summaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/core
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/core')
-rw-r--r--drivers/infiniband/core/Makefile44
-rw-r--r--drivers/infiniband/core/addr.c889
-rw-r--r--drivers/infiniband/core/agent.c221
-rw-r--r--drivers/infiniband/core/agent.h51
-rw-r--r--drivers/infiniband/core/cache.c1674
-rw-r--r--drivers/infiniband/core/cgroup.c53
-rw-r--r--drivers/infiniband/core/cm.c4528
-rw-r--r--drivers/infiniband/core/cm_msgs.h68
-rw-r--r--drivers/infiniband/core/cm_trace.c15
-rw-r--r--drivers/infiniband/core/cm_trace.h414
-rw-r--r--drivers/infiniband/core/cma.c5455
-rw-r--r--drivers/infiniband/core/cma_configfs.c366
-rw-r--r--drivers/infiniband/core/cma_priv.h139
-rw-r--r--drivers/infiniband/core/cma_trace.c16
-rw-r--r--drivers/infiniband/core/cma_trace.h361
-rw-r--r--drivers/infiniband/core/core_priv.h376
-rw-r--r--drivers/infiniband/core/counters.c669
-rw-r--r--drivers/infiniband/core/cq.c507
-rw-r--r--drivers/infiniband/core/device.c2878
-rw-r--r--drivers/infiniband/core/ib_core_uverbs.c367
-rw-r--r--drivers/infiniband/core/iwcm.c1223
-rw-r--r--drivers/infiniband/core/iwcm.h62
-rw-r--r--drivers/infiniband/core/iwpm_msg.c846
-rw-r--r--drivers/infiniband/core/iwpm_util.c793
-rw-r--r--drivers/infiniband/core/iwpm_util.h265
-rw-r--r--drivers/infiniband/core/lag.c137
-rw-r--r--drivers/infiniband/core/mad.c3156
-rw-r--r--drivers/infiniband/core/mad_priv.h225
-rw-r--r--drivers/infiniband/core/mad_rmpp.c960
-rw-r--r--drivers/infiniband/core/mad_rmpp.h58
-rw-r--r--drivers/infiniband/core/mr_pool.c82
-rw-r--r--drivers/infiniband/core/multicast.c906
-rw-r--r--drivers/infiniband/core/netlink.c331
-rw-r--r--drivers/infiniband/core/nldev.c2548
-rw-r--r--drivers/infiniband/core/opa_smi.h78
-rw-r--r--drivers/infiniband/core/packer.c201
-rw-r--r--drivers/infiniband/core/rdma_core.c1015
-rw-r--r--drivers/infiniband/core/rdma_core.h191
-rw-r--r--drivers/infiniband/core/restrack.c353
-rw-r--r--drivers/infiniband/core/restrack.h36
-rw-r--r--drivers/infiniband/core/roce_gid_mgmt.c929
-rw-r--r--drivers/infiniband/core/rw.c734
-rw-r--r--drivers/infiniband/core/sa.h64
-rw-r--r--drivers/infiniband/core/sa_query.c2359
-rw-r--r--drivers/infiniband/core/security.c750
-rw-r--r--drivers/infiniband/core/smi.c338
-rw-r--r--drivers/infiniband/core/smi.h90
-rw-r--r--drivers/infiniband/core/sysfs.c1480
-rw-r--r--drivers/infiniband/core/trace.c12
-rw-r--r--drivers/infiniband/core/ucma.c1896
-rw-r--r--drivers/infiniband/core/ud_header.c547
-rw-r--r--drivers/infiniband/core/umem.c312
-rw-r--r--drivers/infiniband/core/umem_dmabuf.c234
-rw-r--r--drivers/infiniband/core/umem_odp.c515
-rw-r--r--drivers/infiniband/core/user_mad.c1497
-rw-r--r--drivers/infiniband/core/uverbs.h322
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c4052
-rw-r--r--drivers/infiniband/core/uverbs_ioctl.c837
-rw-r--r--drivers/infiniband/core/uverbs_main.c1315
-rw-r--r--drivers/infiniband/core/uverbs_marshall.c215
-rw-r--r--drivers/infiniband/core/uverbs_std_types.c269
-rw-r--r--drivers/infiniband/core/uverbs_std_types_async_fd.c79
-rw-r--r--drivers/infiniband/core/uverbs_std_types_counters.c163
-rw-r--r--drivers/infiniband/core/uverbs_std_types_cq.c222
-rw-r--r--drivers/infiniband/core/uverbs_std_types_device.c503
-rw-r--r--drivers/infiniband/core/uverbs_std_types_dm.c116
-rw-r--r--drivers/infiniband/core/uverbs_std_types_flow_action.c66
-rw-r--r--drivers/infiniband/core/uverbs_std_types_mr.c385
-rw-r--r--drivers/infiniband/core/uverbs_std_types_qp.c380
-rw-r--r--drivers/infiniband/core/uverbs_std_types_srq.c234
-rw-r--r--drivers/infiniband/core/uverbs_std_types_wq.c194
-rw-r--r--drivers/infiniband/core/uverbs_uapi.c734
-rw-r--r--drivers/infiniband/core/verbs.c3030
73 files changed, 57430 insertions, 0 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
new file mode 100644
index 000000000..8ab4eea5a
--- /dev/null
+++ b/drivers/infiniband/core/Makefile
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: GPL-2.0
+infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_cm.o
+user_access-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_ucm.o
+
+obj-$(CONFIG_INFINIBAND) += ib_core.o ib_cm.o iw_cm.o \
+ $(infiniband-y)
+obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o
+obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o $(user_access-y)
+
+ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
+ device.o cache.o netlink.o \
+ roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
+ multicast.o mad.o smi.o agent.o mad_rmpp.o \
+ nldev.o restrack.o counters.o ib_core_uverbs.o \
+ trace.o lag.o
+
+ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
+ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
+
+ib_cm-y := cm.o cm_trace.o
+
+iw_cm-y := iwcm.o iwpm_util.o iwpm_msg.o
+
+CFLAGS_cma_trace.o += -I$(src)
+rdma_cm-y := cma.o cma_trace.o
+
+rdma_cm-$(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) += cma_configfs.o
+
+rdma_ucm-y := ucma.o
+
+ib_umad-y := user_mad.o
+
+ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \
+ rdma_core.o uverbs_std_types.o uverbs_ioctl.o \
+ uverbs_std_types_cq.o \
+ uverbs_std_types_flow_action.o uverbs_std_types_dm.o \
+ uverbs_std_types_mr.o uverbs_std_types_counters.o \
+ uverbs_uapi.o uverbs_std_types_device.o \
+ uverbs_std_types_async_fd.o \
+ uverbs_std_types_srq.o \
+ uverbs_std_types_wq.o \
+ uverbs_std_types_qp.o
+ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o umem_dmabuf.o
+ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
new file mode 100644
index 000000000..f25329579
--- /dev/null
+++ b/drivers/infiniband/core/addr.c
@@ -0,0 +1,889 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc. All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mutex.h>
+#include <linux/inetdevice.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <net/arp.h>
+#include <net/neighbour.h>
+#include <net/route.h>
+#include <net/netevent.h>
+#include <net/ipv6_stubs.h>
+#include <net/ip6_route.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib.h>
+#include <rdma/rdma_netlink.h>
+#include <net/netlink.h>
+
+#include "core_priv.h"
+
+struct addr_req {
+ struct list_head list;
+ struct sockaddr_storage src_addr;
+ struct sockaddr_storage dst_addr;
+ struct rdma_dev_addr *addr;
+ void *context;
+ void (*callback)(int status, struct sockaddr *src_addr,
+ struct rdma_dev_addr *addr, void *context);
+ unsigned long timeout;
+ struct delayed_work work;
+ bool resolve_by_gid_attr; /* Consider gid attr in resolve phase */
+ int status;
+ u32 seq;
+};
+
+static atomic_t ib_nl_addr_request_seq = ATOMIC_INIT(0);
+
+static DEFINE_SPINLOCK(lock);
+static LIST_HEAD(req_list);
+static struct workqueue_struct *addr_wq;
+
+static const struct nla_policy ib_nl_addr_policy[LS_NLA_TYPE_MAX] = {
+ [LS_NLA_TYPE_DGID] = {.type = NLA_BINARY,
+ .len = sizeof(struct rdma_nla_ls_gid),
+ .validation_type = NLA_VALIDATE_MIN,
+ .min = sizeof(struct rdma_nla_ls_gid)},
+};
+
+static inline bool ib_nl_is_good_ip_resp(const struct nlmsghdr *nlh)
+{
+ struct nlattr *tb[LS_NLA_TYPE_MAX] = {};
+ int ret;
+
+ if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR)
+ return false;
+
+ ret = nla_parse_deprecated(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+ nlmsg_len(nlh), ib_nl_addr_policy, NULL);
+ if (ret)
+ return false;
+
+ return true;
+}
+
+static void ib_nl_process_good_ip_rsep(const struct nlmsghdr *nlh)
+{
+ const struct nlattr *head, *curr;
+ union ib_gid gid;
+ struct addr_req *req;
+ int len, rem;
+ int found = 0;
+
+ head = (const struct nlattr *)nlmsg_data(nlh);
+ len = nlmsg_len(nlh);
+
+ nla_for_each_attr(curr, head, len, rem) {
+ if (curr->nla_type == LS_NLA_TYPE_DGID)
+ memcpy(&gid, nla_data(curr), nla_len(curr));
+ }
+
+ spin_lock_bh(&lock);
+ list_for_each_entry(req, &req_list, list) {
+ if (nlh->nlmsg_seq != req->seq)
+ continue;
+ /* We set the DGID part, the rest was set earlier */
+ rdma_addr_set_dgid(req->addr, &gid);
+ req->status = 0;
+ found = 1;
+ break;
+ }
+ spin_unlock_bh(&lock);
+
+ if (!found)
+ pr_info("Couldn't find request waiting for DGID: %pI6\n",
+ &gid);
+}
+
+int ib_nl_handle_ip_res_resp(struct sk_buff *skb,
+ struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ if ((nlh->nlmsg_flags & NLM_F_REQUEST) ||
+ !(NETLINK_CB(skb).sk))
+ return -EPERM;
+
+ if (ib_nl_is_good_ip_resp(nlh))
+ ib_nl_process_good_ip_rsep(nlh);
+
+ return 0;
+}
+
+static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr,
+ const void *daddr,
+ u32 seq, u16 family)
+{
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ struct rdma_ls_ip_resolve_header *header;
+ void *data;
+ size_t size;
+ int attrtype;
+ int len;
+
+ if (family == AF_INET) {
+ size = sizeof(struct in_addr);
+ attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV4;
+ } else {
+ size = sizeof(struct in6_addr);
+ attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV6;
+ }
+
+ len = nla_total_size(sizeof(size));
+ len += NLMSG_ALIGN(sizeof(*header));
+
+ skb = nlmsg_new(len, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ data = ibnl_put_msg(skb, &nlh, seq, 0, RDMA_NL_LS,
+ RDMA_NL_LS_OP_IP_RESOLVE, NLM_F_REQUEST);
+ if (!data) {
+ nlmsg_free(skb);
+ return -ENODATA;
+ }
+
+ /* Construct the family header first */
+ header = skb_put(skb, NLMSG_ALIGN(sizeof(*header)));
+ header->ifindex = dev_addr->bound_dev_if;
+ nla_put(skb, attrtype, size, daddr);
+
+ /* Repair the nlmsg header length */
+ nlmsg_end(skb, nlh);
+ rdma_nl_multicast(&init_net, skb, RDMA_NL_GROUP_LS, GFP_KERNEL);
+
+ /* Make the request retry, so when we get the response from userspace
+ * we will have something.
+ */
+ return -ENODATA;
+}
+
+int rdma_addr_size(const struct sockaddr *addr)
+{
+ switch (addr->sa_family) {
+ case AF_INET:
+ return sizeof(struct sockaddr_in);
+ case AF_INET6:
+ return sizeof(struct sockaddr_in6);
+ case AF_IB:
+ return sizeof(struct sockaddr_ib);
+ default:
+ return 0;
+ }
+}
+EXPORT_SYMBOL(rdma_addr_size);
+
+int rdma_addr_size_in6(struct sockaddr_in6 *addr)
+{
+ int ret = rdma_addr_size((struct sockaddr *) addr);
+
+ return ret <= sizeof(*addr) ? ret : 0;
+}
+EXPORT_SYMBOL(rdma_addr_size_in6);
+
+int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr)
+{
+ int ret = rdma_addr_size((struct sockaddr *) addr);
+
+ return ret <= sizeof(*addr) ? ret : 0;
+}
+EXPORT_SYMBOL(rdma_addr_size_kss);
+
+/**
+ * rdma_copy_src_l2_addr - Copy netdevice source addresses
+ * @dev_addr: Destination address pointer where to copy the addresses
+ * @dev: Netdevice whose source addresses to copy
+ *
+ * rdma_copy_src_l2_addr() copies source addresses from the specified netdevice.
+ * This includes unicast address, broadcast address, device type and
+ * interface index.
+ */
+void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr,
+ const struct net_device *dev)
+{
+ dev_addr->dev_type = dev->type;
+ memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN);
+ memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN);
+ dev_addr->bound_dev_if = dev->ifindex;
+}
+EXPORT_SYMBOL(rdma_copy_src_l2_addr);
+
+static struct net_device *
+rdma_find_ndev_for_src_ip_rcu(struct net *net, const struct sockaddr *src_in)
+{
+ struct net_device *dev = NULL;
+ int ret = -EADDRNOTAVAIL;
+
+ switch (src_in->sa_family) {
+ case AF_INET:
+ dev = __ip_dev_find(net,
+ ((const struct sockaddr_in *)src_in)->sin_addr.s_addr,
+ false);
+ if (dev)
+ ret = 0;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ for_each_netdev_rcu(net, dev) {
+ if (ipv6_chk_addr(net,
+ &((const struct sockaddr_in6 *)src_in)->sin6_addr,
+ dev, 1)) {
+ ret = 0;
+ break;
+ }
+ }
+ break;
+#endif
+ }
+ return ret ? ERR_PTR(ret) : dev;
+}
+
+int rdma_translate_ip(const struct sockaddr *addr,
+ struct rdma_dev_addr *dev_addr)
+{
+ struct net_device *dev;
+
+ if (dev_addr->bound_dev_if) {
+ dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if);
+ if (!dev)
+ return -ENODEV;
+ rdma_copy_src_l2_addr(dev_addr, dev);
+ dev_put(dev);
+ return 0;
+ }
+
+ rcu_read_lock();
+ dev = rdma_find_ndev_for_src_ip_rcu(dev_addr->net, addr);
+ if (!IS_ERR(dev))
+ rdma_copy_src_l2_addr(dev_addr, dev);
+ rcu_read_unlock();
+ return PTR_ERR_OR_ZERO(dev);
+}
+EXPORT_SYMBOL(rdma_translate_ip);
+
+static void set_timeout(struct addr_req *req, unsigned long time)
+{
+ unsigned long delay;
+
+ delay = time - jiffies;
+ if ((long)delay < 0)
+ delay = 0;
+
+ mod_delayed_work(addr_wq, &req->work, delay);
+}
+
+static void queue_req(struct addr_req *req)
+{
+ spin_lock_bh(&lock);
+ list_add_tail(&req->list, &req_list);
+ set_timeout(req, req->timeout);
+ spin_unlock_bh(&lock);
+}
+
+static int ib_nl_fetch_ha(struct rdma_dev_addr *dev_addr,
+ const void *daddr, u32 seq, u16 family)
+{
+ if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS))
+ return -EADDRNOTAVAIL;
+
+ return ib_nl_ip_send_msg(dev_addr, daddr, seq, family);
+}
+
+static int dst_fetch_ha(const struct dst_entry *dst,
+ struct rdma_dev_addr *dev_addr,
+ const void *daddr)
+{
+ struct neighbour *n;
+ int ret = 0;
+
+ n = dst_neigh_lookup(dst, daddr);
+ if (!n)
+ return -ENODATA;
+
+ if (!(n->nud_state & NUD_VALID)) {
+ neigh_event_send(n, NULL);
+ ret = -ENODATA;
+ } else {
+ neigh_ha_snapshot(dev_addr->dst_dev_addr, n, dst->dev);
+ }
+
+ neigh_release(n);
+
+ return ret;
+}
+
+static bool has_gateway(const struct dst_entry *dst, sa_family_t family)
+{
+ struct rtable *rt;
+ struct rt6_info *rt6;
+
+ if (family == AF_INET) {
+ rt = container_of(dst, struct rtable, dst);
+ return rt->rt_uses_gateway;
+ }
+
+ rt6 = container_of(dst, struct rt6_info, dst);
+ return rt6->rt6i_flags & RTF_GATEWAY;
+}
+
+static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
+ const struct sockaddr *dst_in, u32 seq)
+{
+ const struct sockaddr_in *dst_in4 =
+ (const struct sockaddr_in *)dst_in;
+ const struct sockaddr_in6 *dst_in6 =
+ (const struct sockaddr_in6 *)dst_in;
+ const void *daddr = (dst_in->sa_family == AF_INET) ?
+ (const void *)&dst_in4->sin_addr.s_addr :
+ (const void *)&dst_in6->sin6_addr;
+ sa_family_t family = dst_in->sa_family;
+
+ might_sleep();
+
+ /* If we have a gateway in IB mode then it must be an IB network */
+ if (has_gateway(dst, family) && dev_addr->network == RDMA_NETWORK_IB)
+ return ib_nl_fetch_ha(dev_addr, daddr, seq, family);
+ else
+ return dst_fetch_ha(dst, dev_addr, daddr);
+}
+
+static int addr4_resolve(struct sockaddr *src_sock,
+ const struct sockaddr *dst_sock,
+ struct rdma_dev_addr *addr,
+ struct rtable **prt)
+{
+ struct sockaddr_in *src_in = (struct sockaddr_in *)src_sock;
+ const struct sockaddr_in *dst_in =
+ (const struct sockaddr_in *)dst_sock;
+
+ __be32 src_ip = src_in->sin_addr.s_addr;
+ __be32 dst_ip = dst_in->sin_addr.s_addr;
+ struct rtable *rt;
+ struct flowi4 fl4;
+ int ret;
+
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.daddr = dst_ip;
+ fl4.saddr = src_ip;
+ fl4.flowi4_oif = addr->bound_dev_if;
+ rt = ip_route_output_key(addr->net, &fl4);
+ ret = PTR_ERR_OR_ZERO(rt);
+ if (ret)
+ return ret;
+
+ src_in->sin_addr.s_addr = fl4.saddr;
+
+ addr->hoplimit = ip4_dst_hoplimit(&rt->dst);
+
+ *prt = rt;
+ return 0;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int addr6_resolve(struct sockaddr *src_sock,
+ const struct sockaddr *dst_sock,
+ struct rdma_dev_addr *addr,
+ struct dst_entry **pdst)
+{
+ struct sockaddr_in6 *src_in = (struct sockaddr_in6 *)src_sock;
+ const struct sockaddr_in6 *dst_in =
+ (const struct sockaddr_in6 *)dst_sock;
+ struct flowi6 fl6;
+ struct dst_entry *dst;
+
+ memset(&fl6, 0, sizeof fl6);
+ fl6.daddr = dst_in->sin6_addr;
+ fl6.saddr = src_in->sin6_addr;
+ fl6.flowi6_oif = addr->bound_dev_if;
+
+ dst = ipv6_stub->ipv6_dst_lookup_flow(addr->net, NULL, &fl6, NULL);
+ if (IS_ERR(dst))
+ return PTR_ERR(dst);
+
+ if (ipv6_addr_any(&src_in->sin6_addr))
+ src_in->sin6_addr = fl6.saddr;
+
+ addr->hoplimit = ip6_dst_hoplimit(dst);
+
+ *pdst = dst;
+ return 0;
+}
+#else
+static int addr6_resolve(struct sockaddr *src_sock,
+ const struct sockaddr *dst_sock,
+ struct rdma_dev_addr *addr,
+ struct dst_entry **pdst)
+{
+ return -EADDRNOTAVAIL;
+}
+#endif
+
+static int addr_resolve_neigh(const struct dst_entry *dst,
+ const struct sockaddr *dst_in,
+ struct rdma_dev_addr *addr,
+ unsigned int ndev_flags,
+ u32 seq)
+{
+ int ret = 0;
+
+ if (ndev_flags & IFF_LOOPBACK) {
+ memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
+ } else {
+ if (!(ndev_flags & IFF_NOARP)) {
+ /* If the device doesn't do ARP internally */
+ ret = fetch_ha(dst, addr, dst_in, seq);
+ }
+ }
+ return ret;
+}
+
+static int copy_src_l2_addr(struct rdma_dev_addr *dev_addr,
+ const struct sockaddr *dst_in,
+ const struct dst_entry *dst,
+ const struct net_device *ndev)
+{
+ int ret = 0;
+
+ if (dst->dev->flags & IFF_LOOPBACK)
+ ret = rdma_translate_ip(dst_in, dev_addr);
+ else
+ rdma_copy_src_l2_addr(dev_addr, dst->dev);
+
+ /*
+ * If there's a gateway and type of device not ARPHRD_INFINIBAND,
+ * we're definitely in RoCE v2 (as RoCE v1 isn't routable) set the
+ * network type accordingly.
+ */
+ if (has_gateway(dst, dst_in->sa_family) &&
+ ndev->type != ARPHRD_INFINIBAND)
+ dev_addr->network = dst_in->sa_family == AF_INET ?
+ RDMA_NETWORK_IPV4 :
+ RDMA_NETWORK_IPV6;
+ else
+ dev_addr->network = RDMA_NETWORK_IB;
+
+ return ret;
+}
+
+static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr,
+ unsigned int *ndev_flags,
+ const struct sockaddr *dst_in,
+ const struct dst_entry *dst)
+{
+ struct net_device *ndev = READ_ONCE(dst->dev);
+
+ *ndev_flags = ndev->flags;
+ /* A physical device must be the RDMA device to use */
+ if (ndev->flags & IFF_LOOPBACK) {
+ /*
+ * RDMA (IB/RoCE, iWarp) doesn't run on lo interface or
+ * loopback IP address. So if route is resolved to loopback
+ * interface, translate that to a real ndev based on non
+ * loopback IP address.
+ */
+ ndev = rdma_find_ndev_for_src_ip_rcu(dev_net(ndev), dst_in);
+ if (IS_ERR(ndev))
+ return -ENODEV;
+ }
+
+ return copy_src_l2_addr(dev_addr, dst_in, dst, ndev);
+}
+
+static int set_addr_netns_by_gid_rcu(struct rdma_dev_addr *addr)
+{
+ struct net_device *ndev;
+
+ ndev = rdma_read_gid_attr_ndev_rcu(addr->sgid_attr);
+ if (IS_ERR(ndev))
+ return PTR_ERR(ndev);
+
+ /*
+ * Since we are holding the rcu, reading net and ifindex
+ * are safe without any additional reference; because
+ * change_net_namespace() in net/core/dev.c does rcu sync
+ * after it changes the state to IFF_DOWN and before
+ * updating netdev fields {net, ifindex}.
+ */
+ addr->net = dev_net(ndev);
+ addr->bound_dev_if = ndev->ifindex;
+ return 0;
+}
+
+static void rdma_addr_set_net_defaults(struct rdma_dev_addr *addr)
+{
+ addr->net = &init_net;
+ addr->bound_dev_if = 0;
+}
+
+static int addr_resolve(struct sockaddr *src_in,
+ const struct sockaddr *dst_in,
+ struct rdma_dev_addr *addr,
+ bool resolve_neigh,
+ bool resolve_by_gid_attr,
+ u32 seq)
+{
+ struct dst_entry *dst = NULL;
+ unsigned int ndev_flags = 0;
+ struct rtable *rt = NULL;
+ int ret;
+
+ if (!addr->net) {
+ pr_warn_ratelimited("%s: missing namespace\n", __func__);
+ return -EINVAL;
+ }
+
+ rcu_read_lock();
+ if (resolve_by_gid_attr) {
+ if (!addr->sgid_attr) {
+ rcu_read_unlock();
+ pr_warn_ratelimited("%s: missing gid_attr\n", __func__);
+ return -EINVAL;
+ }
+ /*
+ * If the request is for a specific gid attribute of the
+ * rdma_dev_addr, derive net from the netdevice of the
+ * GID attribute.
+ */
+ ret = set_addr_netns_by_gid_rcu(addr);
+ if (ret) {
+ rcu_read_unlock();
+ return ret;
+ }
+ }
+ if (src_in->sa_family == AF_INET) {
+ ret = addr4_resolve(src_in, dst_in, addr, &rt);
+ dst = &rt->dst;
+ } else {
+ ret = addr6_resolve(src_in, dst_in, addr, &dst);
+ }
+ if (ret) {
+ rcu_read_unlock();
+ goto done;
+ }
+ ret = rdma_set_src_addr_rcu(addr, &ndev_flags, dst_in, dst);
+ rcu_read_unlock();
+
+ /*
+ * Resolve neighbor destination address if requested and
+ * only if src addr translation didn't fail.
+ */
+ if (!ret && resolve_neigh)
+ ret = addr_resolve_neigh(dst, dst_in, addr, ndev_flags, seq);
+
+ if (src_in->sa_family == AF_INET)
+ ip_rt_put(rt);
+ else
+ dst_release(dst);
+done:
+ /*
+ * Clear the addr net to go back to its original state, only if it was
+ * derived from GID attribute in this context.
+ */
+ if (resolve_by_gid_attr)
+ rdma_addr_set_net_defaults(addr);
+ return ret;
+}
+
+static void process_one_req(struct work_struct *_work)
+{
+ struct addr_req *req;
+ struct sockaddr *src_in, *dst_in;
+
+ req = container_of(_work, struct addr_req, work.work);
+
+ if (req->status == -ENODATA) {
+ src_in = (struct sockaddr *)&req->src_addr;
+ dst_in = (struct sockaddr *)&req->dst_addr;
+ req->status = addr_resolve(src_in, dst_in, req->addr,
+ true, req->resolve_by_gid_attr,
+ req->seq);
+ if (req->status && time_after_eq(jiffies, req->timeout)) {
+ req->status = -ETIMEDOUT;
+ } else if (req->status == -ENODATA) {
+ /* requeue the work for retrying again */
+ spin_lock_bh(&lock);
+ if (!list_empty(&req->list))
+ set_timeout(req, req->timeout);
+ spin_unlock_bh(&lock);
+ return;
+ }
+ }
+
+ req->callback(req->status, (struct sockaddr *)&req->src_addr,
+ req->addr, req->context);
+ req->callback = NULL;
+
+ spin_lock_bh(&lock);
+ /*
+ * Although the work will normally have been canceled by the workqueue,
+ * it can still be requeued as long as it is on the req_list.
+ */
+ cancel_delayed_work(&req->work);
+ if (!list_empty(&req->list)) {
+ list_del_init(&req->list);
+ kfree(req);
+ }
+ spin_unlock_bh(&lock);
+}
+
+int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr,
+ struct rdma_dev_addr *addr, unsigned long timeout_ms,
+ void (*callback)(int status, struct sockaddr *src_addr,
+ struct rdma_dev_addr *addr, void *context),
+ bool resolve_by_gid_attr, void *context)
+{
+ struct sockaddr *src_in, *dst_in;
+ struct addr_req *req;
+ int ret = 0;
+
+ req = kzalloc(sizeof *req, GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ src_in = (struct sockaddr *) &req->src_addr;
+ dst_in = (struct sockaddr *) &req->dst_addr;
+
+ if (src_addr) {
+ if (src_addr->sa_family != dst_addr->sa_family) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ memcpy(src_in, src_addr, rdma_addr_size(src_addr));
+ } else {
+ src_in->sa_family = dst_addr->sa_family;
+ }
+
+ memcpy(dst_in, dst_addr, rdma_addr_size(dst_addr));
+ req->addr = addr;
+ req->callback = callback;
+ req->context = context;
+ req->resolve_by_gid_attr = resolve_by_gid_attr;
+ INIT_DELAYED_WORK(&req->work, process_one_req);
+ req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq);
+
+ req->status = addr_resolve(src_in, dst_in, addr, true,
+ req->resolve_by_gid_attr, req->seq);
+ switch (req->status) {
+ case 0:
+ req->timeout = jiffies;
+ queue_req(req);
+ break;
+ case -ENODATA:
+ req->timeout = msecs_to_jiffies(timeout_ms) + jiffies;
+ queue_req(req);
+ break;
+ default:
+ ret = req->status;
+ goto err;
+ }
+ return ret;
+err:
+ kfree(req);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_ip);
+
+int roce_resolve_route_from_path(struct sa_path_rec *rec,
+ const struct ib_gid_attr *attr)
+{
+ union {
+ struct sockaddr _sockaddr;
+ struct sockaddr_in _sockaddr_in;
+ struct sockaddr_in6 _sockaddr_in6;
+ } sgid, dgid;
+ struct rdma_dev_addr dev_addr = {};
+ int ret;
+
+ might_sleep();
+
+ if (rec->roce.route_resolved)
+ return 0;
+
+ rdma_gid2ip((struct sockaddr *)&sgid, &rec->sgid);
+ rdma_gid2ip((struct sockaddr *)&dgid, &rec->dgid);
+
+ if (sgid._sockaddr.sa_family != dgid._sockaddr.sa_family)
+ return -EINVAL;
+
+ if (!attr || !attr->ndev)
+ return -EINVAL;
+
+ dev_addr.net = &init_net;
+ dev_addr.sgid_attr = attr;
+
+ ret = addr_resolve((struct sockaddr *)&sgid, (struct sockaddr *)&dgid,
+ &dev_addr, false, true, 0);
+ if (ret)
+ return ret;
+
+ if ((dev_addr.network == RDMA_NETWORK_IPV4 ||
+ dev_addr.network == RDMA_NETWORK_IPV6) &&
+ rec->rec_type != SA_PATH_REC_TYPE_ROCE_V2)
+ return -EINVAL;
+
+ rec->roce.route_resolved = true;
+ return 0;
+}
+
+/**
+ * rdma_addr_cancel - Cancel resolve ip request
+ * @addr: Pointer to address structure given previously
+ * during rdma_resolve_ip().
+ * rdma_addr_cancel() is synchronous function which cancels any pending
+ * request if there is any.
+ */
+void rdma_addr_cancel(struct rdma_dev_addr *addr)
+{
+ struct addr_req *req, *temp_req;
+ struct addr_req *found = NULL;
+
+ spin_lock_bh(&lock);
+ list_for_each_entry_safe(req, temp_req, &req_list, list) {
+ if (req->addr == addr) {
+ /*
+ * Removing from the list means we take ownership of
+ * the req
+ */
+ list_del_init(&req->list);
+ found = req;
+ break;
+ }
+ }
+ spin_unlock_bh(&lock);
+
+ if (!found)
+ return;
+
+ /*
+ * sync canceling the work after removing it from the req_list
+ * guarentees no work is running and none will be started.
+ */
+ cancel_delayed_work_sync(&found->work);
+ kfree(found);
+}
+EXPORT_SYMBOL(rdma_addr_cancel);
+
+struct resolve_cb_context {
+ struct completion comp;
+ int status;
+};
+
+static void resolve_cb(int status, struct sockaddr *src_addr,
+ struct rdma_dev_addr *addr, void *context)
+{
+ ((struct resolve_cb_context *)context)->status = status;
+ complete(&((struct resolve_cb_context *)context)->comp);
+}
+
+int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
+ const union ib_gid *dgid,
+ u8 *dmac, const struct ib_gid_attr *sgid_attr,
+ int *hoplimit)
+{
+ struct rdma_dev_addr dev_addr;
+ struct resolve_cb_context ctx;
+ union {
+ struct sockaddr_in _sockaddr_in;
+ struct sockaddr_in6 _sockaddr_in6;
+ } sgid_addr, dgid_addr;
+ int ret;
+
+ rdma_gid2ip((struct sockaddr *)&sgid_addr, sgid);
+ rdma_gid2ip((struct sockaddr *)&dgid_addr, dgid);
+
+ memset(&dev_addr, 0, sizeof(dev_addr));
+ dev_addr.net = &init_net;
+ dev_addr.sgid_attr = sgid_attr;
+
+ init_completion(&ctx.comp);
+ ret = rdma_resolve_ip((struct sockaddr *)&sgid_addr,
+ (struct sockaddr *)&dgid_addr, &dev_addr, 1000,
+ resolve_cb, true, &ctx);
+ if (ret)
+ return ret;
+
+ wait_for_completion(&ctx.comp);
+
+ ret = ctx.status;
+ if (ret)
+ return ret;
+
+ memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN);
+ *hoplimit = dev_addr.hoplimit;
+ return 0;
+}
+
+static int netevent_callback(struct notifier_block *self, unsigned long event,
+ void *ctx)
+{
+ struct addr_req *req;
+
+ if (event == NETEVENT_NEIGH_UPDATE) {
+ struct neighbour *neigh = ctx;
+
+ if (neigh->nud_state & NUD_VALID) {
+ spin_lock_bh(&lock);
+ list_for_each_entry(req, &req_list, list)
+ set_timeout(req, jiffies);
+ spin_unlock_bh(&lock);
+ }
+ }
+ return 0;
+}
+
+static struct notifier_block nb = {
+ .notifier_call = netevent_callback
+};
+
+int addr_init(void)
+{
+ addr_wq = alloc_ordered_workqueue("ib_addr", 0);
+ if (!addr_wq)
+ return -ENOMEM;
+
+ register_netevent_notifier(&nb);
+
+ return 0;
+}
+
+void addr_cleanup(void)
+{
+ unregister_netevent_notifier(&nb);
+ destroy_workqueue(addr_wq);
+ WARN_ON(!list_empty(&req_list));
+}
diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c
new file mode 100644
index 000000000..f82b4260d
--- /dev/null
+++ b/drivers/infiniband/core/agent.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2004, 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004, 2005 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include "agent.h"
+#include "smi.h"
+#include "mad_priv.h"
+
+#define SPFX "ib_agent: "
+
+struct ib_agent_port_private {
+ struct list_head port_list;
+ struct ib_mad_agent *agent[2];
+};
+
+static DEFINE_SPINLOCK(ib_agent_port_list_lock);
+static LIST_HEAD(ib_agent_port_list);
+
+static struct ib_agent_port_private *
+__ib_get_agent_port(const struct ib_device *device, int port_num)
+{
+ struct ib_agent_port_private *entry;
+
+ list_for_each_entry(entry, &ib_agent_port_list, port_list) {
+ if (entry->agent[1]->device == device &&
+ entry->agent[1]->port_num == port_num)
+ return entry;
+ }
+ return NULL;
+}
+
+static struct ib_agent_port_private *
+ib_get_agent_port(const struct ib_device *device, int port_num)
+{
+ struct ib_agent_port_private *entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+ entry = __ib_get_agent_port(device, port_num);
+ spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+ return entry;
+}
+
+void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *grh,
+ const struct ib_wc *wc, const struct ib_device *device,
+ int port_num, int qpn, size_t resp_mad_len, bool opa)
+{
+ struct ib_agent_port_private *port_priv;
+ struct ib_mad_agent *agent;
+ struct ib_mad_send_buf *send_buf;
+ struct ib_ah *ah;
+ struct ib_mad_send_wr_private *mad_send_wr;
+
+ if (rdma_cap_ib_switch(device))
+ port_priv = ib_get_agent_port(device, 0);
+ else
+ port_priv = ib_get_agent_port(device, port_num);
+
+ if (!port_priv) {
+ dev_err(&device->dev, "Unable to find port agent\n");
+ return;
+ }
+
+ agent = port_priv->agent[qpn];
+ ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num);
+ if (IS_ERR(ah)) {
+ dev_err(&device->dev, "ib_create_ah_from_wc error %ld\n",
+ PTR_ERR(ah));
+ return;
+ }
+
+ if (opa && mad_hdr->base_version != OPA_MGMT_BASE_VERSION)
+ resp_mad_len = IB_MGMT_MAD_SIZE;
+
+ send_buf = ib_create_send_mad(agent, wc->src_qp, wc->pkey_index, 0,
+ IB_MGMT_MAD_HDR,
+ resp_mad_len - IB_MGMT_MAD_HDR,
+ GFP_KERNEL,
+ mad_hdr->base_version);
+ if (IS_ERR(send_buf)) {
+ dev_err(&device->dev, "ib_create_send_mad error\n");
+ goto err1;
+ }
+
+ memcpy(send_buf->mad, mad_hdr, resp_mad_len);
+ send_buf->ah = ah;
+
+ if (rdma_cap_ib_switch(device)) {
+ mad_send_wr = container_of(send_buf,
+ struct ib_mad_send_wr_private,
+ send_buf);
+ mad_send_wr->send_wr.port_num = port_num;
+ }
+
+ if (ib_post_send_mad(send_buf, NULL)) {
+ dev_err(&device->dev, "ib_post_send_mad error\n");
+ goto err2;
+ }
+ return;
+err2:
+ ib_free_send_mad(send_buf);
+err1:
+ rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE);
+}
+
+static void agent_send_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ rdma_destroy_ah(mad_send_wc->send_buf->ah, RDMA_DESTROY_AH_SLEEPABLE);
+ ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int ib_agent_port_open(struct ib_device *device, int port_num)
+{
+ struct ib_agent_port_private *port_priv;
+ unsigned long flags;
+ int ret;
+
+ /* Create new device info */
+ port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
+ if (!port_priv) {
+ ret = -ENOMEM;
+ goto error1;
+ }
+
+ if (rdma_cap_ib_smi(device, port_num)) {
+ /* Obtain send only MAD agent for SMI QP */
+ port_priv->agent[0] = ib_register_mad_agent(device, port_num,
+ IB_QPT_SMI, NULL, 0,
+ &agent_send_handler,
+ NULL, NULL, 0);
+ if (IS_ERR(port_priv->agent[0])) {
+ ret = PTR_ERR(port_priv->agent[0]);
+ goto error2;
+ }
+ }
+
+ /* Obtain send only MAD agent for GSI QP */
+ port_priv->agent[1] = ib_register_mad_agent(device, port_num,
+ IB_QPT_GSI, NULL, 0,
+ &agent_send_handler,
+ NULL, NULL, 0);
+ if (IS_ERR(port_priv->agent[1])) {
+ ret = PTR_ERR(port_priv->agent[1]);
+ goto error3;
+ }
+
+ spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+ list_add_tail(&port_priv->port_list, &ib_agent_port_list);
+ spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+
+ return 0;
+
+error3:
+ if (port_priv->agent[0])
+ ib_unregister_mad_agent(port_priv->agent[0]);
+error2:
+ kfree(port_priv);
+error1:
+ return ret;
+}
+
+int ib_agent_port_close(struct ib_device *device, int port_num)
+{
+ struct ib_agent_port_private *port_priv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+ port_priv = __ib_get_agent_port(device, port_num);
+ if (port_priv == NULL) {
+ spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+ dev_err(&device->dev, "Port %d not found\n", port_num);
+ return -ENODEV;
+ }
+ list_del(&port_priv->port_list);
+ spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+
+ ib_unregister_mad_agent(port_priv->agent[1]);
+ if (port_priv->agent[0])
+ ib_unregister_mad_agent(port_priv->agent[0]);
+
+ kfree(port_priv);
+ return 0;
+}
diff --git a/drivers/infiniband/core/agent.h b/drivers/infiniband/core/agent.h
new file mode 100644
index 000000000..65f92beda
--- /dev/null
+++ b/drivers/infiniband/core/agent.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __AGENT_H_
+#define __AGENT_H_
+
+#include <linux/err.h>
+#include <rdma/ib_mad.h>
+
+extern int ib_agent_port_open(struct ib_device *device, int port_num);
+
+extern int ib_agent_port_close(struct ib_device *device, int port_num);
+
+extern void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *grh,
+ const struct ib_wc *wc, const struct ib_device *device,
+ int port_num, int qpn, size_t resp_mad_len, bool opa);
+
+#endif /* __AGENT_H_ */
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
new file mode 100644
index 000000000..4084d05a4
--- /dev/null
+++ b/drivers/infiniband/core/cache.c
@@ -0,0 +1,1674 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/if_vlan.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/netdevice.h>
+#include <net/addrconf.h>
+
+#include <rdma/ib_cache.h>
+
+#include "core_priv.h"
+
+struct ib_pkey_cache {
+ int table_len;
+ u16 table[];
+};
+
+struct ib_update_work {
+ struct work_struct work;
+ struct ib_event event;
+ bool enforce_security;
+};
+
+union ib_gid zgid;
+EXPORT_SYMBOL(zgid);
+
+enum gid_attr_find_mask {
+ GID_ATTR_FIND_MASK_GID = 1UL << 0,
+ GID_ATTR_FIND_MASK_NETDEV = 1UL << 1,
+ GID_ATTR_FIND_MASK_DEFAULT = 1UL << 2,
+ GID_ATTR_FIND_MASK_GID_TYPE = 1UL << 3,
+};
+
+enum gid_table_entry_state {
+ GID_TABLE_ENTRY_INVALID = 1,
+ GID_TABLE_ENTRY_VALID = 2,
+ /*
+ * Indicates that entry is pending to be removed, there may
+ * be active users of this GID entry.
+ * When last user of the GID entry releases reference to it,
+ * GID entry is detached from the table.
+ */
+ GID_TABLE_ENTRY_PENDING_DEL = 3,
+};
+
+struct roce_gid_ndev_storage {
+ struct rcu_head rcu_head;
+ struct net_device *ndev;
+};
+
+struct ib_gid_table_entry {
+ struct kref kref;
+ struct work_struct del_work;
+ struct ib_gid_attr attr;
+ void *context;
+ /* Store the ndev pointer to release reference later on in
+ * call_rcu context because by that time gid_table_entry
+ * and attr might be already freed. So keep a copy of it.
+ * ndev_storage is freed by rcu callback.
+ */
+ struct roce_gid_ndev_storage *ndev_storage;
+ enum gid_table_entry_state state;
+};
+
+struct ib_gid_table {
+ int sz;
+ /* In RoCE, adding a GID to the table requires:
+ * (a) Find if this GID is already exists.
+ * (b) Find a free space.
+ * (c) Write the new GID
+ *
+ * Delete requires different set of operations:
+ * (a) Find the GID
+ * (b) Delete it.
+ *
+ **/
+ /* Any writer to data_vec must hold this lock and the write side of
+ * rwlock. Readers must hold only rwlock. All writers must be in a
+ * sleepable context.
+ */
+ struct mutex lock;
+ /* rwlock protects data_vec[ix]->state and entry pointer.
+ */
+ rwlock_t rwlock;
+ struct ib_gid_table_entry **data_vec;
+ /* bit field, each bit indicates the index of default GID */
+ u32 default_gid_indices;
+};
+
+static void dispatch_gid_change_event(struct ib_device *ib_dev, u32 port)
+{
+ struct ib_event event;
+
+ event.device = ib_dev;
+ event.element.port_num = port;
+ event.event = IB_EVENT_GID_CHANGE;
+
+ ib_dispatch_event_clients(&event);
+}
+
+static const char * const gid_type_str[] = {
+ /* IB/RoCE v1 value is set for IB_GID_TYPE_IB and IB_GID_TYPE_ROCE for
+ * user space compatibility reasons.
+ */
+ [IB_GID_TYPE_IB] = "IB/RoCE v1",
+ [IB_GID_TYPE_ROCE] = "IB/RoCE v1",
+ [IB_GID_TYPE_ROCE_UDP_ENCAP] = "RoCE v2",
+};
+
+const char *ib_cache_gid_type_str(enum ib_gid_type gid_type)
+{
+ if (gid_type < ARRAY_SIZE(gid_type_str) && gid_type_str[gid_type])
+ return gid_type_str[gid_type];
+
+ return "Invalid GID type";
+}
+EXPORT_SYMBOL(ib_cache_gid_type_str);
+
+/** rdma_is_zero_gid - Check if given GID is zero or not.
+ * @gid: GID to check
+ * Returns true if given GID is zero, returns false otherwise.
+ */
+bool rdma_is_zero_gid(const union ib_gid *gid)
+{
+ return !memcmp(gid, &zgid, sizeof(*gid));
+}
+EXPORT_SYMBOL(rdma_is_zero_gid);
+
+/** is_gid_index_default - Check if a given index belongs to
+ * reserved default GIDs or not.
+ * @table: GID table pointer
+ * @index: Index to check in GID table
+ * Returns true if index is one of the reserved default GID index otherwise
+ * returns false.
+ */
+static bool is_gid_index_default(const struct ib_gid_table *table,
+ unsigned int index)
+{
+ return index < 32 && (BIT(index) & table->default_gid_indices);
+}
+
+int ib_cache_gid_parse_type_str(const char *buf)
+{
+ unsigned int i;
+ size_t len;
+ int err = -EINVAL;
+
+ len = strlen(buf);
+ if (len == 0)
+ return -EINVAL;
+
+ if (buf[len - 1] == '\n')
+ len--;
+
+ for (i = 0; i < ARRAY_SIZE(gid_type_str); ++i)
+ if (gid_type_str[i] && !strncmp(buf, gid_type_str[i], len) &&
+ len == strlen(gid_type_str[i])) {
+ err = i;
+ break;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(ib_cache_gid_parse_type_str);
+
+static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u32 port)
+{
+ return device->port_data[port].cache.gid;
+}
+
+static bool is_gid_entry_free(const struct ib_gid_table_entry *entry)
+{
+ return !entry;
+}
+
+static bool is_gid_entry_valid(const struct ib_gid_table_entry *entry)
+{
+ return entry && entry->state == GID_TABLE_ENTRY_VALID;
+}
+
+static void schedule_free_gid(struct kref *kref)
+{
+ struct ib_gid_table_entry *entry =
+ container_of(kref, struct ib_gid_table_entry, kref);
+
+ queue_work(ib_wq, &entry->del_work);
+}
+
+static void put_gid_ndev(struct rcu_head *head)
+{
+ struct roce_gid_ndev_storage *storage =
+ container_of(head, struct roce_gid_ndev_storage, rcu_head);
+
+ WARN_ON(!storage->ndev);
+ /* At this point its safe to release netdev reference,
+ * as all callers working on gid_attr->ndev are done
+ * using this netdev.
+ */
+ dev_put(storage->ndev);
+ kfree(storage);
+}
+
+static void free_gid_entry_locked(struct ib_gid_table_entry *entry)
+{
+ struct ib_device *device = entry->attr.device;
+ u32 port_num = entry->attr.port_num;
+ struct ib_gid_table *table = rdma_gid_table(device, port_num);
+
+ dev_dbg(&device->dev, "%s port=%u index=%u gid %pI6\n", __func__,
+ port_num, entry->attr.index, entry->attr.gid.raw);
+
+ write_lock_irq(&table->rwlock);
+
+ /*
+ * The only way to avoid overwriting NULL in table is
+ * by comparing if it is same entry in table or not!
+ * If new entry in table is added by the time we free here,
+ * don't overwrite the table entry.
+ */
+ if (entry == table->data_vec[entry->attr.index])
+ table->data_vec[entry->attr.index] = NULL;
+ /* Now this index is ready to be allocated */
+ write_unlock_irq(&table->rwlock);
+
+ if (entry->ndev_storage)
+ call_rcu(&entry->ndev_storage->rcu_head, put_gid_ndev);
+ kfree(entry);
+}
+
+static void free_gid_entry(struct kref *kref)
+{
+ struct ib_gid_table_entry *entry =
+ container_of(kref, struct ib_gid_table_entry, kref);
+
+ free_gid_entry_locked(entry);
+}
+
+/**
+ * free_gid_work - Release reference to the GID entry
+ * @work: Work structure to refer to GID entry which needs to be
+ * deleted.
+ *
+ * free_gid_work() frees the entry from the HCA's hardware table
+ * if provider supports it. It releases reference to netdevice.
+ */
+static void free_gid_work(struct work_struct *work)
+{
+ struct ib_gid_table_entry *entry =
+ container_of(work, struct ib_gid_table_entry, del_work);
+ struct ib_device *device = entry->attr.device;
+ u32 port_num = entry->attr.port_num;
+ struct ib_gid_table *table = rdma_gid_table(device, port_num);
+
+ mutex_lock(&table->lock);
+ free_gid_entry_locked(entry);
+ mutex_unlock(&table->lock);
+}
+
+static struct ib_gid_table_entry *
+alloc_gid_entry(const struct ib_gid_attr *attr)
+{
+ struct ib_gid_table_entry *entry;
+ struct net_device *ndev;
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return NULL;
+
+ ndev = rcu_dereference_protected(attr->ndev, 1);
+ if (ndev) {
+ entry->ndev_storage = kzalloc(sizeof(*entry->ndev_storage),
+ GFP_KERNEL);
+ if (!entry->ndev_storage) {
+ kfree(entry);
+ return NULL;
+ }
+ dev_hold(ndev);
+ entry->ndev_storage->ndev = ndev;
+ }
+ kref_init(&entry->kref);
+ memcpy(&entry->attr, attr, sizeof(*attr));
+ INIT_WORK(&entry->del_work, free_gid_work);
+ entry->state = GID_TABLE_ENTRY_INVALID;
+ return entry;
+}
+
+static void store_gid_entry(struct ib_gid_table *table,
+ struct ib_gid_table_entry *entry)
+{
+ entry->state = GID_TABLE_ENTRY_VALID;
+
+ dev_dbg(&entry->attr.device->dev, "%s port=%u index=%u gid %pI6\n",
+ __func__, entry->attr.port_num, entry->attr.index,
+ entry->attr.gid.raw);
+
+ lockdep_assert_held(&table->lock);
+ write_lock_irq(&table->rwlock);
+ table->data_vec[entry->attr.index] = entry;
+ write_unlock_irq(&table->rwlock);
+}
+
+static void get_gid_entry(struct ib_gid_table_entry *entry)
+{
+ kref_get(&entry->kref);
+}
+
+static void put_gid_entry(struct ib_gid_table_entry *entry)
+{
+ kref_put(&entry->kref, schedule_free_gid);
+}
+
+static void put_gid_entry_locked(struct ib_gid_table_entry *entry)
+{
+ kref_put(&entry->kref, free_gid_entry);
+}
+
+static int add_roce_gid(struct ib_gid_table_entry *entry)
+{
+ const struct ib_gid_attr *attr = &entry->attr;
+ int ret;
+
+ if (!attr->ndev) {
+ dev_err(&attr->device->dev, "%s NULL netdev port=%u index=%u\n",
+ __func__, attr->port_num, attr->index);
+ return -EINVAL;
+ }
+ if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) {
+ ret = attr->device->ops.add_gid(attr, &entry->context);
+ if (ret) {
+ dev_err(&attr->device->dev,
+ "%s GID add failed port=%u index=%u\n",
+ __func__, attr->port_num, attr->index);
+ return ret;
+ }
+ }
+ return 0;
+}
+
+/**
+ * del_gid - Delete GID table entry
+ *
+ * @ib_dev: IB device whose GID entry to be deleted
+ * @port: Port number of the IB device
+ * @table: GID table of the IB device for a port
+ * @ix: GID entry index to delete
+ *
+ */
+static void del_gid(struct ib_device *ib_dev, u32 port,
+ struct ib_gid_table *table, int ix)
+{
+ struct roce_gid_ndev_storage *ndev_storage;
+ struct ib_gid_table_entry *entry;
+
+ lockdep_assert_held(&table->lock);
+
+ dev_dbg(&ib_dev->dev, "%s port=%u index=%d gid %pI6\n", __func__, port,
+ ix, table->data_vec[ix]->attr.gid.raw);
+
+ write_lock_irq(&table->rwlock);
+ entry = table->data_vec[ix];
+ entry->state = GID_TABLE_ENTRY_PENDING_DEL;
+ /*
+ * For non RoCE protocol, GID entry slot is ready to use.
+ */
+ if (!rdma_protocol_roce(ib_dev, port))
+ table->data_vec[ix] = NULL;
+ write_unlock_irq(&table->rwlock);
+
+ ndev_storage = entry->ndev_storage;
+ if (ndev_storage) {
+ entry->ndev_storage = NULL;
+ rcu_assign_pointer(entry->attr.ndev, NULL);
+ call_rcu(&ndev_storage->rcu_head, put_gid_ndev);
+ }
+
+ if (rdma_cap_roce_gid_table(ib_dev, port))
+ ib_dev->ops.del_gid(&entry->attr, &entry->context);
+
+ put_gid_entry_locked(entry);
+}
+
+/**
+ * add_modify_gid - Add or modify GID table entry
+ *
+ * @table: GID table in which GID to be added or modified
+ * @attr: Attributes of the GID
+ *
+ * Returns 0 on success or appropriate error code. It accepts zero
+ * GID addition for non RoCE ports for HCA's who report them as valid
+ * GID. However such zero GIDs are not added to the cache.
+ */
+static int add_modify_gid(struct ib_gid_table *table,
+ const struct ib_gid_attr *attr)
+{
+ struct ib_gid_table_entry *entry;
+ int ret = 0;
+
+ /*
+ * Invalidate any old entry in the table to make it safe to write to
+ * this index.
+ */
+ if (is_gid_entry_valid(table->data_vec[attr->index]))
+ del_gid(attr->device, attr->port_num, table, attr->index);
+
+ /*
+ * Some HCA's report multiple GID entries with only one valid GID, and
+ * leave other unused entries as the zero GID. Convert zero GIDs to
+ * empty table entries instead of storing them.
+ */
+ if (rdma_is_zero_gid(&attr->gid))
+ return 0;
+
+ entry = alloc_gid_entry(attr);
+ if (!entry)
+ return -ENOMEM;
+
+ if (rdma_protocol_roce(attr->device, attr->port_num)) {
+ ret = add_roce_gid(entry);
+ if (ret)
+ goto done;
+ }
+
+ store_gid_entry(table, entry);
+ return 0;
+
+done:
+ put_gid_entry(entry);
+ return ret;
+}
+
+/* rwlock should be read locked, or lock should be held */
+static int find_gid(struct ib_gid_table *table, const union ib_gid *gid,
+ const struct ib_gid_attr *val, bool default_gid,
+ unsigned long mask, int *pempty)
+{
+ int i = 0;
+ int found = -1;
+ int empty = pempty ? -1 : 0;
+
+ while (i < table->sz && (found < 0 || empty < 0)) {
+ struct ib_gid_table_entry *data = table->data_vec[i];
+ struct ib_gid_attr *attr;
+ int curr_index = i;
+
+ i++;
+
+ /* find_gid() is used during GID addition where it is expected
+ * to return a free entry slot which is not duplicate.
+ * Free entry slot is requested and returned if pempty is set,
+ * so lookup free slot only if requested.
+ */
+ if (pempty && empty < 0) {
+ if (is_gid_entry_free(data) &&
+ default_gid ==
+ is_gid_index_default(table, curr_index)) {
+ /*
+ * Found an invalid (free) entry; allocate it.
+ * If default GID is requested, then our
+ * found slot must be one of the DEFAULT
+ * reserved slots or we fail.
+ * This ensures that only DEFAULT reserved
+ * slots are used for default property GIDs.
+ */
+ empty = curr_index;
+ }
+ }
+
+ /*
+ * Additionally find_gid() is used to find valid entry during
+ * lookup operation; so ignore the entries which are marked as
+ * pending for removal and the entries which are marked as
+ * invalid.
+ */
+ if (!is_gid_entry_valid(data))
+ continue;
+
+ if (found >= 0)
+ continue;
+
+ attr = &data->attr;
+ if (mask & GID_ATTR_FIND_MASK_GID_TYPE &&
+ attr->gid_type != val->gid_type)
+ continue;
+
+ if (mask & GID_ATTR_FIND_MASK_GID &&
+ memcmp(gid, &data->attr.gid, sizeof(*gid)))
+ continue;
+
+ if (mask & GID_ATTR_FIND_MASK_NETDEV &&
+ attr->ndev != val->ndev)
+ continue;
+
+ if (mask & GID_ATTR_FIND_MASK_DEFAULT &&
+ is_gid_index_default(table, curr_index) != default_gid)
+ continue;
+
+ found = curr_index;
+ }
+
+ if (pempty)
+ *pempty = empty;
+
+ return found;
+}
+
+static void make_default_gid(struct net_device *dev, union ib_gid *gid)
+{
+ gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
+ addrconf_ifid_eui48(&gid->raw[8], dev);
+}
+
+static int __ib_cache_gid_add(struct ib_device *ib_dev, u32 port,
+ union ib_gid *gid, struct ib_gid_attr *attr,
+ unsigned long mask, bool default_gid)
+{
+ struct ib_gid_table *table;
+ int ret = 0;
+ int empty;
+ int ix;
+
+ /* Do not allow adding zero GID in support of
+ * IB spec version 1.3 section 4.1.1 point (6) and
+ * section 12.7.10 and section 12.7.20
+ */
+ if (rdma_is_zero_gid(gid))
+ return -EINVAL;
+
+ table = rdma_gid_table(ib_dev, port);
+
+ mutex_lock(&table->lock);
+
+ ix = find_gid(table, gid, attr, default_gid, mask, &empty);
+ if (ix >= 0)
+ goto out_unlock;
+
+ if (empty < 0) {
+ ret = -ENOSPC;
+ goto out_unlock;
+ }
+ attr->device = ib_dev;
+ attr->index = empty;
+ attr->port_num = port;
+ attr->gid = *gid;
+ ret = add_modify_gid(table, attr);
+ if (!ret)
+ dispatch_gid_change_event(ib_dev, port);
+
+out_unlock:
+ mutex_unlock(&table->lock);
+ if (ret)
+ pr_warn("%s: unable to add gid %pI6 error=%d\n",
+ __func__, gid->raw, ret);
+ return ret;
+}
+
+int ib_cache_gid_add(struct ib_device *ib_dev, u32 port,
+ union ib_gid *gid, struct ib_gid_attr *attr)
+{
+ unsigned long mask = GID_ATTR_FIND_MASK_GID |
+ GID_ATTR_FIND_MASK_GID_TYPE |
+ GID_ATTR_FIND_MASK_NETDEV;
+
+ return __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false);
+}
+
+static int
+_ib_cache_gid_del(struct ib_device *ib_dev, u32 port,
+ union ib_gid *gid, struct ib_gid_attr *attr,
+ unsigned long mask, bool default_gid)
+{
+ struct ib_gid_table *table;
+ int ret = 0;
+ int ix;
+
+ table = rdma_gid_table(ib_dev, port);
+
+ mutex_lock(&table->lock);
+
+ ix = find_gid(table, gid, attr, default_gid, mask, NULL);
+ if (ix < 0) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ del_gid(ib_dev, port, table, ix);
+ dispatch_gid_change_event(ib_dev, port);
+
+out_unlock:
+ mutex_unlock(&table->lock);
+ if (ret)
+ pr_debug("%s: can't delete gid %pI6 error=%d\n",
+ __func__, gid->raw, ret);
+ return ret;
+}
+
+int ib_cache_gid_del(struct ib_device *ib_dev, u32 port,
+ union ib_gid *gid, struct ib_gid_attr *attr)
+{
+ unsigned long mask = GID_ATTR_FIND_MASK_GID |
+ GID_ATTR_FIND_MASK_GID_TYPE |
+ GID_ATTR_FIND_MASK_DEFAULT |
+ GID_ATTR_FIND_MASK_NETDEV;
+
+ return _ib_cache_gid_del(ib_dev, port, gid, attr, mask, false);
+}
+
+int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u32 port,
+ struct net_device *ndev)
+{
+ struct ib_gid_table *table;
+ int ix;
+ bool deleted = false;
+
+ table = rdma_gid_table(ib_dev, port);
+
+ mutex_lock(&table->lock);
+
+ for (ix = 0; ix < table->sz; ix++) {
+ if (is_gid_entry_valid(table->data_vec[ix]) &&
+ table->data_vec[ix]->attr.ndev == ndev) {
+ del_gid(ib_dev, port, table, ix);
+ deleted = true;
+ }
+ }
+
+ mutex_unlock(&table->lock);
+
+ if (deleted)
+ dispatch_gid_change_event(ib_dev, port);
+
+ return 0;
+}
+
+/**
+ * rdma_find_gid_by_port - Returns the GID entry attributes when it finds
+ * a valid GID entry for given search parameters. It searches for the specified
+ * GID value in the local software cache.
+ * @ib_dev: The device to query.
+ * @gid: The GID value to search for.
+ * @gid_type: The GID type to search for.
+ * @port: The port number of the device where the GID value should be searched.
+ * @ndev: In RoCE, the net device of the device. NULL means ignore.
+ *
+ * Returns sgid attributes if the GID is found with valid reference or
+ * returns ERR_PTR for the error.
+ * The caller must invoke rdma_put_gid_attr() to release the reference.
+ */
+const struct ib_gid_attr *
+rdma_find_gid_by_port(struct ib_device *ib_dev,
+ const union ib_gid *gid,
+ enum ib_gid_type gid_type,
+ u32 port, struct net_device *ndev)
+{
+ int local_index;
+ struct ib_gid_table *table;
+ unsigned long mask = GID_ATTR_FIND_MASK_GID |
+ GID_ATTR_FIND_MASK_GID_TYPE;
+ struct ib_gid_attr val = {.ndev = ndev, .gid_type = gid_type};
+ const struct ib_gid_attr *attr;
+ unsigned long flags;
+
+ if (!rdma_is_port_valid(ib_dev, port))
+ return ERR_PTR(-ENOENT);
+
+ table = rdma_gid_table(ib_dev, port);
+
+ if (ndev)
+ mask |= GID_ATTR_FIND_MASK_NETDEV;
+
+ read_lock_irqsave(&table->rwlock, flags);
+ local_index = find_gid(table, gid, &val, false, mask, NULL);
+ if (local_index >= 0) {
+ get_gid_entry(table->data_vec[local_index]);
+ attr = &table->data_vec[local_index]->attr;
+ read_unlock_irqrestore(&table->rwlock, flags);
+ return attr;
+ }
+
+ read_unlock_irqrestore(&table->rwlock, flags);
+ return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL(rdma_find_gid_by_port);
+
+/**
+ * rdma_find_gid_by_filter - Returns the GID table attribute where a
+ * specified GID value occurs
+ * @ib_dev: The device to query.
+ * @gid: The GID value to search for.
+ * @port: The port number of the device where the GID value could be
+ * searched.
+ * @filter: The filter function is executed on any matching GID in the table.
+ * If the filter function returns true, the corresponding index is returned,
+ * otherwise, we continue searching the GID table. It's guaranteed that
+ * while filter is executed, ndev field is valid and the structure won't
+ * change. filter is executed in an atomic context. filter must not be NULL.
+ * @context: Private data to pass into the call-back.
+ *
+ * rdma_find_gid_by_filter() searches for the specified GID value
+ * of which the filter function returns true in the port's GID table.
+ *
+ */
+const struct ib_gid_attr *rdma_find_gid_by_filter(
+ struct ib_device *ib_dev, const union ib_gid *gid, u32 port,
+ bool (*filter)(const union ib_gid *gid, const struct ib_gid_attr *,
+ void *),
+ void *context)
+{
+ const struct ib_gid_attr *res = ERR_PTR(-ENOENT);
+ struct ib_gid_table *table;
+ unsigned long flags;
+ unsigned int i;
+
+ if (!rdma_is_port_valid(ib_dev, port))
+ return ERR_PTR(-EINVAL);
+
+ table = rdma_gid_table(ib_dev, port);
+
+ read_lock_irqsave(&table->rwlock, flags);
+ for (i = 0; i < table->sz; i++) {
+ struct ib_gid_table_entry *entry = table->data_vec[i];
+
+ if (!is_gid_entry_valid(entry))
+ continue;
+
+ if (memcmp(gid, &entry->attr.gid, sizeof(*gid)))
+ continue;
+
+ if (filter(gid, &entry->attr, context)) {
+ get_gid_entry(entry);
+ res = &entry->attr;
+ break;
+ }
+ }
+ read_unlock_irqrestore(&table->rwlock, flags);
+ return res;
+}
+
+static struct ib_gid_table *alloc_gid_table(int sz)
+{
+ struct ib_gid_table *table = kzalloc(sizeof(*table), GFP_KERNEL);
+
+ if (!table)
+ return NULL;
+
+ table->data_vec = kcalloc(sz, sizeof(*table->data_vec), GFP_KERNEL);
+ if (!table->data_vec)
+ goto err_free_table;
+
+ mutex_init(&table->lock);
+
+ table->sz = sz;
+ rwlock_init(&table->rwlock);
+ return table;
+
+err_free_table:
+ kfree(table);
+ return NULL;
+}
+
+static void release_gid_table(struct ib_device *device,
+ struct ib_gid_table *table)
+{
+ bool leak = false;
+ int i;
+
+ if (!table)
+ return;
+
+ for (i = 0; i < table->sz; i++) {
+ if (is_gid_entry_free(table->data_vec[i]))
+ continue;
+ if (kref_read(&table->data_vec[i]->kref) > 1) {
+ dev_err(&device->dev,
+ "GID entry ref leak for index %d ref=%u\n", i,
+ kref_read(&table->data_vec[i]->kref));
+ leak = true;
+ }
+ }
+ if (leak)
+ return;
+
+ mutex_destroy(&table->lock);
+ kfree(table->data_vec);
+ kfree(table);
+}
+
+static void cleanup_gid_table_port(struct ib_device *ib_dev, u32 port,
+ struct ib_gid_table *table)
+{
+ int i;
+
+ if (!table)
+ return;
+
+ mutex_lock(&table->lock);
+ for (i = 0; i < table->sz; ++i) {
+ if (is_gid_entry_valid(table->data_vec[i]))
+ del_gid(ib_dev, port, table, i);
+ }
+ mutex_unlock(&table->lock);
+}
+
+void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u32 port,
+ struct net_device *ndev,
+ unsigned long gid_type_mask,
+ enum ib_cache_gid_default_mode mode)
+{
+ union ib_gid gid = { };
+ struct ib_gid_attr gid_attr;
+ unsigned int gid_type;
+ unsigned long mask;
+
+ mask = GID_ATTR_FIND_MASK_GID_TYPE |
+ GID_ATTR_FIND_MASK_DEFAULT |
+ GID_ATTR_FIND_MASK_NETDEV;
+ memset(&gid_attr, 0, sizeof(gid_attr));
+ gid_attr.ndev = ndev;
+
+ for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) {
+ if (1UL << gid_type & ~gid_type_mask)
+ continue;
+
+ gid_attr.gid_type = gid_type;
+
+ if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) {
+ make_default_gid(ndev, &gid);
+ __ib_cache_gid_add(ib_dev, port, &gid,
+ &gid_attr, mask, true);
+ } else if (mode == IB_CACHE_GID_DEFAULT_MODE_DELETE) {
+ _ib_cache_gid_del(ib_dev, port, &gid,
+ &gid_attr, mask, true);
+ }
+ }
+}
+
+static void gid_table_reserve_default(struct ib_device *ib_dev, u32 port,
+ struct ib_gid_table *table)
+{
+ unsigned int i;
+ unsigned long roce_gid_type_mask;
+ unsigned int num_default_gids;
+
+ roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+ num_default_gids = hweight_long(roce_gid_type_mask);
+ /* Reserve starting indices for default GIDs */
+ for (i = 0; i < num_default_gids && i < table->sz; i++)
+ table->default_gid_indices |= BIT(i);
+}
+
+
+static void gid_table_release_one(struct ib_device *ib_dev)
+{
+ u32 p;
+
+ rdma_for_each_port (ib_dev, p) {
+ release_gid_table(ib_dev, ib_dev->port_data[p].cache.gid);
+ ib_dev->port_data[p].cache.gid = NULL;
+ }
+}
+
+static int _gid_table_setup_one(struct ib_device *ib_dev)
+{
+ struct ib_gid_table *table;
+ u32 rdma_port;
+
+ rdma_for_each_port (ib_dev, rdma_port) {
+ table = alloc_gid_table(
+ ib_dev->port_data[rdma_port].immutable.gid_tbl_len);
+ if (!table)
+ goto rollback_table_setup;
+
+ gid_table_reserve_default(ib_dev, rdma_port, table);
+ ib_dev->port_data[rdma_port].cache.gid = table;
+ }
+ return 0;
+
+rollback_table_setup:
+ gid_table_release_one(ib_dev);
+ return -ENOMEM;
+}
+
+static void gid_table_cleanup_one(struct ib_device *ib_dev)
+{
+ u32 p;
+
+ rdma_for_each_port (ib_dev, p)
+ cleanup_gid_table_port(ib_dev, p,
+ ib_dev->port_data[p].cache.gid);
+}
+
+static int gid_table_setup_one(struct ib_device *ib_dev)
+{
+ int err;
+
+ err = _gid_table_setup_one(ib_dev);
+
+ if (err)
+ return err;
+
+ rdma_roce_rescan_device(ib_dev);
+
+ return err;
+}
+
+/**
+ * rdma_query_gid - Read the GID content from the GID software cache
+ * @device: Device to query the GID
+ * @port_num: Port number of the device
+ * @index: Index of the GID table entry to read
+ * @gid: Pointer to GID where to store the entry's GID
+ *
+ * rdma_query_gid() only reads the GID entry content for requested device,
+ * port and index. It reads for IB, RoCE and iWarp link layers. It doesn't
+ * hold any reference to the GID table entry in the HCA or software cache.
+ *
+ * Returns 0 on success or appropriate error code.
+ *
+ */
+int rdma_query_gid(struct ib_device *device, u32 port_num,
+ int index, union ib_gid *gid)
+{
+ struct ib_gid_table *table;
+ unsigned long flags;
+ int res;
+
+ if (!rdma_is_port_valid(device, port_num))
+ return -EINVAL;
+
+ table = rdma_gid_table(device, port_num);
+ read_lock_irqsave(&table->rwlock, flags);
+
+ if (index < 0 || index >= table->sz) {
+ res = -EINVAL;
+ goto done;
+ }
+
+ if (!is_gid_entry_valid(table->data_vec[index])) {
+ res = -ENOENT;
+ goto done;
+ }
+
+ memcpy(gid, &table->data_vec[index]->attr.gid, sizeof(*gid));
+ res = 0;
+
+done:
+ read_unlock_irqrestore(&table->rwlock, flags);
+ return res;
+}
+EXPORT_SYMBOL(rdma_query_gid);
+
+/**
+ * rdma_read_gid_hw_context - Read the HW GID context from GID attribute
+ * @attr: Potinter to the GID attribute
+ *
+ * rdma_read_gid_hw_context() reads the drivers GID HW context corresponding
+ * to the SGID attr. Callers are required to already be holding the reference
+ * to an existing GID entry.
+ *
+ * Returns the HW GID context
+ *
+ */
+void *rdma_read_gid_hw_context(const struct ib_gid_attr *attr)
+{
+ return container_of(attr, struct ib_gid_table_entry, attr)->context;
+}
+EXPORT_SYMBOL(rdma_read_gid_hw_context);
+
+/**
+ * rdma_find_gid - Returns SGID attributes if the matching GID is found.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @gid_type: The GID type to search for.
+ * @ndev: In RoCE, the net device of the device. NULL means ignore.
+ *
+ * rdma_find_gid() searches for the specified GID value in the software cache.
+ *
+ * Returns GID attributes if a valid GID is found or returns ERR_PTR for the
+ * error. The caller must invoke rdma_put_gid_attr() to release the reference.
+ *
+ */
+const struct ib_gid_attr *rdma_find_gid(struct ib_device *device,
+ const union ib_gid *gid,
+ enum ib_gid_type gid_type,
+ struct net_device *ndev)
+{
+ unsigned long mask = GID_ATTR_FIND_MASK_GID |
+ GID_ATTR_FIND_MASK_GID_TYPE;
+ struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type};
+ u32 p;
+
+ if (ndev)
+ mask |= GID_ATTR_FIND_MASK_NETDEV;
+
+ rdma_for_each_port(device, p) {
+ struct ib_gid_table *table;
+ unsigned long flags;
+ int index;
+
+ table = device->port_data[p].cache.gid;
+ read_lock_irqsave(&table->rwlock, flags);
+ index = find_gid(table, gid, &gid_attr_val, false, mask, NULL);
+ if (index >= 0) {
+ const struct ib_gid_attr *attr;
+
+ get_gid_entry(table->data_vec[index]);
+ attr = &table->data_vec[index]->attr;
+ read_unlock_irqrestore(&table->rwlock, flags);
+ return attr;
+ }
+ read_unlock_irqrestore(&table->rwlock, flags);
+ }
+
+ return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL(rdma_find_gid);
+
+int ib_get_cached_pkey(struct ib_device *device,
+ u32 port_num,
+ int index,
+ u16 *pkey)
+{
+ struct ib_pkey_cache *cache;
+ unsigned long flags;
+ int ret = 0;
+
+ if (!rdma_is_port_valid(device, port_num))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache_lock, flags);
+
+ cache = device->port_data[port_num].cache.pkey;
+
+ if (!cache || index < 0 || index >= cache->table_len)
+ ret = -EINVAL;
+ else
+ *pkey = cache->table[index];
+
+ read_unlock_irqrestore(&device->cache_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_pkey);
+
+void ib_get_cached_subnet_prefix(struct ib_device *device, u32 port_num,
+ u64 *sn_pfx)
+{
+ unsigned long flags;
+
+ read_lock_irqsave(&device->cache_lock, flags);
+ *sn_pfx = device->port_data[port_num].cache.subnet_prefix;
+ read_unlock_irqrestore(&device->cache_lock, flags);
+}
+EXPORT_SYMBOL(ib_get_cached_subnet_prefix);
+
+int ib_find_cached_pkey(struct ib_device *device, u32 port_num,
+ u16 pkey, u16 *index)
+{
+ struct ib_pkey_cache *cache;
+ unsigned long flags;
+ int i;
+ int ret = -ENOENT;
+ int partial_ix = -1;
+
+ if (!rdma_is_port_valid(device, port_num))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache_lock, flags);
+
+ cache = device->port_data[port_num].cache.pkey;
+ if (!cache) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ *index = -1;
+
+ for (i = 0; i < cache->table_len; ++i)
+ if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) {
+ if (cache->table[i] & 0x8000) {
+ *index = i;
+ ret = 0;
+ break;
+ } else {
+ partial_ix = i;
+ }
+ }
+
+ if (ret && partial_ix >= 0) {
+ *index = partial_ix;
+ ret = 0;
+ }
+
+err:
+ read_unlock_irqrestore(&device->cache_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_find_cached_pkey);
+
+int ib_find_exact_cached_pkey(struct ib_device *device, u32 port_num,
+ u16 pkey, u16 *index)
+{
+ struct ib_pkey_cache *cache;
+ unsigned long flags;
+ int i;
+ int ret = -ENOENT;
+
+ if (!rdma_is_port_valid(device, port_num))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache_lock, flags);
+
+ cache = device->port_data[port_num].cache.pkey;
+ if (!cache) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ *index = -1;
+
+ for (i = 0; i < cache->table_len; ++i)
+ if (cache->table[i] == pkey) {
+ *index = i;
+ ret = 0;
+ break;
+ }
+
+err:
+ read_unlock_irqrestore(&device->cache_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_find_exact_cached_pkey);
+
+int ib_get_cached_lmc(struct ib_device *device, u32 port_num, u8 *lmc)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ if (!rdma_is_port_valid(device, port_num))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache_lock, flags);
+ *lmc = device->port_data[port_num].cache.lmc;
+ read_unlock_irqrestore(&device->cache_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_lmc);
+
+int ib_get_cached_port_state(struct ib_device *device, u32 port_num,
+ enum ib_port_state *port_state)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ if (!rdma_is_port_valid(device, port_num))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache_lock, flags);
+ *port_state = device->port_data[port_num].cache.port_state;
+ read_unlock_irqrestore(&device->cache_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_port_state);
+
+/**
+ * rdma_get_gid_attr - Returns GID attributes for a port of a device
+ * at a requested gid_index, if a valid GID entry exists.
+ * @device: The device to query.
+ * @port_num: The port number on the device where the GID value
+ * is to be queried.
+ * @index: Index of the GID table entry whose attributes are to
+ * be queried.
+ *
+ * rdma_get_gid_attr() acquires reference count of gid attributes from the
+ * cached GID table. Caller must invoke rdma_put_gid_attr() to release
+ * reference to gid attribute regardless of link layer.
+ *
+ * Returns pointer to valid gid attribute or ERR_PTR for the appropriate error
+ * code.
+ */
+const struct ib_gid_attr *
+rdma_get_gid_attr(struct ib_device *device, u32 port_num, int index)
+{
+ const struct ib_gid_attr *attr = ERR_PTR(-ENODATA);
+ struct ib_gid_table *table;
+ unsigned long flags;
+
+ if (!rdma_is_port_valid(device, port_num))
+ return ERR_PTR(-EINVAL);
+
+ table = rdma_gid_table(device, port_num);
+ if (index < 0 || index >= table->sz)
+ return ERR_PTR(-EINVAL);
+
+ read_lock_irqsave(&table->rwlock, flags);
+ if (!is_gid_entry_valid(table->data_vec[index]))
+ goto done;
+
+ get_gid_entry(table->data_vec[index]);
+ attr = &table->data_vec[index]->attr;
+done:
+ read_unlock_irqrestore(&table->rwlock, flags);
+ return attr;
+}
+EXPORT_SYMBOL(rdma_get_gid_attr);
+
+/**
+ * rdma_query_gid_table - Reads GID table entries of all the ports of a device up to max_entries.
+ * @device: The device to query.
+ * @entries: Entries where GID entries are returned.
+ * @max_entries: Maximum number of entries that can be returned.
+ * Entries array must be allocated to hold max_entries number of entries.
+ *
+ * Returns number of entries on success or appropriate error code.
+ */
+ssize_t rdma_query_gid_table(struct ib_device *device,
+ struct ib_uverbs_gid_entry *entries,
+ size_t max_entries)
+{
+ const struct ib_gid_attr *gid_attr;
+ ssize_t num_entries = 0, ret;
+ struct ib_gid_table *table;
+ u32 port_num, i;
+ struct net_device *ndev;
+ unsigned long flags;
+
+ rdma_for_each_port(device, port_num) {
+ table = rdma_gid_table(device, port_num);
+ read_lock_irqsave(&table->rwlock, flags);
+ for (i = 0; i < table->sz; i++) {
+ if (!is_gid_entry_valid(table->data_vec[i]))
+ continue;
+ if (num_entries >= max_entries) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ gid_attr = &table->data_vec[i]->attr;
+
+ memcpy(&entries->gid, &gid_attr->gid,
+ sizeof(gid_attr->gid));
+ entries->gid_index = gid_attr->index;
+ entries->port_num = gid_attr->port_num;
+ entries->gid_type = gid_attr->gid_type;
+ ndev = rcu_dereference_protected(
+ gid_attr->ndev,
+ lockdep_is_held(&table->rwlock));
+ if (ndev)
+ entries->netdev_ifindex = ndev->ifindex;
+
+ num_entries++;
+ entries++;
+ }
+ read_unlock_irqrestore(&table->rwlock, flags);
+ }
+
+ return num_entries;
+err:
+ read_unlock_irqrestore(&table->rwlock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_query_gid_table);
+
+/**
+ * rdma_put_gid_attr - Release reference to the GID attribute
+ * @attr: Pointer to the GID attribute whose reference
+ * needs to be released.
+ *
+ * rdma_put_gid_attr() must be used to release reference whose
+ * reference is acquired using rdma_get_gid_attr() or any APIs
+ * which returns a pointer to the ib_gid_attr regardless of link layer
+ * of IB or RoCE.
+ *
+ */
+void rdma_put_gid_attr(const struct ib_gid_attr *attr)
+{
+ struct ib_gid_table_entry *entry =
+ container_of(attr, struct ib_gid_table_entry, attr);
+
+ put_gid_entry(entry);
+}
+EXPORT_SYMBOL(rdma_put_gid_attr);
+
+/**
+ * rdma_hold_gid_attr - Get reference to existing GID attribute
+ *
+ * @attr: Pointer to the GID attribute whose reference
+ * needs to be taken.
+ *
+ * Increase the reference count to a GID attribute to keep it from being
+ * freed. Callers are required to already be holding a reference to attribute.
+ *
+ */
+void rdma_hold_gid_attr(const struct ib_gid_attr *attr)
+{
+ struct ib_gid_table_entry *entry =
+ container_of(attr, struct ib_gid_table_entry, attr);
+
+ get_gid_entry(entry);
+}
+EXPORT_SYMBOL(rdma_hold_gid_attr);
+
+/**
+ * rdma_read_gid_attr_ndev_rcu - Read GID attribute netdevice
+ * which must be in UP state.
+ *
+ * @attr:Pointer to the GID attribute
+ *
+ * Returns pointer to netdevice if the netdevice was attached to GID and
+ * netdevice is in UP state. Caller must hold RCU lock as this API
+ * reads the netdev flags which can change while netdevice migrates to
+ * different net namespace. Returns ERR_PTR with error code otherwise.
+ *
+ */
+struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr)
+{
+ struct ib_gid_table_entry *entry =
+ container_of(attr, struct ib_gid_table_entry, attr);
+ struct ib_device *device = entry->attr.device;
+ struct net_device *ndev = ERR_PTR(-EINVAL);
+ u32 port_num = entry->attr.port_num;
+ struct ib_gid_table *table;
+ unsigned long flags;
+ bool valid;
+
+ table = rdma_gid_table(device, port_num);
+
+ read_lock_irqsave(&table->rwlock, flags);
+ valid = is_gid_entry_valid(table->data_vec[attr->index]);
+ if (valid) {
+ ndev = rcu_dereference(attr->ndev);
+ if (!ndev)
+ ndev = ERR_PTR(-ENODEV);
+ }
+ read_unlock_irqrestore(&table->rwlock, flags);
+ return ndev;
+}
+EXPORT_SYMBOL(rdma_read_gid_attr_ndev_rcu);
+
+static int get_lower_dev_vlan(struct net_device *lower_dev,
+ struct netdev_nested_priv *priv)
+{
+ u16 *vlan_id = (u16 *)priv->data;
+
+ if (is_vlan_dev(lower_dev))
+ *vlan_id = vlan_dev_vlan_id(lower_dev);
+
+ /* We are interested only in first level vlan device, so
+ * always return 1 to stop iterating over next level devices.
+ */
+ return 1;
+}
+
+/**
+ * rdma_read_gid_l2_fields - Read the vlan ID and source MAC address
+ * of a GID entry.
+ *
+ * @attr: GID attribute pointer whose L2 fields to be read
+ * @vlan_id: Pointer to vlan id to fill up if the GID entry has
+ * vlan id. It is optional.
+ * @smac: Pointer to smac to fill up for a GID entry. It is optional.
+ *
+ * rdma_read_gid_l2_fields() returns 0 on success and returns vlan id
+ * (if gid entry has vlan) and source MAC, or returns error.
+ */
+int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr,
+ u16 *vlan_id, u8 *smac)
+{
+ struct netdev_nested_priv priv = {
+ .data = (void *)vlan_id,
+ };
+ struct net_device *ndev;
+
+ rcu_read_lock();
+ ndev = rcu_dereference(attr->ndev);
+ if (!ndev) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+ if (smac)
+ ether_addr_copy(smac, ndev->dev_addr);
+ if (vlan_id) {
+ *vlan_id = 0xffff;
+ if (is_vlan_dev(ndev)) {
+ *vlan_id = vlan_dev_vlan_id(ndev);
+ } else {
+ /* If the netdev is upper device and if it's lower
+ * device is vlan device, consider vlan id of the
+ * the lower vlan device for this gid entry.
+ */
+ netdev_walk_all_lower_dev_rcu(attr->ndev,
+ get_lower_dev_vlan, &priv);
+ }
+ }
+ rcu_read_unlock();
+ return 0;
+}
+EXPORT_SYMBOL(rdma_read_gid_l2_fields);
+
+static int config_non_roce_gid_cache(struct ib_device *device,
+ u32 port, struct ib_port_attr *tprops)
+{
+ struct ib_gid_attr gid_attr = {};
+ struct ib_gid_table *table;
+ int ret = 0;
+ int i;
+
+ gid_attr.device = device;
+ gid_attr.port_num = port;
+ table = rdma_gid_table(device, port);
+
+ mutex_lock(&table->lock);
+ for (i = 0; i < tprops->gid_tbl_len; ++i) {
+ if (!device->ops.query_gid)
+ continue;
+ ret = device->ops.query_gid(device, port, i, &gid_attr.gid);
+ if (ret) {
+ dev_warn(&device->dev,
+ "query_gid failed (%d) for index %d\n", ret,
+ i);
+ goto err;
+ }
+ gid_attr.index = i;
+ tprops->subnet_prefix =
+ be64_to_cpu(gid_attr.gid.global.subnet_prefix);
+ add_modify_gid(table, &gid_attr);
+ }
+err:
+ mutex_unlock(&table->lock);
+ return ret;
+}
+
+static int
+ib_cache_update(struct ib_device *device, u32 port, bool update_gids,
+ bool update_pkeys, bool enforce_security)
+{
+ struct ib_port_attr *tprops = NULL;
+ struct ib_pkey_cache *pkey_cache = NULL;
+ struct ib_pkey_cache *old_pkey_cache = NULL;
+ int i;
+ int ret;
+
+ if (!rdma_is_port_valid(device, port))
+ return -EINVAL;
+
+ tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+ if (!tprops)
+ return -ENOMEM;
+
+ ret = ib_query_port(device, port, tprops);
+ if (ret) {
+ dev_warn(&device->dev, "ib_query_port failed (%d)\n", ret);
+ goto err;
+ }
+
+ if (!rdma_protocol_roce(device, port) && update_gids) {
+ ret = config_non_roce_gid_cache(device, port,
+ tprops);
+ if (ret)
+ goto err;
+ }
+
+ update_pkeys &= !!tprops->pkey_tbl_len;
+
+ if (update_pkeys) {
+ pkey_cache = kmalloc(struct_size(pkey_cache, table,
+ tprops->pkey_tbl_len),
+ GFP_KERNEL);
+ if (!pkey_cache) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ pkey_cache->table_len = tprops->pkey_tbl_len;
+
+ for (i = 0; i < pkey_cache->table_len; ++i) {
+ ret = ib_query_pkey(device, port, i,
+ pkey_cache->table + i);
+ if (ret) {
+ dev_warn(&device->dev,
+ "ib_query_pkey failed (%d) for index %d\n",
+ ret, i);
+ goto err;
+ }
+ }
+ }
+
+ write_lock_irq(&device->cache_lock);
+
+ if (update_pkeys) {
+ old_pkey_cache = device->port_data[port].cache.pkey;
+ device->port_data[port].cache.pkey = pkey_cache;
+ }
+ device->port_data[port].cache.lmc = tprops->lmc;
+ device->port_data[port].cache.port_state = tprops->state;
+
+ device->port_data[port].cache.subnet_prefix = tprops->subnet_prefix;
+ write_unlock_irq(&device->cache_lock);
+
+ if (enforce_security)
+ ib_security_cache_change(device,
+ port,
+ tprops->subnet_prefix);
+
+ kfree(old_pkey_cache);
+ kfree(tprops);
+ return 0;
+
+err:
+ kfree(pkey_cache);
+ kfree(tprops);
+ return ret;
+}
+
+static void ib_cache_event_task(struct work_struct *_work)
+{
+ struct ib_update_work *work =
+ container_of(_work, struct ib_update_work, work);
+ int ret;
+
+ /* Before distributing the cache update event, first sync
+ * the cache.
+ */
+ ret = ib_cache_update(work->event.device, work->event.element.port_num,
+ work->event.event == IB_EVENT_GID_CHANGE,
+ work->event.event == IB_EVENT_PKEY_CHANGE,
+ work->enforce_security);
+
+ /* GID event is notified already for individual GID entries by
+ * dispatch_gid_change_event(). Hence, notifiy for rest of the
+ * events.
+ */
+ if (!ret && work->event.event != IB_EVENT_GID_CHANGE)
+ ib_dispatch_event_clients(&work->event);
+
+ kfree(work);
+}
+
+static void ib_generic_event_task(struct work_struct *_work)
+{
+ struct ib_update_work *work =
+ container_of(_work, struct ib_update_work, work);
+
+ ib_dispatch_event_clients(&work->event);
+ kfree(work);
+}
+
+static bool is_cache_update_event(const struct ib_event *event)
+{
+ return (event->event == IB_EVENT_PORT_ERR ||
+ event->event == IB_EVENT_PORT_ACTIVE ||
+ event->event == IB_EVENT_LID_CHANGE ||
+ event->event == IB_EVENT_PKEY_CHANGE ||
+ event->event == IB_EVENT_CLIENT_REREGISTER ||
+ event->event == IB_EVENT_GID_CHANGE);
+}
+
+/**
+ * ib_dispatch_event - Dispatch an asynchronous event
+ * @event:Event to dispatch
+ *
+ * Low-level drivers must call ib_dispatch_event() to dispatch the
+ * event to all registered event handlers when an asynchronous event
+ * occurs.
+ */
+void ib_dispatch_event(const struct ib_event *event)
+{
+ struct ib_update_work *work;
+
+ work = kzalloc(sizeof(*work), GFP_ATOMIC);
+ if (!work)
+ return;
+
+ if (is_cache_update_event(event))
+ INIT_WORK(&work->work, ib_cache_event_task);
+ else
+ INIT_WORK(&work->work, ib_generic_event_task);
+
+ work->event = *event;
+ if (event->event == IB_EVENT_PKEY_CHANGE ||
+ event->event == IB_EVENT_GID_CHANGE)
+ work->enforce_security = true;
+
+ queue_work(ib_wq, &work->work);
+}
+EXPORT_SYMBOL(ib_dispatch_event);
+
+int ib_cache_setup_one(struct ib_device *device)
+{
+ u32 p;
+ int err;
+
+ err = gid_table_setup_one(device);
+ if (err)
+ return err;
+
+ rdma_for_each_port (device, p) {
+ err = ib_cache_update(device, p, true, true, true);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+void ib_cache_release_one(struct ib_device *device)
+{
+ u32 p;
+
+ /*
+ * The release function frees all the cache elements.
+ * This function should be called as part of freeing
+ * all the device's resources when the cache could no
+ * longer be accessed.
+ */
+ rdma_for_each_port (device, p)
+ kfree(device->port_data[p].cache.pkey);
+
+ gid_table_release_one(device);
+}
+
+void ib_cache_cleanup_one(struct ib_device *device)
+{
+ /* The cleanup function waits for all in-progress workqueue
+ * elements and cleans up the GID cache. This function should be
+ * called after the device was removed from the devices list and
+ * all clients were removed, so the cache exists but is
+ * non-functional and shouldn't be updated anymore.
+ */
+ flush_workqueue(ib_wq);
+ gid_table_cleanup_one(device);
+
+ /*
+ * Flush the wq second time for any pending GID delete work.
+ */
+ flush_workqueue(ib_wq);
+}
diff --git a/drivers/infiniband/core/cgroup.c b/drivers/infiniband/core/cgroup.c
new file mode 100644
index 000000000..1f037fe01
--- /dev/null
+++ b/drivers/infiniband/core/cgroup.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ */
+
+#include "core_priv.h"
+
+/**
+ * ib_device_register_rdmacg - register with rdma cgroup.
+ * @device: device to register to participate in resource
+ * accounting by rdma cgroup.
+ *
+ * Register with the rdma cgroup. Should be called before
+ * exposing rdma device to user space applications to avoid
+ * resource accounting leak.
+ */
+void ib_device_register_rdmacg(struct ib_device *device)
+{
+ device->cg_device.name = device->name;
+ rdmacg_register_device(&device->cg_device);
+}
+
+/**
+ * ib_device_unregister_rdmacg - unregister with rdma cgroup.
+ * @device: device to unregister.
+ *
+ * Unregister with the rdma cgroup. Should be called after
+ * all the resources are deallocated, and after a stage when any
+ * other resource allocation by user application cannot be done
+ * for this device to avoid any leak in accounting.
+ */
+void ib_device_unregister_rdmacg(struct ib_device *device)
+{
+ rdmacg_unregister_device(&device->cg_device);
+}
+
+int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
+ struct ib_device *device,
+ enum rdmacg_resource_type resource_index)
+{
+ return rdmacg_try_charge(&cg_obj->cg, &device->cg_device,
+ resource_index);
+}
+EXPORT_SYMBOL(ib_rdmacg_try_charge);
+
+void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
+ struct ib_device *device,
+ enum rdmacg_resource_type resource_index)
+{
+ rdmacg_uncharge(cg_obj->cg, &device->cg_device,
+ resource_index);
+}
+EXPORT_SYMBOL(ib_rdmacg_uncharge);
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
new file mode 100644
index 000000000..b7f902344
--- /dev/null
+++ b/drivers/infiniband/core/cm.c
@@ -0,0 +1,4528 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2004-2007 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2019, Mellanox Technologies inc. All rights reserved.
+ */
+
+#include <linux/completion.h>
+#include <linux/dma-mapping.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/random.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/workqueue.h>
+#include <linux/kdev_t.h>
+#include <linux/etherdevice.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_sysfs.h>
+#include "cm_msgs.h"
+#include "core_priv.h"
+#include "cm_trace.h"
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("InfiniBand CM");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static const char * const ibcm_rej_reason_strs[] = {
+ [IB_CM_REJ_NO_QP] = "no QP",
+ [IB_CM_REJ_NO_EEC] = "no EEC",
+ [IB_CM_REJ_NO_RESOURCES] = "no resources",
+ [IB_CM_REJ_TIMEOUT] = "timeout",
+ [IB_CM_REJ_UNSUPPORTED] = "unsupported",
+ [IB_CM_REJ_INVALID_COMM_ID] = "invalid comm ID",
+ [IB_CM_REJ_INVALID_COMM_INSTANCE] = "invalid comm instance",
+ [IB_CM_REJ_INVALID_SERVICE_ID] = "invalid service ID",
+ [IB_CM_REJ_INVALID_TRANSPORT_TYPE] = "invalid transport type",
+ [IB_CM_REJ_STALE_CONN] = "stale conn",
+ [IB_CM_REJ_RDC_NOT_EXIST] = "RDC not exist",
+ [IB_CM_REJ_INVALID_GID] = "invalid GID",
+ [IB_CM_REJ_INVALID_LID] = "invalid LID",
+ [IB_CM_REJ_INVALID_SL] = "invalid SL",
+ [IB_CM_REJ_INVALID_TRAFFIC_CLASS] = "invalid traffic class",
+ [IB_CM_REJ_INVALID_HOP_LIMIT] = "invalid hop limit",
+ [IB_CM_REJ_INVALID_PACKET_RATE] = "invalid packet rate",
+ [IB_CM_REJ_INVALID_ALT_GID] = "invalid alt GID",
+ [IB_CM_REJ_INVALID_ALT_LID] = "invalid alt LID",
+ [IB_CM_REJ_INVALID_ALT_SL] = "invalid alt SL",
+ [IB_CM_REJ_INVALID_ALT_TRAFFIC_CLASS] = "invalid alt traffic class",
+ [IB_CM_REJ_INVALID_ALT_HOP_LIMIT] = "invalid alt hop limit",
+ [IB_CM_REJ_INVALID_ALT_PACKET_RATE] = "invalid alt packet rate",
+ [IB_CM_REJ_PORT_CM_REDIRECT] = "port CM redirect",
+ [IB_CM_REJ_PORT_REDIRECT] = "port redirect",
+ [IB_CM_REJ_INVALID_MTU] = "invalid MTU",
+ [IB_CM_REJ_INSUFFICIENT_RESP_RESOURCES] = "insufficient resp resources",
+ [IB_CM_REJ_CONSUMER_DEFINED] = "consumer defined",
+ [IB_CM_REJ_INVALID_RNR_RETRY] = "invalid RNR retry",
+ [IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID] = "duplicate local comm ID",
+ [IB_CM_REJ_INVALID_CLASS_VERSION] = "invalid class version",
+ [IB_CM_REJ_INVALID_FLOW_LABEL] = "invalid flow label",
+ [IB_CM_REJ_INVALID_ALT_FLOW_LABEL] = "invalid alt flow label",
+ [IB_CM_REJ_VENDOR_OPTION_NOT_SUPPORTED] =
+ "vendor option is not supported",
+};
+
+const char *__attribute_const__ ibcm_reject_msg(int reason)
+{
+ size_t index = reason;
+
+ if (index < ARRAY_SIZE(ibcm_rej_reason_strs) &&
+ ibcm_rej_reason_strs[index])
+ return ibcm_rej_reason_strs[index];
+ else
+ return "unrecognized reason";
+}
+EXPORT_SYMBOL(ibcm_reject_msg);
+
+struct cm_id_private;
+struct cm_work;
+static int cm_add_one(struct ib_device *device);
+static void cm_remove_one(struct ib_device *device, void *client_data);
+static void cm_process_work(struct cm_id_private *cm_id_priv,
+ struct cm_work *work);
+static int cm_send_sidr_rep_locked(struct cm_id_private *cm_id_priv,
+ struct ib_cm_sidr_rep_param *param);
+static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv,
+ const void *private_data, u8 private_data_len);
+static int cm_send_drep_locked(struct cm_id_private *cm_id_priv,
+ void *private_data, u8 private_data_len);
+static int cm_send_rej_locked(struct cm_id_private *cm_id_priv,
+ enum ib_cm_rej_reason reason, void *ari,
+ u8 ari_length, const void *private_data,
+ u8 private_data_len);
+
+static struct ib_client cm_client = {
+ .name = "cm",
+ .add = cm_add_one,
+ .remove = cm_remove_one
+};
+
+static struct ib_cm {
+ spinlock_t lock;
+ struct list_head device_list;
+ rwlock_t device_lock;
+ struct rb_root listen_service_table;
+ u64 listen_service_id;
+ /* struct rb_root peer_service_table; todo: fix peer to peer */
+ struct rb_root remote_qp_table;
+ struct rb_root remote_id_table;
+ struct rb_root remote_sidr_table;
+ struct xarray local_id_table;
+ u32 local_id_next;
+ __be32 random_id_operand;
+ struct list_head timewait_list;
+ struct workqueue_struct *wq;
+} cm;
+
+/* Counter indexes ordered by attribute ID */
+enum {
+ CM_REQ_COUNTER,
+ CM_MRA_COUNTER,
+ CM_REJ_COUNTER,
+ CM_REP_COUNTER,
+ CM_RTU_COUNTER,
+ CM_DREQ_COUNTER,
+ CM_DREP_COUNTER,
+ CM_SIDR_REQ_COUNTER,
+ CM_SIDR_REP_COUNTER,
+ CM_LAP_COUNTER,
+ CM_APR_COUNTER,
+ CM_ATTR_COUNT,
+ CM_ATTR_ID_OFFSET = 0x0010,
+};
+
+enum {
+ CM_XMIT,
+ CM_XMIT_RETRIES,
+ CM_RECV,
+ CM_RECV_DUPLICATES,
+ CM_COUNTER_GROUPS
+};
+
+struct cm_counter_attribute {
+ struct ib_port_attribute attr;
+ unsigned short group;
+ unsigned short index;
+};
+
+struct cm_port {
+ struct cm_device *cm_dev;
+ struct ib_mad_agent *mad_agent;
+ u32 port_num;
+ atomic_long_t counters[CM_COUNTER_GROUPS][CM_ATTR_COUNT];
+};
+
+struct cm_device {
+ struct kref kref;
+ struct list_head list;
+ spinlock_t mad_agent_lock;
+ struct ib_device *ib_device;
+ u8 ack_delay;
+ int going_down;
+ struct cm_port *port[];
+};
+
+struct cm_av {
+ struct cm_port *port;
+ struct rdma_ah_attr ah_attr;
+ u16 dlid_datapath;
+ u16 pkey_index;
+ u8 timeout;
+};
+
+struct cm_work {
+ struct delayed_work work;
+ struct list_head list;
+ struct cm_port *port;
+ struct ib_mad_recv_wc *mad_recv_wc; /* Received MADs */
+ __be32 local_id; /* Established / timewait */
+ __be32 remote_id;
+ struct ib_cm_event cm_event;
+ struct sa_path_rec path[];
+};
+
+struct cm_timewait_info {
+ struct cm_work work;
+ struct list_head list;
+ struct rb_node remote_qp_node;
+ struct rb_node remote_id_node;
+ __be64 remote_ca_guid;
+ __be32 remote_qpn;
+ u8 inserted_remote_qp;
+ u8 inserted_remote_id;
+};
+
+struct cm_id_private {
+ struct ib_cm_id id;
+
+ struct rb_node service_node;
+ struct rb_node sidr_id_node;
+ u32 sidr_slid;
+ spinlock_t lock; /* Do not acquire inside cm.lock */
+ struct completion comp;
+ refcount_t refcount;
+ /* Number of clients sharing this ib_cm_id. Only valid for listeners.
+ * Protected by the cm.lock spinlock.
+ */
+ int listen_sharecount;
+ struct rcu_head rcu;
+
+ struct ib_mad_send_buf *msg;
+ struct cm_timewait_info *timewait_info;
+ /* todo: use alternate port on send failure */
+ struct cm_av av;
+ struct cm_av alt_av;
+
+ void *private_data;
+ __be64 tid;
+ __be32 local_qpn;
+ __be32 remote_qpn;
+ enum ib_qp_type qp_type;
+ __be32 sq_psn;
+ __be32 rq_psn;
+ int timeout_ms;
+ enum ib_mtu path_mtu;
+ __be16 pkey;
+ u8 private_data_len;
+ u8 max_cm_retries;
+ u8 responder_resources;
+ u8 initiator_depth;
+ u8 retry_count;
+ u8 rnr_retry_count;
+ u8 service_timeout;
+ u8 target_ack_delay;
+
+ struct list_head work_list;
+ atomic_t work_count;
+
+ struct rdma_ucm_ece ece;
+};
+
+static void cm_dev_release(struct kref *kref)
+{
+ struct cm_device *cm_dev = container_of(kref, struct cm_device, kref);
+ u32 i;
+
+ rdma_for_each_port(cm_dev->ib_device, i)
+ kfree(cm_dev->port[i - 1]);
+
+ kfree(cm_dev);
+}
+
+static void cm_device_put(struct cm_device *cm_dev)
+{
+ kref_put(&cm_dev->kref, cm_dev_release);
+}
+
+static void cm_work_handler(struct work_struct *work);
+
+static inline void cm_deref_id(struct cm_id_private *cm_id_priv)
+{
+ if (refcount_dec_and_test(&cm_id_priv->refcount))
+ complete(&cm_id_priv->comp);
+}
+
+static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
+{
+ struct ib_mad_agent *mad_agent;
+ struct ib_mad_send_buf *m;
+ struct ib_ah *ah;
+
+ lockdep_assert_held(&cm_id_priv->lock);
+
+ if (!cm_id_priv->av.port)
+ return ERR_PTR(-EINVAL);
+
+ spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ mad_agent = cm_id_priv->av.port->mad_agent;
+ if (!mad_agent) {
+ m = ERR_PTR(-EINVAL);
+ goto out;
+ }
+
+ ah = rdma_create_ah(mad_agent->qp->pd, &cm_id_priv->av.ah_attr, 0);
+ if (IS_ERR(ah)) {
+ m = ERR_CAST(ah);
+ goto out;
+ }
+
+ m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn,
+ cm_id_priv->av.pkey_index,
+ 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+ GFP_ATOMIC,
+ IB_MGMT_BASE_VERSION);
+ if (IS_ERR(m)) {
+ rdma_destroy_ah(ah, 0);
+ goto out;
+ }
+
+ /* Timeout set by caller if response is expected. */
+ m->ah = ah;
+ m->retries = cm_id_priv->max_cm_retries;
+
+ refcount_inc(&cm_id_priv->refcount);
+ m->context[0] = cm_id_priv;
+
+out:
+ spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ return m;
+}
+
+static void cm_free_msg(struct ib_mad_send_buf *msg)
+{
+ struct cm_id_private *cm_id_priv = msg->context[0];
+
+ if (msg->ah)
+ rdma_destroy_ah(msg->ah, 0);
+ cm_deref_id(cm_id_priv);
+ ib_free_send_mad(msg);
+}
+
+static struct ib_mad_send_buf *
+cm_alloc_priv_msg(struct cm_id_private *cm_id_priv)
+{
+ struct ib_mad_send_buf *msg;
+
+ lockdep_assert_held(&cm_id_priv->lock);
+
+ msg = cm_alloc_msg(cm_id_priv);
+ if (IS_ERR(msg))
+ return msg;
+ cm_id_priv->msg = msg;
+ return msg;
+}
+
+static void cm_free_priv_msg(struct ib_mad_send_buf *msg)
+{
+ struct cm_id_private *cm_id_priv = msg->context[0];
+
+ lockdep_assert_held(&cm_id_priv->lock);
+
+ if (!WARN_ON(cm_id_priv->msg != msg))
+ cm_id_priv->msg = NULL;
+
+ if (msg->ah)
+ rdma_destroy_ah(msg->ah, 0);
+ cm_deref_id(cm_id_priv);
+ ib_free_send_mad(msg);
+}
+
+static struct ib_mad_send_buf *cm_alloc_response_msg_no_ah(struct cm_port *port,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ return ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index,
+ 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+ GFP_ATOMIC,
+ IB_MGMT_BASE_VERSION);
+}
+
+static int cm_create_response_msg_ah(struct cm_port *port,
+ struct ib_mad_recv_wc *mad_recv_wc,
+ struct ib_mad_send_buf *msg)
+{
+ struct ib_ah *ah;
+
+ ah = ib_create_ah_from_wc(port->mad_agent->qp->pd, mad_recv_wc->wc,
+ mad_recv_wc->recv_buf.grh, port->port_num);
+ if (IS_ERR(ah))
+ return PTR_ERR(ah);
+
+ msg->ah = ah;
+ return 0;
+}
+
+static int cm_alloc_response_msg(struct cm_port *port,
+ struct ib_mad_recv_wc *mad_recv_wc,
+ struct ib_mad_send_buf **msg)
+{
+ struct ib_mad_send_buf *m;
+ int ret;
+
+ m = cm_alloc_response_msg_no_ah(port, mad_recv_wc);
+ if (IS_ERR(m))
+ return PTR_ERR(m);
+
+ ret = cm_create_response_msg_ah(port, mad_recv_wc, m);
+ if (ret) {
+ ib_free_send_mad(m);
+ return ret;
+ }
+
+ *msg = m;
+ return 0;
+}
+
+static void cm_free_response_msg(struct ib_mad_send_buf *msg)
+{
+ if (msg->ah)
+ rdma_destroy_ah(msg->ah, 0);
+ ib_free_send_mad(msg);
+}
+
+static void *cm_copy_private_data(const void *private_data, u8 private_data_len)
+{
+ void *data;
+
+ if (!private_data || !private_data_len)
+ return NULL;
+
+ data = kmemdup(private_data, private_data_len, GFP_KERNEL);
+ if (!data)
+ return ERR_PTR(-ENOMEM);
+
+ return data;
+}
+
+static void cm_set_private_data(struct cm_id_private *cm_id_priv,
+ void *private_data, u8 private_data_len)
+{
+ if (cm_id_priv->private_data && cm_id_priv->private_data_len)
+ kfree(cm_id_priv->private_data);
+
+ cm_id_priv->private_data = private_data;
+ cm_id_priv->private_data_len = private_data_len;
+}
+
+static void cm_set_av_port(struct cm_av *av, struct cm_port *port)
+{
+ struct cm_port *old_port = av->port;
+
+ if (old_port == port)
+ return;
+
+ av->port = port;
+ if (old_port)
+ cm_device_put(old_port->cm_dev);
+ if (port)
+ kref_get(&port->cm_dev->kref);
+}
+
+static void cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc,
+ struct rdma_ah_attr *ah_attr, struct cm_av *av)
+{
+ cm_set_av_port(av, port);
+ av->pkey_index = wc->pkey_index;
+ rdma_move_ah_attr(&av->ah_attr, ah_attr);
+}
+
+static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc,
+ struct ib_grh *grh, struct cm_av *av)
+{
+ cm_set_av_port(av, port);
+ av->pkey_index = wc->pkey_index;
+ return ib_init_ah_attr_from_wc(port->cm_dev->ib_device,
+ port->port_num, wc,
+ grh, &av->ah_attr);
+}
+
+static struct cm_port *
+get_cm_port_from_path(struct sa_path_rec *path, const struct ib_gid_attr *attr)
+{
+ struct cm_device *cm_dev;
+ struct cm_port *port = NULL;
+ unsigned long flags;
+
+ if (attr) {
+ read_lock_irqsave(&cm.device_lock, flags);
+ list_for_each_entry(cm_dev, &cm.device_list, list) {
+ if (cm_dev->ib_device == attr->device) {
+ port = cm_dev->port[attr->port_num - 1];
+ break;
+ }
+ }
+ read_unlock_irqrestore(&cm.device_lock, flags);
+ } else {
+ /* SGID attribute can be NULL in following
+ * conditions.
+ * (a) Alternative path
+ * (b) IB link layer without GRH
+ * (c) LAP send messages
+ */
+ read_lock_irqsave(&cm.device_lock, flags);
+ list_for_each_entry(cm_dev, &cm.device_list, list) {
+ attr = rdma_find_gid(cm_dev->ib_device,
+ &path->sgid,
+ sa_conv_pathrec_to_gid_type(path),
+ NULL);
+ if (!IS_ERR(attr)) {
+ port = cm_dev->port[attr->port_num - 1];
+ break;
+ }
+ }
+ read_unlock_irqrestore(&cm.device_lock, flags);
+ if (port)
+ rdma_put_gid_attr(attr);
+ }
+ return port;
+}
+
+static int cm_init_av_by_path(struct sa_path_rec *path,
+ const struct ib_gid_attr *sgid_attr,
+ struct cm_av *av)
+{
+ struct rdma_ah_attr new_ah_attr;
+ struct cm_device *cm_dev;
+ struct cm_port *port;
+ int ret;
+
+ port = get_cm_port_from_path(path, sgid_attr);
+ if (!port)
+ return -EINVAL;
+ cm_dev = port->cm_dev;
+
+ ret = ib_find_cached_pkey(cm_dev->ib_device, port->port_num,
+ be16_to_cpu(path->pkey), &av->pkey_index);
+ if (ret)
+ return ret;
+
+ cm_set_av_port(av, port);
+
+ /*
+ * av->ah_attr might be initialized based on wc or during
+ * request processing time which might have reference to sgid_attr.
+ * So initialize a new ah_attr on stack.
+ * If initialization fails, old ah_attr is used for sending any
+ * responses. If initialization is successful, than new ah_attr
+ * is used by overwriting the old one. So that right ah_attr
+ * can be used to return an error response.
+ */
+ ret = ib_init_ah_attr_from_path(cm_dev->ib_device, port->port_num, path,
+ &new_ah_attr, sgid_attr);
+ if (ret)
+ return ret;
+
+ av->timeout = path->packet_life_time + 1;
+ rdma_move_ah_attr(&av->ah_attr, &new_ah_attr);
+ return 0;
+}
+
+/* Move av created by cm_init_av_by_path(), so av.dgid is not moved */
+static void cm_move_av_from_path(struct cm_av *dest, struct cm_av *src)
+{
+ cm_set_av_port(dest, src->port);
+ cm_set_av_port(src, NULL);
+ dest->pkey_index = src->pkey_index;
+ rdma_move_ah_attr(&dest->ah_attr, &src->ah_attr);
+ dest->timeout = src->timeout;
+}
+
+static void cm_destroy_av(struct cm_av *av)
+{
+ rdma_destroy_ah_attr(&av->ah_attr);
+ cm_set_av_port(av, NULL);
+}
+
+static u32 cm_local_id(__be32 local_id)
+{
+ return (__force u32) (local_id ^ cm.random_id_operand);
+}
+
+static struct cm_id_private *cm_acquire_id(__be32 local_id, __be32 remote_id)
+{
+ struct cm_id_private *cm_id_priv;
+
+ rcu_read_lock();
+ cm_id_priv = xa_load(&cm.local_id_table, cm_local_id(local_id));
+ if (!cm_id_priv || cm_id_priv->id.remote_id != remote_id ||
+ !refcount_inc_not_zero(&cm_id_priv->refcount))
+ cm_id_priv = NULL;
+ rcu_read_unlock();
+
+ return cm_id_priv;
+}
+
+/*
+ * Trivial helpers to strip endian annotation and compare; the
+ * endianness doesn't actually matter since we just need a stable
+ * order for the RB tree.
+ */
+static int be32_lt(__be32 a, __be32 b)
+{
+ return (__force u32) a < (__force u32) b;
+}
+
+static int be32_gt(__be32 a, __be32 b)
+{
+ return (__force u32) a > (__force u32) b;
+}
+
+static int be64_lt(__be64 a, __be64 b)
+{
+ return (__force u64) a < (__force u64) b;
+}
+
+static int be64_gt(__be64 a, __be64 b)
+{
+ return (__force u64) a > (__force u64) b;
+}
+
+/*
+ * Inserts a new cm_id_priv into the listen_service_table. Returns cm_id_priv
+ * if the new ID was inserted, NULL if it could not be inserted due to a
+ * collision, or the existing cm_id_priv ready for shared usage.
+ */
+static struct cm_id_private *cm_insert_listen(struct cm_id_private *cm_id_priv,
+ ib_cm_handler shared_handler)
+{
+ struct rb_node **link = &cm.listen_service_table.rb_node;
+ struct rb_node *parent = NULL;
+ struct cm_id_private *cur_cm_id_priv;
+ __be64 service_id = cm_id_priv->id.service_id;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cm.lock, flags);
+ while (*link) {
+ parent = *link;
+ cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
+ service_node);
+
+ if (cm_id_priv->id.device < cur_cm_id_priv->id.device)
+ link = &(*link)->rb_left;
+ else if (cm_id_priv->id.device > cur_cm_id_priv->id.device)
+ link = &(*link)->rb_right;
+ else if (be64_lt(service_id, cur_cm_id_priv->id.service_id))
+ link = &(*link)->rb_left;
+ else if (be64_gt(service_id, cur_cm_id_priv->id.service_id))
+ link = &(*link)->rb_right;
+ else {
+ /*
+ * Sharing an ib_cm_id with different handlers is not
+ * supported
+ */
+ if (cur_cm_id_priv->id.cm_handler != shared_handler ||
+ cur_cm_id_priv->id.context ||
+ WARN_ON(!cur_cm_id_priv->id.cm_handler)) {
+ spin_unlock_irqrestore(&cm.lock, flags);
+ return NULL;
+ }
+ refcount_inc(&cur_cm_id_priv->refcount);
+ cur_cm_id_priv->listen_sharecount++;
+ spin_unlock_irqrestore(&cm.lock, flags);
+ return cur_cm_id_priv;
+ }
+ }
+ cm_id_priv->listen_sharecount++;
+ rb_link_node(&cm_id_priv->service_node, parent, link);
+ rb_insert_color(&cm_id_priv->service_node, &cm.listen_service_table);
+ spin_unlock_irqrestore(&cm.lock, flags);
+ return cm_id_priv;
+}
+
+static struct cm_id_private *cm_find_listen(struct ib_device *device,
+ __be64 service_id)
+{
+ struct rb_node *node = cm.listen_service_table.rb_node;
+ struct cm_id_private *cm_id_priv;
+
+ while (node) {
+ cm_id_priv = rb_entry(node, struct cm_id_private, service_node);
+
+ if (device < cm_id_priv->id.device)
+ node = node->rb_left;
+ else if (device > cm_id_priv->id.device)
+ node = node->rb_right;
+ else if (be64_lt(service_id, cm_id_priv->id.service_id))
+ node = node->rb_left;
+ else if (be64_gt(service_id, cm_id_priv->id.service_id))
+ node = node->rb_right;
+ else {
+ refcount_inc(&cm_id_priv->refcount);
+ return cm_id_priv;
+ }
+ }
+ return NULL;
+}
+
+static struct cm_timewait_info *
+cm_insert_remote_id(struct cm_timewait_info *timewait_info)
+{
+ struct rb_node **link = &cm.remote_id_table.rb_node;
+ struct rb_node *parent = NULL;
+ struct cm_timewait_info *cur_timewait_info;
+ __be64 remote_ca_guid = timewait_info->remote_ca_guid;
+ __be32 remote_id = timewait_info->work.remote_id;
+
+ while (*link) {
+ parent = *link;
+ cur_timewait_info = rb_entry(parent, struct cm_timewait_info,
+ remote_id_node);
+ if (be32_lt(remote_id, cur_timewait_info->work.remote_id))
+ link = &(*link)->rb_left;
+ else if (be32_gt(remote_id, cur_timewait_info->work.remote_id))
+ link = &(*link)->rb_right;
+ else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+ link = &(*link)->rb_left;
+ else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+ link = &(*link)->rb_right;
+ else
+ return cur_timewait_info;
+ }
+ timewait_info->inserted_remote_id = 1;
+ rb_link_node(&timewait_info->remote_id_node, parent, link);
+ rb_insert_color(&timewait_info->remote_id_node, &cm.remote_id_table);
+ return NULL;
+}
+
+static struct cm_id_private *cm_find_remote_id(__be64 remote_ca_guid,
+ __be32 remote_id)
+{
+ struct rb_node *node = cm.remote_id_table.rb_node;
+ struct cm_timewait_info *timewait_info;
+ struct cm_id_private *res = NULL;
+
+ spin_lock_irq(&cm.lock);
+ while (node) {
+ timewait_info = rb_entry(node, struct cm_timewait_info,
+ remote_id_node);
+ if (be32_lt(remote_id, timewait_info->work.remote_id))
+ node = node->rb_left;
+ else if (be32_gt(remote_id, timewait_info->work.remote_id))
+ node = node->rb_right;
+ else if (be64_lt(remote_ca_guid, timewait_info->remote_ca_guid))
+ node = node->rb_left;
+ else if (be64_gt(remote_ca_guid, timewait_info->remote_ca_guid))
+ node = node->rb_right;
+ else {
+ res = cm_acquire_id(timewait_info->work.local_id,
+ timewait_info->work.remote_id);
+ break;
+ }
+ }
+ spin_unlock_irq(&cm.lock);
+ return res;
+}
+
+static struct cm_timewait_info *
+cm_insert_remote_qpn(struct cm_timewait_info *timewait_info)
+{
+ struct rb_node **link = &cm.remote_qp_table.rb_node;
+ struct rb_node *parent = NULL;
+ struct cm_timewait_info *cur_timewait_info;
+ __be64 remote_ca_guid = timewait_info->remote_ca_guid;
+ __be32 remote_qpn = timewait_info->remote_qpn;
+
+ while (*link) {
+ parent = *link;
+ cur_timewait_info = rb_entry(parent, struct cm_timewait_info,
+ remote_qp_node);
+ if (be32_lt(remote_qpn, cur_timewait_info->remote_qpn))
+ link = &(*link)->rb_left;
+ else if (be32_gt(remote_qpn, cur_timewait_info->remote_qpn))
+ link = &(*link)->rb_right;
+ else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+ link = &(*link)->rb_left;
+ else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+ link = &(*link)->rb_right;
+ else
+ return cur_timewait_info;
+ }
+ timewait_info->inserted_remote_qp = 1;
+ rb_link_node(&timewait_info->remote_qp_node, parent, link);
+ rb_insert_color(&timewait_info->remote_qp_node, &cm.remote_qp_table);
+ return NULL;
+}
+
+static struct cm_id_private *
+cm_insert_remote_sidr(struct cm_id_private *cm_id_priv)
+{
+ struct rb_node **link = &cm.remote_sidr_table.rb_node;
+ struct rb_node *parent = NULL;
+ struct cm_id_private *cur_cm_id_priv;
+ __be32 remote_id = cm_id_priv->id.remote_id;
+
+ while (*link) {
+ parent = *link;
+ cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
+ sidr_id_node);
+ if (be32_lt(remote_id, cur_cm_id_priv->id.remote_id))
+ link = &(*link)->rb_left;
+ else if (be32_gt(remote_id, cur_cm_id_priv->id.remote_id))
+ link = &(*link)->rb_right;
+ else {
+ if (cur_cm_id_priv->sidr_slid < cm_id_priv->sidr_slid)
+ link = &(*link)->rb_left;
+ else if (cur_cm_id_priv->sidr_slid > cm_id_priv->sidr_slid)
+ link = &(*link)->rb_right;
+ else
+ return cur_cm_id_priv;
+ }
+ }
+ rb_link_node(&cm_id_priv->sidr_id_node, parent, link);
+ rb_insert_color(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+ return NULL;
+}
+
+static struct cm_id_private *cm_alloc_id_priv(struct ib_device *device,
+ ib_cm_handler cm_handler,
+ void *context)
+{
+ struct cm_id_private *cm_id_priv;
+ u32 id;
+ int ret;
+
+ cm_id_priv = kzalloc(sizeof *cm_id_priv, GFP_KERNEL);
+ if (!cm_id_priv)
+ return ERR_PTR(-ENOMEM);
+
+ cm_id_priv->id.state = IB_CM_IDLE;
+ cm_id_priv->id.device = device;
+ cm_id_priv->id.cm_handler = cm_handler;
+ cm_id_priv->id.context = context;
+ cm_id_priv->id.remote_cm_qpn = 1;
+
+ RB_CLEAR_NODE(&cm_id_priv->service_node);
+ RB_CLEAR_NODE(&cm_id_priv->sidr_id_node);
+ spin_lock_init(&cm_id_priv->lock);
+ init_completion(&cm_id_priv->comp);
+ INIT_LIST_HEAD(&cm_id_priv->work_list);
+ atomic_set(&cm_id_priv->work_count, -1);
+ refcount_set(&cm_id_priv->refcount, 1);
+
+ ret = xa_alloc_cyclic(&cm.local_id_table, &id, NULL, xa_limit_32b,
+ &cm.local_id_next, GFP_KERNEL);
+ if (ret < 0)
+ goto error;
+ cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand;
+
+ return cm_id_priv;
+
+error:
+ kfree(cm_id_priv);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Make the ID visible to the MAD handlers and other threads that use the
+ * xarray.
+ */
+static void cm_finalize_id(struct cm_id_private *cm_id_priv)
+{
+ xa_store(&cm.local_id_table, cm_local_id(cm_id_priv->id.local_id),
+ cm_id_priv, GFP_ATOMIC);
+}
+
+struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
+ ib_cm_handler cm_handler,
+ void *context)
+{
+ struct cm_id_private *cm_id_priv;
+
+ cm_id_priv = cm_alloc_id_priv(device, cm_handler, context);
+ if (IS_ERR(cm_id_priv))
+ return ERR_CAST(cm_id_priv);
+
+ cm_finalize_id(cm_id_priv);
+ return &cm_id_priv->id;
+}
+EXPORT_SYMBOL(ib_create_cm_id);
+
+static struct cm_work *cm_dequeue_work(struct cm_id_private *cm_id_priv)
+{
+ struct cm_work *work;
+
+ if (list_empty(&cm_id_priv->work_list))
+ return NULL;
+
+ work = list_entry(cm_id_priv->work_list.next, struct cm_work, list);
+ list_del(&work->list);
+ return work;
+}
+
+static void cm_free_work(struct cm_work *work)
+{
+ if (work->mad_recv_wc)
+ ib_free_recv_mad(work->mad_recv_wc);
+ kfree(work);
+}
+
+static void cm_queue_work_unlock(struct cm_id_private *cm_id_priv,
+ struct cm_work *work)
+ __releases(&cm_id_priv->lock)
+{
+ bool immediate;
+
+ /*
+ * To deliver the event to the user callback we have the drop the
+ * spinlock, however, we need to ensure that the user callback is single
+ * threaded and receives events in the temporal order. If there are
+ * already events being processed then thread new events onto a list,
+ * the thread currently processing will pick them up.
+ */
+ immediate = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!immediate) {
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ /*
+ * This routine always consumes incoming reference. Once queued
+ * to the work_list then a reference is held by the thread
+ * currently running cm_process_work() and this reference is not
+ * needed.
+ */
+ cm_deref_id(cm_id_priv);
+ }
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (immediate)
+ cm_process_work(cm_id_priv, work);
+}
+
+static inline int cm_convert_to_ms(int iba_time)
+{
+ /* approximate conversion to ms from 4.096us x 2^iba_time */
+ return 1 << max(iba_time - 8, 0);
+}
+
+/*
+ * calculate: 4.096x2^ack_timeout = 4.096x2^ack_delay + 2x4.096x2^life_time
+ * Because of how ack_timeout is stored, adding one doubles the timeout.
+ * To avoid large timeouts, select the max(ack_delay, life_time + 1), and
+ * increment it (round up) only if the other is within 50%.
+ */
+static u8 cm_ack_timeout(u8 ca_ack_delay, u8 packet_life_time)
+{
+ int ack_timeout = packet_life_time + 1;
+
+ if (ack_timeout >= ca_ack_delay)
+ ack_timeout += (ca_ack_delay >= (ack_timeout - 1));
+ else
+ ack_timeout = ca_ack_delay +
+ (ack_timeout >= (ca_ack_delay - 1));
+
+ return min(31, ack_timeout);
+}
+
+static void cm_remove_remote(struct cm_id_private *cm_id_priv)
+{
+ struct cm_timewait_info *timewait_info = cm_id_priv->timewait_info;
+
+ if (timewait_info->inserted_remote_id) {
+ rb_erase(&timewait_info->remote_id_node, &cm.remote_id_table);
+ timewait_info->inserted_remote_id = 0;
+ }
+
+ if (timewait_info->inserted_remote_qp) {
+ rb_erase(&timewait_info->remote_qp_node, &cm.remote_qp_table);
+ timewait_info->inserted_remote_qp = 0;
+ }
+}
+
+static struct cm_timewait_info *cm_create_timewait_info(__be32 local_id)
+{
+ struct cm_timewait_info *timewait_info;
+
+ timewait_info = kzalloc(sizeof *timewait_info, GFP_KERNEL);
+ if (!timewait_info)
+ return ERR_PTR(-ENOMEM);
+
+ timewait_info->work.local_id = local_id;
+ INIT_DELAYED_WORK(&timewait_info->work.work, cm_work_handler);
+ timewait_info->work.cm_event.event = IB_CM_TIMEWAIT_EXIT;
+ return timewait_info;
+}
+
+static void cm_enter_timewait(struct cm_id_private *cm_id_priv)
+{
+ int wait_time;
+ unsigned long flags;
+ struct cm_device *cm_dev;
+
+ lockdep_assert_held(&cm_id_priv->lock);
+
+ cm_dev = ib_get_client_data(cm_id_priv->id.device, &cm_client);
+ if (!cm_dev)
+ return;
+
+ spin_lock_irqsave(&cm.lock, flags);
+ cm_remove_remote(cm_id_priv);
+ list_add_tail(&cm_id_priv->timewait_info->list, &cm.timewait_list);
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+ /*
+ * The cm_id could be destroyed by the user before we exit timewait.
+ * To protect against this, we search for the cm_id after exiting
+ * timewait before notifying the user that we've exited timewait.
+ */
+ cm_id_priv->id.state = IB_CM_TIMEWAIT;
+ wait_time = cm_convert_to_ms(cm_id_priv->av.timeout);
+
+ /* Check if the device started its remove_one */
+ spin_lock_irqsave(&cm.lock, flags);
+ if (!cm_dev->going_down)
+ queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work,
+ msecs_to_jiffies(wait_time));
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+ /*
+ * The timewait_info is converted into a work and gets freed during
+ * cm_free_work() in cm_timewait_handler().
+ */
+ BUILD_BUG_ON(offsetof(struct cm_timewait_info, work) != 0);
+ cm_id_priv->timewait_info = NULL;
+}
+
+static void cm_reset_to_idle(struct cm_id_private *cm_id_priv)
+{
+ unsigned long flags;
+
+ lockdep_assert_held(&cm_id_priv->lock);
+
+ cm_id_priv->id.state = IB_CM_IDLE;
+ if (cm_id_priv->timewait_info) {
+ spin_lock_irqsave(&cm.lock, flags);
+ cm_remove_remote(cm_id_priv);
+ spin_unlock_irqrestore(&cm.lock, flags);
+ kfree(cm_id_priv->timewait_info);
+ cm_id_priv->timewait_info = NULL;
+ }
+}
+
+static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_work *work;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irq(&cm_id_priv->lock);
+retest:
+ switch (cm_id->state) {
+ case IB_CM_LISTEN:
+ spin_lock(&cm.lock);
+ if (--cm_id_priv->listen_sharecount > 0) {
+ /* The id is still shared. */
+ WARN_ON(refcount_read(&cm_id_priv->refcount) == 1);
+ spin_unlock(&cm.lock);
+ spin_unlock_irq(&cm_id_priv->lock);
+ cm_deref_id(cm_id_priv);
+ return;
+ }
+ cm_id->state = IB_CM_IDLE;
+ rb_erase(&cm_id_priv->service_node, &cm.listen_service_table);
+ RB_CLEAR_NODE(&cm_id_priv->service_node);
+ spin_unlock(&cm.lock);
+ break;
+ case IB_CM_SIDR_REQ_SENT:
+ cm_id->state = IB_CM_IDLE;
+ ib_cancel_mad(cm_id_priv->msg);
+ break;
+ case IB_CM_SIDR_REQ_RCVD:
+ cm_send_sidr_rep_locked(cm_id_priv,
+ &(struct ib_cm_sidr_rep_param){
+ .status = IB_SIDR_REJECT });
+ /* cm_send_sidr_rep_locked will not move to IDLE if it fails */
+ cm_id->state = IB_CM_IDLE;
+ break;
+ case IB_CM_REQ_SENT:
+ case IB_CM_MRA_REQ_RCVD:
+ ib_cancel_mad(cm_id_priv->msg);
+ cm_send_rej_locked(cm_id_priv, IB_CM_REJ_TIMEOUT,
+ &cm_id_priv->id.device->node_guid,
+ sizeof(cm_id_priv->id.device->node_guid),
+ NULL, 0);
+ break;
+ case IB_CM_REQ_RCVD:
+ if (err == -ENOMEM) {
+ /* Do not reject to allow future retries. */
+ cm_reset_to_idle(cm_id_priv);
+ } else {
+ cm_send_rej_locked(cm_id_priv,
+ IB_CM_REJ_CONSUMER_DEFINED, NULL, 0,
+ NULL, 0);
+ }
+ break;
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ ib_cancel_mad(cm_id_priv->msg);
+ cm_send_rej_locked(cm_id_priv, IB_CM_REJ_CONSUMER_DEFINED, NULL,
+ 0, NULL, 0);
+ goto retest;
+ case IB_CM_MRA_REQ_SENT:
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ cm_send_rej_locked(cm_id_priv, IB_CM_REJ_CONSUMER_DEFINED, NULL,
+ 0, NULL, 0);
+ break;
+ case IB_CM_ESTABLISHED:
+ if (cm_id_priv->qp_type == IB_QPT_XRC_TGT) {
+ cm_id->state = IB_CM_IDLE;
+ break;
+ }
+ cm_send_dreq_locked(cm_id_priv, NULL, 0);
+ goto retest;
+ case IB_CM_DREQ_SENT:
+ ib_cancel_mad(cm_id_priv->msg);
+ cm_enter_timewait(cm_id_priv);
+ goto retest;
+ case IB_CM_DREQ_RCVD:
+ cm_send_drep_locked(cm_id_priv, NULL, 0);
+ WARN_ON(cm_id->state != IB_CM_TIMEWAIT);
+ goto retest;
+ case IB_CM_TIMEWAIT:
+ /*
+ * The cm_acquire_id in cm_timewait_handler will stop working
+ * once we do xa_erase below, so just move to idle here for
+ * consistency.
+ */
+ cm_id->state = IB_CM_IDLE;
+ break;
+ case IB_CM_IDLE:
+ break;
+ }
+ WARN_ON(cm_id->state != IB_CM_IDLE);
+
+ spin_lock(&cm.lock);
+ /* Required for cleanup paths related cm_req_handler() */
+ if (cm_id_priv->timewait_info) {
+ cm_remove_remote(cm_id_priv);
+ kfree(cm_id_priv->timewait_info);
+ cm_id_priv->timewait_info = NULL;
+ }
+
+ WARN_ON(cm_id_priv->listen_sharecount);
+ WARN_ON(!RB_EMPTY_NODE(&cm_id_priv->service_node));
+ if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node))
+ rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+ spin_unlock(&cm.lock);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ xa_erase(&cm.local_id_table, cm_local_id(cm_id->local_id));
+ cm_deref_id(cm_id_priv);
+ wait_for_completion(&cm_id_priv->comp);
+ while ((work = cm_dequeue_work(cm_id_priv)) != NULL)
+ cm_free_work(work);
+
+ cm_destroy_av(&cm_id_priv->av);
+ cm_destroy_av(&cm_id_priv->alt_av);
+ kfree(cm_id_priv->private_data);
+ kfree_rcu(cm_id_priv, rcu);
+}
+
+void ib_destroy_cm_id(struct ib_cm_id *cm_id)
+{
+ cm_destroy_id(cm_id, 0);
+}
+EXPORT_SYMBOL(ib_destroy_cm_id);
+
+static int cm_init_listen(struct cm_id_private *cm_id_priv, __be64 service_id)
+{
+ if ((service_id & IB_SERVICE_ID_AGN_MASK) == IB_CM_ASSIGN_SERVICE_ID &&
+ (service_id != IB_CM_ASSIGN_SERVICE_ID))
+ return -EINVAL;
+
+ if (service_id == IB_CM_ASSIGN_SERVICE_ID)
+ cm_id_priv->id.service_id = cpu_to_be64(cm.listen_service_id++);
+ else
+ cm_id_priv->id.service_id = service_id;
+
+ return 0;
+}
+
+/**
+ * ib_cm_listen - Initiates listening on the specified service ID for
+ * connection and service ID resolution requests.
+ * @cm_id: Connection identifier associated with the listen request.
+ * @service_id: Service identifier matched against incoming connection
+ * and service ID resolution requests. The service ID should be specified
+ * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ * assign a service ID to the caller.
+ */
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id)
+{
+ struct cm_id_private *cm_id_priv =
+ container_of(cm_id, struct cm_id_private, id);
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id_priv->id.state != IB_CM_IDLE) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = cm_init_listen(cm_id_priv, service_id);
+ if (ret)
+ goto out;
+
+ if (!cm_insert_listen(cm_id_priv, NULL)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ cm_id_priv->id.state = IB_CM_LISTEN;
+ ret = 0;
+
+out:
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_cm_listen);
+
+/**
+ * ib_cm_insert_listen - Create a new listening ib_cm_id and listen on
+ * the given service ID.
+ *
+ * If there's an existing ID listening on that same device and service ID,
+ * return it.
+ *
+ * @device: Device associated with the cm_id. All related communication will
+ * be associated with the specified device.
+ * @cm_handler: Callback invoked to notify the user of CM events.
+ * @service_id: Service identifier matched against incoming connection
+ * and service ID resolution requests. The service ID should be specified
+ * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ * assign a service ID to the caller.
+ *
+ * Callers should call ib_destroy_cm_id when done with the listener ID.
+ */
+struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
+ ib_cm_handler cm_handler,
+ __be64 service_id)
+{
+ struct cm_id_private *listen_id_priv;
+ struct cm_id_private *cm_id_priv;
+ int err = 0;
+
+ /* Create an ID in advance, since the creation may sleep */
+ cm_id_priv = cm_alloc_id_priv(device, cm_handler, NULL);
+ if (IS_ERR(cm_id_priv))
+ return ERR_CAST(cm_id_priv);
+
+ err = cm_init_listen(cm_id_priv, service_id);
+ if (err) {
+ ib_destroy_cm_id(&cm_id_priv->id);
+ return ERR_PTR(err);
+ }
+
+ spin_lock_irq(&cm_id_priv->lock);
+ listen_id_priv = cm_insert_listen(cm_id_priv, cm_handler);
+ if (listen_id_priv != cm_id_priv) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ ib_destroy_cm_id(&cm_id_priv->id);
+ if (!listen_id_priv)
+ return ERR_PTR(-EINVAL);
+ return &listen_id_priv->id;
+ }
+ cm_id_priv->id.state = IB_CM_LISTEN;
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ /*
+ * A listen ID does not need to be in the xarray since it does not
+ * receive mads, is not placed in the remote_id or remote_qpn rbtree,
+ * and does not enter timewait.
+ */
+
+ return &cm_id_priv->id;
+}
+EXPORT_SYMBOL(ib_cm_insert_listen);
+
+static __be64 cm_form_tid(struct cm_id_private *cm_id_priv)
+{
+ u64 hi_tid = 0, low_tid;
+
+ lockdep_assert_held(&cm_id_priv->lock);
+
+ low_tid = (u64)cm_id_priv->id.local_id;
+ if (!cm_id_priv->av.port)
+ return cpu_to_be64(low_tid);
+
+ spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ if (cm_id_priv->av.port->mad_agent)
+ hi_tid = ((u64)cm_id_priv->av.port->mad_agent->hi_tid) << 32;
+ spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ return cpu_to_be64(hi_tid | low_tid);
+}
+
+static void cm_format_mad_hdr(struct ib_mad_hdr *hdr,
+ __be16 attr_id, __be64 tid)
+{
+ hdr->base_version = IB_MGMT_BASE_VERSION;
+ hdr->mgmt_class = IB_MGMT_CLASS_CM;
+ hdr->class_version = IB_CM_CLASS_VERSION;
+ hdr->method = IB_MGMT_METHOD_SEND;
+ hdr->attr_id = attr_id;
+ hdr->tid = tid;
+}
+
+static void cm_format_mad_ece_hdr(struct ib_mad_hdr *hdr, __be16 attr_id,
+ __be64 tid, u32 attr_mod)
+{
+ cm_format_mad_hdr(hdr, attr_id, tid);
+ hdr->attr_mod = cpu_to_be32(attr_mod);
+}
+
+static void cm_format_req(struct cm_req_msg *req_msg,
+ struct cm_id_private *cm_id_priv,
+ struct ib_cm_req_param *param)
+{
+ struct sa_path_rec *pri_path = param->primary_path;
+ struct sa_path_rec *alt_path = param->alternate_path;
+ bool pri_ext = false;
+ __be16 lid;
+
+ if (pri_path->rec_type == SA_PATH_REC_TYPE_OPA)
+ pri_ext = opa_is_extended_lid(pri_path->opa.dlid,
+ pri_path->opa.slid);
+
+ cm_format_mad_ece_hdr(&req_msg->hdr, CM_REQ_ATTR_ID,
+ cm_form_tid(cm_id_priv), param->ece.attr_mod);
+
+ IBA_SET(CM_REQ_LOCAL_COMM_ID, req_msg,
+ be32_to_cpu(cm_id_priv->id.local_id));
+ IBA_SET(CM_REQ_SERVICE_ID, req_msg, be64_to_cpu(param->service_id));
+ IBA_SET(CM_REQ_LOCAL_CA_GUID, req_msg,
+ be64_to_cpu(cm_id_priv->id.device->node_guid));
+ IBA_SET(CM_REQ_LOCAL_QPN, req_msg, param->qp_num);
+ IBA_SET(CM_REQ_INITIATOR_DEPTH, req_msg, param->initiator_depth);
+ IBA_SET(CM_REQ_REMOTE_CM_RESPONSE_TIMEOUT, req_msg,
+ param->remote_cm_response_timeout);
+ cm_req_set_qp_type(req_msg, param->qp_type);
+ IBA_SET(CM_REQ_END_TO_END_FLOW_CONTROL, req_msg, param->flow_control);
+ IBA_SET(CM_REQ_STARTING_PSN, req_msg, param->starting_psn);
+ IBA_SET(CM_REQ_LOCAL_CM_RESPONSE_TIMEOUT, req_msg,
+ param->local_cm_response_timeout);
+ IBA_SET(CM_REQ_PARTITION_KEY, req_msg,
+ be16_to_cpu(param->primary_path->pkey));
+ IBA_SET(CM_REQ_PATH_PACKET_PAYLOAD_MTU, req_msg,
+ param->primary_path->mtu);
+ IBA_SET(CM_REQ_MAX_CM_RETRIES, req_msg, param->max_cm_retries);
+
+ if (param->qp_type != IB_QPT_XRC_INI) {
+ IBA_SET(CM_REQ_RESPONDER_RESOURCES, req_msg,
+ param->responder_resources);
+ IBA_SET(CM_REQ_RETRY_COUNT, req_msg, param->retry_count);
+ IBA_SET(CM_REQ_RNR_RETRY_COUNT, req_msg,
+ param->rnr_retry_count);
+ IBA_SET(CM_REQ_SRQ, req_msg, param->srq);
+ }
+
+ *IBA_GET_MEM_PTR(CM_REQ_PRIMARY_LOCAL_PORT_GID, req_msg) =
+ pri_path->sgid;
+ *IBA_GET_MEM_PTR(CM_REQ_PRIMARY_REMOTE_PORT_GID, req_msg) =
+ pri_path->dgid;
+ if (pri_ext) {
+ IBA_GET_MEM_PTR(CM_REQ_PRIMARY_LOCAL_PORT_GID, req_msg)
+ ->global.interface_id =
+ OPA_MAKE_ID(be32_to_cpu(pri_path->opa.slid));
+ IBA_GET_MEM_PTR(CM_REQ_PRIMARY_REMOTE_PORT_GID, req_msg)
+ ->global.interface_id =
+ OPA_MAKE_ID(be32_to_cpu(pri_path->opa.dlid));
+ }
+ if (pri_path->hop_limit <= 1) {
+ IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg,
+ be16_to_cpu(pri_ext ? 0 :
+ htons(ntohl(sa_path_get_slid(
+ pri_path)))));
+ IBA_SET(CM_REQ_PRIMARY_REMOTE_PORT_LID, req_msg,
+ be16_to_cpu(pri_ext ? 0 :
+ htons(ntohl(sa_path_get_dlid(
+ pri_path)))));
+ } else {
+
+ if (param->primary_path_inbound) {
+ lid = param->primary_path_inbound->ib.dlid;
+ IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg,
+ be16_to_cpu(lid));
+ } else
+ IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg,
+ be16_to_cpu(IB_LID_PERMISSIVE));
+
+ /* Work-around until there's a way to obtain remote LID info */
+ IBA_SET(CM_REQ_PRIMARY_REMOTE_PORT_LID, req_msg,
+ be16_to_cpu(IB_LID_PERMISSIVE));
+ }
+ IBA_SET(CM_REQ_PRIMARY_FLOW_LABEL, req_msg,
+ be32_to_cpu(pri_path->flow_label));
+ IBA_SET(CM_REQ_PRIMARY_PACKET_RATE, req_msg, pri_path->rate);
+ IBA_SET(CM_REQ_PRIMARY_TRAFFIC_CLASS, req_msg, pri_path->traffic_class);
+ IBA_SET(CM_REQ_PRIMARY_HOP_LIMIT, req_msg, pri_path->hop_limit);
+ IBA_SET(CM_REQ_PRIMARY_SL, req_msg, pri_path->sl);
+ IBA_SET(CM_REQ_PRIMARY_SUBNET_LOCAL, req_msg,
+ (pri_path->hop_limit <= 1));
+ IBA_SET(CM_REQ_PRIMARY_LOCAL_ACK_TIMEOUT, req_msg,
+ cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+ pri_path->packet_life_time));
+
+ if (alt_path) {
+ bool alt_ext = false;
+
+ if (alt_path->rec_type == SA_PATH_REC_TYPE_OPA)
+ alt_ext = opa_is_extended_lid(alt_path->opa.dlid,
+ alt_path->opa.slid);
+
+ *IBA_GET_MEM_PTR(CM_REQ_ALTERNATE_LOCAL_PORT_GID, req_msg) =
+ alt_path->sgid;
+ *IBA_GET_MEM_PTR(CM_REQ_ALTERNATE_REMOTE_PORT_GID, req_msg) =
+ alt_path->dgid;
+ if (alt_ext) {
+ IBA_GET_MEM_PTR(CM_REQ_ALTERNATE_LOCAL_PORT_GID,
+ req_msg)
+ ->global.interface_id =
+ OPA_MAKE_ID(be32_to_cpu(alt_path->opa.slid));
+ IBA_GET_MEM_PTR(CM_REQ_ALTERNATE_REMOTE_PORT_GID,
+ req_msg)
+ ->global.interface_id =
+ OPA_MAKE_ID(be32_to_cpu(alt_path->opa.dlid));
+ }
+ if (alt_path->hop_limit <= 1) {
+ IBA_SET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, req_msg,
+ be16_to_cpu(
+ alt_ext ? 0 :
+ htons(ntohl(sa_path_get_slid(
+ alt_path)))));
+ IBA_SET(CM_REQ_ALTERNATE_REMOTE_PORT_LID, req_msg,
+ be16_to_cpu(
+ alt_ext ? 0 :
+ htons(ntohl(sa_path_get_dlid(
+ alt_path)))));
+ } else {
+ IBA_SET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, req_msg,
+ be16_to_cpu(IB_LID_PERMISSIVE));
+ IBA_SET(CM_REQ_ALTERNATE_REMOTE_PORT_LID, req_msg,
+ be16_to_cpu(IB_LID_PERMISSIVE));
+ }
+ IBA_SET(CM_REQ_ALTERNATE_FLOW_LABEL, req_msg,
+ be32_to_cpu(alt_path->flow_label));
+ IBA_SET(CM_REQ_ALTERNATE_PACKET_RATE, req_msg, alt_path->rate);
+ IBA_SET(CM_REQ_ALTERNATE_TRAFFIC_CLASS, req_msg,
+ alt_path->traffic_class);
+ IBA_SET(CM_REQ_ALTERNATE_HOP_LIMIT, req_msg,
+ alt_path->hop_limit);
+ IBA_SET(CM_REQ_ALTERNATE_SL, req_msg, alt_path->sl);
+ IBA_SET(CM_REQ_ALTERNATE_SUBNET_LOCAL, req_msg,
+ (alt_path->hop_limit <= 1));
+ IBA_SET(CM_REQ_ALTERNATE_LOCAL_ACK_TIMEOUT, req_msg,
+ cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+ alt_path->packet_life_time));
+ }
+ IBA_SET(CM_REQ_VENDOR_ID, req_msg, param->ece.vendor_id);
+
+ if (param->private_data && param->private_data_len)
+ IBA_SET_MEM(CM_REQ_PRIVATE_DATA, req_msg, param->private_data,
+ param->private_data_len);
+}
+
+static int cm_validate_req_param(struct ib_cm_req_param *param)
+{
+ if (!param->primary_path)
+ return -EINVAL;
+
+ if (param->qp_type != IB_QPT_RC && param->qp_type != IB_QPT_UC &&
+ param->qp_type != IB_QPT_XRC_INI)
+ return -EINVAL;
+
+ if (param->private_data &&
+ param->private_data_len > IB_CM_REQ_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ if (param->alternate_path &&
+ (param->alternate_path->pkey != param->primary_path->pkey ||
+ param->alternate_path->mtu != param->primary_path->mtu))
+ return -EINVAL;
+
+ return 0;
+}
+
+int ib_send_cm_req(struct ib_cm_id *cm_id,
+ struct ib_cm_req_param *param)
+{
+ struct cm_av av = {}, alt_av = {};
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ struct cm_req_msg *req_msg;
+ unsigned long flags;
+ int ret;
+
+ ret = cm_validate_req_param(param);
+ if (ret)
+ return ret;
+
+ /* Verify that we're not in timewait. */
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_IDLE || WARN_ON(cm_id_priv->timewait_info)) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return -EINVAL;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
+ id.local_id);
+ if (IS_ERR(cm_id_priv->timewait_info)) {
+ ret = PTR_ERR(cm_id_priv->timewait_info);
+ cm_id_priv->timewait_info = NULL;
+ return ret;
+ }
+
+ ret = cm_init_av_by_path(param->primary_path,
+ param->ppath_sgid_attr, &av);
+ if (ret)
+ return ret;
+ if (param->alternate_path) {
+ ret = cm_init_av_by_path(param->alternate_path, NULL,
+ &alt_av);
+ if (ret) {
+ cm_destroy_av(&av);
+ return ret;
+ }
+ }
+ cm_id->service_id = param->service_id;
+ cm_id_priv->timeout_ms = cm_convert_to_ms(
+ param->primary_path->packet_life_time) * 2 +
+ cm_convert_to_ms(
+ param->remote_cm_response_timeout);
+ cm_id_priv->max_cm_retries = param->max_cm_retries;
+ cm_id_priv->initiator_depth = param->initiator_depth;
+ cm_id_priv->responder_resources = param->responder_resources;
+ cm_id_priv->retry_count = param->retry_count;
+ cm_id_priv->path_mtu = param->primary_path->mtu;
+ cm_id_priv->pkey = param->primary_path->pkey;
+ cm_id_priv->qp_type = param->qp_type;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+ cm_move_av_from_path(&cm_id_priv->av, &av);
+ if (param->primary_path_outbound)
+ cm_id_priv->av.dlid_datapath =
+ be16_to_cpu(param->primary_path_outbound->ib.dlid);
+
+ if (param->alternate_path)
+ cm_move_av_from_path(&cm_id_priv->alt_av, &alt_av);
+
+ msg = cm_alloc_priv_msg(cm_id_priv);
+ if (IS_ERR(msg)) {
+ ret = PTR_ERR(msg);
+ goto out_unlock;
+ }
+
+ req_msg = (struct cm_req_msg *)msg->mad;
+ cm_format_req(req_msg, cm_id_priv, param);
+ cm_id_priv->tid = req_msg->hdr.tid;
+ msg->timeout_ms = cm_id_priv->timeout_ms;
+ msg->context[1] = (void *)(unsigned long)IB_CM_REQ_SENT;
+
+ cm_id_priv->local_qpn = cpu_to_be32(IBA_GET(CM_REQ_LOCAL_QPN, req_msg));
+ cm_id_priv->rq_psn = cpu_to_be32(IBA_GET(CM_REQ_STARTING_PSN, req_msg));
+
+ trace_icm_send_req(&cm_id_priv->id);
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ goto out_free;
+ BUG_ON(cm_id->state != IB_CM_IDLE);
+ cm_id->state = IB_CM_REQ_SENT;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return 0;
+out_free:
+ cm_free_priv_msg(msg);
+out_unlock:
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_req);
+
+static int cm_issue_rej(struct cm_port *port,
+ struct ib_mad_recv_wc *mad_recv_wc,
+ enum ib_cm_rej_reason reason,
+ enum cm_msg_response msg_rejected,
+ void *ari, u8 ari_length)
+{
+ struct ib_mad_send_buf *msg = NULL;
+ struct cm_rej_msg *rej_msg, *rcv_msg;
+ int ret;
+
+ ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
+ if (ret)
+ return ret;
+
+ /* We just need common CM header information. Cast to any message. */
+ rcv_msg = (struct cm_rej_msg *) mad_recv_wc->recv_buf.mad;
+ rej_msg = (struct cm_rej_msg *) msg->mad;
+
+ cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, rcv_msg->hdr.tid);
+ IBA_SET(CM_REJ_REMOTE_COMM_ID, rej_msg,
+ IBA_GET(CM_REJ_LOCAL_COMM_ID, rcv_msg));
+ IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg,
+ IBA_GET(CM_REJ_REMOTE_COMM_ID, rcv_msg));
+ IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, msg_rejected);
+ IBA_SET(CM_REJ_REASON, rej_msg, reason);
+
+ if (ari && ari_length) {
+ IBA_SET(CM_REJ_REJECTED_INFO_LENGTH, rej_msg, ari_length);
+ IBA_SET_MEM(CM_REJ_ARI, rej_msg, ari, ari_length);
+ }
+
+ trace_icm_issue_rej(
+ IBA_GET(CM_REJ_LOCAL_COMM_ID, rcv_msg),
+ IBA_GET(CM_REJ_REMOTE_COMM_ID, rcv_msg));
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ cm_free_response_msg(msg);
+
+ return ret;
+}
+
+static bool cm_req_has_alt_path(struct cm_req_msg *req_msg)
+{
+ return ((cpu_to_be16(
+ IBA_GET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, req_msg))) ||
+ (ib_is_opa_gid(IBA_GET_MEM_PTR(CM_REQ_ALTERNATE_LOCAL_PORT_GID,
+ req_msg))));
+}
+
+static void cm_path_set_rec_type(struct ib_device *ib_device, u32 port_num,
+ struct sa_path_rec *path, union ib_gid *gid)
+{
+ if (ib_is_opa_gid(gid) && rdma_cap_opa_ah(ib_device, port_num))
+ path->rec_type = SA_PATH_REC_TYPE_OPA;
+ else
+ path->rec_type = SA_PATH_REC_TYPE_IB;
+}
+
+static void cm_format_path_lid_from_req(struct cm_req_msg *req_msg,
+ struct sa_path_rec *primary_path,
+ struct sa_path_rec *alt_path,
+ struct ib_wc *wc)
+{
+ u32 lid;
+
+ if (primary_path->rec_type != SA_PATH_REC_TYPE_OPA) {
+ sa_path_set_dlid(primary_path, wc->slid);
+ sa_path_set_slid(primary_path,
+ IBA_GET(CM_REQ_PRIMARY_REMOTE_PORT_LID,
+ req_msg));
+ } else {
+ lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR(
+ CM_REQ_PRIMARY_LOCAL_PORT_GID, req_msg));
+ sa_path_set_dlid(primary_path, lid);
+
+ lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR(
+ CM_REQ_PRIMARY_REMOTE_PORT_GID, req_msg));
+ sa_path_set_slid(primary_path, lid);
+ }
+
+ if (!cm_req_has_alt_path(req_msg))
+ return;
+
+ if (alt_path->rec_type != SA_PATH_REC_TYPE_OPA) {
+ sa_path_set_dlid(alt_path,
+ IBA_GET(CM_REQ_ALTERNATE_LOCAL_PORT_LID,
+ req_msg));
+ sa_path_set_slid(alt_path,
+ IBA_GET(CM_REQ_ALTERNATE_REMOTE_PORT_LID,
+ req_msg));
+ } else {
+ lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR(
+ CM_REQ_ALTERNATE_LOCAL_PORT_GID, req_msg));
+ sa_path_set_dlid(alt_path, lid);
+
+ lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR(
+ CM_REQ_ALTERNATE_REMOTE_PORT_GID, req_msg));
+ sa_path_set_slid(alt_path, lid);
+ }
+}
+
+static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
+ struct sa_path_rec *primary_path,
+ struct sa_path_rec *alt_path,
+ struct ib_wc *wc)
+{
+ primary_path->dgid =
+ *IBA_GET_MEM_PTR(CM_REQ_PRIMARY_LOCAL_PORT_GID, req_msg);
+ primary_path->sgid =
+ *IBA_GET_MEM_PTR(CM_REQ_PRIMARY_REMOTE_PORT_GID, req_msg);
+ primary_path->flow_label =
+ cpu_to_be32(IBA_GET(CM_REQ_PRIMARY_FLOW_LABEL, req_msg));
+ primary_path->hop_limit = IBA_GET(CM_REQ_PRIMARY_HOP_LIMIT, req_msg);
+ primary_path->traffic_class =
+ IBA_GET(CM_REQ_PRIMARY_TRAFFIC_CLASS, req_msg);
+ primary_path->reversible = 1;
+ primary_path->pkey =
+ cpu_to_be16(IBA_GET(CM_REQ_PARTITION_KEY, req_msg));
+ primary_path->sl = IBA_GET(CM_REQ_PRIMARY_SL, req_msg);
+ primary_path->mtu_selector = IB_SA_EQ;
+ primary_path->mtu = IBA_GET(CM_REQ_PATH_PACKET_PAYLOAD_MTU, req_msg);
+ primary_path->rate_selector = IB_SA_EQ;
+ primary_path->rate = IBA_GET(CM_REQ_PRIMARY_PACKET_RATE, req_msg);
+ primary_path->packet_life_time_selector = IB_SA_EQ;
+ primary_path->packet_life_time =
+ IBA_GET(CM_REQ_PRIMARY_LOCAL_ACK_TIMEOUT, req_msg);
+ primary_path->packet_life_time -= (primary_path->packet_life_time > 0);
+ primary_path->service_id =
+ cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg));
+ if (sa_path_is_roce(primary_path))
+ primary_path->roce.route_resolved = false;
+
+ if (cm_req_has_alt_path(req_msg)) {
+ alt_path->dgid = *IBA_GET_MEM_PTR(
+ CM_REQ_ALTERNATE_LOCAL_PORT_GID, req_msg);
+ alt_path->sgid = *IBA_GET_MEM_PTR(
+ CM_REQ_ALTERNATE_REMOTE_PORT_GID, req_msg);
+ alt_path->flow_label = cpu_to_be32(
+ IBA_GET(CM_REQ_ALTERNATE_FLOW_LABEL, req_msg));
+ alt_path->hop_limit =
+ IBA_GET(CM_REQ_ALTERNATE_HOP_LIMIT, req_msg);
+ alt_path->traffic_class =
+ IBA_GET(CM_REQ_ALTERNATE_TRAFFIC_CLASS, req_msg);
+ alt_path->reversible = 1;
+ alt_path->pkey =
+ cpu_to_be16(IBA_GET(CM_REQ_PARTITION_KEY, req_msg));
+ alt_path->sl = IBA_GET(CM_REQ_ALTERNATE_SL, req_msg);
+ alt_path->mtu_selector = IB_SA_EQ;
+ alt_path->mtu =
+ IBA_GET(CM_REQ_PATH_PACKET_PAYLOAD_MTU, req_msg);
+ alt_path->rate_selector = IB_SA_EQ;
+ alt_path->rate = IBA_GET(CM_REQ_ALTERNATE_PACKET_RATE, req_msg);
+ alt_path->packet_life_time_selector = IB_SA_EQ;
+ alt_path->packet_life_time =
+ IBA_GET(CM_REQ_ALTERNATE_LOCAL_ACK_TIMEOUT, req_msg);
+ alt_path->packet_life_time -= (alt_path->packet_life_time > 0);
+ alt_path->service_id =
+ cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg));
+
+ if (sa_path_is_roce(alt_path))
+ alt_path->roce.route_resolved = false;
+ }
+ cm_format_path_lid_from_req(req_msg, primary_path, alt_path, wc);
+}
+
+static u16 cm_get_bth_pkey(struct cm_work *work)
+{
+ struct ib_device *ib_dev = work->port->cm_dev->ib_device;
+ u32 port_num = work->port->port_num;
+ u16 pkey_index = work->mad_recv_wc->wc->pkey_index;
+ u16 pkey;
+ int ret;
+
+ ret = ib_get_cached_pkey(ib_dev, port_num, pkey_index, &pkey);
+ if (ret) {
+ dev_warn_ratelimited(&ib_dev->dev, "ib_cm: Couldn't retrieve pkey for incoming request (port %u, pkey index %u). %d\n",
+ port_num, pkey_index, ret);
+ return 0;
+ }
+
+ return pkey;
+}
+
+/**
+ * cm_opa_to_ib_sgid - Convert OPA SGID to IB SGID
+ * ULPs (such as IPoIB) do not understand OPA GIDs and will
+ * reject them as the local_gid will not match the sgid. Therefore,
+ * change the pathrec's SGID to an IB SGID.
+ *
+ * @work: Work completion
+ * @path: Path record
+ */
+static void cm_opa_to_ib_sgid(struct cm_work *work,
+ struct sa_path_rec *path)
+{
+ struct ib_device *dev = work->port->cm_dev->ib_device;
+ u32 port_num = work->port->port_num;
+
+ if (rdma_cap_opa_ah(dev, port_num) &&
+ (ib_is_opa_gid(&path->sgid))) {
+ union ib_gid sgid;
+
+ if (rdma_query_gid(dev, port_num, 0, &sgid)) {
+ dev_warn(&dev->dev,
+ "Error updating sgid in CM request\n");
+ return;
+ }
+
+ path->sgid = sgid;
+ }
+}
+
+static void cm_format_req_event(struct cm_work *work,
+ struct cm_id_private *cm_id_priv,
+ struct ib_cm_id *listen_id)
+{
+ struct cm_req_msg *req_msg;
+ struct ib_cm_req_event_param *param;
+
+ req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+ param = &work->cm_event.param.req_rcvd;
+ param->listen_id = listen_id;
+ param->bth_pkey = cm_get_bth_pkey(work);
+ param->port = cm_id_priv->av.port->port_num;
+ param->primary_path = &work->path[0];
+ cm_opa_to_ib_sgid(work, param->primary_path);
+ if (cm_req_has_alt_path(req_msg)) {
+ param->alternate_path = &work->path[1];
+ cm_opa_to_ib_sgid(work, param->alternate_path);
+ } else {
+ param->alternate_path = NULL;
+ }
+ param->remote_ca_guid =
+ cpu_to_be64(IBA_GET(CM_REQ_LOCAL_CA_GUID, req_msg));
+ param->remote_qkey = IBA_GET(CM_REQ_LOCAL_Q_KEY, req_msg);
+ param->remote_qpn = IBA_GET(CM_REQ_LOCAL_QPN, req_msg);
+ param->qp_type = cm_req_get_qp_type(req_msg);
+ param->starting_psn = IBA_GET(CM_REQ_STARTING_PSN, req_msg);
+ param->responder_resources = IBA_GET(CM_REQ_INITIATOR_DEPTH, req_msg);
+ param->initiator_depth = IBA_GET(CM_REQ_RESPONDER_RESOURCES, req_msg);
+ param->local_cm_response_timeout =
+ IBA_GET(CM_REQ_REMOTE_CM_RESPONSE_TIMEOUT, req_msg);
+ param->flow_control = IBA_GET(CM_REQ_END_TO_END_FLOW_CONTROL, req_msg);
+ param->remote_cm_response_timeout =
+ IBA_GET(CM_REQ_LOCAL_CM_RESPONSE_TIMEOUT, req_msg);
+ param->retry_count = IBA_GET(CM_REQ_RETRY_COUNT, req_msg);
+ param->rnr_retry_count = IBA_GET(CM_REQ_RNR_RETRY_COUNT, req_msg);
+ param->srq = IBA_GET(CM_REQ_SRQ, req_msg);
+ param->ppath_sgid_attr = cm_id_priv->av.ah_attr.grh.sgid_attr;
+ param->ece.vendor_id = IBA_GET(CM_REQ_VENDOR_ID, req_msg);
+ param->ece.attr_mod = be32_to_cpu(req_msg->hdr.attr_mod);
+
+ work->cm_event.private_data =
+ IBA_GET_MEM_PTR(CM_REQ_PRIVATE_DATA, req_msg);
+}
+
+static void cm_process_work(struct cm_id_private *cm_id_priv,
+ struct cm_work *work)
+{
+ int ret;
+
+ /* We will typically only have the current event to report. */
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event);
+ cm_free_work(work);
+
+ while (!ret && !atomic_add_negative(-1, &cm_id_priv->work_count)) {
+ spin_lock_irq(&cm_id_priv->lock);
+ work = cm_dequeue_work(cm_id_priv);
+ spin_unlock_irq(&cm_id_priv->lock);
+ if (!work)
+ return;
+
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id,
+ &work->cm_event);
+ cm_free_work(work);
+ }
+ cm_deref_id(cm_id_priv);
+ if (ret)
+ cm_destroy_id(&cm_id_priv->id, ret);
+}
+
+static void cm_format_mra(struct cm_mra_msg *mra_msg,
+ struct cm_id_private *cm_id_priv,
+ enum cm_msg_response msg_mraed, u8 service_timeout,
+ const void *private_data, u8 private_data_len)
+{
+ cm_format_mad_hdr(&mra_msg->hdr, CM_MRA_ATTR_ID, cm_id_priv->tid);
+ IBA_SET(CM_MRA_MESSAGE_MRAED, mra_msg, msg_mraed);
+ IBA_SET(CM_MRA_LOCAL_COMM_ID, mra_msg,
+ be32_to_cpu(cm_id_priv->id.local_id));
+ IBA_SET(CM_MRA_REMOTE_COMM_ID, mra_msg,
+ be32_to_cpu(cm_id_priv->id.remote_id));
+ IBA_SET(CM_MRA_SERVICE_TIMEOUT, mra_msg, service_timeout);
+
+ if (private_data && private_data_len)
+ IBA_SET_MEM(CM_MRA_PRIVATE_DATA, mra_msg, private_data,
+ private_data_len);
+}
+
+static void cm_format_rej(struct cm_rej_msg *rej_msg,
+ struct cm_id_private *cm_id_priv,
+ enum ib_cm_rej_reason reason, void *ari,
+ u8 ari_length, const void *private_data,
+ u8 private_data_len, enum ib_cm_state state)
+{
+ lockdep_assert_held(&cm_id_priv->lock);
+
+ cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, cm_id_priv->tid);
+ IBA_SET(CM_REJ_REMOTE_COMM_ID, rej_msg,
+ be32_to_cpu(cm_id_priv->id.remote_id));
+
+ switch (state) {
+ case IB_CM_REQ_RCVD:
+ IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg, be32_to_cpu(0));
+ IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, CM_MSG_RESPONSE_REQ);
+ break;
+ case IB_CM_MRA_REQ_SENT:
+ IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg,
+ be32_to_cpu(cm_id_priv->id.local_id));
+ IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, CM_MSG_RESPONSE_REQ);
+ break;
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg,
+ be32_to_cpu(cm_id_priv->id.local_id));
+ IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg, CM_MSG_RESPONSE_REP);
+ break;
+ default:
+ IBA_SET(CM_REJ_LOCAL_COMM_ID, rej_msg,
+ be32_to_cpu(cm_id_priv->id.local_id));
+ IBA_SET(CM_REJ_MESSAGE_REJECTED, rej_msg,
+ CM_MSG_RESPONSE_OTHER);
+ break;
+ }
+
+ IBA_SET(CM_REJ_REASON, rej_msg, reason);
+ if (ari && ari_length) {
+ IBA_SET(CM_REJ_REJECTED_INFO_LENGTH, rej_msg, ari_length);
+ IBA_SET_MEM(CM_REJ_ARI, rej_msg, ari, ari_length);
+ }
+
+ if (private_data && private_data_len)
+ IBA_SET_MEM(CM_REJ_PRIVATE_DATA, rej_msg, private_data,
+ private_data_len);
+}
+
+static void cm_dup_req_handler(struct cm_work *work,
+ struct cm_id_private *cm_id_priv)
+{
+ struct ib_mad_send_buf *msg = NULL;
+ int ret;
+
+ atomic_long_inc(
+ &work->port->counters[CM_RECV_DUPLICATES][CM_REQ_COUNTER]);
+
+ /* Quick state check to discard duplicate REQs. */
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state == IB_CM_REQ_RCVD) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ return;
+ }
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
+ if (ret)
+ return;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ switch (cm_id_priv->id.state) {
+ case IB_CM_MRA_REQ_SENT:
+ cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+ CM_MSG_RESPONSE_REQ, cm_id_priv->service_timeout,
+ cm_id_priv->private_data,
+ cm_id_priv->private_data_len);
+ break;
+ case IB_CM_TIMEWAIT:
+ cm_format_rej((struct cm_rej_msg *)msg->mad, cm_id_priv,
+ IB_CM_REJ_STALE_CONN, NULL, 0, NULL, 0,
+ IB_CM_TIMEWAIT);
+ break;
+ default:
+ goto unlock;
+ }
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ trace_icm_send_dup_req(&cm_id_priv->id);
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ goto free;
+ return;
+
+unlock: spin_unlock_irq(&cm_id_priv->lock);
+free: cm_free_response_msg(msg);
+}
+
+static struct cm_id_private *cm_match_req(struct cm_work *work,
+ struct cm_id_private *cm_id_priv)
+{
+ struct cm_id_private *listen_cm_id_priv, *cur_cm_id_priv;
+ struct cm_timewait_info *timewait_info;
+ struct cm_req_msg *req_msg;
+
+ req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+
+ /* Check for possible duplicate REQ. */
+ spin_lock_irq(&cm.lock);
+ timewait_info = cm_insert_remote_id(cm_id_priv->timewait_info);
+ if (timewait_info) {
+ cur_cm_id_priv = cm_acquire_id(timewait_info->work.local_id,
+ timewait_info->work.remote_id);
+ spin_unlock_irq(&cm.lock);
+ if (cur_cm_id_priv) {
+ cm_dup_req_handler(work, cur_cm_id_priv);
+ cm_deref_id(cur_cm_id_priv);
+ }
+ return NULL;
+ }
+
+ /* Check for stale connections. */
+ timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info);
+ if (timewait_info) {
+ cm_remove_remote(cm_id_priv);
+ cur_cm_id_priv = cm_acquire_id(timewait_info->work.local_id,
+ timewait_info->work.remote_id);
+
+ spin_unlock_irq(&cm.lock);
+ cm_issue_rej(work->port, work->mad_recv_wc,
+ IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ,
+ NULL, 0);
+ if (cur_cm_id_priv) {
+ ib_send_cm_dreq(&cur_cm_id_priv->id, NULL, 0);
+ cm_deref_id(cur_cm_id_priv);
+ }
+ return NULL;
+ }
+
+ /* Find matching listen request. */
+ listen_cm_id_priv = cm_find_listen(
+ cm_id_priv->id.device,
+ cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg)));
+ if (!listen_cm_id_priv) {
+ cm_remove_remote(cm_id_priv);
+ spin_unlock_irq(&cm.lock);
+ cm_issue_rej(work->port, work->mad_recv_wc,
+ IB_CM_REJ_INVALID_SERVICE_ID, CM_MSG_RESPONSE_REQ,
+ NULL, 0);
+ return NULL;
+ }
+ spin_unlock_irq(&cm.lock);
+ return listen_cm_id_priv;
+}
+
+/*
+ * Work-around for inter-subnet connections. If the LIDs are permissive,
+ * we need to override the LID/SL data in the REQ with the LID information
+ * in the work completion.
+ */
+static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc)
+{
+ if (!IBA_GET(CM_REQ_PRIMARY_SUBNET_LOCAL, req_msg)) {
+ if (cpu_to_be16(IBA_GET(CM_REQ_PRIMARY_LOCAL_PORT_LID,
+ req_msg)) == IB_LID_PERMISSIVE) {
+ IBA_SET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg,
+ be16_to_cpu(ib_lid_be16(wc->slid)));
+ IBA_SET(CM_REQ_PRIMARY_SL, req_msg, wc->sl);
+ }
+
+ if (cpu_to_be16(IBA_GET(CM_REQ_PRIMARY_REMOTE_PORT_LID,
+ req_msg)) == IB_LID_PERMISSIVE)
+ IBA_SET(CM_REQ_PRIMARY_REMOTE_PORT_LID, req_msg,
+ wc->dlid_path_bits);
+ }
+
+ if (!IBA_GET(CM_REQ_ALTERNATE_SUBNET_LOCAL, req_msg)) {
+ if (cpu_to_be16(IBA_GET(CM_REQ_ALTERNATE_LOCAL_PORT_LID,
+ req_msg)) == IB_LID_PERMISSIVE) {
+ IBA_SET(CM_REQ_ALTERNATE_LOCAL_PORT_LID, req_msg,
+ be16_to_cpu(ib_lid_be16(wc->slid)));
+ IBA_SET(CM_REQ_ALTERNATE_SL, req_msg, wc->sl);
+ }
+
+ if (cpu_to_be16(IBA_GET(CM_REQ_ALTERNATE_REMOTE_PORT_LID,
+ req_msg)) == IB_LID_PERMISSIVE)
+ IBA_SET(CM_REQ_ALTERNATE_REMOTE_PORT_LID, req_msg,
+ wc->dlid_path_bits);
+ }
+}
+
+static int cm_req_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv, *listen_cm_id_priv;
+ struct cm_req_msg *req_msg;
+ const struct ib_global_route *grh;
+ const struct ib_gid_attr *gid_attr;
+ int ret;
+
+ req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+
+ cm_id_priv =
+ cm_alloc_id_priv(work->port->cm_dev->ib_device, NULL, NULL);
+ if (IS_ERR(cm_id_priv))
+ return PTR_ERR(cm_id_priv);
+
+ cm_id_priv->id.remote_id =
+ cpu_to_be32(IBA_GET(CM_REQ_LOCAL_COMM_ID, req_msg));
+ cm_id_priv->id.service_id =
+ cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg));
+ cm_id_priv->tid = req_msg->hdr.tid;
+ cm_id_priv->timeout_ms = cm_convert_to_ms(
+ IBA_GET(CM_REQ_LOCAL_CM_RESPONSE_TIMEOUT, req_msg));
+ cm_id_priv->max_cm_retries = IBA_GET(CM_REQ_MAX_CM_RETRIES, req_msg);
+ cm_id_priv->remote_qpn =
+ cpu_to_be32(IBA_GET(CM_REQ_LOCAL_QPN, req_msg));
+ cm_id_priv->initiator_depth =
+ IBA_GET(CM_REQ_RESPONDER_RESOURCES, req_msg);
+ cm_id_priv->responder_resources =
+ IBA_GET(CM_REQ_INITIATOR_DEPTH, req_msg);
+ cm_id_priv->path_mtu = IBA_GET(CM_REQ_PATH_PACKET_PAYLOAD_MTU, req_msg);
+ cm_id_priv->pkey = cpu_to_be16(IBA_GET(CM_REQ_PARTITION_KEY, req_msg));
+ cm_id_priv->sq_psn = cpu_to_be32(IBA_GET(CM_REQ_STARTING_PSN, req_msg));
+ cm_id_priv->retry_count = IBA_GET(CM_REQ_RETRY_COUNT, req_msg);
+ cm_id_priv->rnr_retry_count = IBA_GET(CM_REQ_RNR_RETRY_COUNT, req_msg);
+ cm_id_priv->qp_type = cm_req_get_qp_type(req_msg);
+
+ ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+ work->mad_recv_wc->recv_buf.grh,
+ &cm_id_priv->av);
+ if (ret)
+ goto destroy;
+ cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
+ id.local_id);
+ if (IS_ERR(cm_id_priv->timewait_info)) {
+ ret = PTR_ERR(cm_id_priv->timewait_info);
+ cm_id_priv->timewait_info = NULL;
+ goto destroy;
+ }
+ cm_id_priv->timewait_info->work.remote_id = cm_id_priv->id.remote_id;
+ cm_id_priv->timewait_info->remote_ca_guid =
+ cpu_to_be64(IBA_GET(CM_REQ_LOCAL_CA_GUID, req_msg));
+ cm_id_priv->timewait_info->remote_qpn = cm_id_priv->remote_qpn;
+
+ /*
+ * Note that the ID pointer is not in the xarray at this point,
+ * so this set is only visible to the local thread.
+ */
+ cm_id_priv->id.state = IB_CM_REQ_RCVD;
+
+ listen_cm_id_priv = cm_match_req(work, cm_id_priv);
+ if (!listen_cm_id_priv) {
+ trace_icm_no_listener_err(&cm_id_priv->id);
+ cm_id_priv->id.state = IB_CM_IDLE;
+ ret = -EINVAL;
+ goto destroy;
+ }
+
+ memset(&work->path[0], 0, sizeof(work->path[0]));
+ if (cm_req_has_alt_path(req_msg))
+ memset(&work->path[1], 0, sizeof(work->path[1]));
+ grh = rdma_ah_read_grh(&cm_id_priv->av.ah_attr);
+ gid_attr = grh->sgid_attr;
+
+ if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) {
+ work->path[0].rec_type =
+ sa_conv_gid_to_pathrec_type(gid_attr->gid_type);
+ } else {
+ cm_process_routed_req(req_msg, work->mad_recv_wc->wc);
+ cm_path_set_rec_type(
+ work->port->cm_dev->ib_device, work->port->port_num,
+ &work->path[0],
+ IBA_GET_MEM_PTR(CM_REQ_PRIMARY_LOCAL_PORT_GID,
+ req_msg));
+ }
+ if (cm_req_has_alt_path(req_msg))
+ work->path[1].rec_type = work->path[0].rec_type;
+ cm_format_paths_from_req(req_msg, &work->path[0],
+ &work->path[1], work->mad_recv_wc->wc);
+ if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE)
+ sa_path_set_dmac(&work->path[0],
+ cm_id_priv->av.ah_attr.roce.dmac);
+ work->path[0].hop_limit = grh->hop_limit;
+
+ /* This destroy call is needed to pair with cm_init_av_for_response */
+ cm_destroy_av(&cm_id_priv->av);
+ ret = cm_init_av_by_path(&work->path[0], gid_attr, &cm_id_priv->av);
+ if (ret) {
+ int err;
+
+ err = rdma_query_gid(work->port->cm_dev->ib_device,
+ work->port->port_num, 0,
+ &work->path[0].sgid);
+ if (err)
+ ib_send_cm_rej(&cm_id_priv->id, IB_CM_REJ_INVALID_GID,
+ NULL, 0, NULL, 0);
+ else
+ ib_send_cm_rej(&cm_id_priv->id, IB_CM_REJ_INVALID_GID,
+ &work->path[0].sgid,
+ sizeof(work->path[0].sgid),
+ NULL, 0);
+ goto rejected;
+ }
+ if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_IB)
+ cm_id_priv->av.dlid_datapath =
+ IBA_GET(CM_REQ_PRIMARY_LOCAL_PORT_LID, req_msg);
+
+ if (cm_req_has_alt_path(req_msg)) {
+ ret = cm_init_av_by_path(&work->path[1], NULL,
+ &cm_id_priv->alt_av);
+ if (ret) {
+ ib_send_cm_rej(&cm_id_priv->id,
+ IB_CM_REJ_INVALID_ALT_GID,
+ &work->path[0].sgid,
+ sizeof(work->path[0].sgid), NULL, 0);
+ goto rejected;
+ }
+ }
+
+ cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler;
+ cm_id_priv->id.context = listen_cm_id_priv->id.context;
+ cm_format_req_event(work, cm_id_priv, &listen_cm_id_priv->id);
+
+ /* Now MAD handlers can see the new ID */
+ spin_lock_irq(&cm_id_priv->lock);
+ cm_finalize_id(cm_id_priv);
+
+ /* Refcount belongs to the event, pairs with cm_process_work() */
+ refcount_inc(&cm_id_priv->refcount);
+ cm_queue_work_unlock(cm_id_priv, work);
+ /*
+ * Since this ID was just created and was not made visible to other MAD
+ * handlers until the cm_finalize_id() above we know that the
+ * cm_process_work() will deliver the event and the listen_cm_id
+ * embedded in the event can be derefed here.
+ */
+ cm_deref_id(listen_cm_id_priv);
+ return 0;
+
+rejected:
+ cm_deref_id(listen_cm_id_priv);
+destroy:
+ ib_destroy_cm_id(&cm_id_priv->id);
+ return ret;
+}
+
+static void cm_format_rep(struct cm_rep_msg *rep_msg,
+ struct cm_id_private *cm_id_priv,
+ struct ib_cm_rep_param *param)
+{
+ cm_format_mad_ece_hdr(&rep_msg->hdr, CM_REP_ATTR_ID, cm_id_priv->tid,
+ param->ece.attr_mod);
+ IBA_SET(CM_REP_LOCAL_COMM_ID, rep_msg,
+ be32_to_cpu(cm_id_priv->id.local_id));
+ IBA_SET(CM_REP_REMOTE_COMM_ID, rep_msg,
+ be32_to_cpu(cm_id_priv->id.remote_id));
+ IBA_SET(CM_REP_STARTING_PSN, rep_msg, param->starting_psn);
+ IBA_SET(CM_REP_RESPONDER_RESOURCES, rep_msg,
+ param->responder_resources);
+ IBA_SET(CM_REP_TARGET_ACK_DELAY, rep_msg,
+ cm_id_priv->av.port->cm_dev->ack_delay);
+ IBA_SET(CM_REP_FAILOVER_ACCEPTED, rep_msg, param->failover_accepted);
+ IBA_SET(CM_REP_RNR_RETRY_COUNT, rep_msg, param->rnr_retry_count);
+ IBA_SET(CM_REP_LOCAL_CA_GUID, rep_msg,
+ be64_to_cpu(cm_id_priv->id.device->node_guid));
+
+ if (cm_id_priv->qp_type != IB_QPT_XRC_TGT) {
+ IBA_SET(CM_REP_INITIATOR_DEPTH, rep_msg,
+ param->initiator_depth);
+ IBA_SET(CM_REP_END_TO_END_FLOW_CONTROL, rep_msg,
+ param->flow_control);
+ IBA_SET(CM_REP_SRQ, rep_msg, param->srq);
+ IBA_SET(CM_REP_LOCAL_QPN, rep_msg, param->qp_num);
+ } else {
+ IBA_SET(CM_REP_SRQ, rep_msg, 1);
+ IBA_SET(CM_REP_LOCAL_EE_CONTEXT_NUMBER, rep_msg, param->qp_num);
+ }
+
+ IBA_SET(CM_REP_VENDOR_ID_L, rep_msg, param->ece.vendor_id);
+ IBA_SET(CM_REP_VENDOR_ID_M, rep_msg, param->ece.vendor_id >> 8);
+ IBA_SET(CM_REP_VENDOR_ID_H, rep_msg, param->ece.vendor_id >> 16);
+
+ if (param->private_data && param->private_data_len)
+ IBA_SET_MEM(CM_REP_PRIVATE_DATA, rep_msg, param->private_data,
+ param->private_data_len);
+}
+
+int ib_send_cm_rep(struct ib_cm_id *cm_id,
+ struct ib_cm_rep_param *param)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ struct cm_rep_msg *rep_msg;
+ unsigned long flags;
+ int ret;
+
+ if (param->private_data &&
+ param->private_data_len > IB_CM_REP_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_REQ_RCVD &&
+ cm_id->state != IB_CM_MRA_REQ_SENT) {
+ trace_icm_send_rep_err(cm_id_priv->id.local_id, cm_id->state);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ msg = cm_alloc_priv_msg(cm_id_priv);
+ if (IS_ERR(msg)) {
+ ret = PTR_ERR(msg);
+ goto out;
+ }
+
+ rep_msg = (struct cm_rep_msg *) msg->mad;
+ cm_format_rep(rep_msg, cm_id_priv, param);
+ msg->timeout_ms = cm_id_priv->timeout_ms;
+ msg->context[1] = (void *) (unsigned long) IB_CM_REP_SENT;
+
+ trace_icm_send_rep(cm_id);
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ goto out_free;
+
+ cm_id->state = IB_CM_REP_SENT;
+ cm_id_priv->initiator_depth = param->initiator_depth;
+ cm_id_priv->responder_resources = param->responder_resources;
+ cm_id_priv->rq_psn = cpu_to_be32(IBA_GET(CM_REP_STARTING_PSN, rep_msg));
+ WARN_ONCE(param->qp_num & 0xFF000000,
+ "IBTA declares QPN to be 24 bits, but it is 0x%X\n",
+ param->qp_num);
+ cm_id_priv->local_qpn = cpu_to_be32(param->qp_num & 0xFFFFFF);
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return 0;
+
+out_free:
+ cm_free_priv_msg(msg);
+out:
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rep);
+
+static void cm_format_rtu(struct cm_rtu_msg *rtu_msg,
+ struct cm_id_private *cm_id_priv,
+ const void *private_data,
+ u8 private_data_len)
+{
+ cm_format_mad_hdr(&rtu_msg->hdr, CM_RTU_ATTR_ID, cm_id_priv->tid);
+ IBA_SET(CM_RTU_LOCAL_COMM_ID, rtu_msg,
+ be32_to_cpu(cm_id_priv->id.local_id));
+ IBA_SET(CM_RTU_REMOTE_COMM_ID, rtu_msg,
+ be32_to_cpu(cm_id_priv->id.remote_id));
+
+ if (private_data && private_data_len)
+ IBA_SET_MEM(CM_RTU_PRIVATE_DATA, rtu_msg, private_data,
+ private_data_len);
+}
+
+int ib_send_cm_rtu(struct ib_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ void *data;
+ int ret;
+
+ if (private_data && private_data_len > IB_CM_RTU_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ data = cm_copy_private_data(private_data, private_data_len);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_REP_RCVD &&
+ cm_id->state != IB_CM_MRA_REP_SENT) {
+ trace_icm_send_cm_rtu_err(cm_id);
+ ret = -EINVAL;
+ goto error;
+ }
+
+ msg = cm_alloc_msg(cm_id_priv);
+ if (IS_ERR(msg)) {
+ ret = PTR_ERR(msg);
+ goto error;
+ }
+
+ cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv,
+ private_data, private_data_len);
+
+ trace_icm_send_rtu(cm_id);
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_free_msg(msg);
+ kfree(data);
+ return ret;
+ }
+
+ cm_id->state = IB_CM_ESTABLISHED;
+ cm_set_private_data(cm_id_priv, data, private_data_len);
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return 0;
+
+error: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ kfree(data);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rtu);
+
+static void cm_format_rep_event(struct cm_work *work, enum ib_qp_type qp_type)
+{
+ struct cm_rep_msg *rep_msg;
+ struct ib_cm_rep_event_param *param;
+
+ rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad;
+ param = &work->cm_event.param.rep_rcvd;
+ param->remote_ca_guid =
+ cpu_to_be64(IBA_GET(CM_REP_LOCAL_CA_GUID, rep_msg));
+ param->remote_qkey = IBA_GET(CM_REP_LOCAL_Q_KEY, rep_msg);
+ param->remote_qpn = be32_to_cpu(cm_rep_get_qpn(rep_msg, qp_type));
+ param->starting_psn = IBA_GET(CM_REP_STARTING_PSN, rep_msg);
+ param->responder_resources = IBA_GET(CM_REP_INITIATOR_DEPTH, rep_msg);
+ param->initiator_depth = IBA_GET(CM_REP_RESPONDER_RESOURCES, rep_msg);
+ param->target_ack_delay = IBA_GET(CM_REP_TARGET_ACK_DELAY, rep_msg);
+ param->failover_accepted = IBA_GET(CM_REP_FAILOVER_ACCEPTED, rep_msg);
+ param->flow_control = IBA_GET(CM_REP_END_TO_END_FLOW_CONTROL, rep_msg);
+ param->rnr_retry_count = IBA_GET(CM_REP_RNR_RETRY_COUNT, rep_msg);
+ param->srq = IBA_GET(CM_REP_SRQ, rep_msg);
+ param->ece.vendor_id = IBA_GET(CM_REP_VENDOR_ID_H, rep_msg) << 16;
+ param->ece.vendor_id |= IBA_GET(CM_REP_VENDOR_ID_M, rep_msg) << 8;
+ param->ece.vendor_id |= IBA_GET(CM_REP_VENDOR_ID_L, rep_msg);
+ param->ece.attr_mod = be32_to_cpu(rep_msg->hdr.attr_mod);
+
+ work->cm_event.private_data =
+ IBA_GET_MEM_PTR(CM_REP_PRIVATE_DATA, rep_msg);
+}
+
+static void cm_dup_rep_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_rep_msg *rep_msg;
+ struct ib_mad_send_buf *msg = NULL;
+ int ret;
+
+ rep_msg = (struct cm_rep_msg *) work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(
+ cpu_to_be32(IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)),
+ cpu_to_be32(IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg)));
+ if (!cm_id_priv)
+ return;
+
+ atomic_long_inc(
+ &work->port->counters[CM_RECV_DUPLICATES][CM_REP_COUNTER]);
+ ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
+ if (ret)
+ goto deref;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state == IB_CM_ESTABLISHED)
+ cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv,
+ cm_id_priv->private_data,
+ cm_id_priv->private_data_len);
+ else if (cm_id_priv->id.state == IB_CM_MRA_REP_SENT)
+ cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+ CM_MSG_RESPONSE_REP, cm_id_priv->service_timeout,
+ cm_id_priv->private_data,
+ cm_id_priv->private_data_len);
+ else
+ goto unlock;
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ trace_icm_send_dup_rep(&cm_id_priv->id);
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ goto free;
+ goto deref;
+
+unlock: spin_unlock_irq(&cm_id_priv->lock);
+free: cm_free_response_msg(msg);
+deref: cm_deref_id(cm_id_priv);
+}
+
+static int cm_rep_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_rep_msg *rep_msg;
+ int ret;
+ struct cm_id_private *cur_cm_id_priv;
+ struct cm_timewait_info *timewait_info;
+
+ rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(
+ cpu_to_be32(IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)), 0);
+ if (!cm_id_priv) {
+ cm_dup_rep_handler(work);
+ trace_icm_remote_no_priv_err(
+ IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg));
+ return -EINVAL;
+ }
+
+ cm_format_rep_event(work, cm_id_priv->qp_type);
+
+ spin_lock_irq(&cm_id_priv->lock);
+ switch (cm_id_priv->id.state) {
+ case IB_CM_REQ_SENT:
+ case IB_CM_MRA_REQ_RCVD:
+ break;
+ default:
+ ret = -EINVAL;
+ trace_icm_rep_unknown_err(
+ IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg),
+ IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg),
+ cm_id_priv->id.state);
+ spin_unlock_irq(&cm_id_priv->lock);
+ goto error;
+ }
+
+ cm_id_priv->timewait_info->work.remote_id =
+ cpu_to_be32(IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg));
+ cm_id_priv->timewait_info->remote_ca_guid =
+ cpu_to_be64(IBA_GET(CM_REP_LOCAL_CA_GUID, rep_msg));
+ cm_id_priv->timewait_info->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type);
+
+ spin_lock(&cm.lock);
+ /* Check for duplicate REP. */
+ if (cm_insert_remote_id(cm_id_priv->timewait_info)) {
+ spin_unlock(&cm.lock);
+ spin_unlock_irq(&cm_id_priv->lock);
+ ret = -EINVAL;
+ trace_icm_insert_failed_err(
+ IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg));
+ goto error;
+ }
+ /* Check for a stale connection. */
+ timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info);
+ if (timewait_info) {
+ cm_remove_remote(cm_id_priv);
+ cur_cm_id_priv = cm_acquire_id(timewait_info->work.local_id,
+ timewait_info->work.remote_id);
+
+ spin_unlock(&cm.lock);
+ spin_unlock_irq(&cm_id_priv->lock);
+ cm_issue_rej(work->port, work->mad_recv_wc,
+ IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP,
+ NULL, 0);
+ ret = -EINVAL;
+ trace_icm_staleconn_err(
+ IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg),
+ IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg));
+
+ if (cur_cm_id_priv) {
+ ib_send_cm_dreq(&cur_cm_id_priv->id, NULL, 0);
+ cm_deref_id(cur_cm_id_priv);
+ }
+
+ goto error;
+ }
+ spin_unlock(&cm.lock);
+
+ cm_id_priv->id.state = IB_CM_REP_RCVD;
+ cm_id_priv->id.remote_id =
+ cpu_to_be32(IBA_GET(CM_REP_LOCAL_COMM_ID, rep_msg));
+ cm_id_priv->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type);
+ cm_id_priv->initiator_depth =
+ IBA_GET(CM_REP_RESPONDER_RESOURCES, rep_msg);
+ cm_id_priv->responder_resources =
+ IBA_GET(CM_REP_INITIATOR_DEPTH, rep_msg);
+ cm_id_priv->sq_psn = cpu_to_be32(IBA_GET(CM_REP_STARTING_PSN, rep_msg));
+ cm_id_priv->rnr_retry_count = IBA_GET(CM_REP_RNR_RETRY_COUNT, rep_msg);
+ cm_id_priv->target_ack_delay =
+ IBA_GET(CM_REP_TARGET_ACK_DELAY, rep_msg);
+ cm_id_priv->av.timeout =
+ cm_ack_timeout(cm_id_priv->target_ack_delay,
+ cm_id_priv->av.timeout - 1);
+ cm_id_priv->alt_av.timeout =
+ cm_ack_timeout(cm_id_priv->target_ack_delay,
+ cm_id_priv->alt_av.timeout - 1);
+
+ ib_cancel_mad(cm_id_priv->msg);
+ cm_queue_work_unlock(cm_id_priv, work);
+ return 0;
+
+error:
+ cm_deref_id(cm_id_priv);
+ return ret;
+}
+
+static int cm_establish_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+
+ /* See comment in cm_establish about lookup. */
+ cm_id_priv = cm_acquire_id(work->local_id, work->remote_id);
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_ESTABLISHED) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ goto out;
+ }
+
+ ib_cancel_mad(cm_id_priv->msg);
+ cm_queue_work_unlock(cm_id_priv, work);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static int cm_rtu_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_rtu_msg *rtu_msg;
+
+ rtu_msg = (struct cm_rtu_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(
+ cpu_to_be32(IBA_GET(CM_RTU_REMOTE_COMM_ID, rtu_msg)),
+ cpu_to_be32(IBA_GET(CM_RTU_LOCAL_COMM_ID, rtu_msg)));
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ work->cm_event.private_data =
+ IBA_GET_MEM_PTR(CM_RTU_PRIVATE_DATA, rtu_msg);
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_REP_SENT &&
+ cm_id_priv->id.state != IB_CM_MRA_REP_RCVD) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
+ [CM_RTU_COUNTER]);
+ goto out;
+ }
+ cm_id_priv->id.state = IB_CM_ESTABLISHED;
+
+ ib_cancel_mad(cm_id_priv->msg);
+ cm_queue_work_unlock(cm_id_priv, work);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static void cm_format_dreq(struct cm_dreq_msg *dreq_msg,
+ struct cm_id_private *cm_id_priv,
+ const void *private_data,
+ u8 private_data_len)
+{
+ cm_format_mad_hdr(&dreq_msg->hdr, CM_DREQ_ATTR_ID,
+ cm_form_tid(cm_id_priv));
+ IBA_SET(CM_DREQ_LOCAL_COMM_ID, dreq_msg,
+ be32_to_cpu(cm_id_priv->id.local_id));
+ IBA_SET(CM_DREQ_REMOTE_COMM_ID, dreq_msg,
+ be32_to_cpu(cm_id_priv->id.remote_id));
+ IBA_SET(CM_DREQ_REMOTE_QPN_EECN, dreq_msg,
+ be32_to_cpu(cm_id_priv->remote_qpn));
+
+ if (private_data && private_data_len)
+ IBA_SET_MEM(CM_DREQ_PRIVATE_DATA, dreq_msg, private_data,
+ private_data_len);
+}
+
+static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv,
+ const void *private_data, u8 private_data_len)
+{
+ struct ib_mad_send_buf *msg;
+ int ret;
+
+ lockdep_assert_held(&cm_id_priv->lock);
+
+ if (private_data && private_data_len > IB_CM_DREQ_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ if (cm_id_priv->id.state != IB_CM_ESTABLISHED) {
+ trace_icm_dreq_skipped(&cm_id_priv->id);
+ return -EINVAL;
+ }
+
+ if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT ||
+ cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
+ ib_cancel_mad(cm_id_priv->msg);
+
+ msg = cm_alloc_priv_msg(cm_id_priv);
+ if (IS_ERR(msg)) {
+ cm_enter_timewait(cm_id_priv);
+ return PTR_ERR(msg);
+ }
+
+ cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv,
+ private_data, private_data_len);
+ msg->timeout_ms = cm_id_priv->timeout_ms;
+ msg->context[1] = (void *) (unsigned long) IB_CM_DREQ_SENT;
+
+ trace_icm_send_dreq(&cm_id_priv->id);
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ cm_enter_timewait(cm_id_priv);
+ cm_free_priv_msg(msg);
+ return ret;
+ }
+
+ cm_id_priv->id.state = IB_CM_DREQ_SENT;
+ return 0;
+}
+
+int ib_send_cm_dreq(struct ib_cm_id *cm_id, const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv =
+ container_of(cm_id, struct cm_id_private, id);
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ ret = cm_send_dreq_locked(cm_id_priv, private_data, private_data_len);
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_dreq);
+
+static void cm_format_drep(struct cm_drep_msg *drep_msg,
+ struct cm_id_private *cm_id_priv,
+ const void *private_data,
+ u8 private_data_len)
+{
+ cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, cm_id_priv->tid);
+ IBA_SET(CM_DREP_LOCAL_COMM_ID, drep_msg,
+ be32_to_cpu(cm_id_priv->id.local_id));
+ IBA_SET(CM_DREP_REMOTE_COMM_ID, drep_msg,
+ be32_to_cpu(cm_id_priv->id.remote_id));
+
+ if (private_data && private_data_len)
+ IBA_SET_MEM(CM_DREP_PRIVATE_DATA, drep_msg, private_data,
+ private_data_len);
+}
+
+static int cm_send_drep_locked(struct cm_id_private *cm_id_priv,
+ void *private_data, u8 private_data_len)
+{
+ struct ib_mad_send_buf *msg;
+ int ret;
+
+ lockdep_assert_held(&cm_id_priv->lock);
+
+ if (private_data && private_data_len > IB_CM_DREP_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ if (cm_id_priv->id.state != IB_CM_DREQ_RCVD) {
+ trace_icm_send_drep_err(&cm_id_priv->id);
+ kfree(private_data);
+ return -EINVAL;
+ }
+
+ cm_set_private_data(cm_id_priv, private_data, private_data_len);
+ cm_enter_timewait(cm_id_priv);
+
+ msg = cm_alloc_msg(cm_id_priv);
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
+
+ cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv,
+ private_data, private_data_len);
+
+ trace_icm_send_drep(&cm_id_priv->id);
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ cm_free_msg(msg);
+ return ret;
+ }
+ return 0;
+}
+
+int ib_send_cm_drep(struct ib_cm_id *cm_id, const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv =
+ container_of(cm_id, struct cm_id_private, id);
+ unsigned long flags;
+ void *data;
+ int ret;
+
+ data = cm_copy_private_data(private_data, private_data_len);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ ret = cm_send_drep_locked(cm_id_priv, data, private_data_len);
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_drep);
+
+static int cm_issue_drep(struct cm_port *port,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_mad_send_buf *msg = NULL;
+ struct cm_dreq_msg *dreq_msg;
+ struct cm_drep_msg *drep_msg;
+ int ret;
+
+ ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
+ if (ret)
+ return ret;
+
+ dreq_msg = (struct cm_dreq_msg *) mad_recv_wc->recv_buf.mad;
+ drep_msg = (struct cm_drep_msg *) msg->mad;
+
+ cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, dreq_msg->hdr.tid);
+ IBA_SET(CM_DREP_REMOTE_COMM_ID, drep_msg,
+ IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg));
+ IBA_SET(CM_DREP_LOCAL_COMM_ID, drep_msg,
+ IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg));
+
+ trace_icm_issue_drep(
+ IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg),
+ IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg));
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ cm_free_response_msg(msg);
+
+ return ret;
+}
+
+static int cm_dreq_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_dreq_msg *dreq_msg;
+ struct ib_mad_send_buf *msg = NULL;
+
+ dreq_msg = (struct cm_dreq_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(
+ cpu_to_be32(IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg)),
+ cpu_to_be32(IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg)));
+ if (!cm_id_priv) {
+ atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
+ [CM_DREQ_COUNTER]);
+ cm_issue_drep(work->port, work->mad_recv_wc);
+ trace_icm_no_priv_err(
+ IBA_GET(CM_DREQ_LOCAL_COMM_ID, dreq_msg),
+ IBA_GET(CM_DREQ_REMOTE_COMM_ID, dreq_msg));
+ return -EINVAL;
+ }
+
+ work->cm_event.private_data =
+ IBA_GET_MEM_PTR(CM_DREQ_PRIVATE_DATA, dreq_msg);
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->local_qpn !=
+ cpu_to_be32(IBA_GET(CM_DREQ_REMOTE_QPN_EECN, dreq_msg)))
+ goto unlock;
+
+ switch (cm_id_priv->id.state) {
+ case IB_CM_REP_SENT:
+ case IB_CM_DREQ_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ ib_cancel_mad(cm_id_priv->msg);
+ break;
+ case IB_CM_ESTABLISHED:
+ if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT ||
+ cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
+ ib_cancel_mad(cm_id_priv->msg);
+ break;
+ case IB_CM_TIMEWAIT:
+ atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
+ [CM_DREQ_COUNTER]);
+ msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc);
+ if (IS_ERR(msg))
+ goto unlock;
+
+ cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv,
+ cm_id_priv->private_data,
+ cm_id_priv->private_data_len);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) ||
+ ib_post_send_mad(msg, NULL))
+ cm_free_response_msg(msg);
+ goto deref;
+ case IB_CM_DREQ_RCVD:
+ atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
+ [CM_DREQ_COUNTER]);
+ goto unlock;
+ default:
+ trace_icm_dreq_unknown_err(&cm_id_priv->id);
+ goto unlock;
+ }
+ cm_id_priv->id.state = IB_CM_DREQ_RCVD;
+ cm_id_priv->tid = dreq_msg->hdr.tid;
+ cm_queue_work_unlock(cm_id_priv, work);
+ return 0;
+
+unlock: spin_unlock_irq(&cm_id_priv->lock);
+deref: cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static int cm_drep_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_drep_msg *drep_msg;
+
+ drep_msg = (struct cm_drep_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(
+ cpu_to_be32(IBA_GET(CM_DREP_REMOTE_COMM_ID, drep_msg)),
+ cpu_to_be32(IBA_GET(CM_DREP_LOCAL_COMM_ID, drep_msg)));
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ work->cm_event.private_data =
+ IBA_GET_MEM_PTR(CM_DREP_PRIVATE_DATA, drep_msg);
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_DREQ_SENT &&
+ cm_id_priv->id.state != IB_CM_DREQ_RCVD) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ goto out;
+ }
+ cm_enter_timewait(cm_id_priv);
+
+ ib_cancel_mad(cm_id_priv->msg);
+ cm_queue_work_unlock(cm_id_priv, work);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static int cm_send_rej_locked(struct cm_id_private *cm_id_priv,
+ enum ib_cm_rej_reason reason, void *ari,
+ u8 ari_length, const void *private_data,
+ u8 private_data_len)
+{
+ enum ib_cm_state state = cm_id_priv->id.state;
+ struct ib_mad_send_buf *msg;
+ int ret;
+
+ lockdep_assert_held(&cm_id_priv->lock);
+
+ if ((private_data && private_data_len > IB_CM_REJ_PRIVATE_DATA_SIZE) ||
+ (ari && ari_length > IB_CM_REJ_ARI_LENGTH))
+ return -EINVAL;
+
+ trace_icm_send_rej(&cm_id_priv->id, reason);
+
+ switch (state) {
+ case IB_CM_REQ_SENT:
+ case IB_CM_MRA_REQ_RCVD:
+ case IB_CM_REQ_RCVD:
+ case IB_CM_MRA_REQ_SENT:
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ cm_reset_to_idle(cm_id_priv);
+ msg = cm_alloc_msg(cm_id_priv);
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
+ cm_format_rej((struct cm_rej_msg *)msg->mad, cm_id_priv, reason,
+ ari, ari_length, private_data, private_data_len,
+ state);
+ break;
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ cm_enter_timewait(cm_id_priv);
+ msg = cm_alloc_msg(cm_id_priv);
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
+ cm_format_rej((struct cm_rej_msg *)msg->mad, cm_id_priv, reason,
+ ari, ari_length, private_data, private_data_len,
+ state);
+ break;
+ default:
+ trace_icm_send_unknown_rej_err(&cm_id_priv->id);
+ return -EINVAL;
+ }
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ cm_free_msg(msg);
+ return ret;
+ }
+
+ return 0;
+}
+
+int ib_send_cm_rej(struct ib_cm_id *cm_id, enum ib_cm_rej_reason reason,
+ void *ari, u8 ari_length, const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv =
+ container_of(cm_id, struct cm_id_private, id);
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ ret = cm_send_rej_locked(cm_id_priv, reason, ari, ari_length,
+ private_data, private_data_len);
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rej);
+
+static void cm_format_rej_event(struct cm_work *work)
+{
+ struct cm_rej_msg *rej_msg;
+ struct ib_cm_rej_event_param *param;
+
+ rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad;
+ param = &work->cm_event.param.rej_rcvd;
+ param->ari = IBA_GET_MEM_PTR(CM_REJ_ARI, rej_msg);
+ param->ari_length = IBA_GET(CM_REJ_REJECTED_INFO_LENGTH, rej_msg);
+ param->reason = IBA_GET(CM_REJ_REASON, rej_msg);
+ work->cm_event.private_data =
+ IBA_GET_MEM_PTR(CM_REJ_PRIVATE_DATA, rej_msg);
+}
+
+static struct cm_id_private *cm_acquire_rejected_id(struct cm_rej_msg *rej_msg)
+{
+ struct cm_id_private *cm_id_priv;
+ __be32 remote_id;
+
+ remote_id = cpu_to_be32(IBA_GET(CM_REJ_LOCAL_COMM_ID, rej_msg));
+
+ if (IBA_GET(CM_REJ_REASON, rej_msg) == IB_CM_REJ_TIMEOUT) {
+ cm_id_priv = cm_find_remote_id(
+ *((__be64 *)IBA_GET_MEM_PTR(CM_REJ_ARI, rej_msg)),
+ remote_id);
+ } else if (IBA_GET(CM_REJ_MESSAGE_REJECTED, rej_msg) ==
+ CM_MSG_RESPONSE_REQ)
+ cm_id_priv = cm_acquire_id(
+ cpu_to_be32(IBA_GET(CM_REJ_REMOTE_COMM_ID, rej_msg)),
+ 0);
+ else
+ cm_id_priv = cm_acquire_id(
+ cpu_to_be32(IBA_GET(CM_REJ_REMOTE_COMM_ID, rej_msg)),
+ remote_id);
+
+ return cm_id_priv;
+}
+
+static int cm_rej_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_rej_msg *rej_msg;
+
+ rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_rejected_id(rej_msg);
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ cm_format_rej_event(work);
+
+ spin_lock_irq(&cm_id_priv->lock);
+ switch (cm_id_priv->id.state) {
+ case IB_CM_REQ_SENT:
+ case IB_CM_MRA_REQ_RCVD:
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ ib_cancel_mad(cm_id_priv->msg);
+ fallthrough;
+ case IB_CM_REQ_RCVD:
+ case IB_CM_MRA_REQ_SENT:
+ if (IBA_GET(CM_REJ_REASON, rej_msg) == IB_CM_REJ_STALE_CONN)
+ cm_enter_timewait(cm_id_priv);
+ else
+ cm_reset_to_idle(cm_id_priv);
+ break;
+ case IB_CM_DREQ_SENT:
+ ib_cancel_mad(cm_id_priv->msg);
+ fallthrough;
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ cm_enter_timewait(cm_id_priv);
+ break;
+ case IB_CM_ESTABLISHED:
+ if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT ||
+ cm_id_priv->id.lap_state == IB_CM_LAP_SENT) {
+ if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT)
+ ib_cancel_mad(cm_id_priv->msg);
+ cm_enter_timewait(cm_id_priv);
+ break;
+ }
+ fallthrough;
+ default:
+ trace_icm_rej_unknown_err(&cm_id_priv->id);
+ spin_unlock_irq(&cm_id_priv->lock);
+ goto out;
+ }
+
+ cm_queue_work_unlock(cm_id_priv, work);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+int ib_send_cm_mra(struct ib_cm_id *cm_id,
+ u8 service_timeout,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ enum ib_cm_state cm_state;
+ enum ib_cm_lap_state lap_state;
+ enum cm_msg_response msg_response;
+ void *data;
+ unsigned long flags;
+ int ret;
+
+ if (private_data && private_data_len > IB_CM_MRA_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ data = cm_copy_private_data(private_data, private_data_len);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->id.state) {
+ case IB_CM_REQ_RCVD:
+ cm_state = IB_CM_MRA_REQ_SENT;
+ lap_state = cm_id->lap_state;
+ msg_response = CM_MSG_RESPONSE_REQ;
+ break;
+ case IB_CM_REP_RCVD:
+ cm_state = IB_CM_MRA_REP_SENT;
+ lap_state = cm_id->lap_state;
+ msg_response = CM_MSG_RESPONSE_REP;
+ break;
+ case IB_CM_ESTABLISHED:
+ if (cm_id->lap_state == IB_CM_LAP_RCVD) {
+ cm_state = cm_id->state;
+ lap_state = IB_CM_MRA_LAP_SENT;
+ msg_response = CM_MSG_RESPONSE_OTHER;
+ break;
+ }
+ fallthrough;
+ default:
+ trace_icm_send_mra_unknown_err(&cm_id_priv->id);
+ ret = -EINVAL;
+ goto error_unlock;
+ }
+
+ if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) {
+ msg = cm_alloc_msg(cm_id_priv);
+ if (IS_ERR(msg)) {
+ ret = PTR_ERR(msg);
+ goto error_unlock;
+ }
+
+ cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+ msg_response, service_timeout,
+ private_data, private_data_len);
+ trace_icm_send_mra(cm_id);
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ goto error_free_msg;
+ }
+
+ cm_id->state = cm_state;
+ cm_id->lap_state = lap_state;
+ cm_id_priv->service_timeout = service_timeout;
+ cm_set_private_data(cm_id_priv, data, private_data_len);
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return 0;
+
+error_free_msg:
+ cm_free_msg(msg);
+error_unlock:
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ kfree(data);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_mra);
+
+static struct cm_id_private *cm_acquire_mraed_id(struct cm_mra_msg *mra_msg)
+{
+ switch (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg)) {
+ case CM_MSG_RESPONSE_REQ:
+ return cm_acquire_id(
+ cpu_to_be32(IBA_GET(CM_MRA_REMOTE_COMM_ID, mra_msg)),
+ 0);
+ case CM_MSG_RESPONSE_REP:
+ case CM_MSG_RESPONSE_OTHER:
+ return cm_acquire_id(
+ cpu_to_be32(IBA_GET(CM_MRA_REMOTE_COMM_ID, mra_msg)),
+ cpu_to_be32(IBA_GET(CM_MRA_LOCAL_COMM_ID, mra_msg)));
+ default:
+ return NULL;
+ }
+}
+
+static int cm_mra_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_mra_msg *mra_msg;
+ int timeout;
+
+ mra_msg = (struct cm_mra_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_mraed_id(mra_msg);
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ work->cm_event.private_data =
+ IBA_GET_MEM_PTR(CM_MRA_PRIVATE_DATA, mra_msg);
+ work->cm_event.param.mra_rcvd.service_timeout =
+ IBA_GET(CM_MRA_SERVICE_TIMEOUT, mra_msg);
+ timeout = cm_convert_to_ms(IBA_GET(CM_MRA_SERVICE_TIMEOUT, mra_msg)) +
+ cm_convert_to_ms(cm_id_priv->av.timeout);
+
+ spin_lock_irq(&cm_id_priv->lock);
+ switch (cm_id_priv->id.state) {
+ case IB_CM_REQ_SENT:
+ if (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg) !=
+ CM_MSG_RESPONSE_REQ ||
+ ib_modify_mad(cm_id_priv->msg, timeout))
+ goto out;
+ cm_id_priv->id.state = IB_CM_MRA_REQ_RCVD;
+ break;
+ case IB_CM_REP_SENT:
+ if (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg) !=
+ CM_MSG_RESPONSE_REP ||
+ ib_modify_mad(cm_id_priv->msg, timeout))
+ goto out;
+ cm_id_priv->id.state = IB_CM_MRA_REP_RCVD;
+ break;
+ case IB_CM_ESTABLISHED:
+ if (IBA_GET(CM_MRA_MESSAGE_MRAED, mra_msg) !=
+ CM_MSG_RESPONSE_OTHER ||
+ cm_id_priv->id.lap_state != IB_CM_LAP_SENT ||
+ ib_modify_mad(cm_id_priv->msg, timeout)) {
+ if (cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
+ atomic_long_inc(
+ &work->port->counters[CM_RECV_DUPLICATES]
+ [CM_MRA_COUNTER]);
+ goto out;
+ }
+ cm_id_priv->id.lap_state = IB_CM_MRA_LAP_RCVD;
+ break;
+ case IB_CM_MRA_REQ_RCVD:
+ case IB_CM_MRA_REP_RCVD:
+ atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
+ [CM_MRA_COUNTER]);
+ fallthrough;
+ default:
+ trace_icm_mra_unknown_err(&cm_id_priv->id);
+ goto out;
+ }
+
+ cm_id_priv->msg->context[1] = (void *) (unsigned long)
+ cm_id_priv->id.state;
+ cm_queue_work_unlock(cm_id_priv, work);
+ return 0;
+out:
+ spin_unlock_irq(&cm_id_priv->lock);
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static void cm_format_path_lid_from_lap(struct cm_lap_msg *lap_msg,
+ struct sa_path_rec *path)
+{
+ u32 lid;
+
+ if (path->rec_type != SA_PATH_REC_TYPE_OPA) {
+ sa_path_set_dlid(path, IBA_GET(CM_LAP_ALTERNATE_LOCAL_PORT_LID,
+ lap_msg));
+ sa_path_set_slid(path, IBA_GET(CM_LAP_ALTERNATE_REMOTE_PORT_LID,
+ lap_msg));
+ } else {
+ lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR(
+ CM_LAP_ALTERNATE_LOCAL_PORT_GID, lap_msg));
+ sa_path_set_dlid(path, lid);
+
+ lid = opa_get_lid_from_gid(IBA_GET_MEM_PTR(
+ CM_LAP_ALTERNATE_REMOTE_PORT_GID, lap_msg));
+ sa_path_set_slid(path, lid);
+ }
+}
+
+static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv,
+ struct sa_path_rec *path,
+ struct cm_lap_msg *lap_msg)
+{
+ path->dgid = *IBA_GET_MEM_PTR(CM_LAP_ALTERNATE_LOCAL_PORT_GID, lap_msg);
+ path->sgid =
+ *IBA_GET_MEM_PTR(CM_LAP_ALTERNATE_REMOTE_PORT_GID, lap_msg);
+ path->flow_label =
+ cpu_to_be32(IBA_GET(CM_LAP_ALTERNATE_FLOW_LABEL, lap_msg));
+ path->hop_limit = IBA_GET(CM_LAP_ALTERNATE_HOP_LIMIT, lap_msg);
+ path->traffic_class = IBA_GET(CM_LAP_ALTERNATE_TRAFFIC_CLASS, lap_msg);
+ path->reversible = 1;
+ path->pkey = cm_id_priv->pkey;
+ path->sl = IBA_GET(CM_LAP_ALTERNATE_SL, lap_msg);
+ path->mtu_selector = IB_SA_EQ;
+ path->mtu = cm_id_priv->path_mtu;
+ path->rate_selector = IB_SA_EQ;
+ path->rate = IBA_GET(CM_LAP_ALTERNATE_PACKET_RATE, lap_msg);
+ path->packet_life_time_selector = IB_SA_EQ;
+ path->packet_life_time =
+ IBA_GET(CM_LAP_ALTERNATE_LOCAL_ACK_TIMEOUT, lap_msg);
+ path->packet_life_time -= (path->packet_life_time > 0);
+ cm_format_path_lid_from_lap(lap_msg, path);
+}
+
+static int cm_lap_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_lap_msg *lap_msg;
+ struct ib_cm_lap_event_param *param;
+ struct ib_mad_send_buf *msg = NULL;
+ struct rdma_ah_attr ah_attr;
+ struct cm_av alt_av = {};
+ int ret;
+
+ /* Currently Alternate path messages are not supported for
+ * RoCE link layer.
+ */
+ if (rdma_protocol_roce(work->port->cm_dev->ib_device,
+ work->port->port_num))
+ return -EINVAL;
+
+ /* todo: verify LAP request and send reject APR if invalid. */
+ lap_msg = (struct cm_lap_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(
+ cpu_to_be32(IBA_GET(CM_LAP_REMOTE_COMM_ID, lap_msg)),
+ cpu_to_be32(IBA_GET(CM_LAP_LOCAL_COMM_ID, lap_msg)));
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ param = &work->cm_event.param.lap_rcvd;
+ memset(&work->path[0], 0, sizeof(work->path[1]));
+ cm_path_set_rec_type(work->port->cm_dev->ib_device,
+ work->port->port_num, &work->path[0],
+ IBA_GET_MEM_PTR(CM_LAP_ALTERNATE_LOCAL_PORT_GID,
+ lap_msg));
+ param->alternate_path = &work->path[0];
+ cm_format_path_from_lap(cm_id_priv, param->alternate_path, lap_msg);
+ work->cm_event.private_data =
+ IBA_GET_MEM_PTR(CM_LAP_PRIVATE_DATA, lap_msg);
+
+ ret = ib_init_ah_attr_from_wc(work->port->cm_dev->ib_device,
+ work->port->port_num,
+ work->mad_recv_wc->wc,
+ work->mad_recv_wc->recv_buf.grh,
+ &ah_attr);
+ if (ret)
+ goto deref;
+
+ ret = cm_init_av_by_path(param->alternate_path, NULL, &alt_av);
+ if (ret) {
+ rdma_destroy_ah_attr(&ah_attr);
+ goto deref;
+ }
+
+ spin_lock_irq(&cm_id_priv->lock);
+ cm_init_av_for_lap(work->port, work->mad_recv_wc->wc,
+ &ah_attr, &cm_id_priv->av);
+ cm_move_av_from_path(&cm_id_priv->alt_av, &alt_av);
+
+ if (cm_id_priv->id.state != IB_CM_ESTABLISHED)
+ goto unlock;
+
+ switch (cm_id_priv->id.lap_state) {
+ case IB_CM_LAP_UNINIT:
+ case IB_CM_LAP_IDLE:
+ break;
+ case IB_CM_MRA_LAP_SENT:
+ atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
+ [CM_LAP_COUNTER]);
+ msg = cm_alloc_response_msg_no_ah(work->port, work->mad_recv_wc);
+ if (IS_ERR(msg))
+ goto unlock;
+
+ cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+ CM_MSG_RESPONSE_OTHER,
+ cm_id_priv->service_timeout,
+ cm_id_priv->private_data,
+ cm_id_priv->private_data_len);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (cm_create_response_msg_ah(work->port, work->mad_recv_wc, msg) ||
+ ib_post_send_mad(msg, NULL))
+ cm_free_response_msg(msg);
+ goto deref;
+ case IB_CM_LAP_RCVD:
+ atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
+ [CM_LAP_COUNTER]);
+ goto unlock;
+ default:
+ goto unlock;
+ }
+
+ cm_id_priv->id.lap_state = IB_CM_LAP_RCVD;
+ cm_id_priv->tid = lap_msg->hdr.tid;
+ cm_queue_work_unlock(cm_id_priv, work);
+ return 0;
+
+unlock: spin_unlock_irq(&cm_id_priv->lock);
+deref: cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static int cm_apr_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_apr_msg *apr_msg;
+
+ /* Currently Alternate path messages are not supported for
+ * RoCE link layer.
+ */
+ if (rdma_protocol_roce(work->port->cm_dev->ib_device,
+ work->port->port_num))
+ return -EINVAL;
+
+ apr_msg = (struct cm_apr_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(
+ cpu_to_be32(IBA_GET(CM_APR_REMOTE_COMM_ID, apr_msg)),
+ cpu_to_be32(IBA_GET(CM_APR_LOCAL_COMM_ID, apr_msg)));
+ if (!cm_id_priv)
+ return -EINVAL; /* Unmatched reply. */
+
+ work->cm_event.param.apr_rcvd.ap_status =
+ IBA_GET(CM_APR_AR_STATUS, apr_msg);
+ work->cm_event.param.apr_rcvd.apr_info =
+ IBA_GET_MEM_PTR(CM_APR_ADDITIONAL_INFORMATION, apr_msg);
+ work->cm_event.param.apr_rcvd.info_len =
+ IBA_GET(CM_APR_ADDITIONAL_INFORMATION_LENGTH, apr_msg);
+ work->cm_event.private_data =
+ IBA_GET_MEM_PTR(CM_APR_PRIVATE_DATA, apr_msg);
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_ESTABLISHED ||
+ (cm_id_priv->id.lap_state != IB_CM_LAP_SENT &&
+ cm_id_priv->id.lap_state != IB_CM_MRA_LAP_RCVD)) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ goto out;
+ }
+ cm_id_priv->id.lap_state = IB_CM_LAP_IDLE;
+ ib_cancel_mad(cm_id_priv->msg);
+ cm_queue_work_unlock(cm_id_priv, work);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static int cm_timewait_handler(struct cm_work *work)
+{
+ struct cm_timewait_info *timewait_info;
+ struct cm_id_private *cm_id_priv;
+
+ timewait_info = container_of(work, struct cm_timewait_info, work);
+ spin_lock_irq(&cm.lock);
+ list_del(&timewait_info->list);
+ spin_unlock_irq(&cm.lock);
+
+ cm_id_priv = cm_acquire_id(timewait_info->work.local_id,
+ timewait_info->work.remote_id);
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_TIMEWAIT ||
+ cm_id_priv->remote_qpn != timewait_info->remote_qpn) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ goto out;
+ }
+ cm_id_priv->id.state = IB_CM_IDLE;
+ cm_queue_work_unlock(cm_id_priv, work);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static void cm_format_sidr_req(struct cm_sidr_req_msg *sidr_req_msg,
+ struct cm_id_private *cm_id_priv,
+ struct ib_cm_sidr_req_param *param)
+{
+ cm_format_mad_hdr(&sidr_req_msg->hdr, CM_SIDR_REQ_ATTR_ID,
+ cm_form_tid(cm_id_priv));
+ IBA_SET(CM_SIDR_REQ_REQUESTID, sidr_req_msg,
+ be32_to_cpu(cm_id_priv->id.local_id));
+ IBA_SET(CM_SIDR_REQ_PARTITION_KEY, sidr_req_msg,
+ be16_to_cpu(param->path->pkey));
+ IBA_SET(CM_SIDR_REQ_SERVICEID, sidr_req_msg,
+ be64_to_cpu(param->service_id));
+
+ if (param->private_data && param->private_data_len)
+ IBA_SET_MEM(CM_SIDR_REQ_PRIVATE_DATA, sidr_req_msg,
+ param->private_data, param->private_data_len);
+}
+
+int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
+ struct ib_cm_sidr_req_param *param)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ struct cm_av av = {};
+ unsigned long flags;
+ int ret;
+
+ if (!param->path || (param->private_data &&
+ param->private_data_len > IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE))
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ ret = cm_init_av_by_path(param->path, param->sgid_attr, &av);
+ if (ret)
+ return ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ cm_move_av_from_path(&cm_id_priv->av, &av);
+ cm_id->service_id = param->service_id;
+ cm_id_priv->timeout_ms = param->timeout_ms;
+ cm_id_priv->max_cm_retries = param->max_cm_retries;
+ if (cm_id->state != IB_CM_IDLE) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ msg = cm_alloc_priv_msg(cm_id_priv);
+ if (IS_ERR(msg)) {
+ ret = PTR_ERR(msg);
+ goto out_unlock;
+ }
+
+ cm_format_sidr_req((struct cm_sidr_req_msg *)msg->mad, cm_id_priv,
+ param);
+ msg->timeout_ms = cm_id_priv->timeout_ms;
+ msg->context[1] = (void *)(unsigned long)IB_CM_SIDR_REQ_SENT;
+
+ trace_icm_send_sidr_req(&cm_id_priv->id);
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ goto out_free;
+ cm_id->state = IB_CM_SIDR_REQ_SENT;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return 0;
+out_free:
+ cm_free_priv_msg(msg);
+out_unlock:
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_sidr_req);
+
+static void cm_format_sidr_req_event(struct cm_work *work,
+ const struct cm_id_private *rx_cm_id,
+ struct ib_cm_id *listen_id)
+{
+ struct cm_sidr_req_msg *sidr_req_msg;
+ struct ib_cm_sidr_req_event_param *param;
+
+ sidr_req_msg = (struct cm_sidr_req_msg *)
+ work->mad_recv_wc->recv_buf.mad;
+ param = &work->cm_event.param.sidr_req_rcvd;
+ param->pkey = IBA_GET(CM_SIDR_REQ_PARTITION_KEY, sidr_req_msg);
+ param->listen_id = listen_id;
+ param->service_id =
+ cpu_to_be64(IBA_GET(CM_SIDR_REQ_SERVICEID, sidr_req_msg));
+ param->bth_pkey = cm_get_bth_pkey(work);
+ param->port = work->port->port_num;
+ param->sgid_attr = rx_cm_id->av.ah_attr.grh.sgid_attr;
+ work->cm_event.private_data =
+ IBA_GET_MEM_PTR(CM_SIDR_REQ_PRIVATE_DATA, sidr_req_msg);
+}
+
+static int cm_sidr_req_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv, *listen_cm_id_priv;
+ struct cm_sidr_req_msg *sidr_req_msg;
+ struct ib_wc *wc;
+ int ret;
+
+ cm_id_priv =
+ cm_alloc_id_priv(work->port->cm_dev->ib_device, NULL, NULL);
+ if (IS_ERR(cm_id_priv))
+ return PTR_ERR(cm_id_priv);
+
+ /* Record SGID/SLID and request ID for lookup. */
+ sidr_req_msg = (struct cm_sidr_req_msg *)
+ work->mad_recv_wc->recv_buf.mad;
+
+ cm_id_priv->id.remote_id =
+ cpu_to_be32(IBA_GET(CM_SIDR_REQ_REQUESTID, sidr_req_msg));
+ cm_id_priv->id.service_id =
+ cpu_to_be64(IBA_GET(CM_SIDR_REQ_SERVICEID, sidr_req_msg));
+ cm_id_priv->tid = sidr_req_msg->hdr.tid;
+
+ wc = work->mad_recv_wc->wc;
+ cm_id_priv->sidr_slid = wc->slid;
+ ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+ work->mad_recv_wc->recv_buf.grh,
+ &cm_id_priv->av);
+ if (ret)
+ goto out;
+
+ spin_lock_irq(&cm.lock);
+ listen_cm_id_priv = cm_insert_remote_sidr(cm_id_priv);
+ if (listen_cm_id_priv) {
+ spin_unlock_irq(&cm.lock);
+ atomic_long_inc(&work->port->counters[CM_RECV_DUPLICATES]
+ [CM_SIDR_REQ_COUNTER]);
+ goto out; /* Duplicate message. */
+ }
+ cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD;
+ listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device,
+ cm_id_priv->id.service_id);
+ if (!listen_cm_id_priv) {
+ spin_unlock_irq(&cm.lock);
+ ib_send_cm_sidr_rep(&cm_id_priv->id,
+ &(struct ib_cm_sidr_rep_param){
+ .status = IB_SIDR_UNSUPPORTED });
+ goto out; /* No match. */
+ }
+ spin_unlock_irq(&cm.lock);
+
+ cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler;
+ cm_id_priv->id.context = listen_cm_id_priv->id.context;
+
+ /*
+ * A SIDR ID does not need to be in the xarray since it does not receive
+ * mads, is not placed in the remote_id or remote_qpn rbtree, and does
+ * not enter timewait.
+ */
+
+ cm_format_sidr_req_event(work, cm_id_priv, &listen_cm_id_priv->id);
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event);
+ cm_free_work(work);
+ /*
+ * A pointer to the listen_cm_id is held in the event, so this deref
+ * must be after the event is delivered above.
+ */
+ cm_deref_id(listen_cm_id_priv);
+ if (ret)
+ cm_destroy_id(&cm_id_priv->id, ret);
+ return 0;
+out:
+ ib_destroy_cm_id(&cm_id_priv->id);
+ return -EINVAL;
+}
+
+static void cm_format_sidr_rep(struct cm_sidr_rep_msg *sidr_rep_msg,
+ struct cm_id_private *cm_id_priv,
+ struct ib_cm_sidr_rep_param *param)
+{
+ cm_format_mad_ece_hdr(&sidr_rep_msg->hdr, CM_SIDR_REP_ATTR_ID,
+ cm_id_priv->tid, param->ece.attr_mod);
+ IBA_SET(CM_SIDR_REP_REQUESTID, sidr_rep_msg,
+ be32_to_cpu(cm_id_priv->id.remote_id));
+ IBA_SET(CM_SIDR_REP_STATUS, sidr_rep_msg, param->status);
+ IBA_SET(CM_SIDR_REP_QPN, sidr_rep_msg, param->qp_num);
+ IBA_SET(CM_SIDR_REP_SERVICEID, sidr_rep_msg,
+ be64_to_cpu(cm_id_priv->id.service_id));
+ IBA_SET(CM_SIDR_REP_Q_KEY, sidr_rep_msg, param->qkey);
+ IBA_SET(CM_SIDR_REP_VENDOR_ID_L, sidr_rep_msg,
+ param->ece.vendor_id & 0xFF);
+ IBA_SET(CM_SIDR_REP_VENDOR_ID_H, sidr_rep_msg,
+ (param->ece.vendor_id >> 8) & 0xFF);
+
+ if (param->info && param->info_length)
+ IBA_SET_MEM(CM_SIDR_REP_ADDITIONAL_INFORMATION, sidr_rep_msg,
+ param->info, param->info_length);
+
+ if (param->private_data && param->private_data_len)
+ IBA_SET_MEM(CM_SIDR_REP_PRIVATE_DATA, sidr_rep_msg,
+ param->private_data, param->private_data_len);
+}
+
+static int cm_send_sidr_rep_locked(struct cm_id_private *cm_id_priv,
+ struct ib_cm_sidr_rep_param *param)
+{
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ int ret;
+
+ lockdep_assert_held(&cm_id_priv->lock);
+
+ if ((param->info && param->info_length > IB_CM_SIDR_REP_INFO_LENGTH) ||
+ (param->private_data &&
+ param->private_data_len > IB_CM_SIDR_REP_PRIVATE_DATA_SIZE))
+ return -EINVAL;
+
+ if (cm_id_priv->id.state != IB_CM_SIDR_REQ_RCVD)
+ return -EINVAL;
+
+ msg = cm_alloc_msg(cm_id_priv);
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
+
+ cm_format_sidr_rep((struct cm_sidr_rep_msg *) msg->mad, cm_id_priv,
+ param);
+ trace_icm_send_sidr_rep(&cm_id_priv->id);
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ cm_free_msg(msg);
+ return ret;
+ }
+ cm_id_priv->id.state = IB_CM_IDLE;
+ spin_lock_irqsave(&cm.lock, flags);
+ if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node)) {
+ rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+ RB_CLEAR_NODE(&cm_id_priv->sidr_id_node);
+ }
+ spin_unlock_irqrestore(&cm.lock, flags);
+ return 0;
+}
+
+int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id,
+ struct ib_cm_sidr_rep_param *param)
+{
+ struct cm_id_private *cm_id_priv =
+ container_of(cm_id, struct cm_id_private, id);
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ ret = cm_send_sidr_rep_locked(cm_id_priv, param);
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_sidr_rep);
+
+static void cm_format_sidr_rep_event(struct cm_work *work,
+ const struct cm_id_private *cm_id_priv)
+{
+ struct cm_sidr_rep_msg *sidr_rep_msg;
+ struct ib_cm_sidr_rep_event_param *param;
+
+ sidr_rep_msg = (struct cm_sidr_rep_msg *)
+ work->mad_recv_wc->recv_buf.mad;
+ param = &work->cm_event.param.sidr_rep_rcvd;
+ param->status = IBA_GET(CM_SIDR_REP_STATUS, sidr_rep_msg);
+ param->qkey = IBA_GET(CM_SIDR_REP_Q_KEY, sidr_rep_msg);
+ param->qpn = IBA_GET(CM_SIDR_REP_QPN, sidr_rep_msg);
+ param->info = IBA_GET_MEM_PTR(CM_SIDR_REP_ADDITIONAL_INFORMATION,
+ sidr_rep_msg);
+ param->info_len = IBA_GET(CM_SIDR_REP_ADDITIONAL_INFORMATION_LENGTH,
+ sidr_rep_msg);
+ param->sgid_attr = cm_id_priv->av.ah_attr.grh.sgid_attr;
+ work->cm_event.private_data =
+ IBA_GET_MEM_PTR(CM_SIDR_REP_PRIVATE_DATA, sidr_rep_msg);
+}
+
+static int cm_sidr_rep_handler(struct cm_work *work)
+{
+ struct cm_sidr_rep_msg *sidr_rep_msg;
+ struct cm_id_private *cm_id_priv;
+
+ sidr_rep_msg = (struct cm_sidr_rep_msg *)
+ work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(
+ cpu_to_be32(IBA_GET(CM_SIDR_REP_REQUESTID, sidr_rep_msg)), 0);
+ if (!cm_id_priv)
+ return -EINVAL; /* Unmatched reply. */
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_SIDR_REQ_SENT) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ goto out;
+ }
+ cm_id_priv->id.state = IB_CM_IDLE;
+ ib_cancel_mad(cm_id_priv->msg);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ cm_format_sidr_rep_event(work, cm_id_priv);
+ cm_process_work(cm_id_priv, work);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static void cm_process_send_error(struct cm_id_private *cm_id_priv,
+ struct ib_mad_send_buf *msg,
+ enum ib_cm_state state,
+ enum ib_wc_status wc_status)
+{
+ struct ib_cm_event cm_event = {};
+ int ret;
+
+ /* Discard old sends or ones without a response. */
+ spin_lock_irq(&cm_id_priv->lock);
+ if (msg != cm_id_priv->msg) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ cm_free_msg(msg);
+ return;
+ }
+ cm_free_priv_msg(msg);
+
+ if (state != cm_id_priv->id.state || wc_status == IB_WC_SUCCESS ||
+ wc_status == IB_WC_WR_FLUSH_ERR)
+ goto out_unlock;
+
+ trace_icm_mad_send_err(state, wc_status);
+ switch (state) {
+ case IB_CM_REQ_SENT:
+ case IB_CM_MRA_REQ_RCVD:
+ cm_reset_to_idle(cm_id_priv);
+ cm_event.event = IB_CM_REQ_ERROR;
+ break;
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ cm_reset_to_idle(cm_id_priv);
+ cm_event.event = IB_CM_REP_ERROR;
+ break;
+ case IB_CM_DREQ_SENT:
+ cm_enter_timewait(cm_id_priv);
+ cm_event.event = IB_CM_DREQ_ERROR;
+ break;
+ case IB_CM_SIDR_REQ_SENT:
+ cm_id_priv->id.state = IB_CM_IDLE;
+ cm_event.event = IB_CM_SIDR_REQ_ERROR;
+ break;
+ default:
+ goto out_unlock;
+ }
+ spin_unlock_irq(&cm_id_priv->lock);
+ cm_event.param.send_status = wc_status;
+
+ /* No other events can occur on the cm_id at this point. */
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &cm_event);
+ if (ret)
+ ib_destroy_cm_id(&cm_id_priv->id);
+ return;
+out_unlock:
+ spin_unlock_irq(&cm_id_priv->lock);
+}
+
+static void cm_send_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ struct ib_mad_send_buf *msg = mad_send_wc->send_buf;
+ struct cm_id_private *cm_id_priv = msg->context[0];
+ enum ib_cm_state state =
+ (enum ib_cm_state)(unsigned long)msg->context[1];
+ struct cm_port *port;
+ u16 attr_index;
+
+ port = mad_agent->context;
+ attr_index = be16_to_cpu(((struct ib_mad_hdr *)
+ msg->mad)->attr_id) - CM_ATTR_ID_OFFSET;
+
+ /*
+ * If the send was in response to a received message (context[0] is not
+ * set to a cm_id), and is not a REJ, then it is a send that was
+ * manually retried.
+ */
+ if (!cm_id_priv && (attr_index != CM_REJ_COUNTER))
+ msg->retries = 1;
+
+ atomic_long_add(1 + msg->retries, &port->counters[CM_XMIT][attr_index]);
+ if (msg->retries)
+ atomic_long_add(msg->retries,
+ &port->counters[CM_XMIT_RETRIES][attr_index]);
+
+ if (cm_id_priv)
+ cm_process_send_error(cm_id_priv, msg, state,
+ mad_send_wc->status);
+ else
+ cm_free_response_msg(msg);
+}
+
+static void cm_work_handler(struct work_struct *_work)
+{
+ struct cm_work *work = container_of(_work, struct cm_work, work.work);
+ int ret;
+
+ switch (work->cm_event.event) {
+ case IB_CM_REQ_RECEIVED:
+ ret = cm_req_handler(work);
+ break;
+ case IB_CM_MRA_RECEIVED:
+ ret = cm_mra_handler(work);
+ break;
+ case IB_CM_REJ_RECEIVED:
+ ret = cm_rej_handler(work);
+ break;
+ case IB_CM_REP_RECEIVED:
+ ret = cm_rep_handler(work);
+ break;
+ case IB_CM_RTU_RECEIVED:
+ ret = cm_rtu_handler(work);
+ break;
+ case IB_CM_USER_ESTABLISHED:
+ ret = cm_establish_handler(work);
+ break;
+ case IB_CM_DREQ_RECEIVED:
+ ret = cm_dreq_handler(work);
+ break;
+ case IB_CM_DREP_RECEIVED:
+ ret = cm_drep_handler(work);
+ break;
+ case IB_CM_SIDR_REQ_RECEIVED:
+ ret = cm_sidr_req_handler(work);
+ break;
+ case IB_CM_SIDR_REP_RECEIVED:
+ ret = cm_sidr_rep_handler(work);
+ break;
+ case IB_CM_LAP_RECEIVED:
+ ret = cm_lap_handler(work);
+ break;
+ case IB_CM_APR_RECEIVED:
+ ret = cm_apr_handler(work);
+ break;
+ case IB_CM_TIMEWAIT_EXIT:
+ ret = cm_timewait_handler(work);
+ break;
+ default:
+ trace_icm_handler_err(work->cm_event.event);
+ ret = -EINVAL;
+ break;
+ }
+ if (ret)
+ cm_free_work(work);
+}
+
+static int cm_establish(struct ib_cm_id *cm_id)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_work *work;
+ unsigned long flags;
+ int ret = 0;
+ struct cm_device *cm_dev;
+
+ cm_dev = ib_get_client_data(cm_id->device, &cm_client);
+ if (!cm_dev)
+ return -ENODEV;
+
+ work = kmalloc(sizeof *work, GFP_ATOMIC);
+ if (!work)
+ return -ENOMEM;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id->state) {
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ cm_id->state = IB_CM_ESTABLISHED;
+ break;
+ case IB_CM_ESTABLISHED:
+ ret = -EISCONN;
+ break;
+ default:
+ trace_icm_establish_err(cm_id);
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ if (ret) {
+ kfree(work);
+ goto out;
+ }
+
+ /*
+ * The CM worker thread may try to destroy the cm_id before it
+ * can execute this work item. To prevent potential deadlock,
+ * we need to find the cm_id once we're in the context of the
+ * worker thread, rather than holding a reference on it.
+ */
+ INIT_DELAYED_WORK(&work->work, cm_work_handler);
+ work->local_id = cm_id->local_id;
+ work->remote_id = cm_id->remote_id;
+ work->mad_recv_wc = NULL;
+ work->cm_event.event = IB_CM_USER_ESTABLISHED;
+
+ /* Check if the device started its remove_one */
+ spin_lock_irqsave(&cm.lock, flags);
+ if (!cm_dev->going_down) {
+ queue_delayed_work(cm.wq, &work->work, 0);
+ } else {
+ kfree(work);
+ ret = -ENODEV;
+ }
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+out:
+ return ret;
+}
+
+static int cm_migrate(struct ib_cm_id *cm_id)
+{
+ struct cm_id_private *cm_id_priv;
+ unsigned long flags;
+ int ret = 0;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state == IB_CM_ESTABLISHED &&
+ (cm_id->lap_state == IB_CM_LAP_UNINIT ||
+ cm_id->lap_state == IB_CM_LAP_IDLE)) {
+ cm_id->lap_state = IB_CM_LAP_IDLE;
+ cm_id_priv->av = cm_id_priv->alt_av;
+ } else
+ ret = -EINVAL;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ return ret;
+}
+
+int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event)
+{
+ int ret;
+
+ switch (event) {
+ case IB_EVENT_COMM_EST:
+ ret = cm_establish(cm_id);
+ break;
+ case IB_EVENT_PATH_MIG:
+ ret = cm_migrate(cm_id);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(ib_cm_notify);
+
+static void cm_recv_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct cm_port *port = mad_agent->context;
+ struct cm_work *work;
+ enum ib_cm_event_type event;
+ bool alt_path = false;
+ u16 attr_id;
+ int paths = 0;
+ int going_down = 0;
+
+ switch (mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) {
+ case CM_REQ_ATTR_ID:
+ alt_path = cm_req_has_alt_path((struct cm_req_msg *)
+ mad_recv_wc->recv_buf.mad);
+ paths = 1 + (alt_path != 0);
+ event = IB_CM_REQ_RECEIVED;
+ break;
+ case CM_MRA_ATTR_ID:
+ event = IB_CM_MRA_RECEIVED;
+ break;
+ case CM_REJ_ATTR_ID:
+ event = IB_CM_REJ_RECEIVED;
+ break;
+ case CM_REP_ATTR_ID:
+ event = IB_CM_REP_RECEIVED;
+ break;
+ case CM_RTU_ATTR_ID:
+ event = IB_CM_RTU_RECEIVED;
+ break;
+ case CM_DREQ_ATTR_ID:
+ event = IB_CM_DREQ_RECEIVED;
+ break;
+ case CM_DREP_ATTR_ID:
+ event = IB_CM_DREP_RECEIVED;
+ break;
+ case CM_SIDR_REQ_ATTR_ID:
+ event = IB_CM_SIDR_REQ_RECEIVED;
+ break;
+ case CM_SIDR_REP_ATTR_ID:
+ event = IB_CM_SIDR_REP_RECEIVED;
+ break;
+ case CM_LAP_ATTR_ID:
+ paths = 1;
+ event = IB_CM_LAP_RECEIVED;
+ break;
+ case CM_APR_ATTR_ID:
+ event = IB_CM_APR_RECEIVED;
+ break;
+ default:
+ ib_free_recv_mad(mad_recv_wc);
+ return;
+ }
+
+ attr_id = be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id);
+ atomic_long_inc(&port->counters[CM_RECV][attr_id - CM_ATTR_ID_OFFSET]);
+
+ work = kmalloc(struct_size(work, path, paths), GFP_KERNEL);
+ if (!work) {
+ ib_free_recv_mad(mad_recv_wc);
+ return;
+ }
+
+ INIT_DELAYED_WORK(&work->work, cm_work_handler);
+ work->cm_event.event = event;
+ work->mad_recv_wc = mad_recv_wc;
+ work->port = port;
+
+ /* Check if the device started its remove_one */
+ spin_lock_irq(&cm.lock);
+ if (!port->cm_dev->going_down)
+ queue_delayed_work(cm.wq, &work->work, 0);
+ else
+ going_down = 1;
+ spin_unlock_irq(&cm.lock);
+
+ if (going_down) {
+ kfree(work);
+ ib_free_recv_mad(mad_recv_wc);
+ }
+}
+
+static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->id.state) {
+ case IB_CM_REQ_SENT:
+ case IB_CM_MRA_REQ_RCVD:
+ case IB_CM_REQ_RCVD:
+ case IB_CM_MRA_REQ_SENT:
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ case IB_CM_ESTABLISHED:
+ *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX | IB_QP_PORT;
+ qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE;
+ if (cm_id_priv->responder_resources)
+ qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_ATOMIC;
+ qp_attr->pkey_index = cm_id_priv->av.pkey_index;
+ if (cm_id_priv->av.port)
+ qp_attr->port_num = cm_id_priv->av.port->port_num;
+ ret = 0;
+ break;
+ default:
+ trace_icm_qp_init_err(&cm_id_priv->id);
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+
+static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->id.state) {
+ case IB_CM_REQ_RCVD:
+ case IB_CM_MRA_REQ_SENT:
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ case IB_CM_ESTABLISHED:
+ *qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN | IB_QP_RQ_PSN;
+ qp_attr->ah_attr = cm_id_priv->av.ah_attr;
+ if ((qp_attr->ah_attr.type == RDMA_AH_ATTR_TYPE_IB) &&
+ cm_id_priv->av.dlid_datapath &&
+ (cm_id_priv->av.dlid_datapath != 0xffff))
+ qp_attr->ah_attr.ib.dlid = cm_id_priv->av.dlid_datapath;
+ qp_attr->path_mtu = cm_id_priv->path_mtu;
+ qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn);
+ qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn);
+ if (cm_id_priv->qp_type == IB_QPT_RC ||
+ cm_id_priv->qp_type == IB_QPT_XRC_TGT) {
+ *qp_attr_mask |= IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_MIN_RNR_TIMER;
+ qp_attr->max_dest_rd_atomic =
+ cm_id_priv->responder_resources;
+ qp_attr->min_rnr_timer = 0;
+ }
+ if (rdma_ah_get_dlid(&cm_id_priv->alt_av.ah_attr) &&
+ cm_id_priv->alt_av.port) {
+ *qp_attr_mask |= IB_QP_ALT_PATH;
+ qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num;
+ qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
+ qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
+ qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
+ }
+ ret = 0;
+ break;
+ default:
+ trace_icm_qp_rtr_err(&cm_id_priv->id);
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+
+static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->id.state) {
+ /* Allow transition to RTS before sending REP */
+ case IB_CM_REQ_RCVD:
+ case IB_CM_MRA_REQ_SENT:
+
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ case IB_CM_ESTABLISHED:
+ if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT) {
+ *qp_attr_mask = IB_QP_STATE | IB_QP_SQ_PSN;
+ qp_attr->sq_psn = be32_to_cpu(cm_id_priv->sq_psn);
+ switch (cm_id_priv->qp_type) {
+ case IB_QPT_RC:
+ case IB_QPT_XRC_INI:
+ *qp_attr_mask |= IB_QP_RETRY_CNT | IB_QP_RNR_RETRY |
+ IB_QP_MAX_QP_RD_ATOMIC;
+ qp_attr->retry_cnt = cm_id_priv->retry_count;
+ qp_attr->rnr_retry = cm_id_priv->rnr_retry_count;
+ qp_attr->max_rd_atomic = cm_id_priv->initiator_depth;
+ fallthrough;
+ case IB_QPT_XRC_TGT:
+ *qp_attr_mask |= IB_QP_TIMEOUT;
+ qp_attr->timeout = cm_id_priv->av.timeout;
+ break;
+ default:
+ break;
+ }
+ if (rdma_ah_get_dlid(&cm_id_priv->alt_av.ah_attr)) {
+ *qp_attr_mask |= IB_QP_PATH_MIG_STATE;
+ qp_attr->path_mig_state = IB_MIG_REARM;
+ }
+ } else {
+ *qp_attr_mask = IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE;
+ if (cm_id_priv->alt_av.port)
+ qp_attr->alt_port_num =
+ cm_id_priv->alt_av.port->port_num;
+ qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
+ qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
+ qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
+ qp_attr->path_mig_state = IB_MIG_REARM;
+ }
+ ret = 0;
+ break;
+ default:
+ trace_icm_qp_rts_err(&cm_id_priv->id);
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+
+int ib_cm_init_qp_attr(struct ib_cm_id *cm_id,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ struct cm_id_private *cm_id_priv;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ switch (qp_attr->qp_state) {
+ case IB_QPS_INIT:
+ ret = cm_init_qp_init_attr(cm_id_priv, qp_attr, qp_attr_mask);
+ break;
+ case IB_QPS_RTR:
+ ret = cm_init_qp_rtr_attr(cm_id_priv, qp_attr, qp_attr_mask);
+ break;
+ case IB_QPS_RTS:
+ ret = cm_init_qp_rts_attr(cm_id_priv, qp_attr, qp_attr_mask);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(ib_cm_init_qp_attr);
+
+static ssize_t cm_show_counter(struct ib_device *ibdev, u32 port_num,
+ struct ib_port_attribute *attr, char *buf)
+{
+ struct cm_counter_attribute *cm_attr =
+ container_of(attr, struct cm_counter_attribute, attr);
+ struct cm_device *cm_dev = ib_get_client_data(ibdev, &cm_client);
+
+ if (WARN_ON(!cm_dev))
+ return -EINVAL;
+
+ return sysfs_emit(
+ buf, "%ld\n",
+ atomic_long_read(
+ &cm_dev->port[port_num - 1]
+ ->counters[cm_attr->group][cm_attr->index]));
+}
+
+#define CM_COUNTER_ATTR(_name, _group, _index) \
+ { \
+ .attr = __ATTR(_name, 0444, cm_show_counter, NULL), \
+ .group = _group, .index = _index \
+ }
+
+#define CM_COUNTER_GROUP(_group, _name) \
+ static struct cm_counter_attribute cm_counter_attr_##_group[] = { \
+ CM_COUNTER_ATTR(req, _group, CM_REQ_COUNTER), \
+ CM_COUNTER_ATTR(mra, _group, CM_MRA_COUNTER), \
+ CM_COUNTER_ATTR(rej, _group, CM_REJ_COUNTER), \
+ CM_COUNTER_ATTR(rep, _group, CM_REP_COUNTER), \
+ CM_COUNTER_ATTR(rtu, _group, CM_RTU_COUNTER), \
+ CM_COUNTER_ATTR(dreq, _group, CM_DREQ_COUNTER), \
+ CM_COUNTER_ATTR(drep, _group, CM_DREP_COUNTER), \
+ CM_COUNTER_ATTR(sidr_req, _group, CM_SIDR_REQ_COUNTER), \
+ CM_COUNTER_ATTR(sidr_rep, _group, CM_SIDR_REP_COUNTER), \
+ CM_COUNTER_ATTR(lap, _group, CM_LAP_COUNTER), \
+ CM_COUNTER_ATTR(apr, _group, CM_APR_COUNTER), \
+ }; \
+ static struct attribute *cm_counter_attrs_##_group[] = { \
+ &cm_counter_attr_##_group[0].attr.attr, \
+ &cm_counter_attr_##_group[1].attr.attr, \
+ &cm_counter_attr_##_group[2].attr.attr, \
+ &cm_counter_attr_##_group[3].attr.attr, \
+ &cm_counter_attr_##_group[4].attr.attr, \
+ &cm_counter_attr_##_group[5].attr.attr, \
+ &cm_counter_attr_##_group[6].attr.attr, \
+ &cm_counter_attr_##_group[7].attr.attr, \
+ &cm_counter_attr_##_group[8].attr.attr, \
+ &cm_counter_attr_##_group[9].attr.attr, \
+ &cm_counter_attr_##_group[10].attr.attr, \
+ NULL, \
+ }; \
+ static const struct attribute_group cm_counter_group_##_group = { \
+ .name = _name, \
+ .attrs = cm_counter_attrs_##_group, \
+ };
+
+CM_COUNTER_GROUP(CM_XMIT, "cm_tx_msgs")
+CM_COUNTER_GROUP(CM_XMIT_RETRIES, "cm_tx_retries")
+CM_COUNTER_GROUP(CM_RECV, "cm_rx_msgs")
+CM_COUNTER_GROUP(CM_RECV_DUPLICATES, "cm_rx_duplicates")
+
+static const struct attribute_group *cm_counter_groups[] = {
+ &cm_counter_group_CM_XMIT,
+ &cm_counter_group_CM_XMIT_RETRIES,
+ &cm_counter_group_CM_RECV,
+ &cm_counter_group_CM_RECV_DUPLICATES,
+ NULL,
+};
+
+static int cm_add_one(struct ib_device *ib_device)
+{
+ struct cm_device *cm_dev;
+ struct cm_port *port;
+ struct ib_mad_reg_req reg_req = {
+ .mgmt_class = IB_MGMT_CLASS_CM,
+ .mgmt_class_version = IB_CM_CLASS_VERSION,
+ };
+ struct ib_port_modify port_modify = {
+ .set_port_cap_mask = IB_PORT_CM_SUP
+ };
+ unsigned long flags;
+ int ret;
+ int count = 0;
+ u32 i;
+
+ cm_dev = kzalloc(struct_size(cm_dev, port, ib_device->phys_port_cnt),
+ GFP_KERNEL);
+ if (!cm_dev)
+ return -ENOMEM;
+
+ kref_init(&cm_dev->kref);
+ spin_lock_init(&cm_dev->mad_agent_lock);
+ cm_dev->ib_device = ib_device;
+ cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;
+ cm_dev->going_down = 0;
+
+ ib_set_client_data(ib_device, &cm_client, cm_dev);
+
+ set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
+ rdma_for_each_port (ib_device, i) {
+ if (!rdma_cap_ib_cm(ib_device, i))
+ continue;
+
+ port = kzalloc(sizeof *port, GFP_KERNEL);
+ if (!port) {
+ ret = -ENOMEM;
+ goto error1;
+ }
+
+ cm_dev->port[i-1] = port;
+ port->cm_dev = cm_dev;
+ port->port_num = i;
+
+ ret = ib_port_register_client_groups(ib_device, i,
+ cm_counter_groups);
+ if (ret)
+ goto error1;
+
+ port->mad_agent = ib_register_mad_agent(ib_device, i,
+ IB_QPT_GSI,
+ &reg_req,
+ 0,
+ cm_send_handler,
+ cm_recv_handler,
+ port,
+ 0);
+ if (IS_ERR(port->mad_agent)) {
+ ret = PTR_ERR(port->mad_agent);
+ goto error2;
+ }
+
+ ret = ib_modify_port(ib_device, i, 0, &port_modify);
+ if (ret)
+ goto error3;
+
+ count++;
+ }
+
+ if (!count) {
+ ret = -EOPNOTSUPP;
+ goto free;
+ }
+
+ write_lock_irqsave(&cm.device_lock, flags);
+ list_add_tail(&cm_dev->list, &cm.device_list);
+ write_unlock_irqrestore(&cm.device_lock, flags);
+ return 0;
+
+error3:
+ ib_unregister_mad_agent(port->mad_agent);
+error2:
+ ib_port_unregister_client_groups(ib_device, i, cm_counter_groups);
+error1:
+ port_modify.set_port_cap_mask = 0;
+ port_modify.clr_port_cap_mask = IB_PORT_CM_SUP;
+ while (--i) {
+ if (!rdma_cap_ib_cm(ib_device, i))
+ continue;
+
+ port = cm_dev->port[i-1];
+ ib_modify_port(ib_device, port->port_num, 0, &port_modify);
+ ib_unregister_mad_agent(port->mad_agent);
+ ib_port_unregister_client_groups(ib_device, i,
+ cm_counter_groups);
+ }
+free:
+ cm_device_put(cm_dev);
+ return ret;
+}
+
+static void cm_remove_one(struct ib_device *ib_device, void *client_data)
+{
+ struct cm_device *cm_dev = client_data;
+ struct cm_port *port;
+ struct ib_port_modify port_modify = {
+ .clr_port_cap_mask = IB_PORT_CM_SUP
+ };
+ unsigned long flags;
+ u32 i;
+
+ write_lock_irqsave(&cm.device_lock, flags);
+ list_del(&cm_dev->list);
+ write_unlock_irqrestore(&cm.device_lock, flags);
+
+ spin_lock_irq(&cm.lock);
+ cm_dev->going_down = 1;
+ spin_unlock_irq(&cm.lock);
+
+ rdma_for_each_port (ib_device, i) {
+ struct ib_mad_agent *mad_agent;
+
+ if (!rdma_cap_ib_cm(ib_device, i))
+ continue;
+
+ port = cm_dev->port[i-1];
+ mad_agent = port->mad_agent;
+ ib_modify_port(ib_device, port->port_num, 0, &port_modify);
+ /*
+ * We flush the queue here after the going_down set, this
+ * verify that no new works will be queued in the recv handler,
+ * after that we can call the unregister_mad_agent
+ */
+ flush_workqueue(cm.wq);
+ /*
+ * The above ensures no call paths from the work are running,
+ * the remaining paths all take the mad_agent_lock.
+ */
+ spin_lock(&cm_dev->mad_agent_lock);
+ port->mad_agent = NULL;
+ spin_unlock(&cm_dev->mad_agent_lock);
+ ib_unregister_mad_agent(mad_agent);
+ ib_port_unregister_client_groups(ib_device, i,
+ cm_counter_groups);
+ }
+
+ cm_device_put(cm_dev);
+}
+
+static int __init ib_cm_init(void)
+{
+ int ret;
+
+ INIT_LIST_HEAD(&cm.device_list);
+ rwlock_init(&cm.device_lock);
+ spin_lock_init(&cm.lock);
+ cm.listen_service_table = RB_ROOT;
+ cm.listen_service_id = be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID);
+ cm.remote_id_table = RB_ROOT;
+ cm.remote_qp_table = RB_ROOT;
+ cm.remote_sidr_table = RB_ROOT;
+ xa_init_flags(&cm.local_id_table, XA_FLAGS_ALLOC);
+ get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand);
+ INIT_LIST_HEAD(&cm.timewait_list);
+
+ cm.wq = alloc_workqueue("ib_cm", 0, 1);
+ if (!cm.wq) {
+ ret = -ENOMEM;
+ goto error2;
+ }
+
+ ret = ib_register_client(&cm_client);
+ if (ret)
+ goto error3;
+
+ return 0;
+error3:
+ destroy_workqueue(cm.wq);
+error2:
+ return ret;
+}
+
+static void __exit ib_cm_cleanup(void)
+{
+ struct cm_timewait_info *timewait_info, *tmp;
+
+ spin_lock_irq(&cm.lock);
+ list_for_each_entry(timewait_info, &cm.timewait_list, list)
+ cancel_delayed_work(&timewait_info->work.work);
+ spin_unlock_irq(&cm.lock);
+
+ ib_unregister_client(&cm_client);
+ destroy_workqueue(cm.wq);
+
+ list_for_each_entry_safe(timewait_info, tmp, &cm.timewait_list, list) {
+ list_del(&timewait_info->list);
+ kfree(timewait_info);
+ }
+
+ WARN_ON(!xa_empty(&cm.local_id_table));
+}
+
+module_init(ib_cm_init);
+module_exit(ib_cm_cleanup);
diff --git a/drivers/infiniband/core/cm_msgs.h b/drivers/infiniband/core/cm_msgs.h
new file mode 100644
index 000000000..8462de7ca
--- /dev/null
+++ b/drivers/infiniband/core/cm_msgs.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2004, 2011 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2019, Mellanox Technologies inc. All rights reserved.
+ */
+#ifndef CM_MSGS_H
+#define CM_MSGS_H
+
+#include <rdma/ibta_vol1_c12.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_cm.h>
+
+/*
+ * Parameters to routines below should be in network-byte order, and values
+ * are returned in network-byte order.
+ */
+
+#define IB_CM_CLASS_VERSION 2 /* IB specification 1.2 */
+
+static inline enum ib_qp_type cm_req_get_qp_type(struct cm_req_msg *req_msg)
+{
+ u8 transport_type = IBA_GET(CM_REQ_TRANSPORT_SERVICE_TYPE, req_msg);
+ switch (transport_type) {
+ case 0: return IB_QPT_RC;
+ case 1: return IB_QPT_UC;
+ case 3:
+ switch (IBA_GET(CM_REQ_EXTENDED_TRANSPORT_TYPE, req_msg)) {
+ case 1: return IB_QPT_XRC_TGT;
+ default: return 0;
+ }
+ default: return 0;
+ }
+}
+
+static inline void cm_req_set_qp_type(struct cm_req_msg *req_msg,
+ enum ib_qp_type qp_type)
+{
+ switch (qp_type) {
+ case IB_QPT_UC:
+ IBA_SET(CM_REQ_TRANSPORT_SERVICE_TYPE, req_msg, 1);
+ break;
+ case IB_QPT_XRC_INI:
+ IBA_SET(CM_REQ_TRANSPORT_SERVICE_TYPE, req_msg, 3);
+ IBA_SET(CM_REQ_EXTENDED_TRANSPORT_TYPE, req_msg, 1);
+ break;
+ default:
+ IBA_SET(CM_REQ_TRANSPORT_SERVICE_TYPE, req_msg, 0);
+ }
+}
+
+/* Message REJected or MRAed */
+enum cm_msg_response {
+ CM_MSG_RESPONSE_REQ = 0x0,
+ CM_MSG_RESPONSE_REP = 0x1,
+ CM_MSG_RESPONSE_OTHER = 0x2
+};
+
+static inline __be32 cm_rep_get_qpn(struct cm_rep_msg *rep_msg, enum ib_qp_type qp_type)
+{
+ return (qp_type == IB_QPT_XRC_INI) ?
+ cpu_to_be32(IBA_GET(CM_REP_LOCAL_EE_CONTEXT_NUMBER,
+ rep_msg)) :
+ cpu_to_be32(IBA_GET(CM_REP_LOCAL_QPN, rep_msg));
+}
+
+#endif /* CM_MSGS_H */
diff --git a/drivers/infiniband/core/cm_trace.c b/drivers/infiniband/core/cm_trace.c
new file mode 100644
index 000000000..8f3482f66
--- /dev/null
+++ b/drivers/infiniband/core/cm_trace.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Trace points for the IB Connection Manager.
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2020, Oracle and/or its affiliates.
+ */
+
+#include <rdma/rdma_cm.h>
+#include "cma_priv.h"
+
+#define CREATE_TRACE_POINTS
+
+#include "cm_trace.h"
diff --git a/drivers/infiniband/core/cm_trace.h b/drivers/infiniband/core/cm_trace.h
new file mode 100644
index 000000000..e9d282679
--- /dev/null
+++ b/drivers/infiniband/core/cm_trace.h
@@ -0,0 +1,414 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Trace point definitions for the RDMA Connect Manager.
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2020 Oracle and/or its affiliates.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ib_cma
+
+#if !defined(_TRACE_IB_CMA_H) || defined(TRACE_HEADER_MULTI_READ)
+
+#define _TRACE_IB_CMA_H
+
+#include <linux/tracepoint.h>
+#include <rdma/ib_cm.h>
+#include <trace/events/rdma.h>
+
+/*
+ * enum ib_cm_state, from include/rdma/ib_cm.h
+ */
+#define IB_CM_STATE_LIST \
+ ib_cm_state(IDLE) \
+ ib_cm_state(LISTEN) \
+ ib_cm_state(REQ_SENT) \
+ ib_cm_state(REQ_RCVD) \
+ ib_cm_state(MRA_REQ_SENT) \
+ ib_cm_state(MRA_REQ_RCVD) \
+ ib_cm_state(REP_SENT) \
+ ib_cm_state(REP_RCVD) \
+ ib_cm_state(MRA_REP_SENT) \
+ ib_cm_state(MRA_REP_RCVD) \
+ ib_cm_state(ESTABLISHED) \
+ ib_cm_state(DREQ_SENT) \
+ ib_cm_state(DREQ_RCVD) \
+ ib_cm_state(TIMEWAIT) \
+ ib_cm_state(SIDR_REQ_SENT) \
+ ib_cm_state_end(SIDR_REQ_RCVD)
+
+#undef ib_cm_state
+#undef ib_cm_state_end
+#define ib_cm_state(x) TRACE_DEFINE_ENUM(IB_CM_##x);
+#define ib_cm_state_end(x) TRACE_DEFINE_ENUM(IB_CM_##x);
+
+IB_CM_STATE_LIST
+
+#undef ib_cm_state
+#undef ib_cm_state_end
+#define ib_cm_state(x) { IB_CM_##x, #x },
+#define ib_cm_state_end(x) { IB_CM_##x, #x }
+
+#define show_ib_cm_state(x) \
+ __print_symbolic(x, IB_CM_STATE_LIST)
+
+/*
+ * enum ib_cm_lap_state, from include/rdma/ib_cm.h
+ */
+#define IB_CM_LAP_STATE_LIST \
+ ib_cm_lap_state(LAP_UNINIT) \
+ ib_cm_lap_state(LAP_IDLE) \
+ ib_cm_lap_state(LAP_SENT) \
+ ib_cm_lap_state(LAP_RCVD) \
+ ib_cm_lap_state(MRA_LAP_SENT) \
+ ib_cm_lap_state_end(MRA_LAP_RCVD)
+
+#undef ib_cm_lap_state
+#undef ib_cm_lap_state_end
+#define ib_cm_lap_state(x) TRACE_DEFINE_ENUM(IB_CM_##x);
+#define ib_cm_lap_state_end(x) TRACE_DEFINE_ENUM(IB_CM_##x);
+
+IB_CM_LAP_STATE_LIST
+
+#undef ib_cm_lap_state
+#undef ib_cm_lap_state_end
+#define ib_cm_lap_state(x) { IB_CM_##x, #x },
+#define ib_cm_lap_state_end(x) { IB_CM_##x, #x }
+
+#define show_ib_cm_lap_state(x) \
+ __print_symbolic(x, IB_CM_LAP_STATE_LIST)
+
+/*
+ * enum ib_cm_rej_reason, from include/rdma/ib_cm.h
+ */
+#define IB_CM_REJ_REASON_LIST \
+ ib_cm_rej_reason(REJ_NO_QP) \
+ ib_cm_rej_reason(REJ_NO_EEC) \
+ ib_cm_rej_reason(REJ_NO_RESOURCES) \
+ ib_cm_rej_reason(REJ_TIMEOUT) \
+ ib_cm_rej_reason(REJ_UNSUPPORTED) \
+ ib_cm_rej_reason(REJ_INVALID_COMM_ID) \
+ ib_cm_rej_reason(REJ_INVALID_COMM_INSTANCE) \
+ ib_cm_rej_reason(REJ_INVALID_SERVICE_ID) \
+ ib_cm_rej_reason(REJ_INVALID_TRANSPORT_TYPE) \
+ ib_cm_rej_reason(REJ_STALE_CONN) \
+ ib_cm_rej_reason(REJ_RDC_NOT_EXIST) \
+ ib_cm_rej_reason(REJ_INVALID_GID) \
+ ib_cm_rej_reason(REJ_INVALID_LID) \
+ ib_cm_rej_reason(REJ_INVALID_SL) \
+ ib_cm_rej_reason(REJ_INVALID_TRAFFIC_CLASS) \
+ ib_cm_rej_reason(REJ_INVALID_HOP_LIMIT) \
+ ib_cm_rej_reason(REJ_INVALID_PACKET_RATE) \
+ ib_cm_rej_reason(REJ_INVALID_ALT_GID) \
+ ib_cm_rej_reason(REJ_INVALID_ALT_LID) \
+ ib_cm_rej_reason(REJ_INVALID_ALT_SL) \
+ ib_cm_rej_reason(REJ_INVALID_ALT_TRAFFIC_CLASS) \
+ ib_cm_rej_reason(REJ_INVALID_ALT_HOP_LIMIT) \
+ ib_cm_rej_reason(REJ_INVALID_ALT_PACKET_RATE) \
+ ib_cm_rej_reason(REJ_PORT_CM_REDIRECT) \
+ ib_cm_rej_reason(REJ_PORT_REDIRECT) \
+ ib_cm_rej_reason(REJ_INVALID_MTU) \
+ ib_cm_rej_reason(REJ_INSUFFICIENT_RESP_RESOURCES) \
+ ib_cm_rej_reason(REJ_CONSUMER_DEFINED) \
+ ib_cm_rej_reason(REJ_INVALID_RNR_RETRY) \
+ ib_cm_rej_reason(REJ_DUPLICATE_LOCAL_COMM_ID) \
+ ib_cm_rej_reason(REJ_INVALID_CLASS_VERSION) \
+ ib_cm_rej_reason(REJ_INVALID_FLOW_LABEL) \
+ ib_cm_rej_reason(REJ_INVALID_ALT_FLOW_LABEL) \
+ ib_cm_rej_reason_end(REJ_VENDOR_OPTION_NOT_SUPPORTED)
+
+#undef ib_cm_rej_reason
+#undef ib_cm_rej_reason_end
+#define ib_cm_rej_reason(x) TRACE_DEFINE_ENUM(IB_CM_##x);
+#define ib_cm_rej_reason_end(x) TRACE_DEFINE_ENUM(IB_CM_##x);
+
+IB_CM_REJ_REASON_LIST
+
+#undef ib_cm_rej_reason
+#undef ib_cm_rej_reason_end
+#define ib_cm_rej_reason(x) { IB_CM_##x, #x },
+#define ib_cm_rej_reason_end(x) { IB_CM_##x, #x }
+
+#define show_ib_cm_rej_reason(x) \
+ __print_symbolic(x, IB_CM_REJ_REASON_LIST)
+
+DECLARE_EVENT_CLASS(icm_id_class,
+ TP_PROTO(
+ const struct ib_cm_id *cm_id
+ ),
+
+ TP_ARGS(cm_id),
+
+ TP_STRUCT__entry(
+ __field(const void *, cm_id) /* for eBPF scripts */
+ __field(unsigned int, local_id)
+ __field(unsigned int, remote_id)
+ __field(unsigned long, state)
+ __field(unsigned long, lap_state)
+ ),
+
+ TP_fast_assign(
+ __entry->cm_id = cm_id;
+ __entry->local_id = be32_to_cpu(cm_id->local_id);
+ __entry->remote_id = be32_to_cpu(cm_id->remote_id);
+ __entry->state = cm_id->state;
+ __entry->lap_state = cm_id->lap_state;
+ ),
+
+ TP_printk("local_id=%u remote_id=%u state=%s lap_state=%s",
+ __entry->local_id, __entry->remote_id,
+ show_ib_cm_state(__entry->state),
+ show_ib_cm_lap_state(__entry->lap_state)
+ )
+);
+
+#define DEFINE_CM_SEND_EVENT(name) \
+ DEFINE_EVENT(icm_id_class, \
+ icm_send_##name, \
+ TP_PROTO( \
+ const struct ib_cm_id *cm_id \
+ ), \
+ TP_ARGS(cm_id))
+
+DEFINE_CM_SEND_EVENT(req);
+DEFINE_CM_SEND_EVENT(rep);
+DEFINE_CM_SEND_EVENT(dup_req);
+DEFINE_CM_SEND_EVENT(dup_rep);
+DEFINE_CM_SEND_EVENT(rtu);
+DEFINE_CM_SEND_EVENT(mra);
+DEFINE_CM_SEND_EVENT(sidr_req);
+DEFINE_CM_SEND_EVENT(sidr_rep);
+DEFINE_CM_SEND_EVENT(dreq);
+DEFINE_CM_SEND_EVENT(drep);
+
+TRACE_EVENT(icm_send_rej,
+ TP_PROTO(
+ const struct ib_cm_id *cm_id,
+ enum ib_cm_rej_reason reason
+ ),
+
+ TP_ARGS(cm_id, reason),
+
+ TP_STRUCT__entry(
+ __field(const void *, cm_id)
+ __field(u32, local_id)
+ __field(u32, remote_id)
+ __field(unsigned long, state)
+ __field(unsigned long, reason)
+ ),
+
+ TP_fast_assign(
+ __entry->cm_id = cm_id;
+ __entry->local_id = be32_to_cpu(cm_id->local_id);
+ __entry->remote_id = be32_to_cpu(cm_id->remote_id);
+ __entry->state = cm_id->state;
+ __entry->reason = reason;
+ ),
+
+ TP_printk("local_id=%u remote_id=%u state=%s reason=%s",
+ __entry->local_id, __entry->remote_id,
+ show_ib_cm_state(__entry->state),
+ show_ib_cm_rej_reason(__entry->reason)
+ )
+);
+
+#define DEFINE_CM_ERR_EVENT(name) \
+ DEFINE_EVENT(icm_id_class, \
+ icm_##name##_err, \
+ TP_PROTO( \
+ const struct ib_cm_id *cm_id \
+ ), \
+ TP_ARGS(cm_id))
+
+DEFINE_CM_ERR_EVENT(send_cm_rtu);
+DEFINE_CM_ERR_EVENT(establish);
+DEFINE_CM_ERR_EVENT(no_listener);
+DEFINE_CM_ERR_EVENT(send_drep);
+DEFINE_CM_ERR_EVENT(dreq_unknown);
+DEFINE_CM_ERR_EVENT(send_unknown_rej);
+DEFINE_CM_ERR_EVENT(rej_unknown);
+DEFINE_CM_ERR_EVENT(send_mra_unknown);
+DEFINE_CM_ERR_EVENT(mra_unknown);
+DEFINE_CM_ERR_EVENT(qp_init);
+DEFINE_CM_ERR_EVENT(qp_rtr);
+DEFINE_CM_ERR_EVENT(qp_rts);
+
+DEFINE_EVENT(icm_id_class, \
+ icm_dreq_skipped, \
+ TP_PROTO( \
+ const struct ib_cm_id *cm_id \
+ ), \
+ TP_ARGS(cm_id) \
+);
+
+DECLARE_EVENT_CLASS(icm_local_class,
+ TP_PROTO(
+ unsigned int local_id,
+ unsigned int remote_id
+ ),
+
+ TP_ARGS(local_id, remote_id),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, local_id)
+ __field(unsigned int, remote_id)
+ ),
+
+ TP_fast_assign(
+ __entry->local_id = local_id;
+ __entry->remote_id = remote_id;
+ ),
+
+ TP_printk("local_id=%u remote_id=%u",
+ __entry->local_id, __entry->remote_id
+ )
+);
+
+#define DEFINE_CM_LOCAL_EVENT(name) \
+ DEFINE_EVENT(icm_local_class, \
+ icm_##name, \
+ TP_PROTO( \
+ unsigned int local_id, \
+ unsigned int remote_id \
+ ), \
+ TP_ARGS(local_id, remote_id))
+
+DEFINE_CM_LOCAL_EVENT(issue_rej);
+DEFINE_CM_LOCAL_EVENT(issue_drep);
+DEFINE_CM_LOCAL_EVENT(staleconn_err);
+DEFINE_CM_LOCAL_EVENT(no_priv_err);
+
+DECLARE_EVENT_CLASS(icm_remote_class,
+ TP_PROTO(
+ u32 remote_id
+ ),
+
+ TP_ARGS(remote_id),
+
+ TP_STRUCT__entry(
+ __field(u32, remote_id)
+ ),
+
+ TP_fast_assign(
+ __entry->remote_id = remote_id;
+ ),
+
+ TP_printk("remote_id=%u",
+ __entry->remote_id
+ )
+);
+
+#define DEFINE_CM_REMOTE_EVENT(name) \
+ DEFINE_EVENT(icm_remote_class, \
+ icm_##name, \
+ TP_PROTO( \
+ u32 remote_id \
+ ), \
+ TP_ARGS(remote_id))
+
+DEFINE_CM_REMOTE_EVENT(remote_no_priv_err);
+DEFINE_CM_REMOTE_EVENT(insert_failed_err);
+
+TRACE_EVENT(icm_send_rep_err,
+ TP_PROTO(
+ __be32 local_id,
+ enum ib_cm_state state
+ ),
+
+ TP_ARGS(local_id, state),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, local_id)
+ __field(unsigned long, state)
+ ),
+
+ TP_fast_assign(
+ __entry->local_id = be32_to_cpu(local_id);
+ __entry->state = state;
+ ),
+
+ TP_printk("local_id=%u state=%s",
+ __entry->local_id, show_ib_cm_state(__entry->state)
+ )
+);
+
+TRACE_EVENT(icm_rep_unknown_err,
+ TP_PROTO(
+ unsigned int local_id,
+ unsigned int remote_id,
+ enum ib_cm_state state
+ ),
+
+ TP_ARGS(local_id, remote_id, state),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, local_id)
+ __field(unsigned int, remote_id)
+ __field(unsigned long, state)
+ ),
+
+ TP_fast_assign(
+ __entry->local_id = local_id;
+ __entry->remote_id = remote_id;
+ __entry->state = state;
+ ),
+
+ TP_printk("local_id=%u remote_id=%u state=%s",
+ __entry->local_id, __entry->remote_id,
+ show_ib_cm_state(__entry->state)
+ )
+);
+
+TRACE_EVENT(icm_handler_err,
+ TP_PROTO(
+ enum ib_cm_event_type event
+ ),
+
+ TP_ARGS(event),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, event)
+ ),
+
+ TP_fast_assign(
+ __entry->event = event;
+ ),
+
+ TP_printk("unhandled event=%s",
+ rdma_show_ib_cm_event(__entry->event)
+ )
+);
+
+TRACE_EVENT(icm_mad_send_err,
+ TP_PROTO(
+ enum ib_cm_state state,
+ enum ib_wc_status wc_status
+ ),
+
+ TP_ARGS(state, wc_status),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, state)
+ __field(unsigned long, wc_status)
+ ),
+
+ TP_fast_assign(
+ __entry->state = state;
+ __entry->wc_status = wc_status;
+ ),
+
+ TP_printk("state=%s completion status=%s",
+ show_ib_cm_state(__entry->state),
+ rdma_show_wc_status(__entry->wc_status)
+ )
+);
+
+#endif /* _TRACE_IB_CMA_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../drivers/infiniband/core
+#define TRACE_INCLUDE_FILE cm_trace
+
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
new file mode 100644
index 000000000..0773ca7ac
--- /dev/null
+++ b/drivers/infiniband/core/cma.c
@@ -0,0 +1,5455 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2005 Voltaire Inc. All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2019, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Intel Corporation. All rights reserved.
+ */
+
+#include <linux/completion.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/mutex.h>
+#include <linux/random.h>
+#include <linux/rbtree.h>
+#include <linux/igmp.h>
+#include <linux/xarray.h>
+#include <linux/inetdevice.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <net/route.h>
+
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/netevent.h>
+#include <net/tcp.h>
+#include <net/ipv6.h>
+#include <net/ip_fib.h>
+#include <net/ip6_route.h>
+
+#include <rdma/rdma_cm.h>
+#include <rdma/rdma_cm_ib.h>
+#include <rdma/rdma_netlink.h>
+#include <rdma/ib.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_sa.h>
+#include <rdma/iw_cm.h>
+
+#include "core_priv.h"
+#include "cma_priv.h"
+#include "cma_trace.h"
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("Generic RDMA CM Agent");
+MODULE_LICENSE("Dual BSD/GPL");
+
+#define CMA_CM_RESPONSE_TIMEOUT 20
+#define CMA_MAX_CM_RETRIES 15
+#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
+#define CMA_IBOE_PACKET_LIFETIME 18
+#define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP
+
+static const char * const cma_events[] = {
+ [RDMA_CM_EVENT_ADDR_RESOLVED] = "address resolved",
+ [RDMA_CM_EVENT_ADDR_ERROR] = "address error",
+ [RDMA_CM_EVENT_ROUTE_RESOLVED] = "route resolved ",
+ [RDMA_CM_EVENT_ROUTE_ERROR] = "route error",
+ [RDMA_CM_EVENT_CONNECT_REQUEST] = "connect request",
+ [RDMA_CM_EVENT_CONNECT_RESPONSE] = "connect response",
+ [RDMA_CM_EVENT_CONNECT_ERROR] = "connect error",
+ [RDMA_CM_EVENT_UNREACHABLE] = "unreachable",
+ [RDMA_CM_EVENT_REJECTED] = "rejected",
+ [RDMA_CM_EVENT_ESTABLISHED] = "established",
+ [RDMA_CM_EVENT_DISCONNECTED] = "disconnected",
+ [RDMA_CM_EVENT_DEVICE_REMOVAL] = "device removal",
+ [RDMA_CM_EVENT_MULTICAST_JOIN] = "multicast join",
+ [RDMA_CM_EVENT_MULTICAST_ERROR] = "multicast error",
+ [RDMA_CM_EVENT_ADDR_CHANGE] = "address change",
+ [RDMA_CM_EVENT_TIMEWAIT_EXIT] = "timewait exit",
+};
+
+static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid,
+ enum ib_gid_type gid_type);
+
+const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event)
+{
+ size_t index = event;
+
+ return (index < ARRAY_SIZE(cma_events) && cma_events[index]) ?
+ cma_events[index] : "unrecognized event";
+}
+EXPORT_SYMBOL(rdma_event_msg);
+
+const char *__attribute_const__ rdma_reject_msg(struct rdma_cm_id *id,
+ int reason)
+{
+ if (rdma_ib_or_roce(id->device, id->port_num))
+ return ibcm_reject_msg(reason);
+
+ if (rdma_protocol_iwarp(id->device, id->port_num))
+ return iwcm_reject_msg(reason);
+
+ WARN_ON_ONCE(1);
+ return "unrecognized transport";
+}
+EXPORT_SYMBOL(rdma_reject_msg);
+
+/**
+ * rdma_is_consumer_reject - return true if the consumer rejected the connect
+ * request.
+ * @id: Communication identifier that received the REJECT event.
+ * @reason: Value returned in the REJECT event status field.
+ */
+static bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason)
+{
+ if (rdma_ib_or_roce(id->device, id->port_num))
+ return reason == IB_CM_REJ_CONSUMER_DEFINED;
+
+ if (rdma_protocol_iwarp(id->device, id->port_num))
+ return reason == -ECONNREFUSED;
+
+ WARN_ON_ONCE(1);
+ return false;
+}
+
+const void *rdma_consumer_reject_data(struct rdma_cm_id *id,
+ struct rdma_cm_event *ev, u8 *data_len)
+{
+ const void *p;
+
+ if (rdma_is_consumer_reject(id, ev->status)) {
+ *data_len = ev->param.conn.private_data_len;
+ p = ev->param.conn.private_data;
+ } else {
+ *data_len = 0;
+ p = NULL;
+ }
+ return p;
+}
+EXPORT_SYMBOL(rdma_consumer_reject_data);
+
+/**
+ * rdma_iw_cm_id() - return the iw_cm_id pointer for this cm_id.
+ * @id: Communication Identifier
+ */
+struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *id)
+{
+ struct rdma_id_private *id_priv;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (id->device->node_type == RDMA_NODE_RNIC)
+ return id_priv->cm_id.iw;
+ return NULL;
+}
+EXPORT_SYMBOL(rdma_iw_cm_id);
+
+/**
+ * rdma_res_to_id() - return the rdma_cm_id pointer for this restrack.
+ * @res: rdma resource tracking entry pointer
+ */
+struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res)
+{
+ struct rdma_id_private *id_priv =
+ container_of(res, struct rdma_id_private, res);
+
+ return &id_priv->id;
+}
+EXPORT_SYMBOL(rdma_res_to_id);
+
+static int cma_add_one(struct ib_device *device);
+static void cma_remove_one(struct ib_device *device, void *client_data);
+
+static struct ib_client cma_client = {
+ .name = "cma",
+ .add = cma_add_one,
+ .remove = cma_remove_one
+};
+
+static struct ib_sa_client sa_client;
+static LIST_HEAD(dev_list);
+static LIST_HEAD(listen_any_list);
+static DEFINE_MUTEX(lock);
+static struct rb_root id_table = RB_ROOT;
+/* Serialize operations of id_table tree */
+static DEFINE_SPINLOCK(id_table_lock);
+static struct workqueue_struct *cma_wq;
+static unsigned int cma_pernet_id;
+
+struct cma_pernet {
+ struct xarray tcp_ps;
+ struct xarray udp_ps;
+ struct xarray ipoib_ps;
+ struct xarray ib_ps;
+};
+
+static struct cma_pernet *cma_pernet(struct net *net)
+{
+ return net_generic(net, cma_pernet_id);
+}
+
+static
+struct xarray *cma_pernet_xa(struct net *net, enum rdma_ucm_port_space ps)
+{
+ struct cma_pernet *pernet = cma_pernet(net);
+
+ switch (ps) {
+ case RDMA_PS_TCP:
+ return &pernet->tcp_ps;
+ case RDMA_PS_UDP:
+ return &pernet->udp_ps;
+ case RDMA_PS_IPOIB:
+ return &pernet->ipoib_ps;
+ case RDMA_PS_IB:
+ return &pernet->ib_ps;
+ default:
+ return NULL;
+ }
+}
+
+struct id_table_entry {
+ struct list_head id_list;
+ struct rb_node rb_node;
+};
+
+struct cma_device {
+ struct list_head list;
+ struct ib_device *device;
+ struct completion comp;
+ refcount_t refcount;
+ struct list_head id_list;
+ enum ib_gid_type *default_gid_type;
+ u8 *default_roce_tos;
+};
+
+struct rdma_bind_list {
+ enum rdma_ucm_port_space ps;
+ struct hlist_head owners;
+ unsigned short port;
+};
+
+static int cma_ps_alloc(struct net *net, enum rdma_ucm_port_space ps,
+ struct rdma_bind_list *bind_list, int snum)
+{
+ struct xarray *xa = cma_pernet_xa(net, ps);
+
+ return xa_insert(xa, snum, bind_list, GFP_KERNEL);
+}
+
+static struct rdma_bind_list *cma_ps_find(struct net *net,
+ enum rdma_ucm_port_space ps, int snum)
+{
+ struct xarray *xa = cma_pernet_xa(net, ps);
+
+ return xa_load(xa, snum);
+}
+
+static void cma_ps_remove(struct net *net, enum rdma_ucm_port_space ps,
+ int snum)
+{
+ struct xarray *xa = cma_pernet_xa(net, ps);
+
+ xa_erase(xa, snum);
+}
+
+enum {
+ CMA_OPTION_AFONLY,
+};
+
+void cma_dev_get(struct cma_device *cma_dev)
+{
+ refcount_inc(&cma_dev->refcount);
+}
+
+void cma_dev_put(struct cma_device *cma_dev)
+{
+ if (refcount_dec_and_test(&cma_dev->refcount))
+ complete(&cma_dev->comp);
+}
+
+struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter,
+ void *cookie)
+{
+ struct cma_device *cma_dev;
+ struct cma_device *found_cma_dev = NULL;
+
+ mutex_lock(&lock);
+
+ list_for_each_entry(cma_dev, &dev_list, list)
+ if (filter(cma_dev->device, cookie)) {
+ found_cma_dev = cma_dev;
+ break;
+ }
+
+ if (found_cma_dev)
+ cma_dev_get(found_cma_dev);
+ mutex_unlock(&lock);
+ return found_cma_dev;
+}
+
+int cma_get_default_gid_type(struct cma_device *cma_dev,
+ u32 port)
+{
+ if (!rdma_is_port_valid(cma_dev->device, port))
+ return -EINVAL;
+
+ return cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)];
+}
+
+int cma_set_default_gid_type(struct cma_device *cma_dev,
+ u32 port,
+ enum ib_gid_type default_gid_type)
+{
+ unsigned long supported_gids;
+
+ if (!rdma_is_port_valid(cma_dev->device, port))
+ return -EINVAL;
+
+ if (default_gid_type == IB_GID_TYPE_IB &&
+ rdma_protocol_roce_eth_encap(cma_dev->device, port))
+ default_gid_type = IB_GID_TYPE_ROCE;
+
+ supported_gids = roce_gid_type_mask_support(cma_dev->device, port);
+
+ if (!(supported_gids & 1 << default_gid_type))
+ return -EINVAL;
+
+ cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)] =
+ default_gid_type;
+
+ return 0;
+}
+
+int cma_get_default_roce_tos(struct cma_device *cma_dev, u32 port)
+{
+ if (!rdma_is_port_valid(cma_dev->device, port))
+ return -EINVAL;
+
+ return cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)];
+}
+
+int cma_set_default_roce_tos(struct cma_device *cma_dev, u32 port,
+ u8 default_roce_tos)
+{
+ if (!rdma_is_port_valid(cma_dev->device, port))
+ return -EINVAL;
+
+ cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)] =
+ default_roce_tos;
+
+ return 0;
+}
+struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev)
+{
+ return cma_dev->device;
+}
+
+/*
+ * Device removal can occur at anytime, so we need extra handling to
+ * serialize notifying the user of device removal with other callbacks.
+ * We do this by disabling removal notification while a callback is in process,
+ * and reporting it after the callback completes.
+ */
+
+struct cma_multicast {
+ struct rdma_id_private *id_priv;
+ union {
+ struct ib_sa_multicast *sa_mc;
+ struct {
+ struct work_struct work;
+ struct rdma_cm_event event;
+ } iboe_join;
+ };
+ struct list_head list;
+ void *context;
+ struct sockaddr_storage addr;
+ u8 join_state;
+};
+
+struct cma_work {
+ struct work_struct work;
+ struct rdma_id_private *id;
+ enum rdma_cm_state old_state;
+ enum rdma_cm_state new_state;
+ struct rdma_cm_event event;
+};
+
+union cma_ip_addr {
+ struct in6_addr ip6;
+ struct {
+ __be32 pad[3];
+ __be32 addr;
+ } ip4;
+};
+
+struct cma_hdr {
+ u8 cma_version;
+ u8 ip_version; /* IP version: 7:4 */
+ __be16 port;
+ union cma_ip_addr src_addr;
+ union cma_ip_addr dst_addr;
+};
+
+#define CMA_VERSION 0x00
+
+struct cma_req_info {
+ struct sockaddr_storage listen_addr_storage;
+ struct sockaddr_storage src_addr_storage;
+ struct ib_device *device;
+ union ib_gid local_gid;
+ __be64 service_id;
+ int port;
+ bool has_gid;
+ u16 pkey;
+};
+
+static int cma_comp_exch(struct rdma_id_private *id_priv,
+ enum rdma_cm_state comp, enum rdma_cm_state exch)
+{
+ unsigned long flags;
+ int ret;
+
+ /*
+ * The FSM uses a funny double locking where state is protected by both
+ * the handler_mutex and the spinlock. State is not allowed to change
+ * to/from a handler_mutex protected value without also holding
+ * handler_mutex.
+ */
+ if (comp == RDMA_CM_CONNECT || exch == RDMA_CM_CONNECT)
+ lockdep_assert_held(&id_priv->handler_mutex);
+
+ spin_lock_irqsave(&id_priv->lock, flags);
+ if ((ret = (id_priv->state == comp)))
+ id_priv->state = exch;
+ spin_unlock_irqrestore(&id_priv->lock, flags);
+ return ret;
+}
+
+static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr)
+{
+ return hdr->ip_version >> 4;
+}
+
+static void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
+{
+ hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF);
+}
+
+static struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv)
+{
+ return (struct sockaddr *)&id_priv->id.route.addr.src_addr;
+}
+
+static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv)
+{
+ return (struct sockaddr *)&id_priv->id.route.addr.dst_addr;
+}
+
+static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join)
+{
+ struct in_device *in_dev = NULL;
+
+ if (ndev) {
+ rtnl_lock();
+ in_dev = __in_dev_get_rtnl(ndev);
+ if (in_dev) {
+ if (join)
+ ip_mc_inc_group(in_dev,
+ *(__be32 *)(mgid->raw + 12));
+ else
+ ip_mc_dec_group(in_dev,
+ *(__be32 *)(mgid->raw + 12));
+ }
+ rtnl_unlock();
+ }
+ return (in_dev) ? 0 : -ENODEV;
+}
+
+static int compare_netdev_and_ip(int ifindex_a, struct sockaddr *sa,
+ struct id_table_entry *entry_b)
+{
+ struct rdma_id_private *id_priv = list_first_entry(
+ &entry_b->id_list, struct rdma_id_private, id_list_entry);
+ int ifindex_b = id_priv->id.route.addr.dev_addr.bound_dev_if;
+ struct sockaddr *sb = cma_dst_addr(id_priv);
+
+ if (ifindex_a != ifindex_b)
+ return (ifindex_a > ifindex_b) ? 1 : -1;
+
+ if (sa->sa_family != sb->sa_family)
+ return sa->sa_family - sb->sa_family;
+
+ if (sa->sa_family == AF_INET &&
+ __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in)) {
+ return memcmp(&((struct sockaddr_in *)sa)->sin_addr,
+ &((struct sockaddr_in *)sb)->sin_addr,
+ sizeof(((struct sockaddr_in *)sa)->sin_addr));
+ }
+
+ if (sa->sa_family == AF_INET6 &&
+ __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in6)) {
+ return ipv6_addr_cmp(&((struct sockaddr_in6 *)sa)->sin6_addr,
+ &((struct sockaddr_in6 *)sb)->sin6_addr);
+ }
+
+ return -1;
+}
+
+static int cma_add_id_to_tree(struct rdma_id_private *node_id_priv)
+{
+ struct rb_node **new, *parent = NULL;
+ struct id_table_entry *this, *node;
+ unsigned long flags;
+ int result;
+
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (!node)
+ return -ENOMEM;
+
+ spin_lock_irqsave(&id_table_lock, flags);
+ new = &id_table.rb_node;
+ while (*new) {
+ this = container_of(*new, struct id_table_entry, rb_node);
+ result = compare_netdev_and_ip(
+ node_id_priv->id.route.addr.dev_addr.bound_dev_if,
+ cma_dst_addr(node_id_priv), this);
+
+ parent = *new;
+ if (result < 0)
+ new = &((*new)->rb_left);
+ else if (result > 0)
+ new = &((*new)->rb_right);
+ else {
+ list_add_tail(&node_id_priv->id_list_entry,
+ &this->id_list);
+ kfree(node);
+ goto unlock;
+ }
+ }
+
+ INIT_LIST_HEAD(&node->id_list);
+ list_add_tail(&node_id_priv->id_list_entry, &node->id_list);
+
+ rb_link_node(&node->rb_node, parent, new);
+ rb_insert_color(&node->rb_node, &id_table);
+
+unlock:
+ spin_unlock_irqrestore(&id_table_lock, flags);
+ return 0;
+}
+
+static struct id_table_entry *
+node_from_ndev_ip(struct rb_root *root, int ifindex, struct sockaddr *sa)
+{
+ struct rb_node *node = root->rb_node;
+ struct id_table_entry *data;
+ int result;
+
+ while (node) {
+ data = container_of(node, struct id_table_entry, rb_node);
+ result = compare_netdev_and_ip(ifindex, sa, data);
+ if (result < 0)
+ node = node->rb_left;
+ else if (result > 0)
+ node = node->rb_right;
+ else
+ return data;
+ }
+
+ return NULL;
+}
+
+static void cma_remove_id_from_tree(struct rdma_id_private *id_priv)
+{
+ struct id_table_entry *data;
+ unsigned long flags;
+
+ spin_lock_irqsave(&id_table_lock, flags);
+ if (list_empty(&id_priv->id_list_entry))
+ goto out;
+
+ data = node_from_ndev_ip(&id_table,
+ id_priv->id.route.addr.dev_addr.bound_dev_if,
+ cma_dst_addr(id_priv));
+ if (!data)
+ goto out;
+
+ list_del_init(&id_priv->id_list_entry);
+ if (list_empty(&data->id_list)) {
+ rb_erase(&data->rb_node, &id_table);
+ kfree(data);
+ }
+out:
+ spin_unlock_irqrestore(&id_table_lock, flags);
+}
+
+static void _cma_attach_to_dev(struct rdma_id_private *id_priv,
+ struct cma_device *cma_dev)
+{
+ cma_dev_get(cma_dev);
+ id_priv->cma_dev = cma_dev;
+ id_priv->id.device = cma_dev->device;
+ id_priv->id.route.addr.dev_addr.transport =
+ rdma_node_get_transport(cma_dev->device->node_type);
+ list_add_tail(&id_priv->device_item, &cma_dev->id_list);
+
+ trace_cm_id_attach(id_priv, cma_dev->device);
+}
+
+static void cma_attach_to_dev(struct rdma_id_private *id_priv,
+ struct cma_device *cma_dev)
+{
+ _cma_attach_to_dev(id_priv, cma_dev);
+ id_priv->gid_type =
+ cma_dev->default_gid_type[id_priv->id.port_num -
+ rdma_start_port(cma_dev->device)];
+}
+
+static void cma_release_dev(struct rdma_id_private *id_priv)
+{
+ mutex_lock(&lock);
+ list_del_init(&id_priv->device_item);
+ cma_dev_put(id_priv->cma_dev);
+ id_priv->cma_dev = NULL;
+ id_priv->id.device = NULL;
+ if (id_priv->id.route.addr.dev_addr.sgid_attr) {
+ rdma_put_gid_attr(id_priv->id.route.addr.dev_addr.sgid_attr);
+ id_priv->id.route.addr.dev_addr.sgid_attr = NULL;
+ }
+ mutex_unlock(&lock);
+}
+
+static inline unsigned short cma_family(struct rdma_id_private *id_priv)
+{
+ return id_priv->id.route.addr.src_addr.ss_family;
+}
+
+static int cma_set_default_qkey(struct rdma_id_private *id_priv)
+{
+ struct ib_sa_mcmember_rec rec;
+ int ret = 0;
+
+ switch (id_priv->id.ps) {
+ case RDMA_PS_UDP:
+ case RDMA_PS_IB:
+ id_priv->qkey = RDMA_UDP_QKEY;
+ break;
+ case RDMA_PS_IPOIB:
+ ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid);
+ ret = ib_sa_get_mcmember_rec(id_priv->id.device,
+ id_priv->id.port_num, &rec.mgid,
+ &rec);
+ if (!ret)
+ id_priv->qkey = be32_to_cpu(rec.qkey);
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
+static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey)
+{
+ if (!qkey ||
+ (id_priv->qkey && (id_priv->qkey != qkey)))
+ return -EINVAL;
+
+ id_priv->qkey = qkey;
+ return 0;
+}
+
+static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr)
+{
+ dev_addr->dev_type = ARPHRD_INFINIBAND;
+ rdma_addr_set_sgid(dev_addr, (union ib_gid *) &sib->sib_addr);
+ ib_addr_set_pkey(dev_addr, ntohs(sib->sib_pkey));
+}
+
+static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_addr)
+{
+ int ret;
+
+ if (addr->sa_family != AF_IB) {
+ ret = rdma_translate_ip(addr, dev_addr);
+ } else {
+ cma_translate_ib((struct sockaddr_ib *) addr, dev_addr);
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static const struct ib_gid_attr *
+cma_validate_port(struct ib_device *device, u32 port,
+ enum ib_gid_type gid_type,
+ union ib_gid *gid,
+ struct rdma_id_private *id_priv)
+{
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ int bound_if_index = dev_addr->bound_dev_if;
+ const struct ib_gid_attr *sgid_attr;
+ int dev_type = dev_addr->dev_type;
+ struct net_device *ndev = NULL;
+
+ if (!rdma_dev_access_netns(device, id_priv->id.route.addr.dev_addr.net))
+ return ERR_PTR(-ENODEV);
+
+ if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port))
+ return ERR_PTR(-ENODEV);
+
+ if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port))
+ return ERR_PTR(-ENODEV);
+
+ if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) {
+ ndev = dev_get_by_index(dev_addr->net, bound_if_index);
+ if (!ndev)
+ return ERR_PTR(-ENODEV);
+ } else {
+ gid_type = IB_GID_TYPE_IB;
+ }
+
+ sgid_attr = rdma_find_gid_by_port(device, gid, gid_type, port, ndev);
+ if (ndev)
+ dev_put(ndev);
+ return sgid_attr;
+}
+
+static void cma_bind_sgid_attr(struct rdma_id_private *id_priv,
+ const struct ib_gid_attr *sgid_attr)
+{
+ WARN_ON(id_priv->id.route.addr.dev_addr.sgid_attr);
+ id_priv->id.route.addr.dev_addr.sgid_attr = sgid_attr;
+}
+
+/**
+ * cma_acquire_dev_by_src_ip - Acquire cma device, port, gid attribute
+ * based on source ip address.
+ * @id_priv: cm_id which should be bound to cma device
+ *
+ * cma_acquire_dev_by_src_ip() binds cm id to cma device, port and GID attribute
+ * based on source IP address. It returns 0 on success or error code otherwise.
+ * It is applicable to active and passive side cm_id.
+ */
+static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv)
+{
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ const struct ib_gid_attr *sgid_attr;
+ union ib_gid gid, iboe_gid, *gidp;
+ struct cma_device *cma_dev;
+ enum ib_gid_type gid_type;
+ int ret = -ENODEV;
+ u32 port;
+
+ if (dev_addr->dev_type != ARPHRD_INFINIBAND &&
+ id_priv->id.ps == RDMA_PS_IPOIB)
+ return -EINVAL;
+
+ rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+ &iboe_gid);
+
+ memcpy(&gid, dev_addr->src_dev_addr +
+ rdma_addr_gid_offset(dev_addr), sizeof(gid));
+
+ mutex_lock(&lock);
+ list_for_each_entry(cma_dev, &dev_list, list) {
+ rdma_for_each_port (cma_dev->device, port) {
+ gidp = rdma_protocol_roce(cma_dev->device, port) ?
+ &iboe_gid : &gid;
+ gid_type = cma_dev->default_gid_type[port - 1];
+ sgid_attr = cma_validate_port(cma_dev->device, port,
+ gid_type, gidp, id_priv);
+ if (!IS_ERR(sgid_attr)) {
+ id_priv->id.port_num = port;
+ cma_bind_sgid_attr(id_priv, sgid_attr);
+ cma_attach_to_dev(id_priv, cma_dev);
+ ret = 0;
+ goto out;
+ }
+ }
+ }
+out:
+ mutex_unlock(&lock);
+ return ret;
+}
+
+/**
+ * cma_ib_acquire_dev - Acquire cma device, port and SGID attribute
+ * @id_priv: cm id to bind to cma device
+ * @listen_id_priv: listener cm id to match against
+ * @req: Pointer to req structure containaining incoming
+ * request information
+ * cma_ib_acquire_dev() acquires cma device, port and SGID attribute when
+ * rdma device matches for listen_id and incoming request. It also verifies
+ * that a GID table entry is present for the source address.
+ * Returns 0 on success, or returns error code otherwise.
+ */
+static int cma_ib_acquire_dev(struct rdma_id_private *id_priv,
+ const struct rdma_id_private *listen_id_priv,
+ struct cma_req_info *req)
+{
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ const struct ib_gid_attr *sgid_attr;
+ enum ib_gid_type gid_type;
+ union ib_gid gid;
+
+ if (dev_addr->dev_type != ARPHRD_INFINIBAND &&
+ id_priv->id.ps == RDMA_PS_IPOIB)
+ return -EINVAL;
+
+ if (rdma_protocol_roce(req->device, req->port))
+ rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+ &gid);
+ else
+ memcpy(&gid, dev_addr->src_dev_addr +
+ rdma_addr_gid_offset(dev_addr), sizeof(gid));
+
+ gid_type = listen_id_priv->cma_dev->default_gid_type[req->port - 1];
+ sgid_attr = cma_validate_port(req->device, req->port,
+ gid_type, &gid, id_priv);
+ if (IS_ERR(sgid_attr))
+ return PTR_ERR(sgid_attr);
+
+ id_priv->id.port_num = req->port;
+ cma_bind_sgid_attr(id_priv, sgid_attr);
+ /* Need to acquire lock to protect against reader
+ * of cma_dev->id_list such as cma_netdev_callback() and
+ * cma_process_remove().
+ */
+ mutex_lock(&lock);
+ cma_attach_to_dev(id_priv, listen_id_priv->cma_dev);
+ mutex_unlock(&lock);
+ rdma_restrack_add(&id_priv->res);
+ return 0;
+}
+
+static int cma_iw_acquire_dev(struct rdma_id_private *id_priv,
+ const struct rdma_id_private *listen_id_priv)
+{
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ const struct ib_gid_attr *sgid_attr;
+ struct cma_device *cma_dev;
+ enum ib_gid_type gid_type;
+ int ret = -ENODEV;
+ union ib_gid gid;
+ u32 port;
+
+ if (dev_addr->dev_type != ARPHRD_INFINIBAND &&
+ id_priv->id.ps == RDMA_PS_IPOIB)
+ return -EINVAL;
+
+ memcpy(&gid, dev_addr->src_dev_addr +
+ rdma_addr_gid_offset(dev_addr), sizeof(gid));
+
+ mutex_lock(&lock);
+
+ cma_dev = listen_id_priv->cma_dev;
+ port = listen_id_priv->id.port_num;
+ gid_type = listen_id_priv->gid_type;
+ sgid_attr = cma_validate_port(cma_dev->device, port,
+ gid_type, &gid, id_priv);
+ if (!IS_ERR(sgid_attr)) {
+ id_priv->id.port_num = port;
+ cma_bind_sgid_attr(id_priv, sgid_attr);
+ ret = 0;
+ goto out;
+ }
+
+ list_for_each_entry(cma_dev, &dev_list, list) {
+ rdma_for_each_port (cma_dev->device, port) {
+ if (listen_id_priv->cma_dev == cma_dev &&
+ listen_id_priv->id.port_num == port)
+ continue;
+
+ gid_type = cma_dev->default_gid_type[port - 1];
+ sgid_attr = cma_validate_port(cma_dev->device, port,
+ gid_type, &gid, id_priv);
+ if (!IS_ERR(sgid_attr)) {
+ id_priv->id.port_num = port;
+ cma_bind_sgid_attr(id_priv, sgid_attr);
+ ret = 0;
+ goto out;
+ }
+ }
+ }
+
+out:
+ if (!ret) {
+ cma_attach_to_dev(id_priv, cma_dev);
+ rdma_restrack_add(&id_priv->res);
+ }
+
+ mutex_unlock(&lock);
+ return ret;
+}
+
+/*
+ * Select the source IB device and address to reach the destination IB address.
+ */
+static int cma_resolve_ib_dev(struct rdma_id_private *id_priv)
+{
+ struct cma_device *cma_dev, *cur_dev;
+ struct sockaddr_ib *addr;
+ union ib_gid gid, sgid, *dgid;
+ unsigned int p;
+ u16 pkey, index;
+ enum ib_port_state port_state;
+ int ret;
+ int i;
+
+ cma_dev = NULL;
+ addr = (struct sockaddr_ib *) cma_dst_addr(id_priv);
+ dgid = (union ib_gid *) &addr->sib_addr;
+ pkey = ntohs(addr->sib_pkey);
+
+ mutex_lock(&lock);
+ list_for_each_entry(cur_dev, &dev_list, list) {
+ rdma_for_each_port (cur_dev->device, p) {
+ if (!rdma_cap_af_ib(cur_dev->device, p))
+ continue;
+
+ if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index))
+ continue;
+
+ if (ib_get_cached_port_state(cur_dev->device, p, &port_state))
+ continue;
+
+ for (i = 0; i < cur_dev->device->port_data[p].immutable.gid_tbl_len;
+ ++i) {
+ ret = rdma_query_gid(cur_dev->device, p, i,
+ &gid);
+ if (ret)
+ continue;
+
+ if (!memcmp(&gid, dgid, sizeof(gid))) {
+ cma_dev = cur_dev;
+ sgid = gid;
+ id_priv->id.port_num = p;
+ goto found;
+ }
+
+ if (!cma_dev && (gid.global.subnet_prefix ==
+ dgid->global.subnet_prefix) &&
+ port_state == IB_PORT_ACTIVE) {
+ cma_dev = cur_dev;
+ sgid = gid;
+ id_priv->id.port_num = p;
+ goto found;
+ }
+ }
+ }
+ }
+ mutex_unlock(&lock);
+ return -ENODEV;
+
+found:
+ cma_attach_to_dev(id_priv, cma_dev);
+ rdma_restrack_add(&id_priv->res);
+ mutex_unlock(&lock);
+ addr = (struct sockaddr_ib *)cma_src_addr(id_priv);
+ memcpy(&addr->sib_addr, &sgid, sizeof(sgid));
+ cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr);
+ return 0;
+}
+
+static void cma_id_get(struct rdma_id_private *id_priv)
+{
+ refcount_inc(&id_priv->refcount);
+}
+
+static void cma_id_put(struct rdma_id_private *id_priv)
+{
+ if (refcount_dec_and_test(&id_priv->refcount))
+ complete(&id_priv->comp);
+}
+
+static struct rdma_id_private *
+__rdma_create_id(struct net *net, rdma_cm_event_handler event_handler,
+ void *context, enum rdma_ucm_port_space ps,
+ enum ib_qp_type qp_type, const struct rdma_id_private *parent)
+{
+ struct rdma_id_private *id_priv;
+
+ id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL);
+ if (!id_priv)
+ return ERR_PTR(-ENOMEM);
+
+ id_priv->state = RDMA_CM_IDLE;
+ id_priv->id.context = context;
+ id_priv->id.event_handler = event_handler;
+ id_priv->id.ps = ps;
+ id_priv->id.qp_type = qp_type;
+ id_priv->tos_set = false;
+ id_priv->timeout_set = false;
+ id_priv->min_rnr_timer_set = false;
+ id_priv->gid_type = IB_GID_TYPE_IB;
+ spin_lock_init(&id_priv->lock);
+ mutex_init(&id_priv->qp_mutex);
+ init_completion(&id_priv->comp);
+ refcount_set(&id_priv->refcount, 1);
+ mutex_init(&id_priv->handler_mutex);
+ INIT_LIST_HEAD(&id_priv->device_item);
+ INIT_LIST_HEAD(&id_priv->id_list_entry);
+ INIT_LIST_HEAD(&id_priv->listen_list);
+ INIT_LIST_HEAD(&id_priv->mc_list);
+ get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
+ id_priv->id.route.addr.dev_addr.net = get_net(net);
+ id_priv->seq_num &= 0x00ffffff;
+
+ rdma_restrack_new(&id_priv->res, RDMA_RESTRACK_CM_ID);
+ if (parent)
+ rdma_restrack_parent_name(&id_priv->res, &parent->res);
+
+ return id_priv;
+}
+
+struct rdma_cm_id *
+__rdma_create_kernel_id(struct net *net, rdma_cm_event_handler event_handler,
+ void *context, enum rdma_ucm_port_space ps,
+ enum ib_qp_type qp_type, const char *caller)
+{
+ struct rdma_id_private *ret;
+
+ ret = __rdma_create_id(net, event_handler, context, ps, qp_type, NULL);
+ if (IS_ERR(ret))
+ return ERR_CAST(ret);
+
+ rdma_restrack_set_name(&ret->res, caller);
+ return &ret->id;
+}
+EXPORT_SYMBOL(__rdma_create_kernel_id);
+
+struct rdma_cm_id *rdma_create_user_id(rdma_cm_event_handler event_handler,
+ void *context,
+ enum rdma_ucm_port_space ps,
+ enum ib_qp_type qp_type)
+{
+ struct rdma_id_private *ret;
+
+ ret = __rdma_create_id(current->nsproxy->net_ns, event_handler, context,
+ ps, qp_type, NULL);
+ if (IS_ERR(ret))
+ return ERR_CAST(ret);
+
+ rdma_restrack_set_name(&ret->res, NULL);
+ return &ret->id;
+}
+EXPORT_SYMBOL(rdma_create_user_id);
+
+static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
+{
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+
+ qp_attr.qp_state = IB_QPS_INIT;
+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ return ret;
+
+ ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+ if (ret)
+ return ret;
+
+ qp_attr.qp_state = IB_QPS_RTR;
+ ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+ if (ret)
+ return ret;
+
+ qp_attr.qp_state = IB_QPS_RTS;
+ qp_attr.sq_psn = 0;
+ ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN);
+
+ return ret;
+}
+
+static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
+{
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+
+ qp_attr.qp_state = IB_QPS_INIT;
+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ return ret;
+
+ return ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+}
+
+int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ struct rdma_id_private *id_priv;
+ struct ib_qp *qp;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (id->device != pd->device) {
+ ret = -EINVAL;
+ goto out_err;
+ }
+
+ qp_init_attr->port_num = id->port_num;
+ qp = ib_create_qp(pd, qp_init_attr);
+ if (IS_ERR(qp)) {
+ ret = PTR_ERR(qp);
+ goto out_err;
+ }
+
+ if (id->qp_type == IB_QPT_UD)
+ ret = cma_init_ud_qp(id_priv, qp);
+ else
+ ret = cma_init_conn_qp(id_priv, qp);
+ if (ret)
+ goto out_destroy;
+
+ id->qp = qp;
+ id_priv->qp_num = qp->qp_num;
+ id_priv->srq = (qp->srq != NULL);
+ trace_cm_qp_create(id_priv, pd, qp_init_attr, 0);
+ return 0;
+out_destroy:
+ ib_destroy_qp(qp);
+out_err:
+ trace_cm_qp_create(id_priv, pd, qp_init_attr, ret);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_create_qp);
+
+void rdma_destroy_qp(struct rdma_cm_id *id)
+{
+ struct rdma_id_private *id_priv;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ trace_cm_qp_destroy(id_priv);
+ mutex_lock(&id_priv->qp_mutex);
+ ib_destroy_qp(id_priv->id.qp);
+ id_priv->id.qp = NULL;
+ mutex_unlock(&id_priv->qp_mutex);
+}
+EXPORT_SYMBOL(rdma_destroy_qp);
+
+static int cma_modify_qp_rtr(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+
+ mutex_lock(&id_priv->qp_mutex);
+ if (!id_priv->id.qp) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Need to update QP attributes from default values. */
+ qp_attr.qp_state = IB_QPS_INIT;
+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ goto out;
+
+ ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+ if (ret)
+ goto out;
+
+ qp_attr.qp_state = IB_QPS_RTR;
+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ goto out;
+
+ BUG_ON(id_priv->cma_dev->device != id_priv->id.device);
+
+ if (conn_param)
+ qp_attr.max_dest_rd_atomic = conn_param->responder_resources;
+ ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+out:
+ mutex_unlock(&id_priv->qp_mutex);
+ return ret;
+}
+
+static int cma_modify_qp_rts(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+
+ mutex_lock(&id_priv->qp_mutex);
+ if (!id_priv->id.qp) {
+ ret = 0;
+ goto out;
+ }
+
+ qp_attr.qp_state = IB_QPS_RTS;
+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ goto out;
+
+ if (conn_param)
+ qp_attr.max_rd_atomic = conn_param->initiator_depth;
+ ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+out:
+ mutex_unlock(&id_priv->qp_mutex);
+ return ret;
+}
+
+static int cma_modify_qp_err(struct rdma_id_private *id_priv)
+{
+ struct ib_qp_attr qp_attr;
+ int ret;
+
+ mutex_lock(&id_priv->qp_mutex);
+ if (!id_priv->id.qp) {
+ ret = 0;
+ goto out;
+ }
+
+ qp_attr.qp_state = IB_QPS_ERR;
+ ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE);
+out:
+ mutex_unlock(&id_priv->qp_mutex);
+ return ret;
+}
+
+static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv,
+ struct ib_qp_attr *qp_attr, int *qp_attr_mask)
+{
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ int ret;
+ u16 pkey;
+
+ if (rdma_cap_eth_ah(id_priv->id.device, id_priv->id.port_num))
+ pkey = 0xffff;
+ else
+ pkey = ib_addr_get_pkey(dev_addr);
+
+ ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num,
+ pkey, &qp_attr->pkey_index);
+ if (ret)
+ return ret;
+
+ qp_attr->port_num = id_priv->id.port_num;
+ *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT;
+
+ if (id_priv->id.qp_type == IB_QPT_UD) {
+ ret = cma_set_default_qkey(id_priv);
+ if (ret)
+ return ret;
+
+ qp_attr->qkey = id_priv->qkey;
+ *qp_attr_mask |= IB_QP_QKEY;
+ } else {
+ qp_attr->qp_access_flags = 0;
+ *qp_attr_mask |= IB_QP_ACCESS_FLAGS;
+ }
+ return 0;
+}
+
+int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ struct rdma_id_private *id_priv;
+ int ret = 0;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (rdma_cap_ib_cm(id->device, id->port_num)) {
+ if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD))
+ ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask);
+ else
+ ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr,
+ qp_attr_mask);
+
+ if (qp_attr->qp_state == IB_QPS_RTR)
+ qp_attr->rq_psn = id_priv->seq_num;
+ } else if (rdma_cap_iw_cm(id->device, id->port_num)) {
+ if (!id_priv->cm_id.iw) {
+ qp_attr->qp_access_flags = 0;
+ *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
+ } else
+ ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr,
+ qp_attr_mask);
+ qp_attr->port_num = id_priv->id.port_num;
+ *qp_attr_mask |= IB_QP_PORT;
+ } else {
+ ret = -ENOSYS;
+ }
+
+ if ((*qp_attr_mask & IB_QP_TIMEOUT) && id_priv->timeout_set)
+ qp_attr->timeout = id_priv->timeout;
+
+ if ((*qp_attr_mask & IB_QP_MIN_RNR_TIMER) && id_priv->min_rnr_timer_set)
+ qp_attr->min_rnr_timer = id_priv->min_rnr_timer;
+
+ return ret;
+}
+EXPORT_SYMBOL(rdma_init_qp_attr);
+
+static inline bool cma_zero_addr(const struct sockaddr *addr)
+{
+ switch (addr->sa_family) {
+ case AF_INET:
+ return ipv4_is_zeronet(((struct sockaddr_in *)addr)->sin_addr.s_addr);
+ case AF_INET6:
+ return ipv6_addr_any(&((struct sockaddr_in6 *)addr)->sin6_addr);
+ case AF_IB:
+ return ib_addr_any(&((struct sockaddr_ib *)addr)->sib_addr);
+ default:
+ return false;
+ }
+}
+
+static inline bool cma_loopback_addr(const struct sockaddr *addr)
+{
+ switch (addr->sa_family) {
+ case AF_INET:
+ return ipv4_is_loopback(
+ ((struct sockaddr_in *)addr)->sin_addr.s_addr);
+ case AF_INET6:
+ return ipv6_addr_loopback(
+ &((struct sockaddr_in6 *)addr)->sin6_addr);
+ case AF_IB:
+ return ib_addr_loopback(
+ &((struct sockaddr_ib *)addr)->sib_addr);
+ default:
+ return false;
+ }
+}
+
+static inline bool cma_any_addr(const struct sockaddr *addr)
+{
+ return cma_zero_addr(addr) || cma_loopback_addr(addr);
+}
+
+static int cma_addr_cmp(const struct sockaddr *src, const struct sockaddr *dst)
+{
+ if (src->sa_family != dst->sa_family)
+ return -1;
+
+ switch (src->sa_family) {
+ case AF_INET:
+ return ((struct sockaddr_in *)src)->sin_addr.s_addr !=
+ ((struct sockaddr_in *)dst)->sin_addr.s_addr;
+ case AF_INET6: {
+ struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *)src;
+ struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst;
+ bool link_local;
+
+ if (ipv6_addr_cmp(&src_addr6->sin6_addr,
+ &dst_addr6->sin6_addr))
+ return 1;
+ link_local = ipv6_addr_type(&dst_addr6->sin6_addr) &
+ IPV6_ADDR_LINKLOCAL;
+ /* Link local must match their scope_ids */
+ return link_local ? (src_addr6->sin6_scope_id !=
+ dst_addr6->sin6_scope_id) :
+ 0;
+ }
+
+ default:
+ return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr,
+ &((struct sockaddr_ib *) dst)->sib_addr);
+ }
+}
+
+static __be16 cma_port(const struct sockaddr *addr)
+{
+ struct sockaddr_ib *sib;
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ return ((struct sockaddr_in *) addr)->sin_port;
+ case AF_INET6:
+ return ((struct sockaddr_in6 *) addr)->sin6_port;
+ case AF_IB:
+ sib = (struct sockaddr_ib *) addr;
+ return htons((u16) (be64_to_cpu(sib->sib_sid) &
+ be64_to_cpu(sib->sib_sid_mask)));
+ default:
+ return 0;
+ }
+}
+
+static inline int cma_any_port(const struct sockaddr *addr)
+{
+ return !cma_port(addr);
+}
+
+static void cma_save_ib_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ const struct rdma_cm_id *listen_id,
+ const struct sa_path_rec *path)
+{
+ struct sockaddr_ib *listen_ib, *ib;
+
+ listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr;
+ if (src_addr) {
+ ib = (struct sockaddr_ib *)src_addr;
+ ib->sib_family = AF_IB;
+ if (path) {
+ ib->sib_pkey = path->pkey;
+ ib->sib_flowinfo = path->flow_label;
+ memcpy(&ib->sib_addr, &path->sgid, 16);
+ ib->sib_sid = path->service_id;
+ ib->sib_scope_id = 0;
+ } else {
+ ib->sib_pkey = listen_ib->sib_pkey;
+ ib->sib_flowinfo = listen_ib->sib_flowinfo;
+ ib->sib_addr = listen_ib->sib_addr;
+ ib->sib_sid = listen_ib->sib_sid;
+ ib->sib_scope_id = listen_ib->sib_scope_id;
+ }
+ ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL);
+ }
+ if (dst_addr) {
+ ib = (struct sockaddr_ib *)dst_addr;
+ ib->sib_family = AF_IB;
+ if (path) {
+ ib->sib_pkey = path->pkey;
+ ib->sib_flowinfo = path->flow_label;
+ memcpy(&ib->sib_addr, &path->dgid, 16);
+ }
+ }
+}
+
+static void cma_save_ip4_info(struct sockaddr_in *src_addr,
+ struct sockaddr_in *dst_addr,
+ struct cma_hdr *hdr,
+ __be16 local_port)
+{
+ if (src_addr) {
+ *src_addr = (struct sockaddr_in) {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = hdr->dst_addr.ip4.addr,
+ .sin_port = local_port,
+ };
+ }
+
+ if (dst_addr) {
+ *dst_addr = (struct sockaddr_in) {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = hdr->src_addr.ip4.addr,
+ .sin_port = hdr->port,
+ };
+ }
+}
+
+static void cma_save_ip6_info(struct sockaddr_in6 *src_addr,
+ struct sockaddr_in6 *dst_addr,
+ struct cma_hdr *hdr,
+ __be16 local_port)
+{
+ if (src_addr) {
+ *src_addr = (struct sockaddr_in6) {
+ .sin6_family = AF_INET6,
+ .sin6_addr = hdr->dst_addr.ip6,
+ .sin6_port = local_port,
+ };
+ }
+
+ if (dst_addr) {
+ *dst_addr = (struct sockaddr_in6) {
+ .sin6_family = AF_INET6,
+ .sin6_addr = hdr->src_addr.ip6,
+ .sin6_port = hdr->port,
+ };
+ }
+}
+
+static u16 cma_port_from_service_id(__be64 service_id)
+{
+ return (u16)be64_to_cpu(service_id);
+}
+
+static int cma_save_ip_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ const struct ib_cm_event *ib_event,
+ __be64 service_id)
+{
+ struct cma_hdr *hdr;
+ __be16 port;
+
+ hdr = ib_event->private_data;
+ if (hdr->cma_version != CMA_VERSION)
+ return -EINVAL;
+
+ port = htons(cma_port_from_service_id(service_id));
+
+ switch (cma_get_ip_ver(hdr)) {
+ case 4:
+ cma_save_ip4_info((struct sockaddr_in *)src_addr,
+ (struct sockaddr_in *)dst_addr, hdr, port);
+ break;
+ case 6:
+ cma_save_ip6_info((struct sockaddr_in6 *)src_addr,
+ (struct sockaddr_in6 *)dst_addr, hdr, port);
+ break;
+ default:
+ return -EAFNOSUPPORT;
+ }
+
+ return 0;
+}
+
+static int cma_save_net_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ const struct rdma_cm_id *listen_id,
+ const struct ib_cm_event *ib_event,
+ sa_family_t sa_family, __be64 service_id)
+{
+ if (sa_family == AF_IB) {
+ if (ib_event->event == IB_CM_REQ_RECEIVED)
+ cma_save_ib_info(src_addr, dst_addr, listen_id,
+ ib_event->param.req_rcvd.primary_path);
+ else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED)
+ cma_save_ib_info(src_addr, dst_addr, listen_id, NULL);
+ return 0;
+ }
+
+ return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id);
+}
+
+static int cma_save_req_info(const struct ib_cm_event *ib_event,
+ struct cma_req_info *req)
+{
+ const struct ib_cm_req_event_param *req_param =
+ &ib_event->param.req_rcvd;
+ const struct ib_cm_sidr_req_event_param *sidr_param =
+ &ib_event->param.sidr_req_rcvd;
+
+ switch (ib_event->event) {
+ case IB_CM_REQ_RECEIVED:
+ req->device = req_param->listen_id->device;
+ req->port = req_param->port;
+ memcpy(&req->local_gid, &req_param->primary_path->sgid,
+ sizeof(req->local_gid));
+ req->has_gid = true;
+ req->service_id = req_param->primary_path->service_id;
+ req->pkey = be16_to_cpu(req_param->primary_path->pkey);
+ if (req->pkey != req_param->bth_pkey)
+ pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and primary path P_Key (0x%x)\n"
+ "RDMA CMA: in the future this may cause the request to be dropped\n",
+ req_param->bth_pkey, req->pkey);
+ break;
+ case IB_CM_SIDR_REQ_RECEIVED:
+ req->device = sidr_param->listen_id->device;
+ req->port = sidr_param->port;
+ req->has_gid = false;
+ req->service_id = sidr_param->service_id;
+ req->pkey = sidr_param->pkey;
+ if (req->pkey != sidr_param->bth_pkey)
+ pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and SIDR request payload P_Key (0x%x)\n"
+ "RDMA CMA: in the future this may cause the request to be dropped\n",
+ sidr_param->bth_pkey, req->pkey);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static bool validate_ipv4_net_dev(struct net_device *net_dev,
+ const struct sockaddr_in *dst_addr,
+ const struct sockaddr_in *src_addr)
+{
+ __be32 daddr = dst_addr->sin_addr.s_addr,
+ saddr = src_addr->sin_addr.s_addr;
+ struct fib_result res;
+ struct flowi4 fl4;
+ int err;
+ bool ret;
+
+ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
+ ipv4_is_lbcast(daddr) || ipv4_is_zeronet(saddr) ||
+ ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr) ||
+ ipv4_is_loopback(saddr))
+ return false;
+
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.flowi4_oif = net_dev->ifindex;
+ fl4.daddr = daddr;
+ fl4.saddr = saddr;
+
+ rcu_read_lock();
+ err = fib_lookup(dev_net(net_dev), &fl4, &res, 0);
+ ret = err == 0 && FIB_RES_DEV(res) == net_dev;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static bool validate_ipv6_net_dev(struct net_device *net_dev,
+ const struct sockaddr_in6 *dst_addr,
+ const struct sockaddr_in6 *src_addr)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ const int strict = ipv6_addr_type(&dst_addr->sin6_addr) &
+ IPV6_ADDR_LINKLOCAL;
+ struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr,
+ &src_addr->sin6_addr, net_dev->ifindex,
+ NULL, strict);
+ bool ret;
+
+ if (!rt)
+ return false;
+
+ ret = rt->rt6i_idev->dev == net_dev;
+ ip6_rt_put(rt);
+
+ return ret;
+#else
+ return false;
+#endif
+}
+
+static bool validate_net_dev(struct net_device *net_dev,
+ const struct sockaddr *daddr,
+ const struct sockaddr *saddr)
+{
+ const struct sockaddr_in *daddr4 = (const struct sockaddr_in *)daddr;
+ const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr;
+ const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr;
+ const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr;
+
+ switch (daddr->sa_family) {
+ case AF_INET:
+ return saddr->sa_family == AF_INET &&
+ validate_ipv4_net_dev(net_dev, daddr4, saddr4);
+
+ case AF_INET6:
+ return saddr->sa_family == AF_INET6 &&
+ validate_ipv6_net_dev(net_dev, daddr6, saddr6);
+
+ default:
+ return false;
+ }
+}
+
+static struct net_device *
+roce_get_net_dev_by_cm_event(const struct ib_cm_event *ib_event)
+{
+ const struct ib_gid_attr *sgid_attr = NULL;
+ struct net_device *ndev;
+
+ if (ib_event->event == IB_CM_REQ_RECEIVED)
+ sgid_attr = ib_event->param.req_rcvd.ppath_sgid_attr;
+ else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED)
+ sgid_attr = ib_event->param.sidr_req_rcvd.sgid_attr;
+
+ if (!sgid_attr)
+ return NULL;
+
+ rcu_read_lock();
+ ndev = rdma_read_gid_attr_ndev_rcu(sgid_attr);
+ if (IS_ERR(ndev))
+ ndev = NULL;
+ else
+ dev_hold(ndev);
+ rcu_read_unlock();
+ return ndev;
+}
+
+static struct net_device *cma_get_net_dev(const struct ib_cm_event *ib_event,
+ struct cma_req_info *req)
+{
+ struct sockaddr *listen_addr =
+ (struct sockaddr *)&req->listen_addr_storage;
+ struct sockaddr *src_addr = (struct sockaddr *)&req->src_addr_storage;
+ struct net_device *net_dev;
+ const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL;
+ int err;
+
+ err = cma_save_ip_info(listen_addr, src_addr, ib_event,
+ req->service_id);
+ if (err)
+ return ERR_PTR(err);
+
+ if (rdma_protocol_roce(req->device, req->port))
+ net_dev = roce_get_net_dev_by_cm_event(ib_event);
+ else
+ net_dev = ib_get_net_dev_by_params(req->device, req->port,
+ req->pkey,
+ gid, listen_addr);
+ if (!net_dev)
+ return ERR_PTR(-ENODEV);
+
+ return net_dev;
+}
+
+static enum rdma_ucm_port_space rdma_ps_from_service_id(__be64 service_id)
+{
+ return (be64_to_cpu(service_id) >> 16) & 0xffff;
+}
+
+static bool cma_match_private_data(struct rdma_id_private *id_priv,
+ const struct cma_hdr *hdr)
+{
+ struct sockaddr *addr = cma_src_addr(id_priv);
+ __be32 ip4_addr;
+ struct in6_addr ip6_addr;
+
+ if (cma_any_addr(addr) && !id_priv->afonly)
+ return true;
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr;
+ if (cma_get_ip_ver(hdr) != 4)
+ return false;
+ if (!cma_any_addr(addr) &&
+ hdr->dst_addr.ip4.addr != ip4_addr)
+ return false;
+ break;
+ case AF_INET6:
+ ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr;
+ if (cma_get_ip_ver(hdr) != 6)
+ return false;
+ if (!cma_any_addr(addr) &&
+ memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr)))
+ return false;
+ break;
+ case AF_IB:
+ return true;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+static bool cma_protocol_roce(const struct rdma_cm_id *id)
+{
+ struct ib_device *device = id->device;
+ const u32 port_num = id->port_num ?: rdma_start_port(device);
+
+ return rdma_protocol_roce(device, port_num);
+}
+
+static bool cma_is_req_ipv6_ll(const struct cma_req_info *req)
+{
+ const struct sockaddr *daddr =
+ (const struct sockaddr *)&req->listen_addr_storage;
+ const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr;
+
+ /* Returns true if the req is for IPv6 link local */
+ return (daddr->sa_family == AF_INET6 &&
+ (ipv6_addr_type(&daddr6->sin6_addr) & IPV6_ADDR_LINKLOCAL));
+}
+
+static bool cma_match_net_dev(const struct rdma_cm_id *id,
+ const struct net_device *net_dev,
+ const struct cma_req_info *req)
+{
+ const struct rdma_addr *addr = &id->route.addr;
+
+ if (!net_dev)
+ /* This request is an AF_IB request */
+ return (!id->port_num || id->port_num == req->port) &&
+ (addr->src_addr.ss_family == AF_IB);
+
+ /*
+ * If the request is not for IPv6 link local, allow matching
+ * request to any netdevice of the one or multiport rdma device.
+ */
+ if (!cma_is_req_ipv6_ll(req))
+ return true;
+ /*
+ * Net namespaces must match, and if the listner is listening
+ * on a specific netdevice than netdevice must match as well.
+ */
+ if (net_eq(dev_net(net_dev), addr->dev_addr.net) &&
+ (!!addr->dev_addr.bound_dev_if ==
+ (addr->dev_addr.bound_dev_if == net_dev->ifindex)))
+ return true;
+ else
+ return false;
+}
+
+static struct rdma_id_private *cma_find_listener(
+ const struct rdma_bind_list *bind_list,
+ const struct ib_cm_id *cm_id,
+ const struct ib_cm_event *ib_event,
+ const struct cma_req_info *req,
+ const struct net_device *net_dev)
+{
+ struct rdma_id_private *id_priv, *id_priv_dev;
+
+ lockdep_assert_held(&lock);
+
+ if (!bind_list)
+ return ERR_PTR(-EINVAL);
+
+ hlist_for_each_entry(id_priv, &bind_list->owners, node) {
+ if (cma_match_private_data(id_priv, ib_event->private_data)) {
+ if (id_priv->id.device == cm_id->device &&
+ cma_match_net_dev(&id_priv->id, net_dev, req))
+ return id_priv;
+ list_for_each_entry(id_priv_dev,
+ &id_priv->listen_list,
+ listen_item) {
+ if (id_priv_dev->id.device == cm_id->device &&
+ cma_match_net_dev(&id_priv_dev->id,
+ net_dev, req))
+ return id_priv_dev;
+ }
+ }
+ }
+
+ return ERR_PTR(-EINVAL);
+}
+
+static struct rdma_id_private *
+cma_ib_id_from_event(struct ib_cm_id *cm_id,
+ const struct ib_cm_event *ib_event,
+ struct cma_req_info *req,
+ struct net_device **net_dev)
+{
+ struct rdma_bind_list *bind_list;
+ struct rdma_id_private *id_priv;
+ int err;
+
+ err = cma_save_req_info(ib_event, req);
+ if (err)
+ return ERR_PTR(err);
+
+ *net_dev = cma_get_net_dev(ib_event, req);
+ if (IS_ERR(*net_dev)) {
+ if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) {
+ /* Assuming the protocol is AF_IB */
+ *net_dev = NULL;
+ } else {
+ return ERR_CAST(*net_dev);
+ }
+ }
+
+ mutex_lock(&lock);
+ /*
+ * Net namespace might be getting deleted while route lookup,
+ * cm_id lookup is in progress. Therefore, perform netdevice
+ * validation, cm_id lookup under rcu lock.
+ * RCU lock along with netdevice state check, synchronizes with
+ * netdevice migrating to different net namespace and also avoids
+ * case where net namespace doesn't get deleted while lookup is in
+ * progress.
+ * If the device state is not IFF_UP, its properties such as ifindex
+ * and nd_net cannot be trusted to remain valid without rcu lock.
+ * net/core/dev.c change_net_namespace() ensures to synchronize with
+ * ongoing operations on net device after device is closed using
+ * synchronize_net().
+ */
+ rcu_read_lock();
+ if (*net_dev) {
+ /*
+ * If netdevice is down, it is likely that it is administratively
+ * down or it might be migrating to different namespace.
+ * In that case avoid further processing, as the net namespace
+ * or ifindex may change.
+ */
+ if (((*net_dev)->flags & IFF_UP) == 0) {
+ id_priv = ERR_PTR(-EHOSTUNREACH);
+ goto err;
+ }
+
+ if (!validate_net_dev(*net_dev,
+ (struct sockaddr *)&req->src_addr_storage,
+ (struct sockaddr *)&req->listen_addr_storage)) {
+ id_priv = ERR_PTR(-EHOSTUNREACH);
+ goto err;
+ }
+ }
+
+ bind_list = cma_ps_find(*net_dev ? dev_net(*net_dev) : &init_net,
+ rdma_ps_from_service_id(req->service_id),
+ cma_port_from_service_id(req->service_id));
+ id_priv = cma_find_listener(bind_list, cm_id, ib_event, req, *net_dev);
+err:
+ rcu_read_unlock();
+ mutex_unlock(&lock);
+ if (IS_ERR(id_priv) && *net_dev) {
+ dev_put(*net_dev);
+ *net_dev = NULL;
+ }
+ return id_priv;
+}
+
+static inline u8 cma_user_data_offset(struct rdma_id_private *id_priv)
+{
+ return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr);
+}
+
+static void cma_cancel_route(struct rdma_id_private *id_priv)
+{
+ if (rdma_cap_ib_sa(id_priv->id.device, id_priv->id.port_num)) {
+ if (id_priv->query)
+ ib_sa_cancel_query(id_priv->query_id, id_priv->query);
+ }
+}
+
+static void _cma_cancel_listens(struct rdma_id_private *id_priv)
+{
+ struct rdma_id_private *dev_id_priv;
+
+ lockdep_assert_held(&lock);
+
+ /*
+ * Remove from listen_any_list to prevent added devices from spawning
+ * additional listen requests.
+ */
+ list_del_init(&id_priv->listen_any_item);
+
+ while (!list_empty(&id_priv->listen_list)) {
+ dev_id_priv =
+ list_first_entry(&id_priv->listen_list,
+ struct rdma_id_private, listen_item);
+ /* sync with device removal to avoid duplicate destruction */
+ list_del_init(&dev_id_priv->device_item);
+ list_del_init(&dev_id_priv->listen_item);
+ mutex_unlock(&lock);
+
+ rdma_destroy_id(&dev_id_priv->id);
+ mutex_lock(&lock);
+ }
+}
+
+static void cma_cancel_listens(struct rdma_id_private *id_priv)
+{
+ mutex_lock(&lock);
+ _cma_cancel_listens(id_priv);
+ mutex_unlock(&lock);
+}
+
+static void cma_cancel_operation(struct rdma_id_private *id_priv,
+ enum rdma_cm_state state)
+{
+ switch (state) {
+ case RDMA_CM_ADDR_QUERY:
+ /*
+ * We can avoid doing the rdma_addr_cancel() based on state,
+ * only RDMA_CM_ADDR_QUERY has a work that could still execute.
+ * Notice that the addr_handler work could still be exiting
+ * outside this state, however due to the interaction with the
+ * handler_mutex the work is guaranteed not to touch id_priv
+ * during exit.
+ */
+ rdma_addr_cancel(&id_priv->id.route.addr.dev_addr);
+ break;
+ case RDMA_CM_ROUTE_QUERY:
+ cma_cancel_route(id_priv);
+ break;
+ case RDMA_CM_LISTEN:
+ if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev)
+ cma_cancel_listens(id_priv);
+ break;
+ default:
+ break;
+ }
+}
+
+static void cma_release_port(struct rdma_id_private *id_priv)
+{
+ struct rdma_bind_list *bind_list = id_priv->bind_list;
+ struct net *net = id_priv->id.route.addr.dev_addr.net;
+
+ if (!bind_list)
+ return;
+
+ mutex_lock(&lock);
+ hlist_del(&id_priv->node);
+ if (hlist_empty(&bind_list->owners)) {
+ cma_ps_remove(net, bind_list->ps, bind_list->port);
+ kfree(bind_list);
+ }
+ mutex_unlock(&lock);
+}
+
+static void destroy_mc(struct rdma_id_private *id_priv,
+ struct cma_multicast *mc)
+{
+ bool send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN);
+
+ if (rdma_cap_ib_mcast(id_priv->id.device, id_priv->id.port_num))
+ ib_sa_free_multicast(mc->sa_mc);
+
+ if (rdma_protocol_roce(id_priv->id.device, id_priv->id.port_num)) {
+ struct rdma_dev_addr *dev_addr =
+ &id_priv->id.route.addr.dev_addr;
+ struct net_device *ndev = NULL;
+
+ if (dev_addr->bound_dev_if)
+ ndev = dev_get_by_index(dev_addr->net,
+ dev_addr->bound_dev_if);
+ if (ndev && !send_only) {
+ enum ib_gid_type gid_type;
+ union ib_gid mgid;
+
+ gid_type = id_priv->cma_dev->default_gid_type
+ [id_priv->id.port_num -
+ rdma_start_port(
+ id_priv->cma_dev->device)];
+ cma_iboe_set_mgid((struct sockaddr *)&mc->addr, &mgid,
+ gid_type);
+ cma_igmp_send(ndev, &mgid, false);
+ }
+ dev_put(ndev);
+
+ cancel_work_sync(&mc->iboe_join.work);
+ }
+ kfree(mc);
+}
+
+static void cma_leave_mc_groups(struct rdma_id_private *id_priv)
+{
+ struct cma_multicast *mc;
+
+ while (!list_empty(&id_priv->mc_list)) {
+ mc = list_first_entry(&id_priv->mc_list, struct cma_multicast,
+ list);
+ list_del(&mc->list);
+ destroy_mc(id_priv, mc);
+ }
+}
+
+static void _destroy_id(struct rdma_id_private *id_priv,
+ enum rdma_cm_state state)
+{
+ cma_cancel_operation(id_priv, state);
+
+ rdma_restrack_del(&id_priv->res);
+ cma_remove_id_from_tree(id_priv);
+ if (id_priv->cma_dev) {
+ if (rdma_cap_ib_cm(id_priv->id.device, 1)) {
+ if (id_priv->cm_id.ib)
+ ib_destroy_cm_id(id_priv->cm_id.ib);
+ } else if (rdma_cap_iw_cm(id_priv->id.device, 1)) {
+ if (id_priv->cm_id.iw)
+ iw_destroy_cm_id(id_priv->cm_id.iw);
+ }
+ cma_leave_mc_groups(id_priv);
+ cma_release_dev(id_priv);
+ }
+
+ cma_release_port(id_priv);
+ cma_id_put(id_priv);
+ wait_for_completion(&id_priv->comp);
+
+ if (id_priv->internal_id)
+ cma_id_put(id_priv->id.context);
+
+ kfree(id_priv->id.route.path_rec);
+ kfree(id_priv->id.route.path_rec_inbound);
+ kfree(id_priv->id.route.path_rec_outbound);
+
+ put_net(id_priv->id.route.addr.dev_addr.net);
+ kfree(id_priv);
+}
+
+/*
+ * destroy an ID from within the handler_mutex. This ensures that no other
+ * handlers can start running concurrently.
+ */
+static void destroy_id_handler_unlock(struct rdma_id_private *id_priv)
+ __releases(&idprv->handler_mutex)
+{
+ enum rdma_cm_state state;
+ unsigned long flags;
+
+ trace_cm_id_destroy(id_priv);
+
+ /*
+ * Setting the state to destroyed under the handler mutex provides a
+ * fence against calling handler callbacks. If this is invoked due to
+ * the failure of a handler callback then it guarentees that no future
+ * handlers will be called.
+ */
+ lockdep_assert_held(&id_priv->handler_mutex);
+ spin_lock_irqsave(&id_priv->lock, flags);
+ state = id_priv->state;
+ id_priv->state = RDMA_CM_DESTROYING;
+ spin_unlock_irqrestore(&id_priv->lock, flags);
+ mutex_unlock(&id_priv->handler_mutex);
+ _destroy_id(id_priv, state);
+}
+
+void rdma_destroy_id(struct rdma_cm_id *id)
+{
+ struct rdma_id_private *id_priv =
+ container_of(id, struct rdma_id_private, id);
+
+ mutex_lock(&id_priv->handler_mutex);
+ destroy_id_handler_unlock(id_priv);
+}
+EXPORT_SYMBOL(rdma_destroy_id);
+
+static int cma_rep_recv(struct rdma_id_private *id_priv)
+{
+ int ret;
+
+ ret = cma_modify_qp_rtr(id_priv, NULL);
+ if (ret)
+ goto reject;
+
+ ret = cma_modify_qp_rts(id_priv, NULL);
+ if (ret)
+ goto reject;
+
+ trace_cm_send_rtu(id_priv);
+ ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0);
+ if (ret)
+ goto reject;
+
+ return 0;
+reject:
+ pr_debug_ratelimited("RDMA CM: CONNECT_ERROR: failed to handle reply. status %d\n", ret);
+ cma_modify_qp_err(id_priv);
+ trace_cm_send_rej(id_priv);
+ ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED,
+ NULL, 0, NULL, 0);
+ return ret;
+}
+
+static void cma_set_rep_event_data(struct rdma_cm_event *event,
+ const struct ib_cm_rep_event_param *rep_data,
+ void *private_data)
+{
+ event->param.conn.private_data = private_data;
+ event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
+ event->param.conn.responder_resources = rep_data->responder_resources;
+ event->param.conn.initiator_depth = rep_data->initiator_depth;
+ event->param.conn.flow_control = rep_data->flow_control;
+ event->param.conn.rnr_retry_count = rep_data->rnr_retry_count;
+ event->param.conn.srq = rep_data->srq;
+ event->param.conn.qp_num = rep_data->remote_qpn;
+
+ event->ece.vendor_id = rep_data->ece.vendor_id;
+ event->ece.attr_mod = rep_data->ece.attr_mod;
+}
+
+static int cma_cm_event_handler(struct rdma_id_private *id_priv,
+ struct rdma_cm_event *event)
+{
+ int ret;
+
+ lockdep_assert_held(&id_priv->handler_mutex);
+
+ trace_cm_event_handler(id_priv, event);
+ ret = id_priv->id.event_handler(&id_priv->id, event);
+ trace_cm_event_done(id_priv, event, ret);
+ return ret;
+}
+
+static int cma_ib_handler(struct ib_cm_id *cm_id,
+ const struct ib_cm_event *ib_event)
+{
+ struct rdma_id_private *id_priv = cm_id->context;
+ struct rdma_cm_event event = {};
+ enum rdma_cm_state state;
+ int ret;
+
+ mutex_lock(&id_priv->handler_mutex);
+ state = READ_ONCE(id_priv->state);
+ if ((ib_event->event != IB_CM_TIMEWAIT_EXIT &&
+ state != RDMA_CM_CONNECT) ||
+ (ib_event->event == IB_CM_TIMEWAIT_EXIT &&
+ state != RDMA_CM_DISCONNECT))
+ goto out;
+
+ switch (ib_event->event) {
+ case IB_CM_REQ_ERROR:
+ case IB_CM_REP_ERROR:
+ event.event = RDMA_CM_EVENT_UNREACHABLE;
+ event.status = -ETIMEDOUT;
+ break;
+ case IB_CM_REP_RECEIVED:
+ if (state == RDMA_CM_CONNECT &&
+ (id_priv->id.qp_type != IB_QPT_UD)) {
+ trace_cm_send_mra(id_priv);
+ ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
+ }
+ if (id_priv->id.qp) {
+ event.status = cma_rep_recv(id_priv);
+ event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR :
+ RDMA_CM_EVENT_ESTABLISHED;
+ } else {
+ event.event = RDMA_CM_EVENT_CONNECT_RESPONSE;
+ }
+ cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd,
+ ib_event->private_data);
+ break;
+ case IB_CM_RTU_RECEIVED:
+ case IB_CM_USER_ESTABLISHED:
+ event.event = RDMA_CM_EVENT_ESTABLISHED;
+ break;
+ case IB_CM_DREQ_ERROR:
+ event.status = -ETIMEDOUT;
+ fallthrough;
+ case IB_CM_DREQ_RECEIVED:
+ case IB_CM_DREP_RECEIVED:
+ if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT,
+ RDMA_CM_DISCONNECT))
+ goto out;
+ event.event = RDMA_CM_EVENT_DISCONNECTED;
+ break;
+ case IB_CM_TIMEWAIT_EXIT:
+ event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT;
+ break;
+ case IB_CM_MRA_RECEIVED:
+ /* ignore event */
+ goto out;
+ case IB_CM_REJ_RECEIVED:
+ pr_debug_ratelimited("RDMA CM: REJECTED: %s\n", rdma_reject_msg(&id_priv->id,
+ ib_event->param.rej_rcvd.reason));
+ cma_modify_qp_err(id_priv);
+ event.status = ib_event->param.rej_rcvd.reason;
+ event.event = RDMA_CM_EVENT_REJECTED;
+ event.param.conn.private_data = ib_event->private_data;
+ event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
+ break;
+ default:
+ pr_err("RDMA CMA: unexpected IB CM event: %d\n",
+ ib_event->event);
+ goto out;
+ }
+
+ ret = cma_cm_event_handler(id_priv, &event);
+ if (ret) {
+ /* Destroy the CM ID by returning a non-zero value. */
+ id_priv->cm_id.ib = NULL;
+ destroy_id_handler_unlock(id_priv);
+ return ret;
+ }
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+ return 0;
+}
+
+static struct rdma_id_private *
+cma_ib_new_conn_id(const struct rdma_cm_id *listen_id,
+ const struct ib_cm_event *ib_event,
+ struct net_device *net_dev)
+{
+ struct rdma_id_private *listen_id_priv;
+ struct rdma_id_private *id_priv;
+ struct rdma_cm_id *id;
+ struct rdma_route *rt;
+ const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;
+ struct sa_path_rec *path = ib_event->param.req_rcvd.primary_path;
+ const __be64 service_id =
+ ib_event->param.req_rcvd.primary_path->service_id;
+ int ret;
+
+ listen_id_priv = container_of(listen_id, struct rdma_id_private, id);
+ id_priv = __rdma_create_id(listen_id->route.addr.dev_addr.net,
+ listen_id->event_handler, listen_id->context,
+ listen_id->ps,
+ ib_event->param.req_rcvd.qp_type,
+ listen_id_priv);
+ if (IS_ERR(id_priv))
+ return NULL;
+
+ id = &id_priv->id;
+ if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
+ (struct sockaddr *)&id->route.addr.dst_addr,
+ listen_id, ib_event, ss_family, service_id))
+ goto err;
+
+ rt = &id->route;
+ rt->num_pri_alt_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1;
+ rt->path_rec = kmalloc_array(rt->num_pri_alt_paths,
+ sizeof(*rt->path_rec), GFP_KERNEL);
+ if (!rt->path_rec)
+ goto err;
+
+ rt->path_rec[0] = *path;
+ if (rt->num_pri_alt_paths == 2)
+ rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
+
+ if (net_dev) {
+ rdma_copy_src_l2_addr(&rt->addr.dev_addr, net_dev);
+ } else {
+ if (!cma_protocol_roce(listen_id) &&
+ cma_any_addr(cma_src_addr(id_priv))) {
+ rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND;
+ rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
+ ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
+ } else if (!cma_any_addr(cma_src_addr(id_priv))) {
+ ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr);
+ if (ret)
+ goto err;
+ }
+ }
+ rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
+
+ id_priv->state = RDMA_CM_CONNECT;
+ return id_priv;
+
+err:
+ rdma_destroy_id(id);
+ return NULL;
+}
+
+static struct rdma_id_private *
+cma_ib_new_udp_id(const struct rdma_cm_id *listen_id,
+ const struct ib_cm_event *ib_event,
+ struct net_device *net_dev)
+{
+ const struct rdma_id_private *listen_id_priv;
+ struct rdma_id_private *id_priv;
+ struct rdma_cm_id *id;
+ const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;
+ struct net *net = listen_id->route.addr.dev_addr.net;
+ int ret;
+
+ listen_id_priv = container_of(listen_id, struct rdma_id_private, id);
+ id_priv = __rdma_create_id(net, listen_id->event_handler,
+ listen_id->context, listen_id->ps, IB_QPT_UD,
+ listen_id_priv);
+ if (IS_ERR(id_priv))
+ return NULL;
+
+ id = &id_priv->id;
+ if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
+ (struct sockaddr *)&id->route.addr.dst_addr,
+ listen_id, ib_event, ss_family,
+ ib_event->param.sidr_req_rcvd.service_id))
+ goto err;
+
+ if (net_dev) {
+ rdma_copy_src_l2_addr(&id->route.addr.dev_addr, net_dev);
+ } else {
+ if (!cma_any_addr(cma_src_addr(id_priv))) {
+ ret = cma_translate_addr(cma_src_addr(id_priv),
+ &id->route.addr.dev_addr);
+ if (ret)
+ goto err;
+ }
+ }
+
+ id_priv->state = RDMA_CM_CONNECT;
+ return id_priv;
+err:
+ rdma_destroy_id(id);
+ return NULL;
+}
+
+static void cma_set_req_event_data(struct rdma_cm_event *event,
+ const struct ib_cm_req_event_param *req_data,
+ void *private_data, int offset)
+{
+ event->param.conn.private_data = private_data + offset;
+ event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset;
+ event->param.conn.responder_resources = req_data->responder_resources;
+ event->param.conn.initiator_depth = req_data->initiator_depth;
+ event->param.conn.flow_control = req_data->flow_control;
+ event->param.conn.retry_count = req_data->retry_count;
+ event->param.conn.rnr_retry_count = req_data->rnr_retry_count;
+ event->param.conn.srq = req_data->srq;
+ event->param.conn.qp_num = req_data->remote_qpn;
+
+ event->ece.vendor_id = req_data->ece.vendor_id;
+ event->ece.attr_mod = req_data->ece.attr_mod;
+}
+
+static int cma_ib_check_req_qp_type(const struct rdma_cm_id *id,
+ const struct ib_cm_event *ib_event)
+{
+ return (((ib_event->event == IB_CM_REQ_RECEIVED) &&
+ (ib_event->param.req_rcvd.qp_type == id->qp_type)) ||
+ ((ib_event->event == IB_CM_SIDR_REQ_RECEIVED) &&
+ (id->qp_type == IB_QPT_UD)) ||
+ (!id->qp_type));
+}
+
+static int cma_ib_req_handler(struct ib_cm_id *cm_id,
+ const struct ib_cm_event *ib_event)
+{
+ struct rdma_id_private *listen_id, *conn_id = NULL;
+ struct rdma_cm_event event = {};
+ struct cma_req_info req = {};
+ struct net_device *net_dev;
+ u8 offset;
+ int ret;
+
+ listen_id = cma_ib_id_from_event(cm_id, ib_event, &req, &net_dev);
+ if (IS_ERR(listen_id))
+ return PTR_ERR(listen_id);
+
+ trace_cm_req_handler(listen_id, ib_event->event);
+ if (!cma_ib_check_req_qp_type(&listen_id->id, ib_event)) {
+ ret = -EINVAL;
+ goto net_dev_put;
+ }
+
+ mutex_lock(&listen_id->handler_mutex);
+ if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) {
+ ret = -ECONNABORTED;
+ goto err_unlock;
+ }
+
+ offset = cma_user_data_offset(listen_id);
+ event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
+ if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) {
+ conn_id = cma_ib_new_udp_id(&listen_id->id, ib_event, net_dev);
+ event.param.ud.private_data = ib_event->private_data + offset;
+ event.param.ud.private_data_len =
+ IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset;
+ } else {
+ conn_id = cma_ib_new_conn_id(&listen_id->id, ib_event, net_dev);
+ cma_set_req_event_data(&event, &ib_event->param.req_rcvd,
+ ib_event->private_data, offset);
+ }
+ if (!conn_id) {
+ ret = -ENOMEM;
+ goto err_unlock;
+ }
+
+ mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
+ ret = cma_ib_acquire_dev(conn_id, listen_id, &req);
+ if (ret) {
+ destroy_id_handler_unlock(conn_id);
+ goto err_unlock;
+ }
+
+ conn_id->cm_id.ib = cm_id;
+ cm_id->context = conn_id;
+ cm_id->cm_handler = cma_ib_handler;
+
+ ret = cma_cm_event_handler(conn_id, &event);
+ if (ret) {
+ /* Destroy the CM ID by returning a non-zero value. */
+ conn_id->cm_id.ib = NULL;
+ mutex_unlock(&listen_id->handler_mutex);
+ destroy_id_handler_unlock(conn_id);
+ goto net_dev_put;
+ }
+
+ if (READ_ONCE(conn_id->state) == RDMA_CM_CONNECT &&
+ conn_id->id.qp_type != IB_QPT_UD) {
+ trace_cm_send_mra(cm_id->context);
+ ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
+ }
+ mutex_unlock(&conn_id->handler_mutex);
+
+err_unlock:
+ mutex_unlock(&listen_id->handler_mutex);
+
+net_dev_put:
+ if (net_dev)
+ dev_put(net_dev);
+
+ return ret;
+}
+
+__be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+ if (addr->sa_family == AF_IB)
+ return ((struct sockaddr_ib *) addr)->sib_sid;
+
+ return cpu_to_be64(((u64)id->ps << 16) + be16_to_cpu(cma_port(addr)));
+}
+EXPORT_SYMBOL(rdma_get_service_id);
+
+void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid,
+ union ib_gid *dgid)
+{
+ struct rdma_addr *addr = &cm_id->route.addr;
+
+ if (!cm_id->device) {
+ if (sgid)
+ memset(sgid, 0, sizeof(*sgid));
+ if (dgid)
+ memset(dgid, 0, sizeof(*dgid));
+ return;
+ }
+
+ if (rdma_protocol_roce(cm_id->device, cm_id->port_num)) {
+ if (sgid)
+ rdma_ip2gid((struct sockaddr *)&addr->src_addr, sgid);
+ if (dgid)
+ rdma_ip2gid((struct sockaddr *)&addr->dst_addr, dgid);
+ } else {
+ if (sgid)
+ rdma_addr_get_sgid(&addr->dev_addr, sgid);
+ if (dgid)
+ rdma_addr_get_dgid(&addr->dev_addr, dgid);
+ }
+}
+EXPORT_SYMBOL(rdma_read_gids);
+
+static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
+{
+ struct rdma_id_private *id_priv = iw_id->context;
+ struct rdma_cm_event event = {};
+ int ret = 0;
+ struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr;
+ struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr;
+
+ mutex_lock(&id_priv->handler_mutex);
+ if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT)
+ goto out;
+
+ switch (iw_event->event) {
+ case IW_CM_EVENT_CLOSE:
+ event.event = RDMA_CM_EVENT_DISCONNECTED;
+ break;
+ case IW_CM_EVENT_CONNECT_REPLY:
+ memcpy(cma_src_addr(id_priv), laddr,
+ rdma_addr_size(laddr));
+ memcpy(cma_dst_addr(id_priv), raddr,
+ rdma_addr_size(raddr));
+ switch (iw_event->status) {
+ case 0:
+ event.event = RDMA_CM_EVENT_ESTABLISHED;
+ event.param.conn.initiator_depth = iw_event->ird;
+ event.param.conn.responder_resources = iw_event->ord;
+ break;
+ case -ECONNRESET:
+ case -ECONNREFUSED:
+ event.event = RDMA_CM_EVENT_REJECTED;
+ break;
+ case -ETIMEDOUT:
+ event.event = RDMA_CM_EVENT_UNREACHABLE;
+ break;
+ default:
+ event.event = RDMA_CM_EVENT_CONNECT_ERROR;
+ break;
+ }
+ break;
+ case IW_CM_EVENT_ESTABLISHED:
+ event.event = RDMA_CM_EVENT_ESTABLISHED;
+ event.param.conn.initiator_depth = iw_event->ird;
+ event.param.conn.responder_resources = iw_event->ord;
+ break;
+ default:
+ goto out;
+ }
+
+ event.status = iw_event->status;
+ event.param.conn.private_data = iw_event->private_data;
+ event.param.conn.private_data_len = iw_event->private_data_len;
+ ret = cma_cm_event_handler(id_priv, &event);
+ if (ret) {
+ /* Destroy the CM ID by returning a non-zero value. */
+ id_priv->cm_id.iw = NULL;
+ destroy_id_handler_unlock(id_priv);
+ return ret;
+ }
+
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+ return ret;
+}
+
+static int iw_conn_req_handler(struct iw_cm_id *cm_id,
+ struct iw_cm_event *iw_event)
+{
+ struct rdma_id_private *listen_id, *conn_id;
+ struct rdma_cm_event event = {};
+ int ret = -ECONNABORTED;
+ struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr;
+ struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr;
+
+ event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
+ event.param.conn.private_data = iw_event->private_data;
+ event.param.conn.private_data_len = iw_event->private_data_len;
+ event.param.conn.initiator_depth = iw_event->ird;
+ event.param.conn.responder_resources = iw_event->ord;
+
+ listen_id = cm_id->context;
+
+ mutex_lock(&listen_id->handler_mutex);
+ if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN)
+ goto out;
+
+ /* Create a new RDMA id for the new IW CM ID */
+ conn_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net,
+ listen_id->id.event_handler,
+ listen_id->id.context, RDMA_PS_TCP,
+ IB_QPT_RC, listen_id);
+ if (IS_ERR(conn_id)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
+ conn_id->state = RDMA_CM_CONNECT;
+
+ ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr);
+ if (ret) {
+ mutex_unlock(&listen_id->handler_mutex);
+ destroy_id_handler_unlock(conn_id);
+ return ret;
+ }
+
+ ret = cma_iw_acquire_dev(conn_id, listen_id);
+ if (ret) {
+ mutex_unlock(&listen_id->handler_mutex);
+ destroy_id_handler_unlock(conn_id);
+ return ret;
+ }
+
+ conn_id->cm_id.iw = cm_id;
+ cm_id->context = conn_id;
+ cm_id->cm_handler = cma_iw_handler;
+
+ memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr));
+ memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr));
+
+ ret = cma_cm_event_handler(conn_id, &event);
+ if (ret) {
+ /* User wants to destroy the CM ID */
+ conn_id->cm_id.iw = NULL;
+ mutex_unlock(&listen_id->handler_mutex);
+ destroy_id_handler_unlock(conn_id);
+ return ret;
+ }
+
+ mutex_unlock(&conn_id->handler_mutex);
+
+out:
+ mutex_unlock(&listen_id->handler_mutex);
+ return ret;
+}
+
+static int cma_ib_listen(struct rdma_id_private *id_priv)
+{
+ struct sockaddr *addr;
+ struct ib_cm_id *id;
+ __be64 svc_id;
+
+ addr = cma_src_addr(id_priv);
+ svc_id = rdma_get_service_id(&id_priv->id, addr);
+ id = ib_cm_insert_listen(id_priv->id.device,
+ cma_ib_req_handler, svc_id);
+ if (IS_ERR(id))
+ return PTR_ERR(id);
+ id_priv->cm_id.ib = id;
+
+ return 0;
+}
+
+static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
+{
+ int ret;
+ struct iw_cm_id *id;
+
+ id = iw_create_cm_id(id_priv->id.device,
+ iw_conn_req_handler,
+ id_priv);
+ if (IS_ERR(id))
+ return PTR_ERR(id);
+
+ mutex_lock(&id_priv->qp_mutex);
+ id->tos = id_priv->tos;
+ id->tos_set = id_priv->tos_set;
+ mutex_unlock(&id_priv->qp_mutex);
+ id->afonly = id_priv->afonly;
+ id_priv->cm_id.iw = id;
+
+ memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv),
+ rdma_addr_size(cma_src_addr(id_priv)));
+
+ ret = iw_cm_listen(id_priv->cm_id.iw, backlog);
+
+ if (ret) {
+ iw_destroy_cm_id(id_priv->cm_id.iw);
+ id_priv->cm_id.iw = NULL;
+ }
+
+ return ret;
+}
+
+static int cma_listen_handler(struct rdma_cm_id *id,
+ struct rdma_cm_event *event)
+{
+ struct rdma_id_private *id_priv = id->context;
+
+ /* Listening IDs are always destroyed on removal */
+ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+ return -1;
+
+ id->context = id_priv->id.context;
+ id->event_handler = id_priv->id.event_handler;
+ trace_cm_event_handler(id_priv, event);
+ return id_priv->id.event_handler(id, event);
+}
+
+static int cma_listen_on_dev(struct rdma_id_private *id_priv,
+ struct cma_device *cma_dev,
+ struct rdma_id_private **to_destroy)
+{
+ struct rdma_id_private *dev_id_priv;
+ struct net *net = id_priv->id.route.addr.dev_addr.net;
+ int ret;
+
+ lockdep_assert_held(&lock);
+
+ *to_destroy = NULL;
+ if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1))
+ return 0;
+
+ dev_id_priv =
+ __rdma_create_id(net, cma_listen_handler, id_priv,
+ id_priv->id.ps, id_priv->id.qp_type, id_priv);
+ if (IS_ERR(dev_id_priv))
+ return PTR_ERR(dev_id_priv);
+
+ dev_id_priv->state = RDMA_CM_ADDR_BOUND;
+ memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv),
+ rdma_addr_size(cma_src_addr(id_priv)));
+
+ _cma_attach_to_dev(dev_id_priv, cma_dev);
+ rdma_restrack_add(&dev_id_priv->res);
+ cma_id_get(id_priv);
+ dev_id_priv->internal_id = 1;
+ dev_id_priv->afonly = id_priv->afonly;
+ mutex_lock(&id_priv->qp_mutex);
+ dev_id_priv->tos_set = id_priv->tos_set;
+ dev_id_priv->tos = id_priv->tos;
+ mutex_unlock(&id_priv->qp_mutex);
+
+ ret = rdma_listen(&dev_id_priv->id, id_priv->backlog);
+ if (ret)
+ goto err_listen;
+ list_add_tail(&dev_id_priv->listen_item, &id_priv->listen_list);
+ return 0;
+err_listen:
+ /* Caller must destroy this after releasing lock */
+ *to_destroy = dev_id_priv;
+ dev_warn(&cma_dev->device->dev, "RDMA CMA: %s, error %d\n", __func__, ret);
+ return ret;
+}
+
+static int cma_listen_on_all(struct rdma_id_private *id_priv)
+{
+ struct rdma_id_private *to_destroy;
+ struct cma_device *cma_dev;
+ int ret;
+
+ mutex_lock(&lock);
+ list_add_tail(&id_priv->listen_any_item, &listen_any_list);
+ list_for_each_entry(cma_dev, &dev_list, list) {
+ ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy);
+ if (ret) {
+ /* Prevent racing with cma_process_remove() */
+ if (to_destroy)
+ list_del_init(&to_destroy->device_item);
+ goto err_listen;
+ }
+ }
+ mutex_unlock(&lock);
+ return 0;
+
+err_listen:
+ _cma_cancel_listens(id_priv);
+ mutex_unlock(&lock);
+ if (to_destroy)
+ rdma_destroy_id(&to_destroy->id);
+ return ret;
+}
+
+void rdma_set_service_type(struct rdma_cm_id *id, int tos)
+{
+ struct rdma_id_private *id_priv;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ mutex_lock(&id_priv->qp_mutex);
+ id_priv->tos = (u8) tos;
+ id_priv->tos_set = true;
+ mutex_unlock(&id_priv->qp_mutex);
+}
+EXPORT_SYMBOL(rdma_set_service_type);
+
+/**
+ * rdma_set_ack_timeout() - Set the ack timeout of QP associated
+ * with a connection identifier.
+ * @id: Communication identifier to associated with service type.
+ * @timeout: Ack timeout to set a QP, expressed as 4.096 * 2^(timeout) usec.
+ *
+ * This function should be called before rdma_connect() on active side,
+ * and on passive side before rdma_accept(). It is applicable to primary
+ * path only. The timeout will affect the local side of the QP, it is not
+ * negotiated with remote side and zero disables the timer. In case it is
+ * set before rdma_resolve_route, the value will also be used to determine
+ * PacketLifeTime for RoCE.
+ *
+ * Return: 0 for success
+ */
+int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout)
+{
+ struct rdma_id_private *id_priv;
+
+ if (id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_INI)
+ return -EINVAL;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ mutex_lock(&id_priv->qp_mutex);
+ id_priv->timeout = timeout;
+ id_priv->timeout_set = true;
+ mutex_unlock(&id_priv->qp_mutex);
+
+ return 0;
+}
+EXPORT_SYMBOL(rdma_set_ack_timeout);
+
+/**
+ * rdma_set_min_rnr_timer() - Set the minimum RNR Retry timer of the
+ * QP associated with a connection identifier.
+ * @id: Communication identifier to associated with service type.
+ * @min_rnr_timer: 5-bit value encoded as Table 45: "Encoding for RNR NAK
+ * Timer Field" in the IBTA specification.
+ *
+ * This function should be called before rdma_connect() on active
+ * side, and on passive side before rdma_accept(). The timer value
+ * will be associated with the local QP. When it receives a send it is
+ * not read to handle, typically if the receive queue is empty, an RNR
+ * Retry NAK is returned to the requester with the min_rnr_timer
+ * encoded. The requester will then wait at least the time specified
+ * in the NAK before retrying. The default is zero, which translates
+ * to a minimum RNR Timer value of 655 ms.
+ *
+ * Return: 0 for success
+ */
+int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer)
+{
+ struct rdma_id_private *id_priv;
+
+ /* It is a five-bit value */
+ if (min_rnr_timer & 0xe0)
+ return -EINVAL;
+
+ if (WARN_ON(id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_TGT))
+ return -EINVAL;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ mutex_lock(&id_priv->qp_mutex);
+ id_priv->min_rnr_timer = min_rnr_timer;
+ id_priv->min_rnr_timer_set = true;
+ mutex_unlock(&id_priv->qp_mutex);
+
+ return 0;
+}
+EXPORT_SYMBOL(rdma_set_min_rnr_timer);
+
+static void route_set_path_rec_inbound(struct cma_work *work,
+ struct sa_path_rec *path_rec)
+{
+ struct rdma_route *route = &work->id->id.route;
+
+ if (!route->path_rec_inbound) {
+ route->path_rec_inbound =
+ kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL);
+ if (!route->path_rec_inbound)
+ return;
+ }
+
+ *route->path_rec_inbound = *path_rec;
+}
+
+static void route_set_path_rec_outbound(struct cma_work *work,
+ struct sa_path_rec *path_rec)
+{
+ struct rdma_route *route = &work->id->id.route;
+
+ if (!route->path_rec_outbound) {
+ route->path_rec_outbound =
+ kzalloc(sizeof(*route->path_rec_outbound), GFP_KERNEL);
+ if (!route->path_rec_outbound)
+ return;
+ }
+
+ *route->path_rec_outbound = *path_rec;
+}
+
+static void cma_query_handler(int status, struct sa_path_rec *path_rec,
+ int num_prs, void *context)
+{
+ struct cma_work *work = context;
+ struct rdma_route *route;
+ int i;
+
+ route = &work->id->id.route;
+
+ if (status)
+ goto fail;
+
+ for (i = 0; i < num_prs; i++) {
+ if (!path_rec[i].flags || (path_rec[i].flags & IB_PATH_GMP))
+ *route->path_rec = path_rec[i];
+ else if (path_rec[i].flags & IB_PATH_INBOUND)
+ route_set_path_rec_inbound(work, &path_rec[i]);
+ else if (path_rec[i].flags & IB_PATH_OUTBOUND)
+ route_set_path_rec_outbound(work, &path_rec[i]);
+ }
+ if (!route->path_rec) {
+ status = -EINVAL;
+ goto fail;
+ }
+
+ route->num_pri_alt_paths = 1;
+ queue_work(cma_wq, &work->work);
+ return;
+
+fail:
+ work->old_state = RDMA_CM_ROUTE_QUERY;
+ work->new_state = RDMA_CM_ADDR_RESOLVED;
+ work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
+ work->event.status = status;
+ pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n",
+ status);
+ queue_work(cma_wq, &work->work);
+}
+
+static int cma_query_ib_route(struct rdma_id_private *id_priv,
+ unsigned long timeout_ms, struct cma_work *work)
+{
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ struct sa_path_rec path_rec;
+ ib_sa_comp_mask comp_mask;
+ struct sockaddr_in6 *sin6;
+ struct sockaddr_ib *sib;
+
+ memset(&path_rec, 0, sizeof path_rec);
+
+ if (rdma_cap_opa_ah(id_priv->id.device, id_priv->id.port_num))
+ path_rec.rec_type = SA_PATH_REC_TYPE_OPA;
+ else
+ path_rec.rec_type = SA_PATH_REC_TYPE_IB;
+ rdma_addr_get_sgid(dev_addr, &path_rec.sgid);
+ rdma_addr_get_dgid(dev_addr, &path_rec.dgid);
+ path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
+ path_rec.numb_path = 1;
+ path_rec.reversible = 1;
+ path_rec.service_id = rdma_get_service_id(&id_priv->id,
+ cma_dst_addr(id_priv));
+
+ comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID |
+ IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH |
+ IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID;
+
+ switch (cma_family(id_priv)) {
+ case AF_INET:
+ path_rec.qos_class = cpu_to_be16((u16) id_priv->tos);
+ comp_mask |= IB_SA_PATH_REC_QOS_CLASS;
+ break;
+ case AF_INET6:
+ sin6 = (struct sockaddr_in6 *) cma_src_addr(id_priv);
+ path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20);
+ comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS;
+ break;
+ case AF_IB:
+ sib = (struct sockaddr_ib *) cma_src_addr(id_priv);
+ path_rec.traffic_class = (u8) (be32_to_cpu(sib->sib_flowinfo) >> 20);
+ comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS;
+ break;
+ }
+
+ id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device,
+ id_priv->id.port_num, &path_rec,
+ comp_mask, timeout_ms,
+ GFP_KERNEL, cma_query_handler,
+ work, &id_priv->query);
+
+ return (id_priv->query_id < 0) ? id_priv->query_id : 0;
+}
+
+static void cma_iboe_join_work_handler(struct work_struct *work)
+{
+ struct cma_multicast *mc =
+ container_of(work, struct cma_multicast, iboe_join.work);
+ struct rdma_cm_event *event = &mc->iboe_join.event;
+ struct rdma_id_private *id_priv = mc->id_priv;
+ int ret;
+
+ mutex_lock(&id_priv->handler_mutex);
+ if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING ||
+ READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL)
+ goto out_unlock;
+
+ ret = cma_cm_event_handler(id_priv, event);
+ WARN_ON(ret);
+
+out_unlock:
+ mutex_unlock(&id_priv->handler_mutex);
+ if (event->event == RDMA_CM_EVENT_MULTICAST_JOIN)
+ rdma_destroy_ah_attr(&event->param.ud.ah_attr);
+}
+
+static void cma_work_handler(struct work_struct *_work)
+{
+ struct cma_work *work = container_of(_work, struct cma_work, work);
+ struct rdma_id_private *id_priv = work->id;
+
+ mutex_lock(&id_priv->handler_mutex);
+ if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING ||
+ READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL)
+ goto out_unlock;
+ if (work->old_state != 0 || work->new_state != 0) {
+ if (!cma_comp_exch(id_priv, work->old_state, work->new_state))
+ goto out_unlock;
+ }
+
+ if (cma_cm_event_handler(id_priv, &work->event)) {
+ cma_id_put(id_priv);
+ destroy_id_handler_unlock(id_priv);
+ goto out_free;
+ }
+
+out_unlock:
+ mutex_unlock(&id_priv->handler_mutex);
+ cma_id_put(id_priv);
+out_free:
+ if (work->event.event == RDMA_CM_EVENT_MULTICAST_JOIN)
+ rdma_destroy_ah_attr(&work->event.param.ud.ah_attr);
+ kfree(work);
+}
+
+static void cma_init_resolve_route_work(struct cma_work *work,
+ struct rdma_id_private *id_priv)
+{
+ work->id = id_priv;
+ INIT_WORK(&work->work, cma_work_handler);
+ work->old_state = RDMA_CM_ROUTE_QUERY;
+ work->new_state = RDMA_CM_ROUTE_RESOLVED;
+ work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+}
+
+static void enqueue_resolve_addr_work(struct cma_work *work,
+ struct rdma_id_private *id_priv)
+{
+ /* Balances with cma_id_put() in cma_work_handler */
+ cma_id_get(id_priv);
+
+ work->id = id_priv;
+ INIT_WORK(&work->work, cma_work_handler);
+ work->old_state = RDMA_CM_ADDR_QUERY;
+ work->new_state = RDMA_CM_ADDR_RESOLVED;
+ work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+
+ queue_work(cma_wq, &work->work);
+}
+
+static int cma_resolve_ib_route(struct rdma_id_private *id_priv,
+ unsigned long timeout_ms)
+{
+ struct rdma_route *route = &id_priv->id.route;
+ struct cma_work *work;
+ int ret;
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ cma_init_resolve_route_work(work, id_priv);
+
+ if (!route->path_rec)
+ route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL);
+ if (!route->path_rec) {
+ ret = -ENOMEM;
+ goto err1;
+ }
+
+ ret = cma_query_ib_route(id_priv, timeout_ms, work);
+ if (ret)
+ goto err2;
+
+ return 0;
+err2:
+ kfree(route->path_rec);
+ route->path_rec = NULL;
+err1:
+ kfree(work);
+ return ret;
+}
+
+static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type,
+ unsigned long supported_gids,
+ enum ib_gid_type default_gid)
+{
+ if ((network_type == RDMA_NETWORK_IPV4 ||
+ network_type == RDMA_NETWORK_IPV6) &&
+ test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids))
+ return IB_GID_TYPE_ROCE_UDP_ENCAP;
+
+ return default_gid;
+}
+
+/*
+ * cma_iboe_set_path_rec_l2_fields() is helper function which sets
+ * path record type based on GID type.
+ * It also sets up other L2 fields which includes destination mac address
+ * netdev ifindex, of the path record.
+ * It returns the netdev of the bound interface for this path record entry.
+ */
+static struct net_device *
+cma_iboe_set_path_rec_l2_fields(struct rdma_id_private *id_priv)
+{
+ struct rdma_route *route = &id_priv->id.route;
+ enum ib_gid_type gid_type = IB_GID_TYPE_ROCE;
+ struct rdma_addr *addr = &route->addr;
+ unsigned long supported_gids;
+ struct net_device *ndev;
+
+ if (!addr->dev_addr.bound_dev_if)
+ return NULL;
+
+ ndev = dev_get_by_index(addr->dev_addr.net,
+ addr->dev_addr.bound_dev_if);
+ if (!ndev)
+ return NULL;
+
+ supported_gids = roce_gid_type_mask_support(id_priv->id.device,
+ id_priv->id.port_num);
+ gid_type = cma_route_gid_type(addr->dev_addr.network,
+ supported_gids,
+ id_priv->gid_type);
+ /* Use the hint from IP Stack to select GID Type */
+ if (gid_type < ib_network_to_gid_type(addr->dev_addr.network))
+ gid_type = ib_network_to_gid_type(addr->dev_addr.network);
+ route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type);
+
+ route->path_rec->roce.route_resolved = true;
+ sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr);
+ return ndev;
+}
+
+int rdma_set_ib_path(struct rdma_cm_id *id,
+ struct sa_path_rec *path_rec)
+{
+ struct rdma_id_private *id_priv;
+ struct net_device *ndev;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
+ RDMA_CM_ROUTE_RESOLVED))
+ return -EINVAL;
+
+ id->route.path_rec = kmemdup(path_rec, sizeof(*path_rec),
+ GFP_KERNEL);
+ if (!id->route.path_rec) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ if (rdma_protocol_roce(id->device, id->port_num)) {
+ ndev = cma_iboe_set_path_rec_l2_fields(id_priv);
+ if (!ndev) {
+ ret = -ENODEV;
+ goto err_free;
+ }
+ dev_put(ndev);
+ }
+
+ id->route.num_pri_alt_paths = 1;
+ return 0;
+
+err_free:
+ kfree(id->route.path_rec);
+ id->route.path_rec = NULL;
+err:
+ cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_set_ib_path);
+
+static int cma_resolve_iw_route(struct rdma_id_private *id_priv)
+{
+ struct cma_work *work;
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ cma_init_resolve_route_work(work, id_priv);
+ queue_work(cma_wq, &work->work);
+ return 0;
+}
+
+static int get_vlan_ndev_tc(struct net_device *vlan_ndev, int prio)
+{
+ struct net_device *dev;
+
+ dev = vlan_dev_real_dev(vlan_ndev);
+ if (dev->num_tc)
+ return netdev_get_prio_tc_map(dev, prio);
+
+ return (vlan_dev_get_egress_qos_mask(vlan_ndev, prio) &
+ VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+}
+
+struct iboe_prio_tc_map {
+ int input_prio;
+ int output_tc;
+ bool found;
+};
+
+static int get_lower_vlan_dev_tc(struct net_device *dev,
+ struct netdev_nested_priv *priv)
+{
+ struct iboe_prio_tc_map *map = (struct iboe_prio_tc_map *)priv->data;
+
+ if (is_vlan_dev(dev))
+ map->output_tc = get_vlan_ndev_tc(dev, map->input_prio);
+ else if (dev->num_tc)
+ map->output_tc = netdev_get_prio_tc_map(dev, map->input_prio);
+ else
+ map->output_tc = 0;
+ /* We are interested only in first level VLAN device, so always
+ * return 1 to stop iterating over next level devices.
+ */
+ map->found = true;
+ return 1;
+}
+
+static int iboe_tos_to_sl(struct net_device *ndev, int tos)
+{
+ struct iboe_prio_tc_map prio_tc_map = {};
+ int prio = rt_tos2priority(tos);
+ struct netdev_nested_priv priv;
+
+ /* If VLAN device, get it directly from the VLAN netdev */
+ if (is_vlan_dev(ndev))
+ return get_vlan_ndev_tc(ndev, prio);
+
+ prio_tc_map.input_prio = prio;
+ priv.data = (void *)&prio_tc_map;
+ rcu_read_lock();
+ netdev_walk_all_lower_dev_rcu(ndev,
+ get_lower_vlan_dev_tc,
+ &priv);
+ rcu_read_unlock();
+ /* If map is found from lower device, use it; Otherwise
+ * continue with the current netdevice to get priority to tc map.
+ */
+ if (prio_tc_map.found)
+ return prio_tc_map.output_tc;
+ else if (ndev->num_tc)
+ return netdev_get_prio_tc_map(ndev, prio);
+ else
+ return 0;
+}
+
+static __be32 cma_get_roce_udp_flow_label(struct rdma_id_private *id_priv)
+{
+ struct sockaddr_in6 *addr6;
+ u16 dport, sport;
+ u32 hash, fl;
+
+ addr6 = (struct sockaddr_in6 *)cma_src_addr(id_priv);
+ fl = be32_to_cpu(addr6->sin6_flowinfo) & IB_GRH_FLOWLABEL_MASK;
+ if ((cma_family(id_priv) != AF_INET6) || !fl) {
+ dport = be16_to_cpu(cma_port(cma_dst_addr(id_priv)));
+ sport = be16_to_cpu(cma_port(cma_src_addr(id_priv)));
+ hash = (u32)sport * 31 + dport;
+ fl = hash & IB_GRH_FLOWLABEL_MASK;
+ }
+
+ return cpu_to_be32(fl);
+}
+
+static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
+{
+ struct rdma_route *route = &id_priv->id.route;
+ struct rdma_addr *addr = &route->addr;
+ struct cma_work *work;
+ int ret;
+ struct net_device *ndev;
+
+ u8 default_roce_tos = id_priv->cma_dev->default_roce_tos[id_priv->id.port_num -
+ rdma_start_port(id_priv->cma_dev->device)];
+ u8 tos;
+
+ mutex_lock(&id_priv->qp_mutex);
+ tos = id_priv->tos_set ? id_priv->tos : default_roce_tos;
+ mutex_unlock(&id_priv->qp_mutex);
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL);
+ if (!route->path_rec) {
+ ret = -ENOMEM;
+ goto err1;
+ }
+
+ route->num_pri_alt_paths = 1;
+
+ ndev = cma_iboe_set_path_rec_l2_fields(id_priv);
+ if (!ndev) {
+ ret = -ENODEV;
+ goto err2;
+ }
+
+ rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+ &route->path_rec->sgid);
+ rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr,
+ &route->path_rec->dgid);
+
+ if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB)
+ /* TODO: get the hoplimit from the inet/inet6 device */
+ route->path_rec->hop_limit = addr->dev_addr.hoplimit;
+ else
+ route->path_rec->hop_limit = 1;
+ route->path_rec->reversible = 1;
+ route->path_rec->pkey = cpu_to_be16(0xffff);
+ route->path_rec->mtu_selector = IB_SA_EQ;
+ route->path_rec->sl = iboe_tos_to_sl(ndev, tos);
+ route->path_rec->traffic_class = tos;
+ route->path_rec->mtu = iboe_get_mtu(ndev->mtu);
+ route->path_rec->rate_selector = IB_SA_EQ;
+ route->path_rec->rate = IB_RATE_PORT_CURRENT;
+ dev_put(ndev);
+ route->path_rec->packet_life_time_selector = IB_SA_EQ;
+ /* In case ACK timeout is set, use this value to calculate
+ * PacketLifeTime. As per IBTA 12.7.34,
+ * local ACK timeout = (2 * PacketLifeTime + Local CA’s ACK delay).
+ * Assuming a negligible local ACK delay, we can use
+ * PacketLifeTime = local ACK timeout/2
+ * as a reasonable approximation for RoCE networks.
+ */
+ mutex_lock(&id_priv->qp_mutex);
+ if (id_priv->timeout_set && id_priv->timeout)
+ route->path_rec->packet_life_time = id_priv->timeout - 1;
+ else
+ route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME;
+ mutex_unlock(&id_priv->qp_mutex);
+
+ if (!route->path_rec->mtu) {
+ ret = -EINVAL;
+ goto err2;
+ }
+
+ if (rdma_protocol_roce_udp_encap(id_priv->id.device,
+ id_priv->id.port_num))
+ route->path_rec->flow_label =
+ cma_get_roce_udp_flow_label(id_priv);
+
+ cma_init_resolve_route_work(work, id_priv);
+ queue_work(cma_wq, &work->work);
+
+ return 0;
+
+err2:
+ kfree(route->path_rec);
+ route->path_rec = NULL;
+ route->num_pri_alt_paths = 0;
+err1:
+ kfree(work);
+ return ret;
+}
+
+int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ if (!timeout_ms)
+ return -EINVAL;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY))
+ return -EINVAL;
+
+ cma_id_get(id_priv);
+ if (rdma_cap_ib_sa(id->device, id->port_num))
+ ret = cma_resolve_ib_route(id_priv, timeout_ms);
+ else if (rdma_protocol_roce(id->device, id->port_num)) {
+ ret = cma_resolve_iboe_route(id_priv);
+ if (!ret)
+ cma_add_id_to_tree(id_priv);
+ }
+ else if (rdma_protocol_iwarp(id->device, id->port_num))
+ ret = cma_resolve_iw_route(id_priv);
+ else
+ ret = -ENOSYS;
+
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED);
+ cma_id_put(id_priv);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_route);
+
+static void cma_set_loopback(struct sockaddr *addr)
+{
+ switch (addr->sa_family) {
+ case AF_INET:
+ ((struct sockaddr_in *) addr)->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ break;
+ case AF_INET6:
+ ipv6_addr_set(&((struct sockaddr_in6 *) addr)->sin6_addr,
+ 0, 0, 0, htonl(1));
+ break;
+ default:
+ ib_addr_set(&((struct sockaddr_ib *) addr)->sib_addr,
+ 0, 0, 0, htonl(1));
+ break;
+ }
+}
+
+static int cma_bind_loopback(struct rdma_id_private *id_priv)
+{
+ struct cma_device *cma_dev, *cur_dev;
+ union ib_gid gid;
+ enum ib_port_state port_state;
+ unsigned int p;
+ u16 pkey;
+ int ret;
+
+ cma_dev = NULL;
+ mutex_lock(&lock);
+ list_for_each_entry(cur_dev, &dev_list, list) {
+ if (cma_family(id_priv) == AF_IB &&
+ !rdma_cap_ib_cm(cur_dev->device, 1))
+ continue;
+
+ if (!cma_dev)
+ cma_dev = cur_dev;
+
+ rdma_for_each_port (cur_dev->device, p) {
+ if (!ib_get_cached_port_state(cur_dev->device, p, &port_state) &&
+ port_state == IB_PORT_ACTIVE) {
+ cma_dev = cur_dev;
+ goto port_found;
+ }
+ }
+ }
+
+ if (!cma_dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ p = 1;
+
+port_found:
+ ret = rdma_query_gid(cma_dev->device, p, 0, &gid);
+ if (ret)
+ goto out;
+
+ ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey);
+ if (ret)
+ goto out;
+
+ id_priv->id.route.addr.dev_addr.dev_type =
+ (rdma_protocol_ib(cma_dev->device, p)) ?
+ ARPHRD_INFINIBAND : ARPHRD_ETHER;
+
+ rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid);
+ ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey);
+ id_priv->id.port_num = p;
+ cma_attach_to_dev(id_priv, cma_dev);
+ rdma_restrack_add(&id_priv->res);
+ cma_set_loopback(cma_src_addr(id_priv));
+out:
+ mutex_unlock(&lock);
+ return ret;
+}
+
+static void addr_handler(int status, struct sockaddr *src_addr,
+ struct rdma_dev_addr *dev_addr, void *context)
+{
+ struct rdma_id_private *id_priv = context;
+ struct rdma_cm_event event = {};
+ struct sockaddr *addr;
+ struct sockaddr_storage old_addr;
+
+ mutex_lock(&id_priv->handler_mutex);
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY,
+ RDMA_CM_ADDR_RESOLVED))
+ goto out;
+
+ /*
+ * Store the previous src address, so that if we fail to acquire
+ * matching rdma device, old address can be restored back, which helps
+ * to cancel the cma listen operation correctly.
+ */
+ addr = cma_src_addr(id_priv);
+ memcpy(&old_addr, addr, rdma_addr_size(addr));
+ memcpy(addr, src_addr, rdma_addr_size(src_addr));
+ if (!status && !id_priv->cma_dev) {
+ status = cma_acquire_dev_by_src_ip(id_priv);
+ if (status)
+ pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n",
+ status);
+ rdma_restrack_add(&id_priv->res);
+ } else if (status) {
+ pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to resolve IP. status %d\n", status);
+ }
+
+ if (status) {
+ memcpy(addr, &old_addr,
+ rdma_addr_size((struct sockaddr *)&old_addr));
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
+ RDMA_CM_ADDR_BOUND))
+ goto out;
+ event.event = RDMA_CM_EVENT_ADDR_ERROR;
+ event.status = status;
+ } else
+ event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+
+ if (cma_cm_event_handler(id_priv, &event)) {
+ destroy_id_handler_unlock(id_priv);
+ return;
+ }
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+}
+
+static int cma_resolve_loopback(struct rdma_id_private *id_priv)
+{
+ struct cma_work *work;
+ union ib_gid gid;
+ int ret;
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ if (!id_priv->cma_dev) {
+ ret = cma_bind_loopback(id_priv);
+ if (ret)
+ goto err;
+ }
+
+ rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid);
+ rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid);
+
+ enqueue_resolve_addr_work(work, id_priv);
+ return 0;
+err:
+ kfree(work);
+ return ret;
+}
+
+static int cma_resolve_ib_addr(struct rdma_id_private *id_priv)
+{
+ struct cma_work *work;
+ int ret;
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ if (!id_priv->cma_dev) {
+ ret = cma_resolve_ib_dev(id_priv);
+ if (ret)
+ goto err;
+ }
+
+ rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *)
+ &(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr));
+
+ enqueue_resolve_addr_work(work, id_priv);
+ return 0;
+err:
+ kfree(work);
+ return ret;
+}
+
+static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+ const struct sockaddr *dst_addr)
+{
+ struct sockaddr_storage zero_sock = {};
+
+ if (src_addr && src_addr->sa_family)
+ return rdma_bind_addr(id, src_addr);
+
+ /*
+ * When the src_addr is not specified, automatically supply an any addr
+ */
+ zero_sock.ss_family = dst_addr->sa_family;
+ if (IS_ENABLED(CONFIG_IPV6) && dst_addr->sa_family == AF_INET6) {
+ struct sockaddr_in6 *src_addr6 =
+ (struct sockaddr_in6 *)&zero_sock;
+ struct sockaddr_in6 *dst_addr6 =
+ (struct sockaddr_in6 *)dst_addr;
+
+ src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id;
+ if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+ id->route.addr.dev_addr.bound_dev_if =
+ dst_addr6->sin6_scope_id;
+ } else if (dst_addr->sa_family == AF_IB) {
+ ((struct sockaddr_ib *)&zero_sock)->sib_pkey =
+ ((struct sockaddr_ib *)dst_addr)->sib_pkey;
+ }
+ return rdma_bind_addr(id, (struct sockaddr *)&zero_sock);
+}
+
+/*
+ * If required, resolve the source address for bind and leave the id_priv in
+ * state RDMA_CM_ADDR_BOUND. This oddly uses the state to determine the prior
+ * calls made by ULP, a previously bound ID will not be re-bound and src_addr is
+ * ignored.
+ */
+static int resolve_prepare_src(struct rdma_id_private *id_priv,
+ struct sockaddr *src_addr,
+ const struct sockaddr *dst_addr)
+{
+ int ret;
+
+ memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr));
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) {
+ /* For a well behaved ULP state will be RDMA_CM_IDLE */
+ ret = cma_bind_addr(&id_priv->id, src_addr, dst_addr);
+ if (ret)
+ goto err_dst;
+ if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND,
+ RDMA_CM_ADDR_QUERY))) {
+ ret = -EINVAL;
+ goto err_dst;
+ }
+ }
+
+ if (cma_family(id_priv) != dst_addr->sa_family) {
+ ret = -EINVAL;
+ goto err_state;
+ }
+ return 0;
+
+err_state:
+ cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND);
+err_dst:
+ memset(cma_dst_addr(id_priv), 0, rdma_addr_size(dst_addr));
+ return ret;
+}
+
+int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+ const struct sockaddr *dst_addr, unsigned long timeout_ms)
+{
+ struct rdma_id_private *id_priv =
+ container_of(id, struct rdma_id_private, id);
+ int ret;
+
+ ret = resolve_prepare_src(id_priv, src_addr, dst_addr);
+ if (ret)
+ return ret;
+
+ if (cma_any_addr(dst_addr)) {
+ ret = cma_resolve_loopback(id_priv);
+ } else {
+ if (dst_addr->sa_family == AF_IB) {
+ ret = cma_resolve_ib_addr(id_priv);
+ } else {
+ /*
+ * The FSM can return back to RDMA_CM_ADDR_BOUND after
+ * rdma_resolve_ip() is called, eg through the error
+ * path in addr_handler(). If this happens the existing
+ * request must be canceled before issuing a new one.
+ * Since canceling a request is a bit slow and this
+ * oddball path is rare, keep track once a request has
+ * been issued. The track turns out to be a permanent
+ * state since this is the only cancel as it is
+ * immediately before rdma_resolve_ip().
+ */
+ if (id_priv->used_resolve_ip)
+ rdma_addr_cancel(&id->route.addr.dev_addr);
+ else
+ id_priv->used_resolve_ip = 1;
+ ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr,
+ &id->route.addr.dev_addr,
+ timeout_ms, addr_handler,
+ false, id_priv);
+ }
+ }
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_addr);
+
+int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse)
+{
+ struct rdma_id_private *id_priv;
+ unsigned long flags;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ spin_lock_irqsave(&id_priv->lock, flags);
+ if ((reuse && id_priv->state != RDMA_CM_LISTEN) ||
+ id_priv->state == RDMA_CM_IDLE) {
+ id_priv->reuseaddr = reuse;
+ ret = 0;
+ } else {
+ ret = -EINVAL;
+ }
+ spin_unlock_irqrestore(&id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_set_reuseaddr);
+
+int rdma_set_afonly(struct rdma_cm_id *id, int afonly)
+{
+ struct rdma_id_private *id_priv;
+ unsigned long flags;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ spin_lock_irqsave(&id_priv->lock, flags);
+ if (id_priv->state == RDMA_CM_IDLE || id_priv->state == RDMA_CM_ADDR_BOUND) {
+ id_priv->options |= (1 << CMA_OPTION_AFONLY);
+ id_priv->afonly = afonly;
+ ret = 0;
+ } else {
+ ret = -EINVAL;
+ }
+ spin_unlock_irqrestore(&id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_set_afonly);
+
+static void cma_bind_port(struct rdma_bind_list *bind_list,
+ struct rdma_id_private *id_priv)
+{
+ struct sockaddr *addr;
+ struct sockaddr_ib *sib;
+ u64 sid, mask;
+ __be16 port;
+
+ lockdep_assert_held(&lock);
+
+ addr = cma_src_addr(id_priv);
+ port = htons(bind_list->port);
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ ((struct sockaddr_in *) addr)->sin_port = port;
+ break;
+ case AF_INET6:
+ ((struct sockaddr_in6 *) addr)->sin6_port = port;
+ break;
+ case AF_IB:
+ sib = (struct sockaddr_ib *) addr;
+ sid = be64_to_cpu(sib->sib_sid);
+ mask = be64_to_cpu(sib->sib_sid_mask);
+ sib->sib_sid = cpu_to_be64((sid & mask) | (u64) ntohs(port));
+ sib->sib_sid_mask = cpu_to_be64(~0ULL);
+ break;
+ }
+ id_priv->bind_list = bind_list;
+ hlist_add_head(&id_priv->node, &bind_list->owners);
+}
+
+static int cma_alloc_port(enum rdma_ucm_port_space ps,
+ struct rdma_id_private *id_priv, unsigned short snum)
+{
+ struct rdma_bind_list *bind_list;
+ int ret;
+
+ lockdep_assert_held(&lock);
+
+ bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL);
+ if (!bind_list)
+ return -ENOMEM;
+
+ ret = cma_ps_alloc(id_priv->id.route.addr.dev_addr.net, ps, bind_list,
+ snum);
+ if (ret < 0)
+ goto err;
+
+ bind_list->ps = ps;
+ bind_list->port = snum;
+ cma_bind_port(bind_list, id_priv);
+ return 0;
+err:
+ kfree(bind_list);
+ return ret == -ENOSPC ? -EADDRNOTAVAIL : ret;
+}
+
+static int cma_port_is_unique(struct rdma_bind_list *bind_list,
+ struct rdma_id_private *id_priv)
+{
+ struct rdma_id_private *cur_id;
+ struct sockaddr *daddr = cma_dst_addr(id_priv);
+ struct sockaddr *saddr = cma_src_addr(id_priv);
+ __be16 dport = cma_port(daddr);
+
+ lockdep_assert_held(&lock);
+
+ hlist_for_each_entry(cur_id, &bind_list->owners, node) {
+ struct sockaddr *cur_daddr = cma_dst_addr(cur_id);
+ struct sockaddr *cur_saddr = cma_src_addr(cur_id);
+ __be16 cur_dport = cma_port(cur_daddr);
+
+ if (id_priv == cur_id)
+ continue;
+
+ /* different dest port -> unique */
+ if (!cma_any_port(daddr) &&
+ !cma_any_port(cur_daddr) &&
+ (dport != cur_dport))
+ continue;
+
+ /* different src address -> unique */
+ if (!cma_any_addr(saddr) &&
+ !cma_any_addr(cur_saddr) &&
+ cma_addr_cmp(saddr, cur_saddr))
+ continue;
+
+ /* different dst address -> unique */
+ if (!cma_any_addr(daddr) &&
+ !cma_any_addr(cur_daddr) &&
+ cma_addr_cmp(daddr, cur_daddr))
+ continue;
+
+ return -EADDRNOTAVAIL;
+ }
+ return 0;
+}
+
+static int cma_alloc_any_port(enum rdma_ucm_port_space ps,
+ struct rdma_id_private *id_priv)
+{
+ static unsigned int last_used_port;
+ int low, high, remaining;
+ unsigned int rover;
+ struct net *net = id_priv->id.route.addr.dev_addr.net;
+
+ lockdep_assert_held(&lock);
+
+ inet_get_local_port_range(net, &low, &high);
+ remaining = (high - low) + 1;
+ rover = prandom_u32_max(remaining) + low;
+retry:
+ if (last_used_port != rover) {
+ struct rdma_bind_list *bind_list;
+ int ret;
+
+ bind_list = cma_ps_find(net, ps, (unsigned short)rover);
+
+ if (!bind_list) {
+ ret = cma_alloc_port(ps, id_priv, rover);
+ } else {
+ ret = cma_port_is_unique(bind_list, id_priv);
+ if (!ret)
+ cma_bind_port(bind_list, id_priv);
+ }
+ /*
+ * Remember previously used port number in order to avoid
+ * re-using same port immediately after it is closed.
+ */
+ if (!ret)
+ last_used_port = rover;
+ if (ret != -EADDRNOTAVAIL)
+ return ret;
+ }
+ if (--remaining) {
+ rover++;
+ if ((rover < low) || (rover > high))
+ rover = low;
+ goto retry;
+ }
+ return -EADDRNOTAVAIL;
+}
+
+/*
+ * Check that the requested port is available. This is called when trying to
+ * bind to a specific port, or when trying to listen on a bound port. In
+ * the latter case, the provided id_priv may already be on the bind_list, but
+ * we still need to check that it's okay to start listening.
+ */
+static int cma_check_port(struct rdma_bind_list *bind_list,
+ struct rdma_id_private *id_priv, uint8_t reuseaddr)
+{
+ struct rdma_id_private *cur_id;
+ struct sockaddr *addr, *cur_addr;
+
+ lockdep_assert_held(&lock);
+
+ addr = cma_src_addr(id_priv);
+ hlist_for_each_entry(cur_id, &bind_list->owners, node) {
+ if (id_priv == cur_id)
+ continue;
+
+ if (reuseaddr && cur_id->reuseaddr)
+ continue;
+
+ cur_addr = cma_src_addr(cur_id);
+ if (id_priv->afonly && cur_id->afonly &&
+ (addr->sa_family != cur_addr->sa_family))
+ continue;
+
+ if (cma_any_addr(addr) || cma_any_addr(cur_addr))
+ return -EADDRNOTAVAIL;
+
+ if (!cma_addr_cmp(addr, cur_addr))
+ return -EADDRINUSE;
+ }
+ return 0;
+}
+
+static int cma_use_port(enum rdma_ucm_port_space ps,
+ struct rdma_id_private *id_priv)
+{
+ struct rdma_bind_list *bind_list;
+ unsigned short snum;
+ int ret;
+
+ lockdep_assert_held(&lock);
+
+ snum = ntohs(cma_port(cma_src_addr(id_priv)));
+ if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+ return -EACCES;
+
+ bind_list = cma_ps_find(id_priv->id.route.addr.dev_addr.net, ps, snum);
+ if (!bind_list) {
+ ret = cma_alloc_port(ps, id_priv, snum);
+ } else {
+ ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr);
+ if (!ret)
+ cma_bind_port(bind_list, id_priv);
+ }
+ return ret;
+}
+
+static enum rdma_ucm_port_space
+cma_select_inet_ps(struct rdma_id_private *id_priv)
+{
+ switch (id_priv->id.ps) {
+ case RDMA_PS_TCP:
+ case RDMA_PS_UDP:
+ case RDMA_PS_IPOIB:
+ case RDMA_PS_IB:
+ return id_priv->id.ps;
+ default:
+
+ return 0;
+ }
+}
+
+static enum rdma_ucm_port_space
+cma_select_ib_ps(struct rdma_id_private *id_priv)
+{
+ enum rdma_ucm_port_space ps = 0;
+ struct sockaddr_ib *sib;
+ u64 sid_ps, mask, sid;
+
+ sib = (struct sockaddr_ib *) cma_src_addr(id_priv);
+ mask = be64_to_cpu(sib->sib_sid_mask) & RDMA_IB_IP_PS_MASK;
+ sid = be64_to_cpu(sib->sib_sid) & mask;
+
+ if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) {
+ sid_ps = RDMA_IB_IP_PS_IB;
+ ps = RDMA_PS_IB;
+ } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) &&
+ (sid == (RDMA_IB_IP_PS_TCP & mask))) {
+ sid_ps = RDMA_IB_IP_PS_TCP;
+ ps = RDMA_PS_TCP;
+ } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) &&
+ (sid == (RDMA_IB_IP_PS_UDP & mask))) {
+ sid_ps = RDMA_IB_IP_PS_UDP;
+ ps = RDMA_PS_UDP;
+ }
+
+ if (ps) {
+ sib->sib_sid = cpu_to_be64(sid_ps | ntohs(cma_port((struct sockaddr *) sib)));
+ sib->sib_sid_mask = cpu_to_be64(RDMA_IB_IP_PS_MASK |
+ be64_to_cpu(sib->sib_sid_mask));
+ }
+ return ps;
+}
+
+static int cma_get_port(struct rdma_id_private *id_priv)
+{
+ enum rdma_ucm_port_space ps;
+ int ret;
+
+ if (cma_family(id_priv) != AF_IB)
+ ps = cma_select_inet_ps(id_priv);
+ else
+ ps = cma_select_ib_ps(id_priv);
+ if (!ps)
+ return -EPROTONOSUPPORT;
+
+ mutex_lock(&lock);
+ if (cma_any_port(cma_src_addr(id_priv)))
+ ret = cma_alloc_any_port(ps, id_priv);
+ else
+ ret = cma_use_port(ps, id_priv);
+ mutex_unlock(&lock);
+
+ return ret;
+}
+
+static int cma_check_linklocal(struct rdma_dev_addr *dev_addr,
+ struct sockaddr *addr)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ struct sockaddr_in6 *sin6;
+
+ if (addr->sa_family != AF_INET6)
+ return 0;
+
+ sin6 = (struct sockaddr_in6 *) addr;
+
+ if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
+ return 0;
+
+ if (!sin6->sin6_scope_id)
+ return -EINVAL;
+
+ dev_addr->bound_dev_if = sin6->sin6_scope_id;
+#endif
+ return 0;
+}
+
+int rdma_listen(struct rdma_cm_id *id, int backlog)
+{
+ struct rdma_id_private *id_priv =
+ container_of(id, struct rdma_id_private, id);
+ int ret;
+
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) {
+ struct sockaddr_in any_in = {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = htonl(INADDR_ANY),
+ };
+
+ /* For a well behaved ULP state will be RDMA_CM_IDLE */
+ ret = rdma_bind_addr(id, (struct sockaddr *)&any_in);
+ if (ret)
+ return ret;
+ if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND,
+ RDMA_CM_LISTEN)))
+ return -EINVAL;
+ }
+
+ /*
+ * Once the ID reaches RDMA_CM_LISTEN it is not allowed to be reusable
+ * any more, and has to be unique in the bind list.
+ */
+ if (id_priv->reuseaddr) {
+ mutex_lock(&lock);
+ ret = cma_check_port(id_priv->bind_list, id_priv, 0);
+ if (!ret)
+ id_priv->reuseaddr = 0;
+ mutex_unlock(&lock);
+ if (ret)
+ goto err;
+ }
+
+ id_priv->backlog = backlog;
+ if (id_priv->cma_dev) {
+ if (rdma_cap_ib_cm(id->device, 1)) {
+ ret = cma_ib_listen(id_priv);
+ if (ret)
+ goto err;
+ } else if (rdma_cap_iw_cm(id->device, 1)) {
+ ret = cma_iw_listen(id_priv, backlog);
+ if (ret)
+ goto err;
+ } else {
+ ret = -ENOSYS;
+ goto err;
+ }
+ } else {
+ ret = cma_listen_on_all(id_priv);
+ if (ret)
+ goto err;
+ }
+
+ return 0;
+err:
+ id_priv->backlog = 0;
+ /*
+ * All the failure paths that lead here will not allow the req_handler's
+ * to have run.
+ */
+ cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_listen);
+
+int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+ struct sockaddr *daddr;
+
+ if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 &&
+ addr->sa_family != AF_IB)
+ return -EAFNOSUPPORT;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND))
+ return -EINVAL;
+
+ ret = cma_check_linklocal(&id->route.addr.dev_addr, addr);
+ if (ret)
+ goto err1;
+
+ memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr));
+ if (!cma_any_addr(addr)) {
+ ret = cma_translate_addr(addr, &id->route.addr.dev_addr);
+ if (ret)
+ goto err1;
+
+ ret = cma_acquire_dev_by_src_ip(id_priv);
+ if (ret)
+ goto err1;
+ }
+
+ if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) {
+ if (addr->sa_family == AF_INET)
+ id_priv->afonly = 1;
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (addr->sa_family == AF_INET6) {
+ struct net *net = id_priv->id.route.addr.dev_addr.net;
+
+ id_priv->afonly = net->ipv6.sysctl.bindv6only;
+ }
+#endif
+ }
+ daddr = cma_dst_addr(id_priv);
+ daddr->sa_family = addr->sa_family;
+
+ ret = cma_get_port(id_priv);
+ if (ret)
+ goto err2;
+
+ if (!cma_any_addr(addr))
+ rdma_restrack_add(&id_priv->res);
+ return 0;
+err2:
+ if (id_priv->cma_dev)
+ cma_release_dev(id_priv);
+err1:
+ cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_bind_addr);
+
+static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv)
+{
+ struct cma_hdr *cma_hdr;
+
+ cma_hdr = hdr;
+ cma_hdr->cma_version = CMA_VERSION;
+ if (cma_family(id_priv) == AF_INET) {
+ struct sockaddr_in *src4, *dst4;
+
+ src4 = (struct sockaddr_in *) cma_src_addr(id_priv);
+ dst4 = (struct sockaddr_in *) cma_dst_addr(id_priv);
+
+ cma_set_ip_ver(cma_hdr, 4);
+ cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr;
+ cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr;
+ cma_hdr->port = src4->sin_port;
+ } else if (cma_family(id_priv) == AF_INET6) {
+ struct sockaddr_in6 *src6, *dst6;
+
+ src6 = (struct sockaddr_in6 *) cma_src_addr(id_priv);
+ dst6 = (struct sockaddr_in6 *) cma_dst_addr(id_priv);
+
+ cma_set_ip_ver(cma_hdr, 6);
+ cma_hdr->src_addr.ip6 = src6->sin6_addr;
+ cma_hdr->dst_addr.ip6 = dst6->sin6_addr;
+ cma_hdr->port = src6->sin6_port;
+ }
+ return 0;
+}
+
+static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
+ const struct ib_cm_event *ib_event)
+{
+ struct rdma_id_private *id_priv = cm_id->context;
+ struct rdma_cm_event event = {};
+ const struct ib_cm_sidr_rep_event_param *rep =
+ &ib_event->param.sidr_rep_rcvd;
+ int ret;
+
+ mutex_lock(&id_priv->handler_mutex);
+ if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT)
+ goto out;
+
+ switch (ib_event->event) {
+ case IB_CM_SIDR_REQ_ERROR:
+ event.event = RDMA_CM_EVENT_UNREACHABLE;
+ event.status = -ETIMEDOUT;
+ break;
+ case IB_CM_SIDR_REP_RECEIVED:
+ event.param.ud.private_data = ib_event->private_data;
+ event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE;
+ if (rep->status != IB_SIDR_SUCCESS) {
+ event.event = RDMA_CM_EVENT_UNREACHABLE;
+ event.status = ib_event->param.sidr_rep_rcvd.status;
+ pr_debug_ratelimited("RDMA CM: UNREACHABLE: bad SIDR reply. status %d\n",
+ event.status);
+ break;
+ }
+ ret = cma_set_qkey(id_priv, rep->qkey);
+ if (ret) {
+ pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to set qkey. status %d\n", ret);
+ event.event = RDMA_CM_EVENT_ADDR_ERROR;
+ event.status = ret;
+ break;
+ }
+ ib_init_ah_attr_from_path(id_priv->id.device,
+ id_priv->id.port_num,
+ id_priv->id.route.path_rec,
+ &event.param.ud.ah_attr,
+ rep->sgid_attr);
+ event.param.ud.qp_num = rep->qpn;
+ event.param.ud.qkey = rep->qkey;
+ event.event = RDMA_CM_EVENT_ESTABLISHED;
+ event.status = 0;
+ break;
+ default:
+ pr_err("RDMA CMA: unexpected IB CM event: %d\n",
+ ib_event->event);
+ goto out;
+ }
+
+ ret = cma_cm_event_handler(id_priv, &event);
+
+ rdma_destroy_ah_attr(&event.param.ud.ah_attr);
+ if (ret) {
+ /* Destroy the CM ID by returning a non-zero value. */
+ id_priv->cm_id.ib = NULL;
+ destroy_id_handler_unlock(id_priv);
+ return ret;
+ }
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+ return 0;
+}
+
+static int cma_resolve_ib_udp(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct ib_cm_sidr_req_param req;
+ struct ib_cm_id *id;
+ void *private_data;
+ u8 offset;
+ int ret;
+
+ memset(&req, 0, sizeof req);
+ offset = cma_user_data_offset(id_priv);
+ if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len))
+ return -EINVAL;
+
+ if (req.private_data_len) {
+ private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
+ if (!private_data)
+ return -ENOMEM;
+ } else {
+ private_data = NULL;
+ }
+
+ if (conn_param->private_data && conn_param->private_data_len)
+ memcpy(private_data + offset, conn_param->private_data,
+ conn_param->private_data_len);
+
+ if (private_data) {
+ ret = cma_format_hdr(private_data, id_priv);
+ if (ret)
+ goto out;
+ req.private_data = private_data;
+ }
+
+ id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler,
+ id_priv);
+ if (IS_ERR(id)) {
+ ret = PTR_ERR(id);
+ goto out;
+ }
+ id_priv->cm_id.ib = id;
+
+ req.path = id_priv->id.route.path_rec;
+ req.sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr;
+ req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv));
+ req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8);
+ req.max_cm_retries = CMA_MAX_CM_RETRIES;
+
+ trace_cm_send_sidr_req(id_priv);
+ ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req);
+ if (ret) {
+ ib_destroy_cm_id(id_priv->cm_id.ib);
+ id_priv->cm_id.ib = NULL;
+ }
+out:
+ kfree(private_data);
+ return ret;
+}
+
+static int cma_connect_ib(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct ib_cm_req_param req;
+ struct rdma_route *route;
+ void *private_data;
+ struct ib_cm_id *id;
+ u8 offset;
+ int ret;
+
+ memset(&req, 0, sizeof req);
+ offset = cma_user_data_offset(id_priv);
+ if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len))
+ return -EINVAL;
+
+ if (req.private_data_len) {
+ private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
+ if (!private_data)
+ return -ENOMEM;
+ } else {
+ private_data = NULL;
+ }
+
+ if (conn_param->private_data && conn_param->private_data_len)
+ memcpy(private_data + offset, conn_param->private_data,
+ conn_param->private_data_len);
+
+ id = ib_create_cm_id(id_priv->id.device, cma_ib_handler, id_priv);
+ if (IS_ERR(id)) {
+ ret = PTR_ERR(id);
+ goto out;
+ }
+ id_priv->cm_id.ib = id;
+
+ route = &id_priv->id.route;
+ if (private_data) {
+ ret = cma_format_hdr(private_data, id_priv);
+ if (ret)
+ goto out;
+ req.private_data = private_data;
+ }
+
+ req.primary_path = &route->path_rec[0];
+ req.primary_path_inbound = route->path_rec_inbound;
+ req.primary_path_outbound = route->path_rec_outbound;
+ if (route->num_pri_alt_paths == 2)
+ req.alternate_path = &route->path_rec[1];
+
+ req.ppath_sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr;
+ /* Alternate path SGID attribute currently unsupported */
+ req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv));
+ req.qp_num = id_priv->qp_num;
+ req.qp_type = id_priv->id.qp_type;
+ req.starting_psn = id_priv->seq_num;
+ req.responder_resources = conn_param->responder_resources;
+ req.initiator_depth = conn_param->initiator_depth;
+ req.flow_control = conn_param->flow_control;
+ req.retry_count = min_t(u8, 7, conn_param->retry_count);
+ req.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count);
+ req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
+ req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
+ req.max_cm_retries = CMA_MAX_CM_RETRIES;
+ req.srq = id_priv->srq ? 1 : 0;
+ req.ece.vendor_id = id_priv->ece.vendor_id;
+ req.ece.attr_mod = id_priv->ece.attr_mod;
+
+ trace_cm_send_req(id_priv);
+ ret = ib_send_cm_req(id_priv->cm_id.ib, &req);
+out:
+ if (ret && !IS_ERR(id)) {
+ ib_destroy_cm_id(id);
+ id_priv->cm_id.ib = NULL;
+ }
+
+ kfree(private_data);
+ return ret;
+}
+
+static int cma_connect_iw(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct iw_cm_id *cm_id;
+ int ret;
+ struct iw_cm_conn_param iw_param;
+
+ cm_id = iw_create_cm_id(id_priv->id.device, cma_iw_handler, id_priv);
+ if (IS_ERR(cm_id))
+ return PTR_ERR(cm_id);
+
+ mutex_lock(&id_priv->qp_mutex);
+ cm_id->tos = id_priv->tos;
+ cm_id->tos_set = id_priv->tos_set;
+ mutex_unlock(&id_priv->qp_mutex);
+
+ id_priv->cm_id.iw = cm_id;
+
+ memcpy(&cm_id->local_addr, cma_src_addr(id_priv),
+ rdma_addr_size(cma_src_addr(id_priv)));
+ memcpy(&cm_id->remote_addr, cma_dst_addr(id_priv),
+ rdma_addr_size(cma_dst_addr(id_priv)));
+
+ ret = cma_modify_qp_rtr(id_priv, conn_param);
+ if (ret)
+ goto out;
+
+ if (conn_param) {
+ iw_param.ord = conn_param->initiator_depth;
+ iw_param.ird = conn_param->responder_resources;
+ iw_param.private_data = conn_param->private_data;
+ iw_param.private_data_len = conn_param->private_data_len;
+ iw_param.qpn = id_priv->id.qp ? id_priv->qp_num : conn_param->qp_num;
+ } else {
+ memset(&iw_param, 0, sizeof iw_param);
+ iw_param.qpn = id_priv->qp_num;
+ }
+ ret = iw_cm_connect(cm_id, &iw_param);
+out:
+ if (ret) {
+ iw_destroy_cm_id(cm_id);
+ id_priv->cm_id.iw = NULL;
+ }
+ return ret;
+}
+
+/**
+ * rdma_connect_locked - Initiate an active connection request.
+ * @id: Connection identifier to connect.
+ * @conn_param: Connection information used for connected QPs.
+ *
+ * Same as rdma_connect() but can only be called from the
+ * RDMA_CM_EVENT_ROUTE_RESOLVED handler callback.
+ */
+int rdma_connect_locked(struct rdma_cm_id *id,
+ struct rdma_conn_param *conn_param)
+{
+ struct rdma_id_private *id_priv =
+ container_of(id, struct rdma_id_private, id);
+ int ret;
+
+ if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT))
+ return -EINVAL;
+
+ if (!id->qp) {
+ id_priv->qp_num = conn_param->qp_num;
+ id_priv->srq = conn_param->srq;
+ }
+
+ if (rdma_cap_ib_cm(id->device, id->port_num)) {
+ if (id->qp_type == IB_QPT_UD)
+ ret = cma_resolve_ib_udp(id_priv, conn_param);
+ else
+ ret = cma_connect_ib(id_priv, conn_param);
+ } else if (rdma_cap_iw_cm(id->device, id->port_num)) {
+ ret = cma_connect_iw(id_priv, conn_param);
+ } else {
+ ret = -ENOSYS;
+ }
+ if (ret)
+ goto err_state;
+ return 0;
+err_state:
+ cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_connect_locked);
+
+/**
+ * rdma_connect - Initiate an active connection request.
+ * @id: Connection identifier to connect.
+ * @conn_param: Connection information used for connected QPs.
+ *
+ * Users must have resolved a route for the rdma_cm_id to connect with by having
+ * called rdma_resolve_route before calling this routine.
+ *
+ * This call will either connect to a remote QP or obtain remote QP information
+ * for unconnected rdma_cm_id's. The actual operation is based on the
+ * rdma_cm_id's port space.
+ */
+int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
+{
+ struct rdma_id_private *id_priv =
+ container_of(id, struct rdma_id_private, id);
+ int ret;
+
+ mutex_lock(&id_priv->handler_mutex);
+ ret = rdma_connect_locked(id, conn_param);
+ mutex_unlock(&id_priv->handler_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_connect);
+
+/**
+ * rdma_connect_ece - Initiate an active connection request with ECE data.
+ * @id: Connection identifier to connect.
+ * @conn_param: Connection information used for connected QPs.
+ * @ece: ECE parameters
+ *
+ * See rdma_connect() explanation.
+ */
+int rdma_connect_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param,
+ struct rdma_ucm_ece *ece)
+{
+ struct rdma_id_private *id_priv =
+ container_of(id, struct rdma_id_private, id);
+
+ id_priv->ece.vendor_id = ece->vendor_id;
+ id_priv->ece.attr_mod = ece->attr_mod;
+
+ return rdma_connect(id, conn_param);
+}
+EXPORT_SYMBOL(rdma_connect_ece);
+
+static int cma_accept_ib(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct ib_cm_rep_param rep;
+ int ret;
+
+ ret = cma_modify_qp_rtr(id_priv, conn_param);
+ if (ret)
+ goto out;
+
+ ret = cma_modify_qp_rts(id_priv, conn_param);
+ if (ret)
+ goto out;
+
+ memset(&rep, 0, sizeof rep);
+ rep.qp_num = id_priv->qp_num;
+ rep.starting_psn = id_priv->seq_num;
+ rep.private_data = conn_param->private_data;
+ rep.private_data_len = conn_param->private_data_len;
+ rep.responder_resources = conn_param->responder_resources;
+ rep.initiator_depth = conn_param->initiator_depth;
+ rep.failover_accepted = 0;
+ rep.flow_control = conn_param->flow_control;
+ rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count);
+ rep.srq = id_priv->srq ? 1 : 0;
+ rep.ece.vendor_id = id_priv->ece.vendor_id;
+ rep.ece.attr_mod = id_priv->ece.attr_mod;
+
+ trace_cm_send_rep(id_priv);
+ ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep);
+out:
+ return ret;
+}
+
+static int cma_accept_iw(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct iw_cm_conn_param iw_param;
+ int ret;
+
+ if (!conn_param)
+ return -EINVAL;
+
+ ret = cma_modify_qp_rtr(id_priv, conn_param);
+ if (ret)
+ return ret;
+
+ iw_param.ord = conn_param->initiator_depth;
+ iw_param.ird = conn_param->responder_resources;
+ iw_param.private_data = conn_param->private_data;
+ iw_param.private_data_len = conn_param->private_data_len;
+ if (id_priv->id.qp)
+ iw_param.qpn = id_priv->qp_num;
+ else
+ iw_param.qpn = conn_param->qp_num;
+
+ return iw_cm_accept(id_priv->cm_id.iw, &iw_param);
+}
+
+static int cma_send_sidr_rep(struct rdma_id_private *id_priv,
+ enum ib_cm_sidr_status status, u32 qkey,
+ const void *private_data, int private_data_len)
+{
+ struct ib_cm_sidr_rep_param rep;
+ int ret;
+
+ memset(&rep, 0, sizeof rep);
+ rep.status = status;
+ if (status == IB_SIDR_SUCCESS) {
+ if (qkey)
+ ret = cma_set_qkey(id_priv, qkey);
+ else
+ ret = cma_set_default_qkey(id_priv);
+ if (ret)
+ return ret;
+ rep.qp_num = id_priv->qp_num;
+ rep.qkey = id_priv->qkey;
+
+ rep.ece.vendor_id = id_priv->ece.vendor_id;
+ rep.ece.attr_mod = id_priv->ece.attr_mod;
+ }
+
+ rep.private_data = private_data;
+ rep.private_data_len = private_data_len;
+
+ trace_cm_send_sidr_rep(id_priv);
+ return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep);
+}
+
+/**
+ * rdma_accept - Called to accept a connection request or response.
+ * @id: Connection identifier associated with the request.
+ * @conn_param: Information needed to establish the connection. This must be
+ * provided if accepting a connection request. If accepting a connection
+ * response, this parameter must be NULL.
+ *
+ * Typically, this routine is only called by the listener to accept a connection
+ * request. It must also be called on the active side of a connection if the
+ * user is performing their own QP transitions.
+ *
+ * In the case of error, a reject message is sent to the remote side and the
+ * state of the qp associated with the id is modified to error, such that any
+ * previously posted receive buffers would be flushed.
+ *
+ * This function is for use by kernel ULPs and must be called from under the
+ * handler callback.
+ */
+int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
+{
+ struct rdma_id_private *id_priv =
+ container_of(id, struct rdma_id_private, id);
+ int ret;
+
+ lockdep_assert_held(&id_priv->handler_mutex);
+
+ if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT)
+ return -EINVAL;
+
+ if (!id->qp && conn_param) {
+ id_priv->qp_num = conn_param->qp_num;
+ id_priv->srq = conn_param->srq;
+ }
+
+ if (rdma_cap_ib_cm(id->device, id->port_num)) {
+ if (id->qp_type == IB_QPT_UD) {
+ if (conn_param)
+ ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
+ conn_param->qkey,
+ conn_param->private_data,
+ conn_param->private_data_len);
+ else
+ ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
+ 0, NULL, 0);
+ } else {
+ if (conn_param)
+ ret = cma_accept_ib(id_priv, conn_param);
+ else
+ ret = cma_rep_recv(id_priv);
+ }
+ } else if (rdma_cap_iw_cm(id->device, id->port_num)) {
+ ret = cma_accept_iw(id_priv, conn_param);
+ } else {
+ ret = -ENOSYS;
+ }
+ if (ret)
+ goto reject;
+
+ return 0;
+reject:
+ cma_modify_qp_err(id_priv);
+ rdma_reject(id, NULL, 0, IB_CM_REJ_CONSUMER_DEFINED);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_accept);
+
+int rdma_accept_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param,
+ struct rdma_ucm_ece *ece)
+{
+ struct rdma_id_private *id_priv =
+ container_of(id, struct rdma_id_private, id);
+
+ id_priv->ece.vendor_id = ece->vendor_id;
+ id_priv->ece.attr_mod = ece->attr_mod;
+
+ return rdma_accept(id, conn_param);
+}
+EXPORT_SYMBOL(rdma_accept_ece);
+
+void rdma_lock_handler(struct rdma_cm_id *id)
+{
+ struct rdma_id_private *id_priv =
+ container_of(id, struct rdma_id_private, id);
+
+ mutex_lock(&id_priv->handler_mutex);
+}
+EXPORT_SYMBOL(rdma_lock_handler);
+
+void rdma_unlock_handler(struct rdma_cm_id *id)
+{
+ struct rdma_id_private *id_priv =
+ container_of(id, struct rdma_id_private, id);
+
+ mutex_unlock(&id_priv->handler_mutex);
+}
+EXPORT_SYMBOL(rdma_unlock_handler);
+
+int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!id_priv->cm_id.ib)
+ return -EINVAL;
+
+ switch (id->device->node_type) {
+ case RDMA_NODE_IB_CA:
+ ret = ib_cm_notify(id_priv->cm_id.ib, event);
+ break;
+ default:
+ ret = 0;
+ break;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rdma_notify);
+
+int rdma_reject(struct rdma_cm_id *id, const void *private_data,
+ u8 private_data_len, u8 reason)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!id_priv->cm_id.ib)
+ return -EINVAL;
+
+ if (rdma_cap_ib_cm(id->device, id->port_num)) {
+ if (id->qp_type == IB_QPT_UD) {
+ ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0,
+ private_data, private_data_len);
+ } else {
+ trace_cm_send_rej(id_priv);
+ ret = ib_send_cm_rej(id_priv->cm_id.ib, reason, NULL, 0,
+ private_data, private_data_len);
+ }
+ } else if (rdma_cap_iw_cm(id->device, id->port_num)) {
+ ret = iw_cm_reject(id_priv->cm_id.iw,
+ private_data, private_data_len);
+ } else {
+ ret = -ENOSYS;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(rdma_reject);
+
+int rdma_disconnect(struct rdma_cm_id *id)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!id_priv->cm_id.ib)
+ return -EINVAL;
+
+ if (rdma_cap_ib_cm(id->device, id->port_num)) {
+ ret = cma_modify_qp_err(id_priv);
+ if (ret)
+ goto out;
+ /* Initiate or respond to a disconnect. */
+ trace_cm_disconnect(id_priv);
+ if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) {
+ if (!ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0))
+ trace_cm_sent_drep(id_priv);
+ } else {
+ trace_cm_sent_dreq(id_priv);
+ }
+ } else if (rdma_cap_iw_cm(id->device, id->port_num)) {
+ ret = iw_cm_disconnect(id_priv->cm_id.iw, 0);
+ } else
+ ret = -EINVAL;
+
+out:
+ return ret;
+}
+EXPORT_SYMBOL(rdma_disconnect);
+
+static void cma_make_mc_event(int status, struct rdma_id_private *id_priv,
+ struct ib_sa_multicast *multicast,
+ struct rdma_cm_event *event,
+ struct cma_multicast *mc)
+{
+ struct rdma_dev_addr *dev_addr;
+ enum ib_gid_type gid_type;
+ struct net_device *ndev;
+
+ if (status)
+ pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to join multicast. status %d\n",
+ status);
+
+ event->status = status;
+ event->param.ud.private_data = mc->context;
+ if (status) {
+ event->event = RDMA_CM_EVENT_MULTICAST_ERROR;
+ return;
+ }
+
+ dev_addr = &id_priv->id.route.addr.dev_addr;
+ ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if);
+ gid_type =
+ id_priv->cma_dev
+ ->default_gid_type[id_priv->id.port_num -
+ rdma_start_port(
+ id_priv->cma_dev->device)];
+
+ event->event = RDMA_CM_EVENT_MULTICAST_JOIN;
+ if (ib_init_ah_from_mcmember(id_priv->id.device, id_priv->id.port_num,
+ &multicast->rec, ndev, gid_type,
+ &event->param.ud.ah_attr)) {
+ event->event = RDMA_CM_EVENT_MULTICAST_ERROR;
+ goto out;
+ }
+
+ event->param.ud.qp_num = 0xFFFFFF;
+ event->param.ud.qkey = id_priv->qkey;
+
+out:
+ if (ndev)
+ dev_put(ndev);
+}
+
+static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
+{
+ struct cma_multicast *mc = multicast->context;
+ struct rdma_id_private *id_priv = mc->id_priv;
+ struct rdma_cm_event event = {};
+ int ret = 0;
+
+ mutex_lock(&id_priv->handler_mutex);
+ if (READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL ||
+ READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING)
+ goto out;
+
+ ret = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey));
+ if (!ret) {
+ cma_make_mc_event(status, id_priv, multicast, &event, mc);
+ ret = cma_cm_event_handler(id_priv, &event);
+ }
+ rdma_destroy_ah_attr(&event.param.ud.ah_attr);
+ WARN_ON(ret);
+
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+ return 0;
+}
+
+static void cma_set_mgid(struct rdma_id_private *id_priv,
+ struct sockaddr *addr, union ib_gid *mgid)
+{
+ unsigned char mc_map[MAX_ADDR_LEN];
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ struct sockaddr_in *sin = (struct sockaddr_in *) addr;
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr;
+
+ if (cma_any_addr(addr)) {
+ memset(mgid, 0, sizeof *mgid);
+ } else if ((addr->sa_family == AF_INET6) &&
+ ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) ==
+ 0xFF10A01B)) {
+ /* IPv6 address is an SA assigned MGID. */
+ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
+ } else if (addr->sa_family == AF_IB) {
+ memcpy(mgid, &((struct sockaddr_ib *) addr)->sib_addr, sizeof *mgid);
+ } else if (addr->sa_family == AF_INET6) {
+ ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map);
+ if (id_priv->id.ps == RDMA_PS_UDP)
+ mc_map[7] = 0x01; /* Use RDMA CM signature */
+ *mgid = *(union ib_gid *) (mc_map + 4);
+ } else {
+ ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map);
+ if (id_priv->id.ps == RDMA_PS_UDP)
+ mc_map[7] = 0x01; /* Use RDMA CM signature */
+ *mgid = *(union ib_gid *) (mc_map + 4);
+ }
+}
+
+static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
+ struct cma_multicast *mc)
+{
+ struct ib_sa_mcmember_rec rec;
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ ib_sa_comp_mask comp_mask;
+ int ret;
+
+ ib_addr_get_mgid(dev_addr, &rec.mgid);
+ ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num,
+ &rec.mgid, &rec);
+ if (ret)
+ return ret;
+
+ if (!id_priv->qkey) {
+ ret = cma_set_default_qkey(id_priv);
+ if (ret)
+ return ret;
+ }
+
+ cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid);
+ rec.qkey = cpu_to_be32(id_priv->qkey);
+ rdma_addr_get_sgid(dev_addr, &rec.port_gid);
+ rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
+ rec.join_state = mc->join_state;
+
+ comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID |
+ IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE |
+ IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL |
+ IB_SA_MCMEMBER_REC_FLOW_LABEL |
+ IB_SA_MCMEMBER_REC_TRAFFIC_CLASS;
+
+ if (id_priv->id.ps == RDMA_PS_IPOIB)
+ comp_mask |= IB_SA_MCMEMBER_REC_RATE |
+ IB_SA_MCMEMBER_REC_RATE_SELECTOR |
+ IB_SA_MCMEMBER_REC_MTU_SELECTOR |
+ IB_SA_MCMEMBER_REC_MTU |
+ IB_SA_MCMEMBER_REC_HOP_LIMIT;
+
+ mc->sa_mc = ib_sa_join_multicast(&sa_client, id_priv->id.device,
+ id_priv->id.port_num, &rec, comp_mask,
+ GFP_KERNEL, cma_ib_mc_handler, mc);
+ return PTR_ERR_OR_ZERO(mc->sa_mc);
+}
+
+static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid,
+ enum ib_gid_type gid_type)
+{
+ struct sockaddr_in *sin = (struct sockaddr_in *)addr;
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr;
+
+ if (cma_any_addr(addr)) {
+ memset(mgid, 0, sizeof *mgid);
+ } else if (addr->sa_family == AF_INET6) {
+ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
+ } else {
+ mgid->raw[0] =
+ (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0xff;
+ mgid->raw[1] =
+ (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0x0e;
+ mgid->raw[2] = 0;
+ mgid->raw[3] = 0;
+ mgid->raw[4] = 0;
+ mgid->raw[5] = 0;
+ mgid->raw[6] = 0;
+ mgid->raw[7] = 0;
+ mgid->raw[8] = 0;
+ mgid->raw[9] = 0;
+ mgid->raw[10] = 0xff;
+ mgid->raw[11] = 0xff;
+ *(__be32 *)(&mgid->raw[12]) = sin->sin_addr.s_addr;
+ }
+}
+
+static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
+ struct cma_multicast *mc)
+{
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ int err = 0;
+ struct sockaddr *addr = (struct sockaddr *)&mc->addr;
+ struct net_device *ndev = NULL;
+ struct ib_sa_multicast ib = {};
+ enum ib_gid_type gid_type;
+ bool send_only;
+
+ send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN);
+
+ if (cma_zero_addr(addr))
+ return -EINVAL;
+
+ gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num -
+ rdma_start_port(id_priv->cma_dev->device)];
+ cma_iboe_set_mgid(addr, &ib.rec.mgid, gid_type);
+
+ ib.rec.pkey = cpu_to_be16(0xffff);
+ if (dev_addr->bound_dev_if)
+ ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if);
+ if (!ndev)
+ return -ENODEV;
+
+ ib.rec.rate = IB_RATE_PORT_CURRENT;
+ ib.rec.hop_limit = 1;
+ ib.rec.mtu = iboe_get_mtu(ndev->mtu);
+
+ if (addr->sa_family == AF_INET) {
+ if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) {
+ ib.rec.hop_limit = IPV6_DEFAULT_HOPLIMIT;
+ if (!send_only) {
+ err = cma_igmp_send(ndev, &ib.rec.mgid,
+ true);
+ }
+ }
+ } else {
+ if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
+ err = -ENOTSUPP;
+ }
+ dev_put(ndev);
+ if (err || !ib.rec.mtu)
+ return err ?: -EINVAL;
+
+ if (!id_priv->qkey)
+ cma_set_default_qkey(id_priv);
+
+ rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+ &ib.rec.port_gid);
+ INIT_WORK(&mc->iboe_join.work, cma_iboe_join_work_handler);
+ cma_make_mc_event(0, id_priv, &ib, &mc->iboe_join.event, mc);
+ queue_work(cma_wq, &mc->iboe_join.work);
+ return 0;
+}
+
+int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
+ u8 join_state, void *context)
+{
+ struct rdma_id_private *id_priv =
+ container_of(id, struct rdma_id_private, id);
+ struct cma_multicast *mc;
+ int ret;
+
+ /* Not supported for kernel QPs */
+ if (WARN_ON(id->qp))
+ return -EINVAL;
+
+ /* ULP is calling this wrong. */
+ if (!id->device || (READ_ONCE(id_priv->state) != RDMA_CM_ADDR_BOUND &&
+ READ_ONCE(id_priv->state) != RDMA_CM_ADDR_RESOLVED))
+ return -EINVAL;
+
+ if (id_priv->id.qp_type != IB_QPT_UD)
+ return -EINVAL;
+
+ mc = kzalloc(sizeof(*mc), GFP_KERNEL);
+ if (!mc)
+ return -ENOMEM;
+
+ memcpy(&mc->addr, addr, rdma_addr_size(addr));
+ mc->context = context;
+ mc->id_priv = id_priv;
+ mc->join_state = join_state;
+
+ if (rdma_protocol_roce(id->device, id->port_num)) {
+ ret = cma_iboe_join_multicast(id_priv, mc);
+ if (ret)
+ goto out_err;
+ } else if (rdma_cap_ib_mcast(id->device, id->port_num)) {
+ ret = cma_join_ib_multicast(id_priv, mc);
+ if (ret)
+ goto out_err;
+ } else {
+ ret = -ENOSYS;
+ goto out_err;
+ }
+
+ spin_lock(&id_priv->lock);
+ list_add(&mc->list, &id_priv->mc_list);
+ spin_unlock(&id_priv->lock);
+
+ return 0;
+out_err:
+ kfree(mc);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_join_multicast);
+
+void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+ struct rdma_id_private *id_priv;
+ struct cma_multicast *mc;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ spin_lock_irq(&id_priv->lock);
+ list_for_each_entry(mc, &id_priv->mc_list, list) {
+ if (memcmp(&mc->addr, addr, rdma_addr_size(addr)) != 0)
+ continue;
+ list_del(&mc->list);
+ spin_unlock_irq(&id_priv->lock);
+
+ WARN_ON(id_priv->cma_dev->device != id->device);
+ destroy_mc(id_priv, mc);
+ return;
+ }
+ spin_unlock_irq(&id_priv->lock);
+}
+EXPORT_SYMBOL(rdma_leave_multicast);
+
+static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv)
+{
+ struct rdma_dev_addr *dev_addr;
+ struct cma_work *work;
+
+ dev_addr = &id_priv->id.route.addr.dev_addr;
+
+ if ((dev_addr->bound_dev_if == ndev->ifindex) &&
+ (net_eq(dev_net(ndev), dev_addr->net)) &&
+ memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) {
+ pr_info("RDMA CM addr change for ndev %s used by id %p\n",
+ ndev->name, &id_priv->id);
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ INIT_WORK(&work->work, cma_work_handler);
+ work->id = id_priv;
+ work->event.event = RDMA_CM_EVENT_ADDR_CHANGE;
+ cma_id_get(id_priv);
+ queue_work(cma_wq, &work->work);
+ }
+
+ return 0;
+}
+
+static int cma_netdev_callback(struct notifier_block *self, unsigned long event,
+ void *ptr)
+{
+ struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+ struct cma_device *cma_dev;
+ struct rdma_id_private *id_priv;
+ int ret = NOTIFY_DONE;
+
+ if (event != NETDEV_BONDING_FAILOVER)
+ return NOTIFY_DONE;
+
+ if (!netif_is_bond_master(ndev))
+ return NOTIFY_DONE;
+
+ mutex_lock(&lock);
+ list_for_each_entry(cma_dev, &dev_list, list)
+ list_for_each_entry(id_priv, &cma_dev->id_list, device_item) {
+ ret = cma_netdev_change(ndev, id_priv);
+ if (ret)
+ goto out;
+ }
+
+out:
+ mutex_unlock(&lock);
+ return ret;
+}
+
+static void cma_netevent_work_handler(struct work_struct *_work)
+{
+ struct rdma_id_private *id_priv =
+ container_of(_work, struct rdma_id_private, id.net_work);
+ struct rdma_cm_event event = {};
+
+ mutex_lock(&id_priv->handler_mutex);
+
+ if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING ||
+ READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL)
+ goto out_unlock;
+
+ event.event = RDMA_CM_EVENT_UNREACHABLE;
+ event.status = -ETIMEDOUT;
+
+ if (cma_cm_event_handler(id_priv, &event)) {
+ __acquire(&id_priv->handler_mutex);
+ id_priv->cm_id.ib = NULL;
+ cma_id_put(id_priv);
+ destroy_id_handler_unlock(id_priv);
+ return;
+ }
+
+out_unlock:
+ mutex_unlock(&id_priv->handler_mutex);
+ cma_id_put(id_priv);
+}
+
+static int cma_netevent_callback(struct notifier_block *self,
+ unsigned long event, void *ctx)
+{
+ struct id_table_entry *ips_node = NULL;
+ struct rdma_id_private *current_id;
+ struct neighbour *neigh = ctx;
+ unsigned long flags;
+
+ if (event != NETEVENT_NEIGH_UPDATE)
+ return NOTIFY_DONE;
+
+ spin_lock_irqsave(&id_table_lock, flags);
+ if (neigh->tbl->family == AF_INET6) {
+ struct sockaddr_in6 neigh_sock_6;
+
+ neigh_sock_6.sin6_family = AF_INET6;
+ neigh_sock_6.sin6_addr = *(struct in6_addr *)neigh->primary_key;
+ ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex,
+ (struct sockaddr *)&neigh_sock_6);
+ } else if (neigh->tbl->family == AF_INET) {
+ struct sockaddr_in neigh_sock_4;
+
+ neigh_sock_4.sin_family = AF_INET;
+ neigh_sock_4.sin_addr.s_addr = *(__be32 *)(neigh->primary_key);
+ ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex,
+ (struct sockaddr *)&neigh_sock_4);
+ } else
+ goto out;
+
+ if (!ips_node)
+ goto out;
+
+ list_for_each_entry(current_id, &ips_node->id_list, id_list_entry) {
+ if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr,
+ neigh->ha, ETH_ALEN))
+ continue;
+ INIT_WORK(&current_id->id.net_work, cma_netevent_work_handler);
+ cma_id_get(current_id);
+ queue_work(cma_wq, &current_id->id.net_work);
+ }
+out:
+ spin_unlock_irqrestore(&id_table_lock, flags);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block cma_nb = {
+ .notifier_call = cma_netdev_callback
+};
+
+static struct notifier_block cma_netevent_cb = {
+ .notifier_call = cma_netevent_callback
+};
+
+static void cma_send_device_removal_put(struct rdma_id_private *id_priv)
+{
+ struct rdma_cm_event event = { .event = RDMA_CM_EVENT_DEVICE_REMOVAL };
+ enum rdma_cm_state state;
+ unsigned long flags;
+
+ mutex_lock(&id_priv->handler_mutex);
+ /* Record that we want to remove the device */
+ spin_lock_irqsave(&id_priv->lock, flags);
+ state = id_priv->state;
+ if (state == RDMA_CM_DESTROYING || state == RDMA_CM_DEVICE_REMOVAL) {
+ spin_unlock_irqrestore(&id_priv->lock, flags);
+ mutex_unlock(&id_priv->handler_mutex);
+ cma_id_put(id_priv);
+ return;
+ }
+ id_priv->state = RDMA_CM_DEVICE_REMOVAL;
+ spin_unlock_irqrestore(&id_priv->lock, flags);
+
+ if (cma_cm_event_handler(id_priv, &event)) {
+ /*
+ * At this point the ULP promises it won't call
+ * rdma_destroy_id() concurrently
+ */
+ cma_id_put(id_priv);
+ mutex_unlock(&id_priv->handler_mutex);
+ trace_cm_id_destroy(id_priv);
+ _destroy_id(id_priv, state);
+ return;
+ }
+ mutex_unlock(&id_priv->handler_mutex);
+
+ /*
+ * If this races with destroy then the thread that first assigns state
+ * to a destroying does the cancel.
+ */
+ cma_cancel_operation(id_priv, state);
+ cma_id_put(id_priv);
+}
+
+static void cma_process_remove(struct cma_device *cma_dev)
+{
+ mutex_lock(&lock);
+ while (!list_empty(&cma_dev->id_list)) {
+ struct rdma_id_private *id_priv = list_first_entry(
+ &cma_dev->id_list, struct rdma_id_private, device_item);
+
+ list_del_init(&id_priv->listen_item);
+ list_del_init(&id_priv->device_item);
+ cma_id_get(id_priv);
+ mutex_unlock(&lock);
+
+ cma_send_device_removal_put(id_priv);
+
+ mutex_lock(&lock);
+ }
+ mutex_unlock(&lock);
+
+ cma_dev_put(cma_dev);
+ wait_for_completion(&cma_dev->comp);
+}
+
+static bool cma_supported(struct ib_device *device)
+{
+ u32 i;
+
+ rdma_for_each_port(device, i) {
+ if (rdma_cap_ib_cm(device, i) || rdma_cap_iw_cm(device, i))
+ return true;
+ }
+ return false;
+}
+
+static int cma_add_one(struct ib_device *device)
+{
+ struct rdma_id_private *to_destroy;
+ struct cma_device *cma_dev;
+ struct rdma_id_private *id_priv;
+ unsigned long supported_gids = 0;
+ int ret;
+ u32 i;
+
+ if (!cma_supported(device))
+ return -EOPNOTSUPP;
+
+ cma_dev = kmalloc(sizeof(*cma_dev), GFP_KERNEL);
+ if (!cma_dev)
+ return -ENOMEM;
+
+ cma_dev->device = device;
+ cma_dev->default_gid_type = kcalloc(device->phys_port_cnt,
+ sizeof(*cma_dev->default_gid_type),
+ GFP_KERNEL);
+ if (!cma_dev->default_gid_type) {
+ ret = -ENOMEM;
+ goto free_cma_dev;
+ }
+
+ cma_dev->default_roce_tos = kcalloc(device->phys_port_cnt,
+ sizeof(*cma_dev->default_roce_tos),
+ GFP_KERNEL);
+ if (!cma_dev->default_roce_tos) {
+ ret = -ENOMEM;
+ goto free_gid_type;
+ }
+
+ rdma_for_each_port (device, i) {
+ supported_gids = roce_gid_type_mask_support(device, i);
+ WARN_ON(!supported_gids);
+ if (supported_gids & (1 << CMA_PREFERRED_ROCE_GID_TYPE))
+ cma_dev->default_gid_type[i - rdma_start_port(device)] =
+ CMA_PREFERRED_ROCE_GID_TYPE;
+ else
+ cma_dev->default_gid_type[i - rdma_start_port(device)] =
+ find_first_bit(&supported_gids, BITS_PER_LONG);
+ cma_dev->default_roce_tos[i - rdma_start_port(device)] = 0;
+ }
+
+ init_completion(&cma_dev->comp);
+ refcount_set(&cma_dev->refcount, 1);
+ INIT_LIST_HEAD(&cma_dev->id_list);
+ ib_set_client_data(device, &cma_client, cma_dev);
+
+ mutex_lock(&lock);
+ list_add_tail(&cma_dev->list, &dev_list);
+ list_for_each_entry(id_priv, &listen_any_list, listen_any_item) {
+ ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy);
+ if (ret)
+ goto free_listen;
+ }
+ mutex_unlock(&lock);
+
+ trace_cm_add_one(device);
+ return 0;
+
+free_listen:
+ list_del(&cma_dev->list);
+ mutex_unlock(&lock);
+
+ /* cma_process_remove() will delete to_destroy */
+ cma_process_remove(cma_dev);
+ kfree(cma_dev->default_roce_tos);
+free_gid_type:
+ kfree(cma_dev->default_gid_type);
+
+free_cma_dev:
+ kfree(cma_dev);
+ return ret;
+}
+
+static void cma_remove_one(struct ib_device *device, void *client_data)
+{
+ struct cma_device *cma_dev = client_data;
+
+ trace_cm_remove_one(device);
+
+ mutex_lock(&lock);
+ list_del(&cma_dev->list);
+ mutex_unlock(&lock);
+
+ cma_process_remove(cma_dev);
+ kfree(cma_dev->default_roce_tos);
+ kfree(cma_dev->default_gid_type);
+ kfree(cma_dev);
+}
+
+static int cma_init_net(struct net *net)
+{
+ struct cma_pernet *pernet = cma_pernet(net);
+
+ xa_init(&pernet->tcp_ps);
+ xa_init(&pernet->udp_ps);
+ xa_init(&pernet->ipoib_ps);
+ xa_init(&pernet->ib_ps);
+
+ return 0;
+}
+
+static void cma_exit_net(struct net *net)
+{
+ struct cma_pernet *pernet = cma_pernet(net);
+
+ WARN_ON(!xa_empty(&pernet->tcp_ps));
+ WARN_ON(!xa_empty(&pernet->udp_ps));
+ WARN_ON(!xa_empty(&pernet->ipoib_ps));
+ WARN_ON(!xa_empty(&pernet->ib_ps));
+}
+
+static struct pernet_operations cma_pernet_operations = {
+ .init = cma_init_net,
+ .exit = cma_exit_net,
+ .id = &cma_pernet_id,
+ .size = sizeof(struct cma_pernet),
+};
+
+static int __init cma_init(void)
+{
+ int ret;
+
+ /*
+ * There is a rare lock ordering dependency in cma_netdev_callback()
+ * that only happens when bonding is enabled. Teach lockdep that rtnl
+ * must never be nested under lock so it can find these without having
+ * to test with bonding.
+ */
+ if (IS_ENABLED(CONFIG_LOCKDEP)) {
+ rtnl_lock();
+ mutex_lock(&lock);
+ mutex_unlock(&lock);
+ rtnl_unlock();
+ }
+
+ cma_wq = alloc_ordered_workqueue("rdma_cm", WQ_MEM_RECLAIM);
+ if (!cma_wq)
+ return -ENOMEM;
+
+ ret = register_pernet_subsys(&cma_pernet_operations);
+ if (ret)
+ goto err_wq;
+
+ ib_sa_register_client(&sa_client);
+ register_netdevice_notifier(&cma_nb);
+ register_netevent_notifier(&cma_netevent_cb);
+
+ ret = ib_register_client(&cma_client);
+ if (ret)
+ goto err;
+
+ ret = cma_configfs_init();
+ if (ret)
+ goto err_ib;
+
+ return 0;
+
+err_ib:
+ ib_unregister_client(&cma_client);
+err:
+ unregister_netevent_notifier(&cma_netevent_cb);
+ unregister_netdevice_notifier(&cma_nb);
+ ib_sa_unregister_client(&sa_client);
+ unregister_pernet_subsys(&cma_pernet_operations);
+err_wq:
+ destroy_workqueue(cma_wq);
+ return ret;
+}
+
+static void __exit cma_cleanup(void)
+{
+ cma_configfs_exit();
+ ib_unregister_client(&cma_client);
+ unregister_netevent_notifier(&cma_netevent_cb);
+ unregister_netdevice_notifier(&cma_nb);
+ ib_sa_unregister_client(&sa_client);
+ unregister_pernet_subsys(&cma_pernet_operations);
+ destroy_workqueue(cma_wq);
+}
+
+module_init(cma_init);
+module_exit(cma_cleanup);
diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c
new file mode 100644
index 000000000..f2fb2d8a6
--- /dev/null
+++ b/drivers/infiniband/core/cma_configfs.c
@@ -0,0 +1,366 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/configfs.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+
+#include "core_priv.h"
+#include "cma_priv.h"
+
+struct cma_device;
+
+struct cma_dev_group;
+
+struct cma_dev_port_group {
+ u32 port_num;
+ struct cma_dev_group *cma_dev_group;
+ struct config_group group;
+};
+
+struct cma_dev_group {
+ char name[IB_DEVICE_NAME_MAX];
+ struct config_group device_group;
+ struct config_group ports_group;
+ struct cma_dev_port_group *ports;
+};
+
+static struct cma_dev_port_group *to_dev_port_group(struct config_item *item)
+{
+ struct config_group *group;
+
+ if (!item)
+ return NULL;
+
+ group = container_of(item, struct config_group, cg_item);
+ return container_of(group, struct cma_dev_port_group, group);
+}
+
+static bool filter_by_name(struct ib_device *ib_dev, void *cookie)
+{
+ return !strcmp(dev_name(&ib_dev->dev), cookie);
+}
+
+static int cma_configfs_params_get(struct config_item *item,
+ struct cma_device **pcma_dev,
+ struct cma_dev_port_group **pgroup)
+{
+ struct cma_dev_port_group *group = to_dev_port_group(item);
+ struct cma_device *cma_dev;
+
+ if (!group)
+ return -ENODEV;
+
+ cma_dev = cma_enum_devices_by_ibdev(filter_by_name,
+ group->cma_dev_group->name);
+ if (!cma_dev)
+ return -ENODEV;
+
+ *pcma_dev = cma_dev;
+ *pgroup = group;
+
+ return 0;
+}
+
+static void cma_configfs_params_put(struct cma_device *cma_dev)
+{
+ cma_dev_put(cma_dev);
+}
+
+static ssize_t default_roce_mode_show(struct config_item *item,
+ char *buf)
+{
+ struct cma_device *cma_dev;
+ struct cma_dev_port_group *group;
+ int gid_type;
+ ssize_t ret;
+
+ ret = cma_configfs_params_get(item, &cma_dev, &group);
+ if (ret)
+ return ret;
+
+ gid_type = cma_get_default_gid_type(cma_dev, group->port_num);
+ cma_configfs_params_put(cma_dev);
+
+ if (gid_type < 0)
+ return gid_type;
+
+ return sysfs_emit(buf, "%s\n", ib_cache_gid_type_str(gid_type));
+}
+
+static ssize_t default_roce_mode_store(struct config_item *item,
+ const char *buf, size_t count)
+{
+ struct cma_device *cma_dev;
+ struct cma_dev_port_group *group;
+ int gid_type;
+ ssize_t ret;
+
+ ret = cma_configfs_params_get(item, &cma_dev, &group);
+ if (ret)
+ return ret;
+
+ gid_type = ib_cache_gid_parse_type_str(buf);
+ if (gid_type < 0) {
+ cma_configfs_params_put(cma_dev);
+ return -EINVAL;
+ }
+
+ ret = cma_set_default_gid_type(cma_dev, group->port_num, gid_type);
+
+ cma_configfs_params_put(cma_dev);
+
+ return !ret ? strnlen(buf, count) : ret;
+}
+
+CONFIGFS_ATTR(, default_roce_mode);
+
+static ssize_t default_roce_tos_show(struct config_item *item, char *buf)
+{
+ struct cma_device *cma_dev;
+ struct cma_dev_port_group *group;
+ ssize_t ret;
+ u8 tos;
+
+ ret = cma_configfs_params_get(item, &cma_dev, &group);
+ if (ret)
+ return ret;
+
+ tos = cma_get_default_roce_tos(cma_dev, group->port_num);
+ cma_configfs_params_put(cma_dev);
+
+ return sysfs_emit(buf, "%u\n", tos);
+}
+
+static ssize_t default_roce_tos_store(struct config_item *item,
+ const char *buf, size_t count)
+{
+ struct cma_device *cma_dev;
+ struct cma_dev_port_group *group;
+ ssize_t ret;
+ u8 tos;
+
+ ret = kstrtou8(buf, 0, &tos);
+ if (ret)
+ return ret;
+
+ ret = cma_configfs_params_get(item, &cma_dev, &group);
+ if (ret)
+ return ret;
+
+ ret = cma_set_default_roce_tos(cma_dev, group->port_num, tos);
+ cma_configfs_params_put(cma_dev);
+
+ return ret ? ret : strnlen(buf, count);
+}
+
+CONFIGFS_ATTR(, default_roce_tos);
+
+static struct configfs_attribute *cma_configfs_attributes[] = {
+ &attr_default_roce_mode,
+ &attr_default_roce_tos,
+ NULL,
+};
+
+static const struct config_item_type cma_port_group_type = {
+ .ct_attrs = cma_configfs_attributes,
+ .ct_owner = THIS_MODULE
+};
+
+static int make_cma_ports(struct cma_dev_group *cma_dev_group,
+ struct cma_device *cma_dev)
+{
+ struct cma_dev_port_group *ports;
+ struct ib_device *ibdev;
+ u32 ports_num;
+ u32 i;
+
+ ibdev = cma_get_ib_dev(cma_dev);
+
+ if (!ibdev)
+ return -ENODEV;
+
+ ports_num = ibdev->phys_port_cnt;
+ ports = kcalloc(ports_num, sizeof(*cma_dev_group->ports),
+ GFP_KERNEL);
+
+ if (!ports)
+ return -ENOMEM;
+
+ for (i = 0; i < ports_num; i++) {
+ char port_str[11];
+
+ ports[i].port_num = i + 1;
+ snprintf(port_str, sizeof(port_str), "%u", i + 1);
+ ports[i].cma_dev_group = cma_dev_group;
+ config_group_init_type_name(&ports[i].group,
+ port_str,
+ &cma_port_group_type);
+ configfs_add_default_group(&ports[i].group,
+ &cma_dev_group->ports_group);
+
+ }
+ cma_dev_group->ports = ports;
+ return 0;
+}
+
+static void release_cma_dev(struct config_item *item)
+{
+ struct config_group *group = container_of(item, struct config_group,
+ cg_item);
+ struct cma_dev_group *cma_dev_group = container_of(group,
+ struct cma_dev_group,
+ device_group);
+
+ kfree(cma_dev_group);
+};
+
+static void release_cma_ports_group(struct config_item *item)
+{
+ struct config_group *group = container_of(item, struct config_group,
+ cg_item);
+ struct cma_dev_group *cma_dev_group = container_of(group,
+ struct cma_dev_group,
+ ports_group);
+
+ kfree(cma_dev_group->ports);
+ cma_dev_group->ports = NULL;
+};
+
+static struct configfs_item_operations cma_ports_item_ops = {
+ .release = release_cma_ports_group
+};
+
+static const struct config_item_type cma_ports_group_type = {
+ .ct_item_ops = &cma_ports_item_ops,
+ .ct_owner = THIS_MODULE
+};
+
+static struct configfs_item_operations cma_device_item_ops = {
+ .release = release_cma_dev
+};
+
+static const struct config_item_type cma_device_group_type = {
+ .ct_item_ops = &cma_device_item_ops,
+ .ct_owner = THIS_MODULE
+};
+
+static struct config_group *make_cma_dev(struct config_group *group,
+ const char *name)
+{
+ int err = -ENODEV;
+ struct cma_device *cma_dev = cma_enum_devices_by_ibdev(filter_by_name,
+ (void *)name);
+ struct cma_dev_group *cma_dev_group = NULL;
+
+ if (!cma_dev)
+ goto fail;
+
+ cma_dev_group = kzalloc(sizeof(*cma_dev_group), GFP_KERNEL);
+
+ if (!cma_dev_group) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ strscpy(cma_dev_group->name, name, sizeof(cma_dev_group->name));
+
+ config_group_init_type_name(&cma_dev_group->ports_group, "ports",
+ &cma_ports_group_type);
+
+ err = make_cma_ports(cma_dev_group, cma_dev);
+ if (err)
+ goto fail;
+
+ config_group_init_type_name(&cma_dev_group->device_group, name,
+ &cma_device_group_type);
+ configfs_add_default_group(&cma_dev_group->ports_group,
+ &cma_dev_group->device_group);
+
+ cma_dev_put(cma_dev);
+ return &cma_dev_group->device_group;
+
+fail:
+ if (cma_dev)
+ cma_dev_put(cma_dev);
+ kfree(cma_dev_group);
+ return ERR_PTR(err);
+}
+
+static void drop_cma_dev(struct config_group *cgroup, struct config_item *item)
+{
+ struct config_group *group =
+ container_of(item, struct config_group, cg_item);
+ struct cma_dev_group *cma_dev_group =
+ container_of(group, struct cma_dev_group, device_group);
+
+ configfs_remove_default_groups(&cma_dev_group->ports_group);
+ configfs_remove_default_groups(&cma_dev_group->device_group);
+ config_item_put(item);
+}
+
+static struct configfs_group_operations cma_subsys_group_ops = {
+ .make_group = make_cma_dev,
+ .drop_item = drop_cma_dev,
+};
+
+static const struct config_item_type cma_subsys_type = {
+ .ct_group_ops = &cma_subsys_group_ops,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct configfs_subsystem cma_subsys = {
+ .su_group = {
+ .cg_item = {
+ .ci_namebuf = "rdma_cm",
+ .ci_type = &cma_subsys_type,
+ },
+ },
+};
+
+int __init cma_configfs_init(void)
+{
+ int ret;
+
+ config_group_init(&cma_subsys.su_group);
+ mutex_init(&cma_subsys.su_mutex);
+ ret = configfs_register_subsystem(&cma_subsys);
+ if (ret)
+ mutex_destroy(&cma_subsys.su_mutex);
+ return ret;
+}
+
+void __exit cma_configfs_exit(void)
+{
+ configfs_unregister_subsystem(&cma_subsys);
+ mutex_destroy(&cma_subsys.su_mutex);
+}
diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h
new file mode 100644
index 000000000..b7354c94c
--- /dev/null
+++ b/drivers/infiniband/core/cma_priv.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc. All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _CMA_PRIV_H
+#define _CMA_PRIV_H
+
+enum rdma_cm_state {
+ RDMA_CM_IDLE,
+ RDMA_CM_ADDR_QUERY,
+ RDMA_CM_ADDR_RESOLVED,
+ RDMA_CM_ROUTE_QUERY,
+ RDMA_CM_ROUTE_RESOLVED,
+ RDMA_CM_CONNECT,
+ RDMA_CM_DISCONNECT,
+ RDMA_CM_ADDR_BOUND,
+ RDMA_CM_LISTEN,
+ RDMA_CM_DEVICE_REMOVAL,
+ RDMA_CM_DESTROYING
+};
+
+struct rdma_id_private {
+ struct rdma_cm_id id;
+
+ struct rdma_bind_list *bind_list;
+ struct hlist_node node;
+ union {
+ struct list_head device_item; /* On cma_device->id_list */
+ struct list_head listen_any_item; /* On listen_any_list */
+ };
+ union {
+ /* On rdma_id_private->listen_list */
+ struct list_head listen_item;
+ struct list_head listen_list;
+ };
+ struct list_head id_list_entry;
+ struct cma_device *cma_dev;
+ struct list_head mc_list;
+
+ int internal_id;
+ enum rdma_cm_state state;
+ spinlock_t lock;
+ struct mutex qp_mutex;
+
+ struct completion comp;
+ refcount_t refcount;
+ struct mutex handler_mutex;
+
+ int backlog;
+ int timeout_ms;
+ struct ib_sa_query *query;
+ int query_id;
+ union {
+ struct ib_cm_id *ib;
+ struct iw_cm_id *iw;
+ } cm_id;
+
+ u32 seq_num;
+ u32 qkey;
+ u32 qp_num;
+ u32 options;
+ u8 srq;
+ u8 tos;
+ u8 tos_set:1;
+ u8 timeout_set:1;
+ u8 min_rnr_timer_set:1;
+ u8 reuseaddr;
+ u8 afonly;
+ u8 timeout;
+ u8 min_rnr_timer;
+ u8 used_resolve_ip;
+ enum ib_gid_type gid_type;
+
+ /*
+ * Internal to RDMA/core, don't use in the drivers
+ */
+ struct rdma_restrack_entry res;
+ struct rdma_ucm_ece ece;
+};
+
+#if IS_ENABLED(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS)
+int cma_configfs_init(void);
+void cma_configfs_exit(void);
+#else
+static inline int cma_configfs_init(void)
+{
+ return 0;
+}
+
+static inline void cma_configfs_exit(void)
+{
+}
+#endif
+
+void cma_dev_get(struct cma_device *dev);
+void cma_dev_put(struct cma_device *dev);
+typedef bool (*cma_device_filter)(struct ib_device *, void *);
+struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter,
+ void *cookie);
+int cma_get_default_gid_type(struct cma_device *dev, u32 port);
+int cma_set_default_gid_type(struct cma_device *dev, u32 port,
+ enum ib_gid_type default_gid_type);
+int cma_get_default_roce_tos(struct cma_device *dev, u32 port);
+int cma_set_default_roce_tos(struct cma_device *dev, u32 port,
+ u8 default_roce_tos);
+struct ib_device *cma_get_ib_dev(struct cma_device *dev);
+
+#endif /* _CMA_PRIV_H */
diff --git a/drivers/infiniband/core/cma_trace.c b/drivers/infiniband/core/cma_trace.c
new file mode 100644
index 000000000..b314a281e
--- /dev/null
+++ b/drivers/infiniband/core/cma_trace.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Trace points for the RDMA Connection Manager.
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#define CREATE_TRACE_POINTS
+
+#include <rdma/rdma_cm.h>
+#include <rdma/ib_cm.h>
+#include "cma_priv.h"
+
+#include "cma_trace.h"
diff --git a/drivers/infiniband/core/cma_trace.h b/drivers/infiniband/core/cma_trace.h
new file mode 100644
index 000000000..e45264267
--- /dev/null
+++ b/drivers/infiniband/core/cma_trace.h
@@ -0,0 +1,361 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Trace point definitions for the RDMA Connect Manager.
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rdma_cma
+
+#if !defined(_TRACE_RDMA_CMA_H) || defined(TRACE_HEADER_MULTI_READ)
+
+#define _TRACE_RDMA_CMA_H
+
+#include <linux/tracepoint.h>
+#include <trace/events/rdma.h>
+
+
+DECLARE_EVENT_CLASS(cma_fsm_class,
+ TP_PROTO(
+ const struct rdma_id_private *id_priv
+ ),
+
+ TP_ARGS(id_priv),
+
+ TP_STRUCT__entry(
+ __field(u32, cm_id)
+ __field(u32, tos)
+ __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6))
+ __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6))
+ ),
+
+ TP_fast_assign(
+ __entry->cm_id = id_priv->res.id;
+ __entry->tos = id_priv->tos;
+ memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr,
+ sizeof(struct sockaddr_in6));
+ memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+
+ TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u",
+ __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos
+ )
+);
+
+#define DEFINE_CMA_FSM_EVENT(name) \
+ DEFINE_EVENT(cma_fsm_class, cm_##name, \
+ TP_PROTO( \
+ const struct rdma_id_private *id_priv \
+ ), \
+ TP_ARGS(id_priv))
+
+DEFINE_CMA_FSM_EVENT(send_rtu);
+DEFINE_CMA_FSM_EVENT(send_rej);
+DEFINE_CMA_FSM_EVENT(send_mra);
+DEFINE_CMA_FSM_EVENT(send_sidr_req);
+DEFINE_CMA_FSM_EVENT(send_sidr_rep);
+DEFINE_CMA_FSM_EVENT(disconnect);
+DEFINE_CMA_FSM_EVENT(sent_drep);
+DEFINE_CMA_FSM_EVENT(sent_dreq);
+DEFINE_CMA_FSM_EVENT(id_destroy);
+
+TRACE_EVENT(cm_id_attach,
+ TP_PROTO(
+ const struct rdma_id_private *id_priv,
+ const struct ib_device *device
+ ),
+
+ TP_ARGS(id_priv, device),
+
+ TP_STRUCT__entry(
+ __field(u32, cm_id)
+ __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6))
+ __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6))
+ __string(devname, device->name)
+ ),
+
+ TP_fast_assign(
+ __entry->cm_id = id_priv->res.id;
+ memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr,
+ sizeof(struct sockaddr_in6));
+ memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr,
+ sizeof(struct sockaddr_in6));
+ __assign_str(devname, device->name);
+ ),
+
+ TP_printk("cm.id=%u src=%pISpc dst=%pISpc device=%s",
+ __entry->cm_id, __entry->srcaddr, __entry->dstaddr,
+ __get_str(devname)
+ )
+);
+
+DECLARE_EVENT_CLASS(cma_qp_class,
+ TP_PROTO(
+ const struct rdma_id_private *id_priv
+ ),
+
+ TP_ARGS(id_priv),
+
+ TP_STRUCT__entry(
+ __field(u32, cm_id)
+ __field(u32, tos)
+ __field(u32, qp_num)
+ __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6))
+ __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6))
+ ),
+
+ TP_fast_assign(
+ __entry->cm_id = id_priv->res.id;
+ __entry->tos = id_priv->tos;
+ __entry->qp_num = id_priv->qp_num;
+ memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr,
+ sizeof(struct sockaddr_in6));
+ memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+
+ TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u qp_num=%u",
+ __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos,
+ __entry->qp_num
+ )
+);
+
+#define DEFINE_CMA_QP_EVENT(name) \
+ DEFINE_EVENT(cma_qp_class, cm_##name, \
+ TP_PROTO( \
+ const struct rdma_id_private *id_priv \
+ ), \
+ TP_ARGS(id_priv))
+
+DEFINE_CMA_QP_EVENT(send_req);
+DEFINE_CMA_QP_EVENT(send_rep);
+DEFINE_CMA_QP_EVENT(qp_destroy);
+
+/*
+ * enum ib_wp_type, from include/rdma/ib_verbs.h
+ */
+#define IB_QP_TYPE_LIST \
+ ib_qp_type(SMI) \
+ ib_qp_type(GSI) \
+ ib_qp_type(RC) \
+ ib_qp_type(UC) \
+ ib_qp_type(UD) \
+ ib_qp_type(RAW_IPV6) \
+ ib_qp_type(RAW_ETHERTYPE) \
+ ib_qp_type(RAW_PACKET) \
+ ib_qp_type(XRC_INI) \
+ ib_qp_type_end(XRC_TGT)
+
+#undef ib_qp_type
+#undef ib_qp_type_end
+
+#define ib_qp_type(x) TRACE_DEFINE_ENUM(IB_QPT_##x);
+#define ib_qp_type_end(x) TRACE_DEFINE_ENUM(IB_QPT_##x);
+
+IB_QP_TYPE_LIST
+
+#undef ib_qp_type
+#undef ib_qp_type_end
+
+#define ib_qp_type(x) { IB_QPT_##x, #x },
+#define ib_qp_type_end(x) { IB_QPT_##x, #x }
+
+#define rdma_show_qp_type(x) \
+ __print_symbolic(x, IB_QP_TYPE_LIST)
+
+
+TRACE_EVENT(cm_qp_create,
+ TP_PROTO(
+ const struct rdma_id_private *id_priv,
+ const struct ib_pd *pd,
+ const struct ib_qp_init_attr *qp_init_attr,
+ int rc
+ ),
+
+ TP_ARGS(id_priv, pd, qp_init_attr, rc),
+
+ TP_STRUCT__entry(
+ __field(u32, cm_id)
+ __field(u32, pd_id)
+ __field(u32, tos)
+ __field(u32, qp_num)
+ __field(u32, send_wr)
+ __field(u32, recv_wr)
+ __field(int, rc)
+ __field(unsigned long, qp_type)
+ __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6))
+ __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6))
+ ),
+
+ TP_fast_assign(
+ __entry->cm_id = id_priv->res.id;
+ __entry->pd_id = pd->res.id;
+ __entry->tos = id_priv->tos;
+ __entry->send_wr = qp_init_attr->cap.max_send_wr;
+ __entry->recv_wr = qp_init_attr->cap.max_recv_wr;
+ __entry->rc = rc;
+ if (!rc) {
+ __entry->qp_num = id_priv->qp_num;
+ __entry->qp_type = id_priv->id.qp_type;
+ } else {
+ __entry->qp_num = 0;
+ __entry->qp_type = 0;
+ }
+ memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr,
+ sizeof(struct sockaddr_in6));
+ memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+
+ TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u pd.id=%u qp_type=%s"
+ " send_wr=%u recv_wr=%u qp_num=%u rc=%d",
+ __entry->cm_id, __entry->srcaddr, __entry->dstaddr,
+ __entry->tos, __entry->pd_id,
+ rdma_show_qp_type(__entry->qp_type), __entry->send_wr,
+ __entry->recv_wr, __entry->qp_num, __entry->rc
+ )
+);
+
+TRACE_EVENT(cm_req_handler,
+ TP_PROTO(
+ const struct rdma_id_private *id_priv,
+ int event
+ ),
+
+ TP_ARGS(id_priv, event),
+
+ TP_STRUCT__entry(
+ __field(u32, cm_id)
+ __field(u32, tos)
+ __field(unsigned long, event)
+ __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6))
+ __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6))
+ ),
+
+ TP_fast_assign(
+ __entry->cm_id = id_priv->res.id;
+ __entry->tos = id_priv->tos;
+ __entry->event = event;
+ memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr,
+ sizeof(struct sockaddr_in6));
+ memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+
+ TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u %s (%lu)",
+ __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos,
+ rdma_show_ib_cm_event(__entry->event), __entry->event
+ )
+);
+
+TRACE_EVENT(cm_event_handler,
+ TP_PROTO(
+ const struct rdma_id_private *id_priv,
+ const struct rdma_cm_event *event
+ ),
+
+ TP_ARGS(id_priv, event),
+
+ TP_STRUCT__entry(
+ __field(u32, cm_id)
+ __field(u32, tos)
+ __field(unsigned long, event)
+ __field(int, status)
+ __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6))
+ __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6))
+ ),
+
+ TP_fast_assign(
+ __entry->cm_id = id_priv->res.id;
+ __entry->tos = id_priv->tos;
+ __entry->event = event->event;
+ __entry->status = event->status;
+ memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr,
+ sizeof(struct sockaddr_in6));
+ memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+
+ TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u %s (%lu/%d)",
+ __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos,
+ rdma_show_cm_event(__entry->event), __entry->event,
+ __entry->status
+ )
+);
+
+TRACE_EVENT(cm_event_done,
+ TP_PROTO(
+ const struct rdma_id_private *id_priv,
+ const struct rdma_cm_event *event,
+ int result
+ ),
+
+ TP_ARGS(id_priv, event, result),
+
+ TP_STRUCT__entry(
+ __field(u32, cm_id)
+ __field(u32, tos)
+ __field(unsigned long, event)
+ __field(int, result)
+ __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6))
+ __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6))
+ ),
+
+ TP_fast_assign(
+ __entry->cm_id = id_priv->res.id;
+ __entry->tos = id_priv->tos;
+ __entry->event = event->event;
+ __entry->result = result;
+ memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr,
+ sizeof(struct sockaddr_in6));
+ memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+
+ TP_printk("cm.id=%u src=%pISpc dst=%pISpc tos=%u %s consumer returns %d",
+ __entry->cm_id, __entry->srcaddr, __entry->dstaddr, __entry->tos,
+ rdma_show_cm_event(__entry->event), __entry->result
+ )
+);
+
+DECLARE_EVENT_CLASS(cma_client_class,
+ TP_PROTO(
+ const struct ib_device *device
+ ),
+
+ TP_ARGS(device),
+
+ TP_STRUCT__entry(
+ __string(name, device->name)
+ ),
+
+ TP_fast_assign(
+ __assign_str(name, device->name);
+ ),
+
+ TP_printk("device name=%s",
+ __get_str(name)
+ )
+);
+
+#define DEFINE_CMA_CLIENT_EVENT(name) \
+ DEFINE_EVENT(cma_client_class, cm_##name, \
+ TP_PROTO( \
+ const struct ib_device *device \
+ ), \
+ TP_ARGS(device))
+
+DEFINE_CMA_CLIENT_EVENT(add_one);
+DEFINE_CMA_CLIENT_EVENT(remove_one);
+
+#endif /* _TRACE_RDMA_CMA_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE cma_trace
+
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
new file mode 100644
index 000000000..f66f48d86
--- /dev/null
+++ b/drivers/infiniband/core/core_priv.h
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _CORE_PRIV_H
+#define _CORE_PRIV_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/cgroup_rdma.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/opa_addr.h>
+#include <rdma/ib_mad.h>
+#include <rdma/restrack.h>
+#include "mad_priv.h"
+#include "restrack.h"
+
+/* Total number of ports combined across all struct ib_devices's */
+#define RDMA_MAX_PORTS 8192
+
+struct pkey_index_qp_list {
+ struct list_head pkey_index_list;
+ u16 pkey_index;
+ /* Lock to hold while iterating the qp_list. */
+ spinlock_t qp_list_lock;
+ struct list_head qp_list;
+};
+
+/**
+ * struct rdma_dev_net - rdma net namespace metadata for a net
+ * @nl_sock: Pointer to netlink socket
+ * @net: Pointer to owner net namespace
+ * @id: xarray id to identify the net namespace.
+ */
+struct rdma_dev_net {
+ struct sock *nl_sock;
+ possible_net_t net;
+ u32 id;
+};
+
+extern const struct attribute_group ib_dev_attr_group;
+extern bool ib_devices_shared_netns;
+extern unsigned int rdma_dev_net_id;
+
+static inline struct rdma_dev_net *rdma_net_to_dev_net(struct net *net)
+{
+ return net_generic(net, rdma_dev_net_id);
+}
+
+int ib_device_rename(struct ib_device *ibdev, const char *name);
+int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim);
+
+typedef void (*roce_netdev_callback)(struct ib_device *device, u32 port,
+ struct net_device *idev, void *cookie);
+
+typedef bool (*roce_netdev_filter)(struct ib_device *device, u32 port,
+ struct net_device *idev, void *cookie);
+
+struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
+ u32 port);
+
+void ib_enum_roce_netdev(struct ib_device *ib_dev,
+ roce_netdev_filter filter,
+ void *filter_cookie,
+ roce_netdev_callback cb,
+ void *cookie);
+void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
+ void *filter_cookie,
+ roce_netdev_callback cb,
+ void *cookie);
+
+typedef int (*nldev_callback)(struct ib_device *device,
+ struct sk_buff *skb,
+ struct netlink_callback *cb,
+ unsigned int idx);
+
+int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
+ struct netlink_callback *cb);
+
+struct ib_client_nl_info {
+ struct sk_buff *nl_msg;
+ struct device *cdev;
+ u32 port;
+ u64 abi;
+};
+int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name,
+ struct ib_client_nl_info *res);
+
+enum ib_cache_gid_default_mode {
+ IB_CACHE_GID_DEFAULT_MODE_SET,
+ IB_CACHE_GID_DEFAULT_MODE_DELETE
+};
+
+int ib_cache_gid_parse_type_str(const char *buf);
+
+const char *ib_cache_gid_type_str(enum ib_gid_type gid_type);
+
+void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u32 port,
+ struct net_device *ndev,
+ unsigned long gid_type_mask,
+ enum ib_cache_gid_default_mode mode);
+
+int ib_cache_gid_add(struct ib_device *ib_dev, u32 port,
+ union ib_gid *gid, struct ib_gid_attr *attr);
+
+int ib_cache_gid_del(struct ib_device *ib_dev, u32 port,
+ union ib_gid *gid, struct ib_gid_attr *attr);
+
+int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u32 port,
+ struct net_device *ndev);
+
+int roce_gid_mgmt_init(void);
+void roce_gid_mgmt_cleanup(void);
+
+unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u32 port);
+
+int ib_cache_setup_one(struct ib_device *device);
+void ib_cache_cleanup_one(struct ib_device *device);
+void ib_cache_release_one(struct ib_device *device);
+void ib_dispatch_event_clients(struct ib_event *event);
+
+#ifdef CONFIG_CGROUP_RDMA
+void ib_device_register_rdmacg(struct ib_device *device);
+void ib_device_unregister_rdmacg(struct ib_device *device);
+
+int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
+ struct ib_device *device,
+ enum rdmacg_resource_type resource_index);
+
+void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
+ struct ib_device *device,
+ enum rdmacg_resource_type resource_index);
+#else
+static inline void ib_device_register_rdmacg(struct ib_device *device)
+{
+}
+
+static inline void ib_device_unregister_rdmacg(struct ib_device *device)
+{
+}
+
+static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
+ struct ib_device *device,
+ enum rdmacg_resource_type resource_index)
+{
+ return 0;
+}
+
+static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
+ struct ib_device *device,
+ enum rdmacg_resource_type resource_index)
+{
+}
+#endif
+
+static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
+ struct net_device *upper)
+{
+ return netdev_has_upper_dev_all_rcu(dev, upper);
+}
+
+int addr_init(void);
+void addr_cleanup(void);
+
+int ib_mad_init(void);
+void ib_mad_cleanup(void);
+
+int ib_sa_init(void);
+void ib_sa_cleanup(void);
+
+void rdma_nl_init(void);
+void rdma_nl_exit(void);
+
+int ib_nl_handle_resolve_resp(struct sk_buff *skb,
+ struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack);
+int ib_nl_handle_set_timeout(struct sk_buff *skb,
+ struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack);
+int ib_nl_handle_ip_res_resp(struct sk_buff *skb,
+ struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack);
+
+void ib_get_cached_subnet_prefix(struct ib_device *device,
+ u32 port_num,
+ u64 *sn_pfx);
+
+#ifdef CONFIG_SECURITY_INFINIBAND
+void ib_security_release_port_pkey_list(struct ib_device *device);
+
+void ib_security_cache_change(struct ib_device *device,
+ u32 port_num,
+ u64 subnet_prefix);
+
+int ib_security_modify_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask,
+ struct ib_udata *udata);
+
+int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev);
+void ib_destroy_qp_security_begin(struct ib_qp_security *sec);
+void ib_destroy_qp_security_abort(struct ib_qp_security *sec);
+void ib_destroy_qp_security_end(struct ib_qp_security *sec);
+int ib_open_shared_qp_security(struct ib_qp *qp, struct ib_device *dev);
+void ib_close_shared_qp_security(struct ib_qp_security *sec);
+int ib_mad_agent_security_setup(struct ib_mad_agent *agent,
+ enum ib_qp_type qp_type);
+void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent);
+int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index);
+void ib_mad_agent_security_change(void);
+#else
+static inline void ib_security_release_port_pkey_list(struct ib_device *device)
+{
+}
+
+static inline void ib_security_cache_change(struct ib_device *device,
+ u32 port_num,
+ u64 subnet_prefix)
+{
+}
+
+static inline int ib_security_modify_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask,
+ struct ib_udata *udata)
+{
+ return qp->device->ops.modify_qp(qp->real_qp,
+ qp_attr,
+ qp_attr_mask,
+ udata);
+}
+
+static inline int ib_create_qp_security(struct ib_qp *qp,
+ struct ib_device *dev)
+{
+ return 0;
+}
+
+static inline void ib_destroy_qp_security_begin(struct ib_qp_security *sec)
+{
+}
+
+static inline void ib_destroy_qp_security_abort(struct ib_qp_security *sec)
+{
+}
+
+static inline void ib_destroy_qp_security_end(struct ib_qp_security *sec)
+{
+}
+
+static inline int ib_open_shared_qp_security(struct ib_qp *qp,
+ struct ib_device *dev)
+{
+ return 0;
+}
+
+static inline void ib_close_shared_qp_security(struct ib_qp_security *sec)
+{
+}
+
+static inline int ib_mad_agent_security_setup(struct ib_mad_agent *agent,
+ enum ib_qp_type qp_type)
+{
+ return 0;
+}
+
+static inline void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent)
+{
+}
+
+static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map,
+ u16 pkey_index)
+{
+ return 0;
+}
+
+static inline void ib_mad_agent_security_change(void)
+{
+}
+#endif
+
+struct ib_device *ib_device_get_by_index(const struct net *net, u32 index);
+
+/* RDMA device netlink */
+void nldev_init(void);
+void nldev_exit(void);
+
+struct ib_qp *ib_create_qp_user(struct ib_device *dev, struct ib_pd *pd,
+ struct ib_qp_init_attr *attr,
+ struct ib_udata *udata,
+ struct ib_uqp_object *uobj, const char *caller);
+
+void ib_qp_usecnt_inc(struct ib_qp *qp);
+void ib_qp_usecnt_dec(struct ib_qp *qp);
+
+struct rdma_dev_addr;
+int rdma_resolve_ip_route(struct sockaddr *src_addr,
+ const struct sockaddr *dst_addr,
+ struct rdma_dev_addr *addr);
+
+int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
+ const union ib_gid *dgid,
+ u8 *dmac, const struct ib_gid_attr *sgid_attr,
+ int *hoplimit);
+void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr,
+ const struct net_device *dev);
+
+struct sa_path_rec;
+int roce_resolve_route_from_path(struct sa_path_rec *rec,
+ const struct ib_gid_attr *attr);
+
+struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr);
+
+void ib_free_port_attrs(struct ib_core_device *coredev);
+int ib_setup_port_attrs(struct ib_core_device *coredev);
+struct rdma_hw_stats *ib_get_hw_stats_port(struct ib_device *ibdev, u32 port_num);
+void ib_device_release_hw_stats(struct hw_stats_device_data *data);
+int ib_setup_device_attrs(struct ib_device *ibdev);
+
+int rdma_compatdev_set(u8 enable);
+
+int ib_port_register_client_groups(struct ib_device *ibdev, u32 port_num,
+ const struct attribute_group **groups);
+void ib_port_unregister_client_groups(struct ib_device *ibdev, u32 port_num,
+ const struct attribute_group **groups);
+
+int ib_device_set_netns_put(struct sk_buff *skb,
+ struct ib_device *dev, u32 ns_fd);
+
+int rdma_nl_net_init(struct rdma_dev_net *rnet);
+void rdma_nl_net_exit(struct rdma_dev_net *rnet);
+
+struct rdma_umap_priv {
+ struct vm_area_struct *vma;
+ struct list_head list;
+ struct rdma_user_mmap_entry *entry;
+};
+
+void rdma_umap_priv_init(struct rdma_umap_priv *priv,
+ struct vm_area_struct *vma,
+ struct rdma_user_mmap_entry *entry);
+
+void ib_cq_pool_cleanup(struct ib_device *dev);
+
+#endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c
new file mode 100644
index 000000000..af59486fe
--- /dev/null
+++ b/drivers/infiniband/core/counters.c
@@ -0,0 +1,669 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2019 Mellanox Technologies. All rights reserved.
+ */
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_counter.h>
+
+#include "core_priv.h"
+#include "restrack.h"
+
+#define ALL_AUTO_MODE_MASKS (RDMA_COUNTER_MASK_QP_TYPE | RDMA_COUNTER_MASK_PID)
+
+static int __counter_set_mode(struct rdma_port_counter *port_counter,
+ enum rdma_nl_counter_mode new_mode,
+ enum rdma_nl_counter_mask new_mask)
+{
+ if (new_mode == RDMA_COUNTER_MODE_AUTO) {
+ if (new_mask & (~ALL_AUTO_MODE_MASKS))
+ return -EINVAL;
+ if (port_counter->num_counters)
+ return -EBUSY;
+ }
+
+ port_counter->mode.mode = new_mode;
+ port_counter->mode.mask = new_mask;
+ return 0;
+}
+
+/*
+ * rdma_counter_set_auto_mode() - Turn on/off per-port auto mode
+ *
+ * @dev: Device to operate
+ * @port: Port to use
+ * @mask: Mask to configure
+ * @extack: Message to the user
+ *
+ * Return 0 on success. If counter mode wasn't changed then it is considered
+ * as success as well.
+ * Return -EBUSY when changing to auto mode while there are bounded counters.
+ *
+ */
+int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port,
+ enum rdma_nl_counter_mask mask,
+ struct netlink_ext_ack *extack)
+{
+ struct rdma_port_counter *port_counter;
+ enum rdma_nl_counter_mode mode;
+ int ret;
+
+ port_counter = &dev->port_data[port].port_counter;
+ if (!port_counter->hstats)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&port_counter->lock);
+ if (mask)
+ mode = RDMA_COUNTER_MODE_AUTO;
+ else
+ mode = (port_counter->num_counters) ? RDMA_COUNTER_MODE_MANUAL :
+ RDMA_COUNTER_MODE_NONE;
+
+ if (port_counter->mode.mode == mode &&
+ port_counter->mode.mask == mask) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = __counter_set_mode(port_counter, mode, mask);
+
+out:
+ mutex_unlock(&port_counter->lock);
+ if (ret == -EBUSY)
+ NL_SET_ERR_MSG(
+ extack,
+ "Modifying auto mode is not allowed when there is a bound QP");
+ return ret;
+}
+
+static void auto_mode_init_counter(struct rdma_counter *counter,
+ const struct ib_qp *qp,
+ enum rdma_nl_counter_mask new_mask)
+{
+ struct auto_mode_param *param = &counter->mode.param;
+
+ counter->mode.mode = RDMA_COUNTER_MODE_AUTO;
+ counter->mode.mask = new_mask;
+
+ if (new_mask & RDMA_COUNTER_MASK_QP_TYPE)
+ param->qp_type = qp->qp_type;
+}
+
+static int __rdma_counter_bind_qp(struct rdma_counter *counter,
+ struct ib_qp *qp)
+{
+ int ret;
+
+ if (qp->counter)
+ return -EINVAL;
+
+ if (!qp->device->ops.counter_bind_qp)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&counter->lock);
+ ret = qp->device->ops.counter_bind_qp(counter, qp);
+ mutex_unlock(&counter->lock);
+
+ return ret;
+}
+
+int rdma_counter_modify(struct ib_device *dev, u32 port,
+ unsigned int index, bool enable)
+{
+ struct rdma_hw_stats *stats;
+ int ret = 0;
+
+ if (!dev->ops.modify_hw_stat)
+ return -EOPNOTSUPP;
+
+ stats = ib_get_hw_stats_port(dev, port);
+ if (!stats || index >= stats->num_counters ||
+ !(stats->descs[index].flags & IB_STAT_FLAG_OPTIONAL))
+ return -EINVAL;
+
+ mutex_lock(&stats->lock);
+
+ if (enable != test_bit(index, stats->is_disabled))
+ goto out;
+
+ ret = dev->ops.modify_hw_stat(dev, port, index, enable);
+ if (ret)
+ goto out;
+
+ if (enable)
+ clear_bit(index, stats->is_disabled);
+ else
+ set_bit(index, stats->is_disabled);
+out:
+ mutex_unlock(&stats->lock);
+ return ret;
+}
+
+static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port,
+ struct ib_qp *qp,
+ enum rdma_nl_counter_mode mode)
+{
+ struct rdma_port_counter *port_counter;
+ struct rdma_counter *counter;
+ int ret;
+
+ if (!dev->ops.counter_dealloc || !dev->ops.counter_alloc_stats)
+ return NULL;
+
+ counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+ if (!counter)
+ return NULL;
+
+ counter->device = dev;
+ counter->port = port;
+
+ rdma_restrack_new(&counter->res, RDMA_RESTRACK_COUNTER);
+ counter->stats = dev->ops.counter_alloc_stats(counter);
+ if (!counter->stats)
+ goto err_stats;
+
+ port_counter = &dev->port_data[port].port_counter;
+ mutex_lock(&port_counter->lock);
+ switch (mode) {
+ case RDMA_COUNTER_MODE_MANUAL:
+ ret = __counter_set_mode(port_counter, RDMA_COUNTER_MODE_MANUAL,
+ 0);
+ if (ret) {
+ mutex_unlock(&port_counter->lock);
+ goto err_mode;
+ }
+ break;
+ case RDMA_COUNTER_MODE_AUTO:
+ auto_mode_init_counter(counter, qp, port_counter->mode.mask);
+ break;
+ default:
+ ret = -EOPNOTSUPP;
+ mutex_unlock(&port_counter->lock);
+ goto err_mode;
+ }
+
+ port_counter->num_counters++;
+ mutex_unlock(&port_counter->lock);
+
+ counter->mode.mode = mode;
+ kref_init(&counter->kref);
+ mutex_init(&counter->lock);
+
+ ret = __rdma_counter_bind_qp(counter, qp);
+ if (ret)
+ goto err_mode;
+
+ rdma_restrack_parent_name(&counter->res, &qp->res);
+ rdma_restrack_add(&counter->res);
+ return counter;
+
+err_mode:
+ rdma_free_hw_stats_struct(counter->stats);
+err_stats:
+ rdma_restrack_put(&counter->res);
+ kfree(counter);
+ return NULL;
+}
+
+static void rdma_counter_free(struct rdma_counter *counter)
+{
+ struct rdma_port_counter *port_counter;
+
+ port_counter = &counter->device->port_data[counter->port].port_counter;
+ mutex_lock(&port_counter->lock);
+ port_counter->num_counters--;
+ if (!port_counter->num_counters &&
+ (port_counter->mode.mode == RDMA_COUNTER_MODE_MANUAL))
+ __counter_set_mode(port_counter, RDMA_COUNTER_MODE_NONE, 0);
+
+ mutex_unlock(&port_counter->lock);
+
+ rdma_restrack_del(&counter->res);
+ rdma_free_hw_stats_struct(counter->stats);
+ kfree(counter);
+}
+
+static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter,
+ enum rdma_nl_counter_mask auto_mask)
+{
+ struct auto_mode_param *param = &counter->mode.param;
+ bool match = true;
+
+ if (auto_mask & RDMA_COUNTER_MASK_QP_TYPE)
+ match &= (param->qp_type == qp->qp_type);
+
+ if (auto_mask & RDMA_COUNTER_MASK_PID)
+ match &= (task_pid_nr(counter->res.task) ==
+ task_pid_nr(qp->res.task));
+
+ return match;
+}
+
+static int __rdma_counter_unbind_qp(struct ib_qp *qp)
+{
+ struct rdma_counter *counter = qp->counter;
+ int ret;
+
+ if (!qp->device->ops.counter_unbind_qp)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&counter->lock);
+ ret = qp->device->ops.counter_unbind_qp(qp);
+ mutex_unlock(&counter->lock);
+
+ return ret;
+}
+
+static void counter_history_stat_update(struct rdma_counter *counter)
+{
+ struct ib_device *dev = counter->device;
+ struct rdma_port_counter *port_counter;
+ int i;
+
+ port_counter = &dev->port_data[counter->port].port_counter;
+ if (!port_counter->hstats)
+ return;
+
+ rdma_counter_query_stats(counter);
+
+ for (i = 0; i < counter->stats->num_counters; i++)
+ port_counter->hstats->value[i] += counter->stats->value[i];
+}
+
+/*
+ * rdma_get_counter_auto_mode - Find the counter that @qp should be bound
+ * with in auto mode
+ *
+ * Return: The counter (with ref-count increased) if found
+ */
+static struct rdma_counter *rdma_get_counter_auto_mode(struct ib_qp *qp,
+ u32 port)
+{
+ struct rdma_port_counter *port_counter;
+ struct rdma_counter *counter = NULL;
+ struct ib_device *dev = qp->device;
+ struct rdma_restrack_entry *res;
+ struct rdma_restrack_root *rt;
+ unsigned long id = 0;
+
+ port_counter = &dev->port_data[port].port_counter;
+ rt = &dev->res[RDMA_RESTRACK_COUNTER];
+ xa_lock(&rt->xa);
+ xa_for_each(&rt->xa, id, res) {
+ counter = container_of(res, struct rdma_counter, res);
+ if ((counter->device != qp->device) || (counter->port != port))
+ goto next;
+
+ if (auto_mode_match(qp, counter, port_counter->mode.mask))
+ break;
+next:
+ counter = NULL;
+ }
+
+ if (counter && !kref_get_unless_zero(&counter->kref))
+ counter = NULL;
+
+ xa_unlock(&rt->xa);
+ return counter;
+}
+
+static void counter_release(struct kref *kref)
+{
+ struct rdma_counter *counter;
+
+ counter = container_of(kref, struct rdma_counter, kref);
+ counter_history_stat_update(counter);
+ counter->device->ops.counter_dealloc(counter);
+ rdma_counter_free(counter);
+}
+
+/*
+ * rdma_counter_bind_qp_auto - Check and bind the QP to a counter base on
+ * the auto-mode rule
+ */
+int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port)
+{
+ struct rdma_port_counter *port_counter;
+ struct ib_device *dev = qp->device;
+ struct rdma_counter *counter;
+ int ret;
+
+ if (!rdma_restrack_is_tracked(&qp->res) || rdma_is_kernel_res(&qp->res))
+ return 0;
+
+ if (!rdma_is_port_valid(dev, port))
+ return -EINVAL;
+
+ port_counter = &dev->port_data[port].port_counter;
+ if (port_counter->mode.mode != RDMA_COUNTER_MODE_AUTO)
+ return 0;
+
+ counter = rdma_get_counter_auto_mode(qp, port);
+ if (counter) {
+ ret = __rdma_counter_bind_qp(counter, qp);
+ if (ret) {
+ kref_put(&counter->kref, counter_release);
+ return ret;
+ }
+ } else {
+ counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_AUTO);
+ if (!counter)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/*
+ * rdma_counter_unbind_qp - Unbind a qp from a counter
+ * @force:
+ * true - Decrease the counter ref-count anyway (e.g., qp destroy)
+ */
+int rdma_counter_unbind_qp(struct ib_qp *qp, bool force)
+{
+ struct rdma_counter *counter = qp->counter;
+ int ret;
+
+ if (!counter)
+ return -EINVAL;
+
+ ret = __rdma_counter_unbind_qp(qp);
+ if (ret && !force)
+ return ret;
+
+ kref_put(&counter->kref, counter_release);
+ return 0;
+}
+
+int rdma_counter_query_stats(struct rdma_counter *counter)
+{
+ struct ib_device *dev = counter->device;
+ int ret;
+
+ if (!dev->ops.counter_update_stats)
+ return -EINVAL;
+
+ mutex_lock(&counter->lock);
+ ret = dev->ops.counter_update_stats(counter);
+ mutex_unlock(&counter->lock);
+
+ return ret;
+}
+
+static u64 get_running_counters_hwstat_sum(struct ib_device *dev,
+ u32 port, u32 index)
+{
+ struct rdma_restrack_entry *res;
+ struct rdma_restrack_root *rt;
+ struct rdma_counter *counter;
+ unsigned long id = 0;
+ u64 sum = 0;
+
+ rt = &dev->res[RDMA_RESTRACK_COUNTER];
+ xa_lock(&rt->xa);
+ xa_for_each(&rt->xa, id, res) {
+ if (!rdma_restrack_get(res))
+ continue;
+
+ xa_unlock(&rt->xa);
+
+ counter = container_of(res, struct rdma_counter, res);
+ if ((counter->device != dev) || (counter->port != port) ||
+ rdma_counter_query_stats(counter))
+ goto next;
+
+ sum += counter->stats->value[index];
+
+next:
+ xa_lock(&rt->xa);
+ rdma_restrack_put(res);
+ }
+
+ xa_unlock(&rt->xa);
+ return sum;
+}
+
+/*
+ * rdma_counter_get_hwstat_value() - Get the sum value of all counters on a
+ * specific port, including the running ones and history data
+ */
+u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u32 port, u32 index)
+{
+ struct rdma_port_counter *port_counter;
+ u64 sum;
+
+ port_counter = &dev->port_data[port].port_counter;
+ if (!port_counter->hstats)
+ return 0;
+
+ sum = get_running_counters_hwstat_sum(dev, port, index);
+ sum += port_counter->hstats->value[index];
+
+ return sum;
+}
+
+static struct ib_qp *rdma_counter_get_qp(struct ib_device *dev, u32 qp_num)
+{
+ struct rdma_restrack_entry *res = NULL;
+ struct ib_qp *qp = NULL;
+
+ res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_QP, qp_num);
+ if (IS_ERR(res))
+ return NULL;
+
+ qp = container_of(res, struct ib_qp, res);
+ if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
+ goto err;
+
+ return qp;
+
+err:
+ rdma_restrack_put(res);
+ return NULL;
+}
+
+static struct rdma_counter *rdma_get_counter_by_id(struct ib_device *dev,
+ u32 counter_id)
+{
+ struct rdma_restrack_entry *res;
+ struct rdma_counter *counter;
+
+ res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_COUNTER, counter_id);
+ if (IS_ERR(res))
+ return NULL;
+
+ counter = container_of(res, struct rdma_counter, res);
+ kref_get(&counter->kref);
+ rdma_restrack_put(res);
+
+ return counter;
+}
+
+/*
+ * rdma_counter_bind_qpn() - Bind QP @qp_num to counter @counter_id
+ */
+int rdma_counter_bind_qpn(struct ib_device *dev, u32 port,
+ u32 qp_num, u32 counter_id)
+{
+ struct rdma_port_counter *port_counter;
+ struct rdma_counter *counter;
+ struct ib_qp *qp;
+ int ret;
+
+ port_counter = &dev->port_data[port].port_counter;
+ if (port_counter->mode.mode == RDMA_COUNTER_MODE_AUTO)
+ return -EINVAL;
+
+ qp = rdma_counter_get_qp(dev, qp_num);
+ if (!qp)
+ return -ENOENT;
+
+ counter = rdma_get_counter_by_id(dev, counter_id);
+ if (!counter) {
+ ret = -ENOENT;
+ goto err;
+ }
+
+ if (rdma_is_kernel_res(&counter->res) != rdma_is_kernel_res(&qp->res)) {
+ ret = -EINVAL;
+ goto err_task;
+ }
+
+ if ((counter->device != qp->device) || (counter->port != qp->port)) {
+ ret = -EINVAL;
+ goto err_task;
+ }
+
+ ret = __rdma_counter_bind_qp(counter, qp);
+ if (ret)
+ goto err_task;
+
+ rdma_restrack_put(&qp->res);
+ return 0;
+
+err_task:
+ kref_put(&counter->kref, counter_release);
+err:
+ rdma_restrack_put(&qp->res);
+ return ret;
+}
+
+/*
+ * rdma_counter_bind_qpn_alloc() - Alloc a counter and bind QP @qp_num to it
+ * The id of new counter is returned in @counter_id
+ */
+int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u32 port,
+ u32 qp_num, u32 *counter_id)
+{
+ struct rdma_port_counter *port_counter;
+ struct rdma_counter *counter;
+ struct ib_qp *qp;
+ int ret;
+
+ if (!rdma_is_port_valid(dev, port))
+ return -EINVAL;
+
+ port_counter = &dev->port_data[port].port_counter;
+ if (!port_counter->hstats)
+ return -EOPNOTSUPP;
+
+ if (port_counter->mode.mode == RDMA_COUNTER_MODE_AUTO)
+ return -EINVAL;
+
+ qp = rdma_counter_get_qp(dev, qp_num);
+ if (!qp)
+ return -ENOENT;
+
+ if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_MANUAL);
+ if (!counter) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ if (counter_id)
+ *counter_id = counter->id;
+
+ rdma_restrack_put(&qp->res);
+ return 0;
+
+err:
+ rdma_restrack_put(&qp->res);
+ return ret;
+}
+
+/*
+ * rdma_counter_unbind_qpn() - Unbind QP @qp_num from a counter
+ */
+int rdma_counter_unbind_qpn(struct ib_device *dev, u32 port,
+ u32 qp_num, u32 counter_id)
+{
+ struct rdma_port_counter *port_counter;
+ struct ib_qp *qp;
+ int ret;
+
+ if (!rdma_is_port_valid(dev, port))
+ return -EINVAL;
+
+ qp = rdma_counter_get_qp(dev, qp_num);
+ if (!qp)
+ return -ENOENT;
+
+ if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ port_counter = &dev->port_data[port].port_counter;
+ if (!qp->counter || qp->counter->id != counter_id ||
+ port_counter->mode.mode != RDMA_COUNTER_MODE_MANUAL) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = rdma_counter_unbind_qp(qp, false);
+
+out:
+ rdma_restrack_put(&qp->res);
+ return ret;
+}
+
+int rdma_counter_get_mode(struct ib_device *dev, u32 port,
+ enum rdma_nl_counter_mode *mode,
+ enum rdma_nl_counter_mask *mask)
+{
+ struct rdma_port_counter *port_counter;
+
+ port_counter = &dev->port_data[port].port_counter;
+ *mode = port_counter->mode.mode;
+ *mask = port_counter->mode.mask;
+
+ return 0;
+}
+
+void rdma_counter_init(struct ib_device *dev)
+{
+ struct rdma_port_counter *port_counter;
+ u32 port, i;
+
+ if (!dev->port_data)
+ return;
+
+ rdma_for_each_port(dev, port) {
+ port_counter = &dev->port_data[port].port_counter;
+ port_counter->mode.mode = RDMA_COUNTER_MODE_NONE;
+ mutex_init(&port_counter->lock);
+
+ if (!dev->ops.alloc_hw_port_stats)
+ continue;
+
+ port_counter->hstats = dev->ops.alloc_hw_port_stats(dev, port);
+ if (!port_counter->hstats)
+ goto fail;
+ }
+
+ return;
+
+fail:
+ for (i = port; i >= rdma_start_port(dev); i--) {
+ port_counter = &dev->port_data[port].port_counter;
+ rdma_free_hw_stats_struct(port_counter->hstats);
+ port_counter->hstats = NULL;
+ mutex_destroy(&port_counter->lock);
+ }
+}
+
+void rdma_counter_release(struct ib_device *dev)
+{
+ struct rdma_port_counter *port_counter;
+ u32 port;
+
+ rdma_for_each_port(dev, port) {
+ port_counter = &dev->port_data[port].port_counter;
+ rdma_free_hw_stats_struct(port_counter->hstats);
+ mutex_destroy(&port_counter->lock);
+ }
+}
diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
new file mode 100644
index 000000000..a70876a0a
--- /dev/null
+++ b/drivers/infiniband/core/cq.c
@@ -0,0 +1,507 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2015 HGST, a Western Digital Company.
+ */
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <rdma/ib_verbs.h>
+
+#include "core_priv.h"
+
+#include <trace/events/rdma_core.h>
+/* Max size for shared CQ, may require tuning */
+#define IB_MAX_SHARED_CQ_SZ 4096U
+
+/* # of WCs to poll for with a single call to ib_poll_cq */
+#define IB_POLL_BATCH 16
+#define IB_POLL_BATCH_DIRECT 8
+
+/* # of WCs to iterate over before yielding */
+#define IB_POLL_BUDGET_IRQ 256
+#define IB_POLL_BUDGET_WORKQUEUE 65536
+
+#define IB_POLL_FLAGS \
+ (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
+
+static const struct dim_cq_moder
+rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = {
+ {1, 0, 1, 0},
+ {1, 0, 4, 0},
+ {2, 0, 4, 0},
+ {2, 0, 8, 0},
+ {4, 0, 8, 0},
+ {16, 0, 8, 0},
+ {16, 0, 16, 0},
+ {32, 0, 16, 0},
+ {32, 0, 32, 0},
+};
+
+static void ib_cq_rdma_dim_work(struct work_struct *w)
+{
+ struct dim *dim = container_of(w, struct dim, work);
+ struct ib_cq *cq = dim->priv;
+
+ u16 usec = rdma_dim_prof[dim->profile_ix].usec;
+ u16 comps = rdma_dim_prof[dim->profile_ix].comps;
+
+ dim->state = DIM_START_MEASURE;
+
+ trace_cq_modify(cq, comps, usec);
+ cq->device->ops.modify_cq(cq, comps, usec);
+}
+
+static void rdma_dim_init(struct ib_cq *cq)
+{
+ struct dim *dim;
+
+ if (!cq->device->ops.modify_cq || !cq->device->use_cq_dim ||
+ cq->poll_ctx == IB_POLL_DIRECT)
+ return;
+
+ dim = kzalloc(sizeof(struct dim), GFP_KERNEL);
+ if (!dim)
+ return;
+
+ dim->state = DIM_START_MEASURE;
+ dim->tune_state = DIM_GOING_RIGHT;
+ dim->profile_ix = RDMA_DIM_START_PROFILE;
+ dim->priv = cq;
+ cq->dim = dim;
+
+ INIT_WORK(&dim->work, ib_cq_rdma_dim_work);
+}
+
+static void rdma_dim_destroy(struct ib_cq *cq)
+{
+ if (!cq->dim)
+ return;
+
+ cancel_work_sync(&cq->dim->work);
+ kfree(cq->dim);
+}
+
+static int __poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc)
+{
+ int rc;
+
+ rc = ib_poll_cq(cq, num_entries, wc);
+ trace_cq_poll(cq, num_entries, rc);
+ return rc;
+}
+
+static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs,
+ int batch)
+{
+ int i, n, completed = 0;
+
+ trace_cq_process(cq);
+
+ /*
+ * budget might be (-1) if the caller does not
+ * want to bound this call, thus we need unsigned
+ * minimum here.
+ */
+ while ((n = __poll_cq(cq, min_t(u32, batch,
+ budget - completed), wcs)) > 0) {
+ for (i = 0; i < n; i++) {
+ struct ib_wc *wc = &wcs[i];
+
+ if (wc->wr_cqe)
+ wc->wr_cqe->done(cq, wc);
+ else
+ WARN_ON_ONCE(wc->status == IB_WC_SUCCESS);
+ }
+
+ completed += n;
+
+ if (n != batch || (budget != -1 && completed >= budget))
+ break;
+ }
+
+ return completed;
+}
+
+/**
+ * ib_process_cq_direct - process a CQ in caller context
+ * @cq: CQ to process
+ * @budget: number of CQEs to poll for
+ *
+ * This function is used to process all outstanding CQ entries.
+ * It does not offload CQ processing to a different context and does
+ * not ask for completion interrupts from the HCA.
+ * Using direct processing on CQ with non IB_POLL_DIRECT type may trigger
+ * concurrent processing.
+ *
+ * Note: do not pass -1 as %budget unless it is guaranteed that the number
+ * of completions that will be processed is small.
+ */
+int ib_process_cq_direct(struct ib_cq *cq, int budget)
+{
+ struct ib_wc wcs[IB_POLL_BATCH_DIRECT];
+
+ return __ib_process_cq(cq, budget, wcs, IB_POLL_BATCH_DIRECT);
+}
+EXPORT_SYMBOL(ib_process_cq_direct);
+
+static void ib_cq_completion_direct(struct ib_cq *cq, void *private)
+{
+ WARN_ONCE(1, "got unsolicited completion for CQ 0x%p\n", cq);
+}
+
+static int ib_poll_handler(struct irq_poll *iop, int budget)
+{
+ struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
+ struct dim *dim = cq->dim;
+ int completed;
+
+ completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
+ if (completed < budget) {
+ irq_poll_complete(&cq->iop);
+ if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) {
+ trace_cq_reschedule(cq);
+ irq_poll_sched(&cq->iop);
+ }
+ }
+
+ if (dim)
+ rdma_dim(dim, completed);
+
+ return completed;
+}
+
+static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
+{
+ trace_cq_schedule(cq);
+ irq_poll_sched(&cq->iop);
+}
+
+static void ib_cq_poll_work(struct work_struct *work)
+{
+ struct ib_cq *cq = container_of(work, struct ib_cq, work);
+ int completed;
+
+ completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, cq->wc,
+ IB_POLL_BATCH);
+ if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
+ ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
+ queue_work(cq->comp_wq, &cq->work);
+ else if (cq->dim)
+ rdma_dim(cq->dim, completed);
+}
+
+static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
+{
+ trace_cq_schedule(cq);
+ queue_work(cq->comp_wq, &cq->work);
+}
+
+/**
+ * __ib_alloc_cq - allocate a completion queue
+ * @dev: device to allocate the CQ for
+ * @private: driver private data, accessible from cq->cq_context
+ * @nr_cqe: number of CQEs to allocate
+ * @comp_vector: HCA completion vectors for this CQ
+ * @poll_ctx: context to poll the CQ from.
+ * @caller: module owner name.
+ *
+ * This is the proper interface to allocate a CQ for in-kernel users. A
+ * CQ allocated with this interface will automatically be polled from the
+ * specified context. The ULP must use wr->wr_cqe instead of wr->wr_id
+ * to use this CQ abstraction.
+ */
+struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, int nr_cqe,
+ int comp_vector, enum ib_poll_context poll_ctx,
+ const char *caller)
+{
+ struct ib_cq_init_attr cq_attr = {
+ .cqe = nr_cqe,
+ .comp_vector = comp_vector,
+ };
+ struct ib_cq *cq;
+ int ret = -ENOMEM;
+
+ cq = rdma_zalloc_drv_obj(dev, ib_cq);
+ if (!cq)
+ return ERR_PTR(ret);
+
+ cq->device = dev;
+ cq->cq_context = private;
+ cq->poll_ctx = poll_ctx;
+ atomic_set(&cq->usecnt, 0);
+ cq->comp_vector = comp_vector;
+
+ cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL);
+ if (!cq->wc)
+ goto out_free_cq;
+
+ rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ);
+ rdma_restrack_set_name(&cq->res, caller);
+
+ ret = dev->ops.create_cq(cq, &cq_attr, NULL);
+ if (ret)
+ goto out_free_wc;
+
+ rdma_dim_init(cq);
+
+ switch (cq->poll_ctx) {
+ case IB_POLL_DIRECT:
+ cq->comp_handler = ib_cq_completion_direct;
+ break;
+ case IB_POLL_SOFTIRQ:
+ cq->comp_handler = ib_cq_completion_softirq;
+
+ irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler);
+ ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+ break;
+ case IB_POLL_WORKQUEUE:
+ case IB_POLL_UNBOUND_WORKQUEUE:
+ cq->comp_handler = ib_cq_completion_workqueue;
+ INIT_WORK(&cq->work, ib_cq_poll_work);
+ ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+ cq->comp_wq = (cq->poll_ctx == IB_POLL_WORKQUEUE) ?
+ ib_comp_wq : ib_comp_unbound_wq;
+ break;
+ default:
+ ret = -EINVAL;
+ goto out_destroy_cq;
+ }
+
+ rdma_restrack_add(&cq->res);
+ trace_cq_alloc(cq, nr_cqe, comp_vector, poll_ctx);
+ return cq;
+
+out_destroy_cq:
+ rdma_dim_destroy(cq);
+ cq->device->ops.destroy_cq(cq, NULL);
+out_free_wc:
+ rdma_restrack_put(&cq->res);
+ kfree(cq->wc);
+out_free_cq:
+ kfree(cq);
+ trace_cq_alloc_error(nr_cqe, comp_vector, poll_ctx, ret);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(__ib_alloc_cq);
+
+/**
+ * __ib_alloc_cq_any - allocate a completion queue
+ * @dev: device to allocate the CQ for
+ * @private: driver private data, accessible from cq->cq_context
+ * @nr_cqe: number of CQEs to allocate
+ * @poll_ctx: context to poll the CQ from
+ * @caller: module owner name
+ *
+ * Attempt to spread ULP Completion Queues over each device's interrupt
+ * vectors. A simple best-effort mechanism is used.
+ */
+struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private,
+ int nr_cqe, enum ib_poll_context poll_ctx,
+ const char *caller)
+{
+ static atomic_t counter;
+ int comp_vector = 0;
+
+ if (dev->num_comp_vectors > 1)
+ comp_vector =
+ atomic_inc_return(&counter) %
+ min_t(int, dev->num_comp_vectors, num_online_cpus());
+
+ return __ib_alloc_cq(dev, private, nr_cqe, comp_vector, poll_ctx,
+ caller);
+}
+EXPORT_SYMBOL(__ib_alloc_cq_any);
+
+/**
+ * ib_free_cq - free a completion queue
+ * @cq: completion queue to free.
+ */
+void ib_free_cq(struct ib_cq *cq)
+{
+ int ret;
+
+ if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
+ return;
+ if (WARN_ON_ONCE(cq->cqe_used))
+ return;
+
+ switch (cq->poll_ctx) {
+ case IB_POLL_DIRECT:
+ break;
+ case IB_POLL_SOFTIRQ:
+ irq_poll_disable(&cq->iop);
+ break;
+ case IB_POLL_WORKQUEUE:
+ case IB_POLL_UNBOUND_WORKQUEUE:
+ cancel_work_sync(&cq->work);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+
+ rdma_dim_destroy(cq);
+ trace_cq_free(cq);
+ ret = cq->device->ops.destroy_cq(cq, NULL);
+ WARN_ONCE(ret, "Destroy of kernel CQ shouldn't fail");
+ rdma_restrack_del(&cq->res);
+ kfree(cq->wc);
+ kfree(cq);
+}
+EXPORT_SYMBOL(ib_free_cq);
+
+void ib_cq_pool_cleanup(struct ib_device *dev)
+{
+ struct ib_cq *cq, *n;
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++) {
+ list_for_each_entry_safe(cq, n, &dev->cq_pools[i],
+ pool_entry) {
+ WARN_ON(cq->cqe_used);
+ list_del(&cq->pool_entry);
+ cq->shared = false;
+ ib_free_cq(cq);
+ }
+ }
+}
+
+static int ib_alloc_cqs(struct ib_device *dev, unsigned int nr_cqes,
+ enum ib_poll_context poll_ctx)
+{
+ LIST_HEAD(tmp_list);
+ unsigned int nr_cqs, i;
+ struct ib_cq *cq, *n;
+ int ret;
+
+ if (poll_ctx > IB_POLL_LAST_POOL_TYPE) {
+ WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE);
+ return -EINVAL;
+ }
+
+ /*
+ * Allocate at least as many CQEs as requested, and otherwise
+ * a reasonable batch size so that we can share CQs between
+ * multiple users instead of allocating a larger number of CQs.
+ */
+ nr_cqes = min_t(unsigned int, dev->attrs.max_cqe,
+ max(nr_cqes, IB_MAX_SHARED_CQ_SZ));
+ nr_cqs = min_t(unsigned int, dev->num_comp_vectors, num_online_cpus());
+ for (i = 0; i < nr_cqs; i++) {
+ cq = ib_alloc_cq(dev, NULL, nr_cqes, i, poll_ctx);
+ if (IS_ERR(cq)) {
+ ret = PTR_ERR(cq);
+ goto out_free_cqs;
+ }
+ cq->shared = true;
+ list_add_tail(&cq->pool_entry, &tmp_list);
+ }
+
+ spin_lock_irq(&dev->cq_pools_lock);
+ list_splice(&tmp_list, &dev->cq_pools[poll_ctx]);
+ spin_unlock_irq(&dev->cq_pools_lock);
+
+ return 0;
+
+out_free_cqs:
+ list_for_each_entry_safe(cq, n, &tmp_list, pool_entry) {
+ cq->shared = false;
+ ib_free_cq(cq);
+ }
+ return ret;
+}
+
+/**
+ * ib_cq_pool_get() - Find the least used completion queue that matches
+ * a given cpu hint (or least used for wild card affinity) and fits
+ * nr_cqe.
+ * @dev: rdma device
+ * @nr_cqe: number of needed cqe entries
+ * @comp_vector_hint: completion vector hint (-1) for the driver to assign
+ * a comp vector based on internal counter
+ * @poll_ctx: cq polling context
+ *
+ * Finds a cq that satisfies @comp_vector_hint and @nr_cqe requirements and
+ * claim entries in it for us. In case there is no available cq, allocate
+ * a new cq with the requirements and add it to the device pool.
+ * IB_POLL_DIRECT cannot be used for shared cqs so it is not a valid value
+ * for @poll_ctx.
+ */
+struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe,
+ int comp_vector_hint,
+ enum ib_poll_context poll_ctx)
+{
+ static unsigned int default_comp_vector;
+ unsigned int vector, num_comp_vectors;
+ struct ib_cq *cq, *found = NULL;
+ int ret;
+
+ if (poll_ctx > IB_POLL_LAST_POOL_TYPE) {
+ WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE);
+ return ERR_PTR(-EINVAL);
+ }
+
+ num_comp_vectors =
+ min_t(unsigned int, dev->num_comp_vectors, num_online_cpus());
+ /* Project the affinty to the device completion vector range */
+ if (comp_vector_hint < 0) {
+ comp_vector_hint =
+ (READ_ONCE(default_comp_vector) + 1) % num_comp_vectors;
+ WRITE_ONCE(default_comp_vector, comp_vector_hint);
+ }
+ vector = comp_vector_hint % num_comp_vectors;
+
+ /*
+ * Find the least used CQ with correct affinity and
+ * enough free CQ entries
+ */
+ while (!found) {
+ spin_lock_irq(&dev->cq_pools_lock);
+ list_for_each_entry(cq, &dev->cq_pools[poll_ctx],
+ pool_entry) {
+ /*
+ * Check to see if we have found a CQ with the
+ * correct completion vector
+ */
+ if (vector != cq->comp_vector)
+ continue;
+ if (cq->cqe_used + nr_cqe > cq->cqe)
+ continue;
+ found = cq;
+ break;
+ }
+
+ if (found) {
+ found->cqe_used += nr_cqe;
+ spin_unlock_irq(&dev->cq_pools_lock);
+
+ return found;
+ }
+ spin_unlock_irq(&dev->cq_pools_lock);
+
+ /*
+ * Didn't find a match or ran out of CQs in the device
+ * pool, allocate a new array of CQs.
+ */
+ ret = ib_alloc_cqs(dev, nr_cqe, poll_ctx);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
+ return found;
+}
+EXPORT_SYMBOL(ib_cq_pool_get);
+
+/**
+ * ib_cq_pool_put - Return a CQ taken from a shared pool.
+ * @cq: The CQ to return.
+ * @nr_cqe: The max number of cqes that the user had requested.
+ */
+void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe)
+{
+ if (WARN_ON_ONCE(nr_cqe > cq->cqe_used))
+ return;
+
+ spin_lock_irq(&cq->device->cq_pools_lock);
+ cq->cqe_used -= nr_cqe;
+ spin_unlock_irq(&cq->device->cq_pools_lock);
+}
+EXPORT_SYMBOL(ib_cq_pool_put);
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
new file mode 100644
index 000000000..3a9b9a28d
--- /dev/null
+++ b/drivers/infiniband/core/device.c
@@ -0,0 +1,2878 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <net/net_namespace.h>
+#include <linux/security.h>
+#include <linux/notifier.h>
+#include <linux/hashtable.h>
+#include <rdma/rdma_netlink.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
+#include <rdma/rdma_counter.h>
+
+#include "core_priv.h"
+#include "restrack.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("core kernel InfiniBand API");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct workqueue_struct *ib_comp_wq;
+struct workqueue_struct *ib_comp_unbound_wq;
+struct workqueue_struct *ib_wq;
+EXPORT_SYMBOL_GPL(ib_wq);
+static struct workqueue_struct *ib_unreg_wq;
+
+/*
+ * Each of the three rwsem locks (devices, clients, client_data) protects the
+ * xarray of the same name. Specifically it allows the caller to assert that
+ * the MARK will/will not be changing under the lock, and for devices and
+ * clients, that the value in the xarray is still a valid pointer. Change of
+ * the MARK is linked to the object state, so holding the lock and testing the
+ * MARK also asserts that the contained object is in a certain state.
+ *
+ * This is used to build a two stage register/unregister flow where objects
+ * can continue to be in the xarray even though they are still in progress to
+ * register/unregister.
+ *
+ * The xarray itself provides additional locking, and restartable iteration,
+ * which is also relied on.
+ *
+ * Locks should not be nested, with the exception of client_data, which is
+ * allowed to nest under the read side of the other two locks.
+ *
+ * The devices_rwsem also protects the device name list, any change or
+ * assignment of device name must also hold the write side to guarantee unique
+ * names.
+ */
+
+/*
+ * devices contains devices that have had their names assigned. The
+ * devices may not be registered. Users that care about the registration
+ * status need to call ib_device_try_get() on the device to ensure it is
+ * registered, and keep it registered, for the required duration.
+ *
+ */
+static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC);
+static DECLARE_RWSEM(devices_rwsem);
+#define DEVICE_REGISTERED XA_MARK_1
+
+static u32 highest_client_id;
+#define CLIENT_REGISTERED XA_MARK_1
+static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC);
+static DECLARE_RWSEM(clients_rwsem);
+
+static void ib_client_put(struct ib_client *client)
+{
+ if (refcount_dec_and_test(&client->uses))
+ complete(&client->uses_zero);
+}
+
+/*
+ * If client_data is registered then the corresponding client must also still
+ * be registered.
+ */
+#define CLIENT_DATA_REGISTERED XA_MARK_1
+
+unsigned int rdma_dev_net_id;
+
+/*
+ * A list of net namespaces is maintained in an xarray. This is necessary
+ * because we can't get the locking right using the existing net ns list. We
+ * would require a init_net callback after the list is updated.
+ */
+static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC);
+/*
+ * rwsem to protect accessing the rdma_nets xarray entries.
+ */
+static DECLARE_RWSEM(rdma_nets_rwsem);
+
+bool ib_devices_shared_netns = true;
+module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444);
+MODULE_PARM_DESC(netns_mode,
+ "Share device among net namespaces; default=1 (shared)");
+/**
+ * rdma_dev_access_netns() - Return whether an rdma device can be accessed
+ * from a specified net namespace or not.
+ * @dev: Pointer to rdma device which needs to be checked
+ * @net: Pointer to net namesapce for which access to be checked
+ *
+ * When the rdma device is in shared mode, it ignores the net namespace.
+ * When the rdma device is exclusive to a net namespace, rdma device net
+ * namespace is checked against the specified one.
+ */
+bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net)
+{
+ return (ib_devices_shared_netns ||
+ net_eq(read_pnet(&dev->coredev.rdma_net), net));
+}
+EXPORT_SYMBOL(rdma_dev_access_netns);
+
+/*
+ * xarray has this behavior where it won't iterate over NULL values stored in
+ * allocated arrays. So we need our own iterator to see all values stored in
+ * the array. This does the same thing as xa_for_each except that it also
+ * returns NULL valued entries if the array is allocating. Simplified to only
+ * work on simple xarrays.
+ */
+static void *xan_find_marked(struct xarray *xa, unsigned long *indexp,
+ xa_mark_t filter)
+{
+ XA_STATE(xas, xa, *indexp);
+ void *entry;
+
+ rcu_read_lock();
+ do {
+ entry = xas_find_marked(&xas, ULONG_MAX, filter);
+ if (xa_is_zero(entry))
+ break;
+ } while (xas_retry(&xas, entry));
+ rcu_read_unlock();
+
+ if (entry) {
+ *indexp = xas.xa_index;
+ if (xa_is_zero(entry))
+ return NULL;
+ return entry;
+ }
+ return XA_ERROR(-ENOENT);
+}
+#define xan_for_each_marked(xa, index, entry, filter) \
+ for (index = 0, entry = xan_find_marked(xa, &(index), filter); \
+ !xa_is_err(entry); \
+ (index)++, entry = xan_find_marked(xa, &(index), filter))
+
+/* RCU hash table mapping netdevice pointers to struct ib_port_data */
+static DEFINE_SPINLOCK(ndev_hash_lock);
+static DECLARE_HASHTABLE(ndev_hash, 5);
+
+static void free_netdevs(struct ib_device *ib_dev);
+static void ib_unregister_work(struct work_struct *work);
+static void __ib_unregister_device(struct ib_device *device);
+static int ib_security_change(struct notifier_block *nb, unsigned long event,
+ void *lsm_data);
+static void ib_policy_change_task(struct work_struct *work);
+static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task);
+
+static void __ibdev_printk(const char *level, const struct ib_device *ibdev,
+ struct va_format *vaf)
+{
+ if (ibdev && ibdev->dev.parent)
+ dev_printk_emit(level[1] - '0',
+ ibdev->dev.parent,
+ "%s %s %s: %pV",
+ dev_driver_string(ibdev->dev.parent),
+ dev_name(ibdev->dev.parent),
+ dev_name(&ibdev->dev),
+ vaf);
+ else if (ibdev)
+ printk("%s%s: %pV",
+ level, dev_name(&ibdev->dev), vaf);
+ else
+ printk("%s(NULL ib_device): %pV", level, vaf);
+}
+
+void ibdev_printk(const char *level, const struct ib_device *ibdev,
+ const char *format, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, format);
+
+ vaf.fmt = format;
+ vaf.va = &args;
+
+ __ibdev_printk(level, ibdev, &vaf);
+
+ va_end(args);
+}
+EXPORT_SYMBOL(ibdev_printk);
+
+#define define_ibdev_printk_level(func, level) \
+void func(const struct ib_device *ibdev, const char *fmt, ...) \
+{ \
+ struct va_format vaf; \
+ va_list args; \
+ \
+ va_start(args, fmt); \
+ \
+ vaf.fmt = fmt; \
+ vaf.va = &args; \
+ \
+ __ibdev_printk(level, ibdev, &vaf); \
+ \
+ va_end(args); \
+} \
+EXPORT_SYMBOL(func);
+
+define_ibdev_printk_level(ibdev_emerg, KERN_EMERG);
+define_ibdev_printk_level(ibdev_alert, KERN_ALERT);
+define_ibdev_printk_level(ibdev_crit, KERN_CRIT);
+define_ibdev_printk_level(ibdev_err, KERN_ERR);
+define_ibdev_printk_level(ibdev_warn, KERN_WARNING);
+define_ibdev_printk_level(ibdev_notice, KERN_NOTICE);
+define_ibdev_printk_level(ibdev_info, KERN_INFO);
+
+static struct notifier_block ibdev_lsm_nb = {
+ .notifier_call = ib_security_change,
+};
+
+static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net,
+ struct net *net);
+
+/* Pointer to the RCU head at the start of the ib_port_data array */
+struct ib_port_data_rcu {
+ struct rcu_head rcu_head;
+ struct ib_port_data pdata[];
+};
+
+static void ib_device_check_mandatory(struct ib_device *device)
+{
+#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x }
+ static const struct {
+ size_t offset;
+ char *name;
+ } mandatory_table[] = {
+ IB_MANDATORY_FUNC(query_device),
+ IB_MANDATORY_FUNC(query_port),
+ IB_MANDATORY_FUNC(alloc_pd),
+ IB_MANDATORY_FUNC(dealloc_pd),
+ IB_MANDATORY_FUNC(create_qp),
+ IB_MANDATORY_FUNC(modify_qp),
+ IB_MANDATORY_FUNC(destroy_qp),
+ IB_MANDATORY_FUNC(post_send),
+ IB_MANDATORY_FUNC(post_recv),
+ IB_MANDATORY_FUNC(create_cq),
+ IB_MANDATORY_FUNC(destroy_cq),
+ IB_MANDATORY_FUNC(poll_cq),
+ IB_MANDATORY_FUNC(req_notify_cq),
+ IB_MANDATORY_FUNC(get_dma_mr),
+ IB_MANDATORY_FUNC(reg_user_mr),
+ IB_MANDATORY_FUNC(dereg_mr),
+ IB_MANDATORY_FUNC(get_port_immutable)
+ };
+ int i;
+
+ device->kverbs_provider = true;
+ for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
+ if (!*(void **) ((void *) &device->ops +
+ mandatory_table[i].offset)) {
+ device->kverbs_provider = false;
+ break;
+ }
+ }
+}
+
+/*
+ * Caller must perform ib_device_put() to return the device reference count
+ * when ib_device_get_by_index() returns valid device pointer.
+ */
+struct ib_device *ib_device_get_by_index(const struct net *net, u32 index)
+{
+ struct ib_device *device;
+
+ down_read(&devices_rwsem);
+ device = xa_load(&devices, index);
+ if (device) {
+ if (!rdma_dev_access_netns(device, net)) {
+ device = NULL;
+ goto out;
+ }
+
+ if (!ib_device_try_get(device))
+ device = NULL;
+ }
+out:
+ up_read(&devices_rwsem);
+ return device;
+}
+
+/**
+ * ib_device_put - Release IB device reference
+ * @device: device whose reference to be released
+ *
+ * ib_device_put() releases reference to the IB device to allow it to be
+ * unregistered and eventually free.
+ */
+void ib_device_put(struct ib_device *device)
+{
+ if (refcount_dec_and_test(&device->refcount))
+ complete(&device->unreg_completion);
+}
+EXPORT_SYMBOL(ib_device_put);
+
+static struct ib_device *__ib_device_get_by_name(const char *name)
+{
+ struct ib_device *device;
+ unsigned long index;
+
+ xa_for_each (&devices, index, device)
+ if (!strcmp(name, dev_name(&device->dev)))
+ return device;
+
+ return NULL;
+}
+
+/**
+ * ib_device_get_by_name - Find an IB device by name
+ * @name: The name to look for
+ * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
+ *
+ * Find and hold an ib_device by its name. The caller must call
+ * ib_device_put() on the returned pointer.
+ */
+struct ib_device *ib_device_get_by_name(const char *name,
+ enum rdma_driver_id driver_id)
+{
+ struct ib_device *device;
+
+ down_read(&devices_rwsem);
+ device = __ib_device_get_by_name(name);
+ if (device && driver_id != RDMA_DRIVER_UNKNOWN &&
+ device->ops.driver_id != driver_id)
+ device = NULL;
+
+ if (device) {
+ if (!ib_device_try_get(device))
+ device = NULL;
+ }
+ up_read(&devices_rwsem);
+ return device;
+}
+EXPORT_SYMBOL(ib_device_get_by_name);
+
+static int rename_compat_devs(struct ib_device *device)
+{
+ struct ib_core_device *cdev;
+ unsigned long index;
+ int ret = 0;
+
+ mutex_lock(&device->compat_devs_mutex);
+ xa_for_each (&device->compat_devs, index, cdev) {
+ ret = device_rename(&cdev->dev, dev_name(&device->dev));
+ if (ret) {
+ dev_warn(&cdev->dev,
+ "Fail to rename compatdev to new name %s\n",
+ dev_name(&device->dev));
+ break;
+ }
+ }
+ mutex_unlock(&device->compat_devs_mutex);
+ return ret;
+}
+
+int ib_device_rename(struct ib_device *ibdev, const char *name)
+{
+ unsigned long index;
+ void *client_data;
+ int ret;
+
+ down_write(&devices_rwsem);
+ if (!strcmp(name, dev_name(&ibdev->dev))) {
+ up_write(&devices_rwsem);
+ return 0;
+ }
+
+ if (__ib_device_get_by_name(name)) {
+ up_write(&devices_rwsem);
+ return -EEXIST;
+ }
+
+ ret = device_rename(&ibdev->dev, name);
+ if (ret) {
+ up_write(&devices_rwsem);
+ return ret;
+ }
+
+ strscpy(ibdev->name, name, IB_DEVICE_NAME_MAX);
+ ret = rename_compat_devs(ibdev);
+
+ downgrade_write(&devices_rwsem);
+ down_read(&ibdev->client_data_rwsem);
+ xan_for_each_marked(&ibdev->client_data, index, client_data,
+ CLIENT_DATA_REGISTERED) {
+ struct ib_client *client = xa_load(&clients, index);
+
+ if (!client || !client->rename)
+ continue;
+
+ client->rename(ibdev, client_data);
+ }
+ up_read(&ibdev->client_data_rwsem);
+ up_read(&devices_rwsem);
+ return 0;
+}
+
+int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim)
+{
+ if (use_dim > 1)
+ return -EINVAL;
+ ibdev->use_cq_dim = use_dim;
+
+ return 0;
+}
+
+static int alloc_name(struct ib_device *ibdev, const char *name)
+{
+ struct ib_device *device;
+ unsigned long index;
+ struct ida inuse;
+ int rc;
+ int i;
+
+ lockdep_assert_held_write(&devices_rwsem);
+ ida_init(&inuse);
+ xa_for_each (&devices, index, device) {
+ char buf[IB_DEVICE_NAME_MAX];
+
+ if (sscanf(dev_name(&device->dev), name, &i) != 1)
+ continue;
+ if (i < 0 || i >= INT_MAX)
+ continue;
+ snprintf(buf, sizeof buf, name, i);
+ if (strcmp(buf, dev_name(&device->dev)) != 0)
+ continue;
+
+ rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL);
+ if (rc < 0)
+ goto out;
+ }
+
+ rc = ida_alloc(&inuse, GFP_KERNEL);
+ if (rc < 0)
+ goto out;
+
+ rc = dev_set_name(&ibdev->dev, name, rc);
+out:
+ ida_destroy(&inuse);
+ return rc;
+}
+
+static void ib_device_release(struct device *device)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ free_netdevs(dev);
+ WARN_ON(refcount_read(&dev->refcount));
+ if (dev->hw_stats_data)
+ ib_device_release_hw_stats(dev->hw_stats_data);
+ if (dev->port_data) {
+ ib_cache_release_one(dev);
+ ib_security_release_port_pkey_list(dev);
+ rdma_counter_release(dev);
+ kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu,
+ pdata[0]),
+ rcu_head);
+ }
+
+ mutex_destroy(&dev->unregistration_lock);
+ mutex_destroy(&dev->compat_devs_mutex);
+
+ xa_destroy(&dev->compat_devs);
+ xa_destroy(&dev->client_data);
+ kfree_rcu(dev, rcu_head);
+}
+
+static int ib_device_uevent(struct device *device,
+ struct kobj_uevent_env *env)
+{
+ if (add_uevent_var(env, "NAME=%s", dev_name(device)))
+ return -ENOMEM;
+
+ /*
+ * It would be nice to pass the node GUID with the event...
+ */
+
+ return 0;
+}
+
+static const void *net_namespace(struct device *d)
+{
+ struct ib_core_device *coredev =
+ container_of(d, struct ib_core_device, dev);
+
+ return read_pnet(&coredev->rdma_net);
+}
+
+static struct class ib_class = {
+ .name = "infiniband",
+ .dev_release = ib_device_release,
+ .dev_uevent = ib_device_uevent,
+ .ns_type = &net_ns_type_operations,
+ .namespace = net_namespace,
+};
+
+static void rdma_init_coredev(struct ib_core_device *coredev,
+ struct ib_device *dev, struct net *net)
+{
+ /* This BUILD_BUG_ON is intended to catch layout change
+ * of union of ib_core_device and device.
+ * dev must be the first element as ib_core and providers
+ * driver uses it. Adding anything in ib_core_device before
+ * device will break this assumption.
+ */
+ BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) !=
+ offsetof(struct ib_device, dev));
+
+ coredev->dev.class = &ib_class;
+ coredev->dev.groups = dev->groups;
+ device_initialize(&coredev->dev);
+ coredev->owner = dev;
+ INIT_LIST_HEAD(&coredev->port_list);
+ write_pnet(&coredev->rdma_net, net);
+}
+
+/**
+ * _ib_alloc_device - allocate an IB device struct
+ * @size:size of structure to allocate
+ *
+ * Low-level drivers should use ib_alloc_device() to allocate &struct
+ * ib_device. @size is the size of the structure to be allocated,
+ * including any private data used by the low-level driver.
+ * ib_dealloc_device() must be used to free structures allocated with
+ * ib_alloc_device().
+ */
+struct ib_device *_ib_alloc_device(size_t size)
+{
+ struct ib_device *device;
+ unsigned int i;
+
+ if (WARN_ON(size < sizeof(struct ib_device)))
+ return NULL;
+
+ device = kzalloc(size, GFP_KERNEL);
+ if (!device)
+ return NULL;
+
+ if (rdma_restrack_init(device)) {
+ kfree(device);
+ return NULL;
+ }
+
+ rdma_init_coredev(&device->coredev, device, &init_net);
+
+ INIT_LIST_HEAD(&device->event_handler_list);
+ spin_lock_init(&device->qp_open_list_lock);
+ init_rwsem(&device->event_handler_rwsem);
+ mutex_init(&device->unregistration_lock);
+ /*
+ * client_data needs to be alloc because we don't want our mark to be
+ * destroyed if the user stores NULL in the client data.
+ */
+ xa_init_flags(&device->client_data, XA_FLAGS_ALLOC);
+ init_rwsem(&device->client_data_rwsem);
+ xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC);
+ mutex_init(&device->compat_devs_mutex);
+ init_completion(&device->unreg_completion);
+ INIT_WORK(&device->unregistration_work, ib_unregister_work);
+
+ spin_lock_init(&device->cq_pools_lock);
+ for (i = 0; i < ARRAY_SIZE(device->cq_pools); i++)
+ INIT_LIST_HEAD(&device->cq_pools[i]);
+
+ rwlock_init(&device->cache_lock);
+
+ device->uverbs_cmd_mask =
+ BIT_ULL(IB_USER_VERBS_CMD_ALLOC_MW) |
+ BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) |
+ BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST) |
+ BIT_ULL(IB_USER_VERBS_CMD_CLOSE_XRCD) |
+ BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH) |
+ BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+ BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ) |
+ BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP) |
+ BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ) |
+ BIT_ULL(IB_USER_VERBS_CMD_CREATE_XSRQ) |
+ BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_MW) |
+ BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD) |
+ BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR) |
+ BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH) |
+ BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ) |
+ BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP) |
+ BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ) |
+ BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST) |
+ BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) |
+ BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP) |
+ BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ) |
+ BIT_ULL(IB_USER_VERBS_CMD_OPEN_QP) |
+ BIT_ULL(IB_USER_VERBS_CMD_OPEN_XRCD) |
+ BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) |
+ BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT) |
+ BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP) |
+ BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ) |
+ BIT_ULL(IB_USER_VERBS_CMD_REG_MR) |
+ BIT_ULL(IB_USER_VERBS_CMD_REREG_MR) |
+ BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ);
+ return device;
+}
+EXPORT_SYMBOL(_ib_alloc_device);
+
+/**
+ * ib_dealloc_device - free an IB device struct
+ * @device:structure to free
+ *
+ * Free a structure allocated with ib_alloc_device().
+ */
+void ib_dealloc_device(struct ib_device *device)
+{
+ if (device->ops.dealloc_driver)
+ device->ops.dealloc_driver(device);
+
+ /*
+ * ib_unregister_driver() requires all devices to remain in the xarray
+ * while their ops are callable. The last op we call is dealloc_driver
+ * above. This is needed to create a fence on op callbacks prior to
+ * allowing the driver module to unload.
+ */
+ down_write(&devices_rwsem);
+ if (xa_load(&devices, device->index) == device)
+ xa_erase(&devices, device->index);
+ up_write(&devices_rwsem);
+
+ /* Expedite releasing netdev references */
+ free_netdevs(device);
+
+ WARN_ON(!xa_empty(&device->compat_devs));
+ WARN_ON(!xa_empty(&device->client_data));
+ WARN_ON(refcount_read(&device->refcount));
+ rdma_restrack_clean(device);
+ /* Balances with device_initialize */
+ put_device(&device->dev);
+}
+EXPORT_SYMBOL(ib_dealloc_device);
+
+/*
+ * add_client_context() and remove_client_context() must be safe against
+ * parallel calls on the same device - registration/unregistration of both the
+ * device and client can be occurring in parallel.
+ *
+ * The routines need to be a fence, any caller must not return until the add
+ * or remove is fully completed.
+ */
+static int add_client_context(struct ib_device *device,
+ struct ib_client *client)
+{
+ int ret = 0;
+
+ if (!device->kverbs_provider && !client->no_kverbs_req)
+ return 0;
+
+ down_write(&device->client_data_rwsem);
+ /*
+ * So long as the client is registered hold both the client and device
+ * unregistration locks.
+ */
+ if (!refcount_inc_not_zero(&client->uses))
+ goto out_unlock;
+ refcount_inc(&device->refcount);
+
+ /*
+ * Another caller to add_client_context got here first and has already
+ * completely initialized context.
+ */
+ if (xa_get_mark(&device->client_data, client->client_id,
+ CLIENT_DATA_REGISTERED))
+ goto out;
+
+ ret = xa_err(xa_store(&device->client_data, client->client_id, NULL,
+ GFP_KERNEL));
+ if (ret)
+ goto out;
+ downgrade_write(&device->client_data_rwsem);
+ if (client->add) {
+ if (client->add(device)) {
+ /*
+ * If a client fails to add then the error code is
+ * ignored, but we won't call any more ops on this
+ * client.
+ */
+ xa_erase(&device->client_data, client->client_id);
+ up_read(&device->client_data_rwsem);
+ ib_device_put(device);
+ ib_client_put(client);
+ return 0;
+ }
+ }
+
+ /* Readers shall not see a client until add has been completed */
+ xa_set_mark(&device->client_data, client->client_id,
+ CLIENT_DATA_REGISTERED);
+ up_read(&device->client_data_rwsem);
+ return 0;
+
+out:
+ ib_device_put(device);
+ ib_client_put(client);
+out_unlock:
+ up_write(&device->client_data_rwsem);
+ return ret;
+}
+
+static void remove_client_context(struct ib_device *device,
+ unsigned int client_id)
+{
+ struct ib_client *client;
+ void *client_data;
+
+ down_write(&device->client_data_rwsem);
+ if (!xa_get_mark(&device->client_data, client_id,
+ CLIENT_DATA_REGISTERED)) {
+ up_write(&device->client_data_rwsem);
+ return;
+ }
+ client_data = xa_load(&device->client_data, client_id);
+ xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED);
+ client = xa_load(&clients, client_id);
+ up_write(&device->client_data_rwsem);
+
+ /*
+ * Notice we cannot be holding any exclusive locks when calling the
+ * remove callback as the remove callback can recurse back into any
+ * public functions in this module and thus try for any locks those
+ * functions take.
+ *
+ * For this reason clients and drivers should not call the
+ * unregistration functions will holdling any locks.
+ */
+ if (client->remove)
+ client->remove(device, client_data);
+
+ xa_erase(&device->client_data, client_id);
+ ib_device_put(device);
+ ib_client_put(client);
+}
+
+static int alloc_port_data(struct ib_device *device)
+{
+ struct ib_port_data_rcu *pdata_rcu;
+ u32 port;
+
+ if (device->port_data)
+ return 0;
+
+ /* This can only be called once the physical port range is defined */
+ if (WARN_ON(!device->phys_port_cnt))
+ return -EINVAL;
+
+ /* Reserve U32_MAX so the logic to go over all the ports is sane */
+ if (WARN_ON(device->phys_port_cnt == U32_MAX))
+ return -EINVAL;
+
+ /*
+ * device->port_data is indexed directly by the port number to make
+ * access to this data as efficient as possible.
+ *
+ * Therefore port_data is declared as a 1 based array with potential
+ * empty slots at the beginning.
+ */
+ pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata,
+ size_add(rdma_end_port(device), 1)),
+ GFP_KERNEL);
+ if (!pdata_rcu)
+ return -ENOMEM;
+ /*
+ * The rcu_head is put in front of the port data array and the stored
+ * pointer is adjusted since we never need to see that member until
+ * kfree_rcu.
+ */
+ device->port_data = pdata_rcu->pdata;
+
+ rdma_for_each_port (device, port) {
+ struct ib_port_data *pdata = &device->port_data[port];
+
+ pdata->ib_dev = device;
+ spin_lock_init(&pdata->pkey_list_lock);
+ INIT_LIST_HEAD(&pdata->pkey_list);
+ spin_lock_init(&pdata->netdev_lock);
+ INIT_HLIST_NODE(&pdata->ndev_hash_link);
+ }
+ return 0;
+}
+
+static int verify_immutable(const struct ib_device *dev, u32 port)
+{
+ return WARN_ON(!rdma_cap_ib_mad(dev, port) &&
+ rdma_max_mad_size(dev, port) != 0);
+}
+
+static int setup_port_data(struct ib_device *device)
+{
+ u32 port;
+ int ret;
+
+ ret = alloc_port_data(device);
+ if (ret)
+ return ret;
+
+ rdma_for_each_port (device, port) {
+ struct ib_port_data *pdata = &device->port_data[port];
+
+ ret = device->ops.get_port_immutable(device, port,
+ &pdata->immutable);
+ if (ret)
+ return ret;
+
+ if (verify_immutable(device, port))
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/**
+ * ib_port_immutable_read() - Read rdma port's immutable data
+ * @dev: IB device
+ * @port: port number whose immutable data to read. It starts with index 1 and
+ * valid upto including rdma_end_port().
+ */
+const struct ib_port_immutable*
+ib_port_immutable_read(struct ib_device *dev, unsigned int port)
+{
+ WARN_ON(!rdma_is_port_valid(dev, port));
+ return &dev->port_data[port].immutable;
+}
+EXPORT_SYMBOL(ib_port_immutable_read);
+
+void ib_get_device_fw_str(struct ib_device *dev, char *str)
+{
+ if (dev->ops.get_dev_fw_str)
+ dev->ops.get_dev_fw_str(dev, str);
+ else
+ str[0] = '\0';
+}
+EXPORT_SYMBOL(ib_get_device_fw_str);
+
+static void ib_policy_change_task(struct work_struct *work)
+{
+ struct ib_device *dev;
+ unsigned long index;
+
+ down_read(&devices_rwsem);
+ xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
+ unsigned int i;
+
+ rdma_for_each_port (dev, i) {
+ u64 sp;
+ ib_get_cached_subnet_prefix(dev, i, &sp);
+ ib_security_cache_change(dev, i, sp);
+ }
+ }
+ up_read(&devices_rwsem);
+}
+
+static int ib_security_change(struct notifier_block *nb, unsigned long event,
+ void *lsm_data)
+{
+ if (event != LSM_POLICY_CHANGE)
+ return NOTIFY_DONE;
+
+ schedule_work(&ib_policy_change_work);
+ ib_mad_agent_security_change();
+
+ return NOTIFY_OK;
+}
+
+static void compatdev_release(struct device *dev)
+{
+ struct ib_core_device *cdev =
+ container_of(dev, struct ib_core_device, dev);
+
+ kfree(cdev);
+}
+
+static int add_one_compat_dev(struct ib_device *device,
+ struct rdma_dev_net *rnet)
+{
+ struct ib_core_device *cdev;
+ int ret;
+
+ lockdep_assert_held(&rdma_nets_rwsem);
+ if (!ib_devices_shared_netns)
+ return 0;
+
+ /*
+ * Create and add compat device in all namespaces other than where it
+ * is currently bound to.
+ */
+ if (net_eq(read_pnet(&rnet->net),
+ read_pnet(&device->coredev.rdma_net)))
+ return 0;
+
+ /*
+ * The first of init_net() or ib_register_device() to take the
+ * compat_devs_mutex wins and gets to add the device. Others will wait
+ * for completion here.
+ */
+ mutex_lock(&device->compat_devs_mutex);
+ cdev = xa_load(&device->compat_devs, rnet->id);
+ if (cdev) {
+ ret = 0;
+ goto done;
+ }
+ ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL);
+ if (ret)
+ goto done;
+
+ cdev = kzalloc(sizeof(*cdev), GFP_KERNEL);
+ if (!cdev) {
+ ret = -ENOMEM;
+ goto cdev_err;
+ }
+
+ cdev->dev.parent = device->dev.parent;
+ rdma_init_coredev(cdev, device, read_pnet(&rnet->net));
+ cdev->dev.release = compatdev_release;
+ ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev));
+ if (ret)
+ goto add_err;
+
+ ret = device_add(&cdev->dev);
+ if (ret)
+ goto add_err;
+ ret = ib_setup_port_attrs(cdev);
+ if (ret)
+ goto port_err;
+
+ ret = xa_err(xa_store(&device->compat_devs, rnet->id,
+ cdev, GFP_KERNEL));
+ if (ret)
+ goto insert_err;
+
+ mutex_unlock(&device->compat_devs_mutex);
+ return 0;
+
+insert_err:
+ ib_free_port_attrs(cdev);
+port_err:
+ device_del(&cdev->dev);
+add_err:
+ put_device(&cdev->dev);
+cdev_err:
+ xa_release(&device->compat_devs, rnet->id);
+done:
+ mutex_unlock(&device->compat_devs_mutex);
+ return ret;
+}
+
+static void remove_one_compat_dev(struct ib_device *device, u32 id)
+{
+ struct ib_core_device *cdev;
+
+ mutex_lock(&device->compat_devs_mutex);
+ cdev = xa_erase(&device->compat_devs, id);
+ mutex_unlock(&device->compat_devs_mutex);
+ if (cdev) {
+ ib_free_port_attrs(cdev);
+ device_del(&cdev->dev);
+ put_device(&cdev->dev);
+ }
+}
+
+static void remove_compat_devs(struct ib_device *device)
+{
+ struct ib_core_device *cdev;
+ unsigned long index;
+
+ xa_for_each (&device->compat_devs, index, cdev)
+ remove_one_compat_dev(device, index);
+}
+
+static int add_compat_devs(struct ib_device *device)
+{
+ struct rdma_dev_net *rnet;
+ unsigned long index;
+ int ret = 0;
+
+ lockdep_assert_held(&devices_rwsem);
+
+ down_read(&rdma_nets_rwsem);
+ xa_for_each (&rdma_nets, index, rnet) {
+ ret = add_one_compat_dev(device, rnet);
+ if (ret)
+ break;
+ }
+ up_read(&rdma_nets_rwsem);
+ return ret;
+}
+
+static void remove_all_compat_devs(void)
+{
+ struct ib_compat_device *cdev;
+ struct ib_device *dev;
+ unsigned long index;
+
+ down_read(&devices_rwsem);
+ xa_for_each (&devices, index, dev) {
+ unsigned long c_index = 0;
+
+ /* Hold nets_rwsem so that any other thread modifying this
+ * system param can sync with this thread.
+ */
+ down_read(&rdma_nets_rwsem);
+ xa_for_each (&dev->compat_devs, c_index, cdev)
+ remove_one_compat_dev(dev, c_index);
+ up_read(&rdma_nets_rwsem);
+ }
+ up_read(&devices_rwsem);
+}
+
+static int add_all_compat_devs(void)
+{
+ struct rdma_dev_net *rnet;
+ struct ib_device *dev;
+ unsigned long index;
+ int ret = 0;
+
+ down_read(&devices_rwsem);
+ xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
+ unsigned long net_index = 0;
+
+ /* Hold nets_rwsem so that any other thread modifying this
+ * system param can sync with this thread.
+ */
+ down_read(&rdma_nets_rwsem);
+ xa_for_each (&rdma_nets, net_index, rnet) {
+ ret = add_one_compat_dev(dev, rnet);
+ if (ret)
+ break;
+ }
+ up_read(&rdma_nets_rwsem);
+ }
+ up_read(&devices_rwsem);
+ if (ret)
+ remove_all_compat_devs();
+ return ret;
+}
+
+int rdma_compatdev_set(u8 enable)
+{
+ struct rdma_dev_net *rnet;
+ unsigned long index;
+ int ret = 0;
+
+ down_write(&rdma_nets_rwsem);
+ if (ib_devices_shared_netns == enable) {
+ up_write(&rdma_nets_rwsem);
+ return 0;
+ }
+
+ /* enable/disable of compat devices is not supported
+ * when more than default init_net exists.
+ */
+ xa_for_each (&rdma_nets, index, rnet) {
+ ret++;
+ break;
+ }
+ if (!ret)
+ ib_devices_shared_netns = enable;
+ up_write(&rdma_nets_rwsem);
+ if (ret)
+ return -EBUSY;
+
+ if (enable)
+ ret = add_all_compat_devs();
+ else
+ remove_all_compat_devs();
+ return ret;
+}
+
+static void rdma_dev_exit_net(struct net *net)
+{
+ struct rdma_dev_net *rnet = rdma_net_to_dev_net(net);
+ struct ib_device *dev;
+ unsigned long index;
+ int ret;
+
+ down_write(&rdma_nets_rwsem);
+ /*
+ * Prevent the ID from being re-used and hide the id from xa_for_each.
+ */
+ ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL));
+ WARN_ON(ret);
+ up_write(&rdma_nets_rwsem);
+
+ down_read(&devices_rwsem);
+ xa_for_each (&devices, index, dev) {
+ get_device(&dev->dev);
+ /*
+ * Release the devices_rwsem so that pontentially blocking
+ * device_del, doesn't hold the devices_rwsem for too long.
+ */
+ up_read(&devices_rwsem);
+
+ remove_one_compat_dev(dev, rnet->id);
+
+ /*
+ * If the real device is in the NS then move it back to init.
+ */
+ rdma_dev_change_netns(dev, net, &init_net);
+
+ put_device(&dev->dev);
+ down_read(&devices_rwsem);
+ }
+ up_read(&devices_rwsem);
+
+ rdma_nl_net_exit(rnet);
+ xa_erase(&rdma_nets, rnet->id);
+}
+
+static __net_init int rdma_dev_init_net(struct net *net)
+{
+ struct rdma_dev_net *rnet = rdma_net_to_dev_net(net);
+ unsigned long index;
+ struct ib_device *dev;
+ int ret;
+
+ write_pnet(&rnet->net, net);
+
+ ret = rdma_nl_net_init(rnet);
+ if (ret)
+ return ret;
+
+ /* No need to create any compat devices in default init_net. */
+ if (net_eq(net, &init_net))
+ return 0;
+
+ ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL);
+ if (ret) {
+ rdma_nl_net_exit(rnet);
+ return ret;
+ }
+
+ down_read(&devices_rwsem);
+ xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
+ /* Hold nets_rwsem so that netlink command cannot change
+ * system configuration for device sharing mode.
+ */
+ down_read(&rdma_nets_rwsem);
+ ret = add_one_compat_dev(dev, rnet);
+ up_read(&rdma_nets_rwsem);
+ if (ret)
+ break;
+ }
+ up_read(&devices_rwsem);
+
+ if (ret)
+ rdma_dev_exit_net(net);
+
+ return ret;
+}
+
+/*
+ * Assign the unique string device name and the unique device index. This is
+ * undone by ib_dealloc_device.
+ */
+static int assign_name(struct ib_device *device, const char *name)
+{
+ static u32 last_id;
+ int ret;
+
+ down_write(&devices_rwsem);
+ /* Assign a unique name to the device */
+ if (strchr(name, '%'))
+ ret = alloc_name(device, name);
+ else
+ ret = dev_set_name(&device->dev, name);
+ if (ret)
+ goto out;
+
+ if (__ib_device_get_by_name(dev_name(&device->dev))) {
+ ret = -ENFILE;
+ goto out;
+ }
+ strscpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX);
+
+ ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b,
+ &last_id, GFP_KERNEL);
+ if (ret > 0)
+ ret = 0;
+
+out:
+ up_write(&devices_rwsem);
+ return ret;
+}
+
+/*
+ * setup_device() allocates memory and sets up data that requires calling the
+ * device ops, this is the only reason these actions are not done during
+ * ib_alloc_device. It is undone by ib_dealloc_device().
+ */
+static int setup_device(struct ib_device *device)
+{
+ struct ib_udata uhw = {.outlen = 0, .inlen = 0};
+ int ret;
+
+ ib_device_check_mandatory(device);
+
+ ret = setup_port_data(device);
+ if (ret) {
+ dev_warn(&device->dev, "Couldn't create per-port data\n");
+ return ret;
+ }
+
+ memset(&device->attrs, 0, sizeof(device->attrs));
+ ret = device->ops.query_device(device, &device->attrs, &uhw);
+ if (ret) {
+ dev_warn(&device->dev,
+ "Couldn't query the device attributes\n");
+ return ret;
+ }
+
+ return 0;
+}
+
+static void disable_device(struct ib_device *device)
+{
+ u32 cid;
+
+ WARN_ON(!refcount_read(&device->refcount));
+
+ down_write(&devices_rwsem);
+ xa_clear_mark(&devices, device->index, DEVICE_REGISTERED);
+ up_write(&devices_rwsem);
+
+ /*
+ * Remove clients in LIFO order, see assign_client_id. This could be
+ * more efficient if xarray learns to reverse iterate. Since no new
+ * clients can be added to this ib_device past this point we only need
+ * the maximum possible client_id value here.
+ */
+ down_read(&clients_rwsem);
+ cid = highest_client_id;
+ up_read(&clients_rwsem);
+ while (cid) {
+ cid--;
+ remove_client_context(device, cid);
+ }
+
+ ib_cq_pool_cleanup(device);
+
+ /* Pairs with refcount_set in enable_device */
+ ib_device_put(device);
+ wait_for_completion(&device->unreg_completion);
+
+ /*
+ * compat devices must be removed after device refcount drops to zero.
+ * Otherwise init_net() may add more compatdevs after removing compat
+ * devices and before device is disabled.
+ */
+ remove_compat_devs(device);
+}
+
+/*
+ * An enabled device is visible to all clients and to all the public facing
+ * APIs that return a device pointer. This always returns with a new get, even
+ * if it fails.
+ */
+static int enable_device_and_get(struct ib_device *device)
+{
+ struct ib_client *client;
+ unsigned long index;
+ int ret = 0;
+
+ /*
+ * One ref belongs to the xa and the other belongs to this
+ * thread. This is needed to guard against parallel unregistration.
+ */
+ refcount_set(&device->refcount, 2);
+ down_write(&devices_rwsem);
+ xa_set_mark(&devices, device->index, DEVICE_REGISTERED);
+
+ /*
+ * By using downgrade_write() we ensure that no other thread can clear
+ * DEVICE_REGISTERED while we are completing the client setup.
+ */
+ downgrade_write(&devices_rwsem);
+
+ if (device->ops.enable_driver) {
+ ret = device->ops.enable_driver(device);
+ if (ret)
+ goto out;
+ }
+
+ down_read(&clients_rwsem);
+ xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
+ ret = add_client_context(device, client);
+ if (ret)
+ break;
+ }
+ up_read(&clients_rwsem);
+ if (!ret)
+ ret = add_compat_devs(device);
+out:
+ up_read(&devices_rwsem);
+ return ret;
+}
+
+static void prevent_dealloc_device(struct ib_device *ib_dev)
+{
+}
+
+/**
+ * ib_register_device - Register an IB device with IB core
+ * @device: Device to register
+ * @name: unique string device name. This may include a '%' which will
+ * cause a unique index to be added to the passed device name.
+ * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB
+ * device will be used. In this case the caller should fully
+ * setup the ibdev for DMA. This usually means using dma_virt_ops.
+ *
+ * Low-level drivers use ib_register_device() to register their
+ * devices with the IB core. All registered clients will receive a
+ * callback for each device that is added. @device must be allocated
+ * with ib_alloc_device().
+ *
+ * If the driver uses ops.dealloc_driver and calls any ib_unregister_device()
+ * asynchronously then the device pointer may become freed as soon as this
+ * function returns.
+ */
+int ib_register_device(struct ib_device *device, const char *name,
+ struct device *dma_device)
+{
+ int ret;
+
+ ret = assign_name(device, name);
+ if (ret)
+ return ret;
+
+ /*
+ * If the caller does not provide a DMA capable device then the IB core
+ * will set up ib_sge and scatterlist structures that stash the kernel
+ * virtual address into the address field.
+ */
+ WARN_ON(dma_device && !dma_device->dma_parms);
+ device->dma_device = dma_device;
+
+ ret = setup_device(device);
+ if (ret)
+ return ret;
+
+ ret = ib_cache_setup_one(device);
+ if (ret) {
+ dev_warn(&device->dev,
+ "Couldn't set up InfiniBand P_Key/GID cache\n");
+ return ret;
+ }
+
+ device->groups[0] = &ib_dev_attr_group;
+ device->groups[1] = device->ops.device_group;
+ ret = ib_setup_device_attrs(device);
+ if (ret)
+ goto cache_cleanup;
+
+ ib_device_register_rdmacg(device);
+
+ rdma_counter_init(device);
+
+ /*
+ * Ensure that ADD uevent is not fired because it
+ * is too early amd device is not initialized yet.
+ */
+ dev_set_uevent_suppress(&device->dev, true);
+ ret = device_add(&device->dev);
+ if (ret)
+ goto cg_cleanup;
+
+ ret = ib_setup_port_attrs(&device->coredev);
+ if (ret) {
+ dev_warn(&device->dev,
+ "Couldn't register device with driver model\n");
+ goto dev_cleanup;
+ }
+
+ ret = enable_device_and_get(device);
+ if (ret) {
+ void (*dealloc_fn)(struct ib_device *);
+
+ /*
+ * If we hit this error flow then we don't want to
+ * automatically dealloc the device since the caller is
+ * expected to call ib_dealloc_device() after
+ * ib_register_device() fails. This is tricky due to the
+ * possibility for a parallel unregistration along with this
+ * error flow. Since we have a refcount here we know any
+ * parallel flow is stopped in disable_device and will see the
+ * special dealloc_driver pointer, causing the responsibility to
+ * ib_dealloc_device() to revert back to this thread.
+ */
+ dealloc_fn = device->ops.dealloc_driver;
+ device->ops.dealloc_driver = prevent_dealloc_device;
+ ib_device_put(device);
+ __ib_unregister_device(device);
+ device->ops.dealloc_driver = dealloc_fn;
+ dev_set_uevent_suppress(&device->dev, false);
+ return ret;
+ }
+ dev_set_uevent_suppress(&device->dev, false);
+ /* Mark for userspace that device is ready */
+ kobject_uevent(&device->dev.kobj, KOBJ_ADD);
+ ib_device_put(device);
+
+ return 0;
+
+dev_cleanup:
+ device_del(&device->dev);
+cg_cleanup:
+ dev_set_uevent_suppress(&device->dev, false);
+ ib_device_unregister_rdmacg(device);
+cache_cleanup:
+ ib_cache_cleanup_one(device);
+ return ret;
+}
+EXPORT_SYMBOL(ib_register_device);
+
+/* Callers must hold a get on the device. */
+static void __ib_unregister_device(struct ib_device *ib_dev)
+{
+ /*
+ * We have a registration lock so that all the calls to unregister are
+ * fully fenced, once any unregister returns the device is truely
+ * unregistered even if multiple callers are unregistering it at the
+ * same time. This also interacts with the registration flow and
+ * provides sane semantics if register and unregister are racing.
+ */
+ mutex_lock(&ib_dev->unregistration_lock);
+ if (!refcount_read(&ib_dev->refcount))
+ goto out;
+
+ disable_device(ib_dev);
+
+ /* Expedite removing unregistered pointers from the hash table */
+ free_netdevs(ib_dev);
+
+ ib_free_port_attrs(&ib_dev->coredev);
+ device_del(&ib_dev->dev);
+ ib_device_unregister_rdmacg(ib_dev);
+ ib_cache_cleanup_one(ib_dev);
+
+ /*
+ * Drivers using the new flow may not call ib_dealloc_device except
+ * in error unwind prior to registration success.
+ */
+ if (ib_dev->ops.dealloc_driver &&
+ ib_dev->ops.dealloc_driver != prevent_dealloc_device) {
+ WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1);
+ ib_dealloc_device(ib_dev);
+ }
+out:
+ mutex_unlock(&ib_dev->unregistration_lock);
+}
+
+/**
+ * ib_unregister_device - Unregister an IB device
+ * @ib_dev: The device to unregister
+ *
+ * Unregister an IB device. All clients will receive a remove callback.
+ *
+ * Callers should call this routine only once, and protect against races with
+ * registration. Typically it should only be called as part of a remove
+ * callback in an implementation of driver core's struct device_driver and
+ * related.
+ *
+ * If ops.dealloc_driver is used then ib_dev will be freed upon return from
+ * this function.
+ */
+void ib_unregister_device(struct ib_device *ib_dev)
+{
+ get_device(&ib_dev->dev);
+ __ib_unregister_device(ib_dev);
+ put_device(&ib_dev->dev);
+}
+EXPORT_SYMBOL(ib_unregister_device);
+
+/**
+ * ib_unregister_device_and_put - Unregister a device while holding a 'get'
+ * @ib_dev: The device to unregister
+ *
+ * This is the same as ib_unregister_device(), except it includes an internal
+ * ib_device_put() that should match a 'get' obtained by the caller.
+ *
+ * It is safe to call this routine concurrently from multiple threads while
+ * holding the 'get'. When the function returns the device is fully
+ * unregistered.
+ *
+ * Drivers using this flow MUST use the driver_unregister callback to clean up
+ * their resources associated with the device and dealloc it.
+ */
+void ib_unregister_device_and_put(struct ib_device *ib_dev)
+{
+ WARN_ON(!ib_dev->ops.dealloc_driver);
+ get_device(&ib_dev->dev);
+ ib_device_put(ib_dev);
+ __ib_unregister_device(ib_dev);
+ put_device(&ib_dev->dev);
+}
+EXPORT_SYMBOL(ib_unregister_device_and_put);
+
+/**
+ * ib_unregister_driver - Unregister all IB devices for a driver
+ * @driver_id: The driver to unregister
+ *
+ * This implements a fence for device unregistration. It only returns once all
+ * devices associated with the driver_id have fully completed their
+ * unregistration and returned from ib_unregister_device*().
+ *
+ * If device's are not yet unregistered it goes ahead and starts unregistering
+ * them.
+ *
+ * This does not block creation of new devices with the given driver_id, that
+ * is the responsibility of the caller.
+ */
+void ib_unregister_driver(enum rdma_driver_id driver_id)
+{
+ struct ib_device *ib_dev;
+ unsigned long index;
+
+ down_read(&devices_rwsem);
+ xa_for_each (&devices, index, ib_dev) {
+ if (ib_dev->ops.driver_id != driver_id)
+ continue;
+
+ get_device(&ib_dev->dev);
+ up_read(&devices_rwsem);
+
+ WARN_ON(!ib_dev->ops.dealloc_driver);
+ __ib_unregister_device(ib_dev);
+
+ put_device(&ib_dev->dev);
+ down_read(&devices_rwsem);
+ }
+ up_read(&devices_rwsem);
+}
+EXPORT_SYMBOL(ib_unregister_driver);
+
+static void ib_unregister_work(struct work_struct *work)
+{
+ struct ib_device *ib_dev =
+ container_of(work, struct ib_device, unregistration_work);
+
+ __ib_unregister_device(ib_dev);
+ put_device(&ib_dev->dev);
+}
+
+/**
+ * ib_unregister_device_queued - Unregister a device using a work queue
+ * @ib_dev: The device to unregister
+ *
+ * This schedules an asynchronous unregistration using a WQ for the device. A
+ * driver should use this to avoid holding locks while doing unregistration,
+ * such as holding the RTNL lock.
+ *
+ * Drivers using this API must use ib_unregister_driver before module unload
+ * to ensure that all scheduled unregistrations have completed.
+ */
+void ib_unregister_device_queued(struct ib_device *ib_dev)
+{
+ WARN_ON(!refcount_read(&ib_dev->refcount));
+ WARN_ON(!ib_dev->ops.dealloc_driver);
+ get_device(&ib_dev->dev);
+ if (!queue_work(ib_unreg_wq, &ib_dev->unregistration_work))
+ put_device(&ib_dev->dev);
+}
+EXPORT_SYMBOL(ib_unregister_device_queued);
+
+/*
+ * The caller must pass in a device that has the kref held and the refcount
+ * released. If the device is in cur_net and still registered then it is moved
+ * into net.
+ */
+static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net,
+ struct net *net)
+{
+ int ret2 = -EINVAL;
+ int ret;
+
+ mutex_lock(&device->unregistration_lock);
+
+ /*
+ * If a device not under ib_device_get() or if the unregistration_lock
+ * is not held, the namespace can be changed, or it can be unregistered.
+ * Check again under the lock.
+ */
+ if (refcount_read(&device->refcount) == 0 ||
+ !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ kobject_uevent(&device->dev.kobj, KOBJ_REMOVE);
+ disable_device(device);
+
+ /*
+ * At this point no one can be using the device, so it is safe to
+ * change the namespace.
+ */
+ write_pnet(&device->coredev.rdma_net, net);
+
+ down_read(&devices_rwsem);
+ /*
+ * Currently rdma devices are system wide unique. So the device name
+ * is guaranteed free in the new namespace. Publish the new namespace
+ * at the sysfs level.
+ */
+ ret = device_rename(&device->dev, dev_name(&device->dev));
+ up_read(&devices_rwsem);
+ if (ret) {
+ dev_warn(&device->dev,
+ "%s: Couldn't rename device after namespace change\n",
+ __func__);
+ /* Try and put things back and re-enable the device */
+ write_pnet(&device->coredev.rdma_net, cur_net);
+ }
+
+ ret2 = enable_device_and_get(device);
+ if (ret2) {
+ /*
+ * This shouldn't really happen, but if it does, let the user
+ * retry at later point. So don't disable the device.
+ */
+ dev_warn(&device->dev,
+ "%s: Couldn't re-enable device after namespace change\n",
+ __func__);
+ }
+ kobject_uevent(&device->dev.kobj, KOBJ_ADD);
+
+ ib_device_put(device);
+out:
+ mutex_unlock(&device->unregistration_lock);
+ if (ret)
+ return ret;
+ return ret2;
+}
+
+int ib_device_set_netns_put(struct sk_buff *skb,
+ struct ib_device *dev, u32 ns_fd)
+{
+ struct net *net;
+ int ret;
+
+ net = get_net_ns_by_fd(ns_fd);
+ if (IS_ERR(net)) {
+ ret = PTR_ERR(net);
+ goto net_err;
+ }
+
+ if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ goto ns_err;
+ }
+
+ /*
+ * All the ib_clients, including uverbs, are reset when the namespace is
+ * changed and this cannot be blocked waiting for userspace to do
+ * something, so disassociation is mandatory.
+ */
+ if (!dev->ops.disassociate_ucontext || ib_devices_shared_netns) {
+ ret = -EOPNOTSUPP;
+ goto ns_err;
+ }
+
+ get_device(&dev->dev);
+ ib_device_put(dev);
+ ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net);
+ put_device(&dev->dev);
+
+ put_net(net);
+ return ret;
+
+ns_err:
+ put_net(net);
+net_err:
+ ib_device_put(dev);
+ return ret;
+}
+
+static struct pernet_operations rdma_dev_net_ops = {
+ .init = rdma_dev_init_net,
+ .exit = rdma_dev_exit_net,
+ .id = &rdma_dev_net_id,
+ .size = sizeof(struct rdma_dev_net),
+};
+
+static int assign_client_id(struct ib_client *client)
+{
+ int ret;
+
+ down_write(&clients_rwsem);
+ /*
+ * The add/remove callbacks must be called in FIFO/LIFO order. To
+ * achieve this we assign client_ids so they are sorted in
+ * registration order.
+ */
+ client->client_id = highest_client_id;
+ ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL);
+ if (ret)
+ goto out;
+
+ highest_client_id++;
+ xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED);
+
+out:
+ up_write(&clients_rwsem);
+ return ret;
+}
+
+static void remove_client_id(struct ib_client *client)
+{
+ down_write(&clients_rwsem);
+ xa_erase(&clients, client->client_id);
+ for (; highest_client_id; highest_client_id--)
+ if (xa_load(&clients, highest_client_id - 1))
+ break;
+ up_write(&clients_rwsem);
+}
+
+/**
+ * ib_register_client - Register an IB client
+ * @client:Client to register
+ *
+ * Upper level users of the IB drivers can use ib_register_client() to
+ * register callbacks for IB device addition and removal. When an IB
+ * device is added, each registered client's add method will be called
+ * (in the order the clients were registered), and when a device is
+ * removed, each client's remove method will be called (in the reverse
+ * order that clients were registered). In addition, when
+ * ib_register_client() is called, the client will receive an add
+ * callback for all devices already registered.
+ */
+int ib_register_client(struct ib_client *client)
+{
+ struct ib_device *device;
+ unsigned long index;
+ int ret;
+
+ refcount_set(&client->uses, 1);
+ init_completion(&client->uses_zero);
+ ret = assign_client_id(client);
+ if (ret)
+ return ret;
+
+ down_read(&devices_rwsem);
+ xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) {
+ ret = add_client_context(device, client);
+ if (ret) {
+ up_read(&devices_rwsem);
+ ib_unregister_client(client);
+ return ret;
+ }
+ }
+ up_read(&devices_rwsem);
+ return 0;
+}
+EXPORT_SYMBOL(ib_register_client);
+
+/**
+ * ib_unregister_client - Unregister an IB client
+ * @client:Client to unregister
+ *
+ * Upper level users use ib_unregister_client() to remove their client
+ * registration. When ib_unregister_client() is called, the client
+ * will receive a remove callback for each IB device still registered.
+ *
+ * This is a full fence, once it returns no client callbacks will be called,
+ * or are running in another thread.
+ */
+void ib_unregister_client(struct ib_client *client)
+{
+ struct ib_device *device;
+ unsigned long index;
+
+ down_write(&clients_rwsem);
+ ib_client_put(client);
+ xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED);
+ up_write(&clients_rwsem);
+
+ /* We do not want to have locks while calling client->remove() */
+ rcu_read_lock();
+ xa_for_each (&devices, index, device) {
+ if (!ib_device_try_get(device))
+ continue;
+ rcu_read_unlock();
+
+ remove_client_context(device, client->client_id);
+
+ ib_device_put(device);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+
+ /*
+ * remove_client_context() is not a fence, it can return even though a
+ * removal is ongoing. Wait until all removals are completed.
+ */
+ wait_for_completion(&client->uses_zero);
+ remove_client_id(client);
+}
+EXPORT_SYMBOL(ib_unregister_client);
+
+static int __ib_get_global_client_nl_info(const char *client_name,
+ struct ib_client_nl_info *res)
+{
+ struct ib_client *client;
+ unsigned long index;
+ int ret = -ENOENT;
+
+ down_read(&clients_rwsem);
+ xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
+ if (strcmp(client->name, client_name) != 0)
+ continue;
+ if (!client->get_global_nl_info) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+ ret = client->get_global_nl_info(res);
+ if (WARN_ON(ret == -ENOENT))
+ ret = -EINVAL;
+ if (!ret && res->cdev)
+ get_device(res->cdev);
+ break;
+ }
+ up_read(&clients_rwsem);
+ return ret;
+}
+
+static int __ib_get_client_nl_info(struct ib_device *ibdev,
+ const char *client_name,
+ struct ib_client_nl_info *res)
+{
+ unsigned long index;
+ void *client_data;
+ int ret = -ENOENT;
+
+ down_read(&ibdev->client_data_rwsem);
+ xan_for_each_marked (&ibdev->client_data, index, client_data,
+ CLIENT_DATA_REGISTERED) {
+ struct ib_client *client = xa_load(&clients, index);
+
+ if (!client || strcmp(client->name, client_name) != 0)
+ continue;
+ if (!client->get_nl_info) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+ ret = client->get_nl_info(ibdev, client_data, res);
+ if (WARN_ON(ret == -ENOENT))
+ ret = -EINVAL;
+
+ /*
+ * The cdev is guaranteed valid as long as we are inside the
+ * client_data_rwsem as remove_one can't be called. Keep it
+ * valid for the caller.
+ */
+ if (!ret && res->cdev)
+ get_device(res->cdev);
+ break;
+ }
+ up_read(&ibdev->client_data_rwsem);
+
+ return ret;
+}
+
+/**
+ * ib_get_client_nl_info - Fetch the nl_info from a client
+ * @ibdev: IB device
+ * @client_name: Name of the client
+ * @res: Result of the query
+ */
+int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name,
+ struct ib_client_nl_info *res)
+{
+ int ret;
+
+ if (ibdev)
+ ret = __ib_get_client_nl_info(ibdev, client_name, res);
+ else
+ ret = __ib_get_global_client_nl_info(client_name, res);
+#ifdef CONFIG_MODULES
+ if (ret == -ENOENT) {
+ request_module("rdma-client-%s", client_name);
+ if (ibdev)
+ ret = __ib_get_client_nl_info(ibdev, client_name, res);
+ else
+ ret = __ib_get_global_client_nl_info(client_name, res);
+ }
+#endif
+ if (ret) {
+ if (ret == -ENOENT)
+ return -EOPNOTSUPP;
+ return ret;
+ }
+
+ if (WARN_ON(!res->cdev))
+ return -EINVAL;
+ return 0;
+}
+
+/**
+ * ib_set_client_data - Set IB client context
+ * @device:Device to set context for
+ * @client:Client to set context for
+ * @data:Context to set
+ *
+ * ib_set_client_data() sets client context data that can be retrieved with
+ * ib_get_client_data(). This can only be called while the client is
+ * registered to the device, once the ib_client remove() callback returns this
+ * cannot be called.
+ */
+void ib_set_client_data(struct ib_device *device, struct ib_client *client,
+ void *data)
+{
+ void *rc;
+
+ if (WARN_ON(IS_ERR(data)))
+ data = NULL;
+
+ rc = xa_store(&device->client_data, client->client_id, data,
+ GFP_KERNEL);
+ WARN_ON(xa_is_err(rc));
+}
+EXPORT_SYMBOL(ib_set_client_data);
+
+/**
+ * ib_register_event_handler - Register an IB event handler
+ * @event_handler:Handler to register
+ *
+ * ib_register_event_handler() registers an event handler that will be
+ * called back when asynchronous IB events occur (as defined in
+ * chapter 11 of the InfiniBand Architecture Specification). This
+ * callback occurs in workqueue context.
+ */
+void ib_register_event_handler(struct ib_event_handler *event_handler)
+{
+ down_write(&event_handler->device->event_handler_rwsem);
+ list_add_tail(&event_handler->list,
+ &event_handler->device->event_handler_list);
+ up_write(&event_handler->device->event_handler_rwsem);
+}
+EXPORT_SYMBOL(ib_register_event_handler);
+
+/**
+ * ib_unregister_event_handler - Unregister an event handler
+ * @event_handler:Handler to unregister
+ *
+ * Unregister an event handler registered with
+ * ib_register_event_handler().
+ */
+void ib_unregister_event_handler(struct ib_event_handler *event_handler)
+{
+ down_write(&event_handler->device->event_handler_rwsem);
+ list_del(&event_handler->list);
+ up_write(&event_handler->device->event_handler_rwsem);
+}
+EXPORT_SYMBOL(ib_unregister_event_handler);
+
+void ib_dispatch_event_clients(struct ib_event *event)
+{
+ struct ib_event_handler *handler;
+
+ down_read(&event->device->event_handler_rwsem);
+
+ list_for_each_entry(handler, &event->device->event_handler_list, list)
+ handler->handler(handler, event);
+
+ up_read(&event->device->event_handler_rwsem);
+}
+
+static int iw_query_port(struct ib_device *device,
+ u32 port_num,
+ struct ib_port_attr *port_attr)
+{
+ struct in_device *inetdev;
+ struct net_device *netdev;
+
+ memset(port_attr, 0, sizeof(*port_attr));
+
+ netdev = ib_device_get_netdev(device, port_num);
+ if (!netdev)
+ return -ENODEV;
+
+ port_attr->max_mtu = IB_MTU_4096;
+ port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu);
+
+ if (!netif_carrier_ok(netdev)) {
+ port_attr->state = IB_PORT_DOWN;
+ port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED;
+ } else {
+ rcu_read_lock();
+ inetdev = __in_dev_get_rcu(netdev);
+
+ if (inetdev && inetdev->ifa_list) {
+ port_attr->state = IB_PORT_ACTIVE;
+ port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
+ } else {
+ port_attr->state = IB_PORT_INIT;
+ port_attr->phys_state =
+ IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING;
+ }
+
+ rcu_read_unlock();
+ }
+
+ dev_put(netdev);
+ return device->ops.query_port(device, port_num, port_attr);
+}
+
+static int __ib_query_port(struct ib_device *device,
+ u32 port_num,
+ struct ib_port_attr *port_attr)
+{
+ int err;
+
+ memset(port_attr, 0, sizeof(*port_attr));
+
+ err = device->ops.query_port(device, port_num, port_attr);
+ if (err || port_attr->subnet_prefix)
+ return err;
+
+ if (rdma_port_get_link_layer(device, port_num) !=
+ IB_LINK_LAYER_INFINIBAND)
+ return 0;
+
+ ib_get_cached_subnet_prefix(device, port_num,
+ &port_attr->subnet_prefix);
+ return 0;
+}
+
+/**
+ * ib_query_port - Query IB port attributes
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @port_attr:Port attributes
+ *
+ * ib_query_port() returns the attributes of a port through the
+ * @port_attr pointer.
+ */
+int ib_query_port(struct ib_device *device,
+ u32 port_num,
+ struct ib_port_attr *port_attr)
+{
+ if (!rdma_is_port_valid(device, port_num))
+ return -EINVAL;
+
+ if (rdma_protocol_iwarp(device, port_num))
+ return iw_query_port(device, port_num, port_attr);
+ else
+ return __ib_query_port(device, port_num, port_attr);
+}
+EXPORT_SYMBOL(ib_query_port);
+
+static void add_ndev_hash(struct ib_port_data *pdata)
+{
+ unsigned long flags;
+
+ might_sleep();
+
+ spin_lock_irqsave(&ndev_hash_lock, flags);
+ if (hash_hashed(&pdata->ndev_hash_link)) {
+ hash_del_rcu(&pdata->ndev_hash_link);
+ spin_unlock_irqrestore(&ndev_hash_lock, flags);
+ /*
+ * We cannot do hash_add_rcu after a hash_del_rcu until the
+ * grace period
+ */
+ synchronize_rcu();
+ spin_lock_irqsave(&ndev_hash_lock, flags);
+ }
+ if (pdata->netdev)
+ hash_add_rcu(ndev_hash, &pdata->ndev_hash_link,
+ (uintptr_t)pdata->netdev);
+ spin_unlock_irqrestore(&ndev_hash_lock, flags);
+}
+
+/**
+ * ib_device_set_netdev - Associate the ib_dev with an underlying net_device
+ * @ib_dev: Device to modify
+ * @ndev: net_device to affiliate, may be NULL
+ * @port: IB port the net_device is connected to
+ *
+ * Drivers should use this to link the ib_device to a netdev so the netdev
+ * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be
+ * affiliated with any port.
+ *
+ * The caller must ensure that the given ndev is not unregistered or
+ * unregistering, and that either the ib_device is unregistered or
+ * ib_device_set_netdev() is called with NULL when the ndev sends a
+ * NETDEV_UNREGISTER event.
+ */
+int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
+ u32 port)
+{
+ struct net_device *old_ndev;
+ struct ib_port_data *pdata;
+ unsigned long flags;
+ int ret;
+
+ /*
+ * Drivers wish to call this before ib_register_driver, so we have to
+ * setup the port data early.
+ */
+ ret = alloc_port_data(ib_dev);
+ if (ret)
+ return ret;
+
+ if (!rdma_is_port_valid(ib_dev, port))
+ return -EINVAL;
+
+ pdata = &ib_dev->port_data[port];
+ spin_lock_irqsave(&pdata->netdev_lock, flags);
+ old_ndev = rcu_dereference_protected(
+ pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
+ if (old_ndev == ndev) {
+ spin_unlock_irqrestore(&pdata->netdev_lock, flags);
+ return 0;
+ }
+
+ if (ndev)
+ dev_hold(ndev);
+ rcu_assign_pointer(pdata->netdev, ndev);
+ spin_unlock_irqrestore(&pdata->netdev_lock, flags);
+
+ add_ndev_hash(pdata);
+ if (old_ndev)
+ dev_put(old_ndev);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_device_set_netdev);
+
+static void free_netdevs(struct ib_device *ib_dev)
+{
+ unsigned long flags;
+ u32 port;
+
+ if (!ib_dev->port_data)
+ return;
+
+ rdma_for_each_port (ib_dev, port) {
+ struct ib_port_data *pdata = &ib_dev->port_data[port];
+ struct net_device *ndev;
+
+ spin_lock_irqsave(&pdata->netdev_lock, flags);
+ ndev = rcu_dereference_protected(
+ pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
+ if (ndev) {
+ spin_lock(&ndev_hash_lock);
+ hash_del_rcu(&pdata->ndev_hash_link);
+ spin_unlock(&ndev_hash_lock);
+
+ /*
+ * If this is the last dev_put there is still a
+ * synchronize_rcu before the netdev is kfreed, so we
+ * can continue to rely on unlocked pointer
+ * comparisons after the put
+ */
+ rcu_assign_pointer(pdata->netdev, NULL);
+ dev_put(ndev);
+ }
+ spin_unlock_irqrestore(&pdata->netdev_lock, flags);
+ }
+}
+
+struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
+ u32 port)
+{
+ struct ib_port_data *pdata;
+ struct net_device *res;
+
+ if (!rdma_is_port_valid(ib_dev, port))
+ return NULL;
+
+ pdata = &ib_dev->port_data[port];
+
+ /*
+ * New drivers should use ib_device_set_netdev() not the legacy
+ * get_netdev().
+ */
+ if (ib_dev->ops.get_netdev)
+ res = ib_dev->ops.get_netdev(ib_dev, port);
+ else {
+ spin_lock(&pdata->netdev_lock);
+ res = rcu_dereference_protected(
+ pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
+ if (res)
+ dev_hold(res);
+ spin_unlock(&pdata->netdev_lock);
+ }
+
+ /*
+ * If we are starting to unregister expedite things by preventing
+ * propagation of an unregistering netdev.
+ */
+ if (res && res->reg_state != NETREG_REGISTERED) {
+ dev_put(res);
+ return NULL;
+ }
+
+ return res;
+}
+
+/**
+ * ib_device_get_by_netdev - Find an IB device associated with a netdev
+ * @ndev: netdev to locate
+ * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
+ *
+ * Find and hold an ib_device that is associated with a netdev via
+ * ib_device_set_netdev(). The caller must call ib_device_put() on the
+ * returned pointer.
+ */
+struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
+ enum rdma_driver_id driver_id)
+{
+ struct ib_device *res = NULL;
+ struct ib_port_data *cur;
+
+ rcu_read_lock();
+ hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link,
+ (uintptr_t)ndev) {
+ if (rcu_access_pointer(cur->netdev) == ndev &&
+ (driver_id == RDMA_DRIVER_UNKNOWN ||
+ cur->ib_dev->ops.driver_id == driver_id) &&
+ ib_device_try_get(cur->ib_dev)) {
+ res = cur->ib_dev;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return res;
+}
+EXPORT_SYMBOL(ib_device_get_by_netdev);
+
+/**
+ * ib_enum_roce_netdev - enumerate all RoCE ports
+ * @ib_dev : IB device we want to query
+ * @filter: Should we call the callback?
+ * @filter_cookie: Cookie passed to filter
+ * @cb: Callback to call for each found RoCE ports
+ * @cookie: Cookie passed back to the callback
+ *
+ * Enumerates all of the physical RoCE ports of ib_dev
+ * which are related to netdevice and calls callback() on each
+ * device for which filter() function returns non zero.
+ */
+void ib_enum_roce_netdev(struct ib_device *ib_dev,
+ roce_netdev_filter filter,
+ void *filter_cookie,
+ roce_netdev_callback cb,
+ void *cookie)
+{
+ u32 port;
+
+ rdma_for_each_port (ib_dev, port)
+ if (rdma_protocol_roce(ib_dev, port)) {
+ struct net_device *idev =
+ ib_device_get_netdev(ib_dev, port);
+
+ if (filter(ib_dev, port, idev, filter_cookie))
+ cb(ib_dev, port, idev, cookie);
+
+ if (idev)
+ dev_put(idev);
+ }
+}
+
+/**
+ * ib_enum_all_roce_netdevs - enumerate all RoCE devices
+ * @filter: Should we call the callback?
+ * @filter_cookie: Cookie passed to filter
+ * @cb: Callback to call for each found RoCE ports
+ * @cookie: Cookie passed back to the callback
+ *
+ * Enumerates all RoCE devices' physical ports which are related
+ * to netdevices and calls callback() on each device for which
+ * filter() function returns non zero.
+ */
+void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
+ void *filter_cookie,
+ roce_netdev_callback cb,
+ void *cookie)
+{
+ struct ib_device *dev;
+ unsigned long index;
+
+ down_read(&devices_rwsem);
+ xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED)
+ ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie);
+ up_read(&devices_rwsem);
+}
+
+/*
+ * ib_enum_all_devs - enumerate all ib_devices
+ * @cb: Callback to call for each found ib_device
+ *
+ * Enumerates all ib_devices and calls callback() on each device.
+ */
+int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ unsigned long index;
+ struct ib_device *dev;
+ unsigned int idx = 0;
+ int ret = 0;
+
+ down_read(&devices_rwsem);
+ xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
+ if (!rdma_dev_access_netns(dev, sock_net(skb->sk)))
+ continue;
+
+ ret = nldev_cb(dev, skb, cb, idx);
+ if (ret)
+ break;
+ idx++;
+ }
+ up_read(&devices_rwsem);
+ return ret;
+}
+
+/**
+ * ib_query_pkey - Get P_Key table entry
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @index:P_Key table index to query
+ * @pkey:Returned P_Key
+ *
+ * ib_query_pkey() fetches the specified P_Key table entry.
+ */
+int ib_query_pkey(struct ib_device *device,
+ u32 port_num, u16 index, u16 *pkey)
+{
+ if (!rdma_is_port_valid(device, port_num))
+ return -EINVAL;
+
+ if (!device->ops.query_pkey)
+ return -EOPNOTSUPP;
+
+ return device->ops.query_pkey(device, port_num, index, pkey);
+}
+EXPORT_SYMBOL(ib_query_pkey);
+
+/**
+ * ib_modify_device - Change IB device attributes
+ * @device:Device to modify
+ * @device_modify_mask:Mask of attributes to change
+ * @device_modify:New attribute values
+ *
+ * ib_modify_device() changes a device's attributes as specified by
+ * the @device_modify_mask and @device_modify structure.
+ */
+int ib_modify_device(struct ib_device *device,
+ int device_modify_mask,
+ struct ib_device_modify *device_modify)
+{
+ if (!device->ops.modify_device)
+ return -EOPNOTSUPP;
+
+ return device->ops.modify_device(device, device_modify_mask,
+ device_modify);
+}
+EXPORT_SYMBOL(ib_modify_device);
+
+/**
+ * ib_modify_port - Modifies the attributes for the specified port.
+ * @device: The device to modify.
+ * @port_num: The number of the port to modify.
+ * @port_modify_mask: Mask used to specify which attributes of the port
+ * to change.
+ * @port_modify: New attribute values for the port.
+ *
+ * ib_modify_port() changes a port's attributes as specified by the
+ * @port_modify_mask and @port_modify structure.
+ */
+int ib_modify_port(struct ib_device *device,
+ u32 port_num, int port_modify_mask,
+ struct ib_port_modify *port_modify)
+{
+ int rc;
+
+ if (!rdma_is_port_valid(device, port_num))
+ return -EINVAL;
+
+ if (device->ops.modify_port)
+ rc = device->ops.modify_port(device, port_num,
+ port_modify_mask,
+ port_modify);
+ else if (rdma_protocol_roce(device, port_num) &&
+ ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 ||
+ (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0))
+ rc = 0;
+ else
+ rc = -EOPNOTSUPP;
+ return rc;
+}
+EXPORT_SYMBOL(ib_modify_port);
+
+/**
+ * ib_find_gid - Returns the port number and GID table index where
+ * a specified GID value occurs. Its searches only for IB link layer.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port_num: The port number of the device where the GID value was found.
+ * @index: The index into the GID table where the GID was found. This
+ * parameter may be NULL.
+ */
+int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+ u32 *port_num, u16 *index)
+{
+ union ib_gid tmp_gid;
+ u32 port;
+ int ret, i;
+
+ rdma_for_each_port (device, port) {
+ if (!rdma_protocol_ib(device, port))
+ continue;
+
+ for (i = 0; i < device->port_data[port].immutable.gid_tbl_len;
+ ++i) {
+ ret = rdma_query_gid(device, port, i, &tmp_gid);
+ if (ret)
+ continue;
+
+ if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
+ *port_num = port;
+ if (index)
+ *index = i;
+ return 0;
+ }
+ }
+ }
+
+ return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_gid);
+
+/**
+ * ib_find_pkey - Returns the PKey table index where a specified
+ * PKey value occurs.
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the PKey table where the PKey was found.
+ */
+int ib_find_pkey(struct ib_device *device,
+ u32 port_num, u16 pkey, u16 *index)
+{
+ int ret, i;
+ u16 tmp_pkey;
+ int partial_ix = -1;
+
+ for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len;
+ ++i) {
+ ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
+ if (ret)
+ return ret;
+ if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) {
+ /* if there is full-member pkey take it.*/
+ if (tmp_pkey & 0x8000) {
+ *index = i;
+ return 0;
+ }
+ if (partial_ix < 0)
+ partial_ix = i;
+ }
+ }
+
+ /*no full-member, if exists take the limited*/
+ if (partial_ix >= 0) {
+ *index = partial_ix;
+ return 0;
+ }
+ return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_pkey);
+
+/**
+ * ib_get_net_dev_by_params() - Return the appropriate net_dev
+ * for a received CM request
+ * @dev: An RDMA device on which the request has been received.
+ * @port: Port number on the RDMA device.
+ * @pkey: The Pkey the request came on.
+ * @gid: A GID that the net_dev uses to communicate.
+ * @addr: Contains the IP address that the request specified as its
+ * destination.
+ *
+ */
+struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
+ u32 port,
+ u16 pkey,
+ const union ib_gid *gid,
+ const struct sockaddr *addr)
+{
+ struct net_device *net_dev = NULL;
+ unsigned long index;
+ void *client_data;
+
+ if (!rdma_protocol_ib(dev, port))
+ return NULL;
+
+ /*
+ * Holding the read side guarantees that the client will not become
+ * unregistered while we are calling get_net_dev_by_params()
+ */
+ down_read(&dev->client_data_rwsem);
+ xan_for_each_marked (&dev->client_data, index, client_data,
+ CLIENT_DATA_REGISTERED) {
+ struct ib_client *client = xa_load(&clients, index);
+
+ if (!client || !client->get_net_dev_by_params)
+ continue;
+
+ net_dev = client->get_net_dev_by_params(dev, port, pkey, gid,
+ addr, client_data);
+ if (net_dev)
+ break;
+ }
+ up_read(&dev->client_data_rwsem);
+
+ return net_dev;
+}
+EXPORT_SYMBOL(ib_get_net_dev_by_params);
+
+void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
+{
+ struct ib_device_ops *dev_ops = &dev->ops;
+#define SET_DEVICE_OP(ptr, name) \
+ do { \
+ if (ops->name) \
+ if (!((ptr)->name)) \
+ (ptr)->name = ops->name; \
+ } while (0)
+
+#define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name)
+
+ if (ops->driver_id != RDMA_DRIVER_UNKNOWN) {
+ WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN &&
+ dev_ops->driver_id != ops->driver_id);
+ dev_ops->driver_id = ops->driver_id;
+ }
+ if (ops->owner) {
+ WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner);
+ dev_ops->owner = ops->owner;
+ }
+ if (ops->uverbs_abi_ver)
+ dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver;
+
+ dev_ops->uverbs_no_driver_id_binding |=
+ ops->uverbs_no_driver_id_binding;
+
+ SET_DEVICE_OP(dev_ops, add_gid);
+ SET_DEVICE_OP(dev_ops, advise_mr);
+ SET_DEVICE_OP(dev_ops, alloc_dm);
+ SET_DEVICE_OP(dev_ops, alloc_hw_device_stats);
+ SET_DEVICE_OP(dev_ops, alloc_hw_port_stats);
+ SET_DEVICE_OP(dev_ops, alloc_mr);
+ SET_DEVICE_OP(dev_ops, alloc_mr_integrity);
+ SET_DEVICE_OP(dev_ops, alloc_mw);
+ SET_DEVICE_OP(dev_ops, alloc_pd);
+ SET_DEVICE_OP(dev_ops, alloc_rdma_netdev);
+ SET_DEVICE_OP(dev_ops, alloc_ucontext);
+ SET_DEVICE_OP(dev_ops, alloc_xrcd);
+ SET_DEVICE_OP(dev_ops, attach_mcast);
+ SET_DEVICE_OP(dev_ops, check_mr_status);
+ SET_DEVICE_OP(dev_ops, counter_alloc_stats);
+ SET_DEVICE_OP(dev_ops, counter_bind_qp);
+ SET_DEVICE_OP(dev_ops, counter_dealloc);
+ SET_DEVICE_OP(dev_ops, counter_unbind_qp);
+ SET_DEVICE_OP(dev_ops, counter_update_stats);
+ SET_DEVICE_OP(dev_ops, create_ah);
+ SET_DEVICE_OP(dev_ops, create_counters);
+ SET_DEVICE_OP(dev_ops, create_cq);
+ SET_DEVICE_OP(dev_ops, create_flow);
+ SET_DEVICE_OP(dev_ops, create_qp);
+ SET_DEVICE_OP(dev_ops, create_rwq_ind_table);
+ SET_DEVICE_OP(dev_ops, create_srq);
+ SET_DEVICE_OP(dev_ops, create_user_ah);
+ SET_DEVICE_OP(dev_ops, create_wq);
+ SET_DEVICE_OP(dev_ops, dealloc_dm);
+ SET_DEVICE_OP(dev_ops, dealloc_driver);
+ SET_DEVICE_OP(dev_ops, dealloc_mw);
+ SET_DEVICE_OP(dev_ops, dealloc_pd);
+ SET_DEVICE_OP(dev_ops, dealloc_ucontext);
+ SET_DEVICE_OP(dev_ops, dealloc_xrcd);
+ SET_DEVICE_OP(dev_ops, del_gid);
+ SET_DEVICE_OP(dev_ops, dereg_mr);
+ SET_DEVICE_OP(dev_ops, destroy_ah);
+ SET_DEVICE_OP(dev_ops, destroy_counters);
+ SET_DEVICE_OP(dev_ops, destroy_cq);
+ SET_DEVICE_OP(dev_ops, destroy_flow);
+ SET_DEVICE_OP(dev_ops, destroy_flow_action);
+ SET_DEVICE_OP(dev_ops, destroy_qp);
+ SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table);
+ SET_DEVICE_OP(dev_ops, destroy_srq);
+ SET_DEVICE_OP(dev_ops, destroy_wq);
+ SET_DEVICE_OP(dev_ops, device_group);
+ SET_DEVICE_OP(dev_ops, detach_mcast);
+ SET_DEVICE_OP(dev_ops, disassociate_ucontext);
+ SET_DEVICE_OP(dev_ops, drain_rq);
+ SET_DEVICE_OP(dev_ops, drain_sq);
+ SET_DEVICE_OP(dev_ops, enable_driver);
+ SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry);
+ SET_DEVICE_OP(dev_ops, fill_res_cq_entry);
+ SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw);
+ SET_DEVICE_OP(dev_ops, fill_res_mr_entry);
+ SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw);
+ SET_DEVICE_OP(dev_ops, fill_res_qp_entry);
+ SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw);
+ SET_DEVICE_OP(dev_ops, fill_stat_mr_entry);
+ SET_DEVICE_OP(dev_ops, get_dev_fw_str);
+ SET_DEVICE_OP(dev_ops, get_dma_mr);
+ SET_DEVICE_OP(dev_ops, get_hw_stats);
+ SET_DEVICE_OP(dev_ops, get_link_layer);
+ SET_DEVICE_OP(dev_ops, get_netdev);
+ SET_DEVICE_OP(dev_ops, get_numa_node);
+ SET_DEVICE_OP(dev_ops, get_port_immutable);
+ SET_DEVICE_OP(dev_ops, get_vector_affinity);
+ SET_DEVICE_OP(dev_ops, get_vf_config);
+ SET_DEVICE_OP(dev_ops, get_vf_guid);
+ SET_DEVICE_OP(dev_ops, get_vf_stats);
+ SET_DEVICE_OP(dev_ops, iw_accept);
+ SET_DEVICE_OP(dev_ops, iw_add_ref);
+ SET_DEVICE_OP(dev_ops, iw_connect);
+ SET_DEVICE_OP(dev_ops, iw_create_listen);
+ SET_DEVICE_OP(dev_ops, iw_destroy_listen);
+ SET_DEVICE_OP(dev_ops, iw_get_qp);
+ SET_DEVICE_OP(dev_ops, iw_reject);
+ SET_DEVICE_OP(dev_ops, iw_rem_ref);
+ SET_DEVICE_OP(dev_ops, map_mr_sg);
+ SET_DEVICE_OP(dev_ops, map_mr_sg_pi);
+ SET_DEVICE_OP(dev_ops, mmap);
+ SET_DEVICE_OP(dev_ops, mmap_free);
+ SET_DEVICE_OP(dev_ops, modify_ah);
+ SET_DEVICE_OP(dev_ops, modify_cq);
+ SET_DEVICE_OP(dev_ops, modify_device);
+ SET_DEVICE_OP(dev_ops, modify_hw_stat);
+ SET_DEVICE_OP(dev_ops, modify_port);
+ SET_DEVICE_OP(dev_ops, modify_qp);
+ SET_DEVICE_OP(dev_ops, modify_srq);
+ SET_DEVICE_OP(dev_ops, modify_wq);
+ SET_DEVICE_OP(dev_ops, peek_cq);
+ SET_DEVICE_OP(dev_ops, poll_cq);
+ SET_DEVICE_OP(dev_ops, port_groups);
+ SET_DEVICE_OP(dev_ops, post_recv);
+ SET_DEVICE_OP(dev_ops, post_send);
+ SET_DEVICE_OP(dev_ops, post_srq_recv);
+ SET_DEVICE_OP(dev_ops, process_mad);
+ SET_DEVICE_OP(dev_ops, query_ah);
+ SET_DEVICE_OP(dev_ops, query_device);
+ SET_DEVICE_OP(dev_ops, query_gid);
+ SET_DEVICE_OP(dev_ops, query_pkey);
+ SET_DEVICE_OP(dev_ops, query_port);
+ SET_DEVICE_OP(dev_ops, query_qp);
+ SET_DEVICE_OP(dev_ops, query_srq);
+ SET_DEVICE_OP(dev_ops, query_ucontext);
+ SET_DEVICE_OP(dev_ops, rdma_netdev_get_params);
+ SET_DEVICE_OP(dev_ops, read_counters);
+ SET_DEVICE_OP(dev_ops, reg_dm_mr);
+ SET_DEVICE_OP(dev_ops, reg_user_mr);
+ SET_DEVICE_OP(dev_ops, reg_user_mr_dmabuf);
+ SET_DEVICE_OP(dev_ops, req_notify_cq);
+ SET_DEVICE_OP(dev_ops, rereg_user_mr);
+ SET_DEVICE_OP(dev_ops, resize_cq);
+ SET_DEVICE_OP(dev_ops, set_vf_guid);
+ SET_DEVICE_OP(dev_ops, set_vf_link_state);
+
+ SET_OBJ_SIZE(dev_ops, ib_ah);
+ SET_OBJ_SIZE(dev_ops, ib_counters);
+ SET_OBJ_SIZE(dev_ops, ib_cq);
+ SET_OBJ_SIZE(dev_ops, ib_mw);
+ SET_OBJ_SIZE(dev_ops, ib_pd);
+ SET_OBJ_SIZE(dev_ops, ib_qp);
+ SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table);
+ SET_OBJ_SIZE(dev_ops, ib_srq);
+ SET_OBJ_SIZE(dev_ops, ib_ucontext);
+ SET_OBJ_SIZE(dev_ops, ib_xrcd);
+}
+EXPORT_SYMBOL(ib_set_device_ops);
+
+#ifdef CONFIG_INFINIBAND_VIRT_DMA
+int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents)
+{
+ struct scatterlist *s;
+ int i;
+
+ for_each_sg(sg, s, nents, i) {
+ sg_dma_address(s) = (uintptr_t)sg_virt(s);
+ sg_dma_len(s) = s->length;
+ }
+ return nents;
+}
+EXPORT_SYMBOL(ib_dma_virt_map_sg);
+#endif /* CONFIG_INFINIBAND_VIRT_DMA */
+
+static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = {
+ [RDMA_NL_LS_OP_RESOLVE] = {
+ .doit = ib_nl_handle_resolve_resp,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
+ [RDMA_NL_LS_OP_SET_TIMEOUT] = {
+ .doit = ib_nl_handle_set_timeout,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
+ [RDMA_NL_LS_OP_IP_RESOLVE] = {
+ .doit = ib_nl_handle_ip_res_resp,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
+};
+
+static int __init ib_core_init(void)
+{
+ int ret = -ENOMEM;
+
+ ib_wq = alloc_workqueue("infiniband", 0, 0);
+ if (!ib_wq)
+ return -ENOMEM;
+
+ ib_unreg_wq = alloc_workqueue("ib-unreg-wq", WQ_UNBOUND,
+ WQ_UNBOUND_MAX_ACTIVE);
+ if (!ib_unreg_wq)
+ goto err;
+
+ ib_comp_wq = alloc_workqueue("ib-comp-wq",
+ WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+ if (!ib_comp_wq)
+ goto err_unbound;
+
+ ib_comp_unbound_wq =
+ alloc_workqueue("ib-comp-unb-wq",
+ WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM |
+ WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE);
+ if (!ib_comp_unbound_wq)
+ goto err_comp;
+
+ ret = class_register(&ib_class);
+ if (ret) {
+ pr_warn("Couldn't create InfiniBand device class\n");
+ goto err_comp_unbound;
+ }
+
+ rdma_nl_init();
+
+ ret = addr_init();
+ if (ret) {
+ pr_warn("Couldn't init IB address resolution\n");
+ goto err_ibnl;
+ }
+
+ ret = ib_mad_init();
+ if (ret) {
+ pr_warn("Couldn't init IB MAD\n");
+ goto err_addr;
+ }
+
+ ret = ib_sa_init();
+ if (ret) {
+ pr_warn("Couldn't init SA\n");
+ goto err_mad;
+ }
+
+ ret = register_blocking_lsm_notifier(&ibdev_lsm_nb);
+ if (ret) {
+ pr_warn("Couldn't register LSM notifier. ret %d\n", ret);
+ goto err_sa;
+ }
+
+ ret = register_pernet_device(&rdma_dev_net_ops);
+ if (ret) {
+ pr_warn("Couldn't init compat dev. ret %d\n", ret);
+ goto err_compat;
+ }
+
+ nldev_init();
+ rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table);
+ ret = roce_gid_mgmt_init();
+ if (ret) {
+ pr_warn("Couldn't init RoCE GID management\n");
+ goto err_parent;
+ }
+
+ return 0;
+
+err_parent:
+ rdma_nl_unregister(RDMA_NL_LS);
+ nldev_exit();
+ unregister_pernet_device(&rdma_dev_net_ops);
+err_compat:
+ unregister_blocking_lsm_notifier(&ibdev_lsm_nb);
+err_sa:
+ ib_sa_cleanup();
+err_mad:
+ ib_mad_cleanup();
+err_addr:
+ addr_cleanup();
+err_ibnl:
+ class_unregister(&ib_class);
+err_comp_unbound:
+ destroy_workqueue(ib_comp_unbound_wq);
+err_comp:
+ destroy_workqueue(ib_comp_wq);
+err_unbound:
+ destroy_workqueue(ib_unreg_wq);
+err:
+ destroy_workqueue(ib_wq);
+ return ret;
+}
+
+static void __exit ib_core_cleanup(void)
+{
+ roce_gid_mgmt_cleanup();
+ rdma_nl_unregister(RDMA_NL_LS);
+ nldev_exit();
+ unregister_pernet_device(&rdma_dev_net_ops);
+ unregister_blocking_lsm_notifier(&ibdev_lsm_nb);
+ ib_sa_cleanup();
+ ib_mad_cleanup();
+ addr_cleanup();
+ rdma_nl_exit();
+ class_unregister(&ib_class);
+ destroy_workqueue(ib_comp_unbound_wq);
+ destroy_workqueue(ib_comp_wq);
+ /* Make sure that any pending umem accounting work is done. */
+ destroy_workqueue(ib_wq);
+ destroy_workqueue(ib_unreg_wq);
+ WARN_ON(!xa_empty(&clients));
+ WARN_ON(!xa_empty(&devices));
+}
+
+MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4);
+
+/* ib core relies on netdev stack to first register net_ns_type_operations
+ * ns kobject type before ib_core initialization.
+ */
+fs_initcall(ib_core_init);
+module_exit(ib_core_cleanup);
diff --git a/drivers/infiniband/core/ib_core_uverbs.c b/drivers/infiniband/core/ib_core_uverbs.c
new file mode 100644
index 000000000..b51bd7087
--- /dev/null
+++ b/drivers/infiniband/core/ib_core_uverbs.c
@@ -0,0 +1,367 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2019 Marvell. All rights reserved.
+ */
+#include <linux/xarray.h>
+#include "uverbs.h"
+#include "core_priv.h"
+
+/**
+ * rdma_umap_priv_init() - Initialize the private data of a vma
+ *
+ * @priv: The already allocated private data
+ * @vma: The vm area struct that needs private data
+ * @entry: entry into the mmap_xa that needs to be linked with
+ * this vma
+ *
+ * Each time we map IO memory into user space this keeps track of the
+ * mapping. When the device is hot-unplugged we 'zap' the mmaps in user space
+ * to point to the zero page and allow the hot unplug to proceed.
+ *
+ * This is necessary for cases like PCI physical hot unplug as the actual BAR
+ * memory may vanish after this and access to it from userspace could MCE.
+ *
+ * RDMA drivers supporting disassociation must have their user space designed
+ * to cope in some way with their IO pages going to the zero page.
+ *
+ */
+void rdma_umap_priv_init(struct rdma_umap_priv *priv,
+ struct vm_area_struct *vma,
+ struct rdma_user_mmap_entry *entry)
+{
+ struct ib_uverbs_file *ufile = vma->vm_file->private_data;
+
+ priv->vma = vma;
+ if (entry) {
+ kref_get(&entry->ref);
+ priv->entry = entry;
+ }
+ vma->vm_private_data = priv;
+ /* vm_ops is setup in ib_uverbs_mmap() to avoid module dependencies */
+
+ mutex_lock(&ufile->umap_lock);
+ list_add(&priv->list, &ufile->umaps);
+ mutex_unlock(&ufile->umap_lock);
+}
+EXPORT_SYMBOL(rdma_umap_priv_init);
+
+/**
+ * rdma_user_mmap_io() - Map IO memory into a process
+ *
+ * @ucontext: associated user context
+ * @vma: the vma related to the current mmap call
+ * @pfn: pfn to map
+ * @size: size to map
+ * @prot: pgprot to use in remap call
+ * @entry: mmap_entry retrieved from rdma_user_mmap_entry_get(), or NULL
+ * if mmap_entry is not used by the driver
+ *
+ * This is to be called by drivers as part of their mmap() functions if they
+ * wish to send something like PCI-E BAR memory to userspace.
+ *
+ * Return -EINVAL on wrong flags or size, -EAGAIN on failure to map. 0 on
+ * success.
+ */
+int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
+ unsigned long pfn, unsigned long size, pgprot_t prot,
+ struct rdma_user_mmap_entry *entry)
+{
+ struct ib_uverbs_file *ufile = ucontext->ufile;
+ struct rdma_umap_priv *priv;
+
+ if (!(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ if (vma->vm_end - vma->vm_start != size)
+ return -EINVAL;
+
+ /* Driver is using this wrong, must be called by ib_uverbs_mmap */
+ if (WARN_ON(!vma->vm_file ||
+ vma->vm_file->private_data != ufile))
+ return -EINVAL;
+ lockdep_assert_held(&ufile->device->disassociate_srcu);
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ vma->vm_page_prot = prot;
+ if (io_remap_pfn_range(vma, vma->vm_start, pfn, size, prot)) {
+ kfree(priv);
+ return -EAGAIN;
+ }
+
+ rdma_umap_priv_init(priv, vma, entry);
+ return 0;
+}
+EXPORT_SYMBOL(rdma_user_mmap_io);
+
+/**
+ * rdma_user_mmap_entry_get_pgoff() - Get an entry from the mmap_xa
+ *
+ * @ucontext: associated user context
+ * @pgoff: The mmap offset >> PAGE_SHIFT
+ *
+ * This function is called when a user tries to mmap with an offset (returned
+ * by rdma_user_mmap_get_offset()) it initially received from the driver. The
+ * rdma_user_mmap_entry was created by the function
+ * rdma_user_mmap_entry_insert(). This function increases the refcnt of the
+ * entry so that it won't be deleted from the xarray in the meantime.
+ *
+ * Return an reference to an entry if exists or NULL if there is no
+ * match. rdma_user_mmap_entry_put() must be called to put the reference.
+ */
+struct rdma_user_mmap_entry *
+rdma_user_mmap_entry_get_pgoff(struct ib_ucontext *ucontext,
+ unsigned long pgoff)
+{
+ struct rdma_user_mmap_entry *entry;
+
+ if (pgoff > U32_MAX)
+ return NULL;
+
+ xa_lock(&ucontext->mmap_xa);
+
+ entry = xa_load(&ucontext->mmap_xa, pgoff);
+
+ /*
+ * If refcount is zero, entry is already being deleted, driver_removed
+ * indicates that the no further mmaps are possible and we waiting for
+ * the active VMAs to be closed.
+ */
+ if (!entry || entry->start_pgoff != pgoff || entry->driver_removed ||
+ !kref_get_unless_zero(&entry->ref))
+ goto err;
+
+ xa_unlock(&ucontext->mmap_xa);
+
+ ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#zx] returned\n",
+ pgoff, entry->npages);
+
+ return entry;
+
+err:
+ xa_unlock(&ucontext->mmap_xa);
+ return NULL;
+}
+EXPORT_SYMBOL(rdma_user_mmap_entry_get_pgoff);
+
+/**
+ * rdma_user_mmap_entry_get() - Get an entry from the mmap_xa
+ *
+ * @ucontext: associated user context
+ * @vma: the vma being mmap'd into
+ *
+ * This function is like rdma_user_mmap_entry_get_pgoff() except that it also
+ * checks that the VMA is correct.
+ */
+struct rdma_user_mmap_entry *
+rdma_user_mmap_entry_get(struct ib_ucontext *ucontext,
+ struct vm_area_struct *vma)
+{
+ struct rdma_user_mmap_entry *entry;
+
+ if (!(vma->vm_flags & VM_SHARED))
+ return NULL;
+ entry = rdma_user_mmap_entry_get_pgoff(ucontext, vma->vm_pgoff);
+ if (!entry)
+ return NULL;
+ if (entry->npages * PAGE_SIZE != vma->vm_end - vma->vm_start) {
+ rdma_user_mmap_entry_put(entry);
+ return NULL;
+ }
+ return entry;
+}
+EXPORT_SYMBOL(rdma_user_mmap_entry_get);
+
+static void rdma_user_mmap_entry_free(struct kref *kref)
+{
+ struct rdma_user_mmap_entry *entry =
+ container_of(kref, struct rdma_user_mmap_entry, ref);
+ struct ib_ucontext *ucontext = entry->ucontext;
+ unsigned long i;
+
+ /*
+ * Erase all entries occupied by this single entry, this is deferred
+ * until all VMA are closed so that the mmap offsets remain unique.
+ */
+ xa_lock(&ucontext->mmap_xa);
+ for (i = 0; i < entry->npages; i++)
+ __xa_erase(&ucontext->mmap_xa, entry->start_pgoff + i);
+ xa_unlock(&ucontext->mmap_xa);
+
+ ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#zx] removed\n",
+ entry->start_pgoff, entry->npages);
+
+ if (ucontext->device->ops.mmap_free)
+ ucontext->device->ops.mmap_free(entry);
+}
+
+/**
+ * rdma_user_mmap_entry_put() - Drop reference to the mmap entry
+ *
+ * @entry: an entry in the mmap_xa
+ *
+ * This function is called when the mapping is closed if it was
+ * an io mapping or when the driver is done with the entry for
+ * some other reason.
+ * Should be called after rdma_user_mmap_entry_get was called
+ * and entry is no longer needed. This function will erase the
+ * entry and free it if its refcnt reaches zero.
+ */
+void rdma_user_mmap_entry_put(struct rdma_user_mmap_entry *entry)
+{
+ kref_put(&entry->ref, rdma_user_mmap_entry_free);
+}
+EXPORT_SYMBOL(rdma_user_mmap_entry_put);
+
+/**
+ * rdma_user_mmap_entry_remove() - Drop reference to entry and
+ * mark it as unmmapable
+ *
+ * @entry: the entry to insert into the mmap_xa
+ *
+ * Drivers can call this to prevent userspace from creating more mappings for
+ * entry, however existing mmaps continue to exist and ops->mmap_free() will
+ * not be called until all user mmaps are destroyed.
+ */
+void rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry)
+{
+ if (!entry)
+ return;
+
+ xa_lock(&entry->ucontext->mmap_xa);
+ entry->driver_removed = true;
+ xa_unlock(&entry->ucontext->mmap_xa);
+ kref_put(&entry->ref, rdma_user_mmap_entry_free);
+}
+EXPORT_SYMBOL(rdma_user_mmap_entry_remove);
+
+/**
+ * rdma_user_mmap_entry_insert_range() - Insert an entry to the mmap_xa
+ * in a given range.
+ *
+ * @ucontext: associated user context.
+ * @entry: the entry to insert into the mmap_xa
+ * @length: length of the address that will be mmapped
+ * @min_pgoff: minimum pgoff to be returned
+ * @max_pgoff: maximum pgoff to be returned
+ *
+ * This function should be called by drivers that use the rdma_user_mmap
+ * interface for implementing their mmap syscall A database of mmap offsets is
+ * handled in the core and helper functions are provided to insert entries
+ * into the database and extract entries when the user calls mmap with the
+ * given offset. The function allocates a unique page offset in a given range
+ * that should be provided to user, the user will use the offset to retrieve
+ * information such as address to be mapped and how.
+ *
+ * Return: 0 on success and -ENOMEM on failure
+ */
+int rdma_user_mmap_entry_insert_range(struct ib_ucontext *ucontext,
+ struct rdma_user_mmap_entry *entry,
+ size_t length, u32 min_pgoff,
+ u32 max_pgoff)
+{
+ struct ib_uverbs_file *ufile = ucontext->ufile;
+ XA_STATE(xas, &ucontext->mmap_xa, min_pgoff);
+ u32 xa_first, xa_last, npages;
+ int err;
+ u32 i;
+
+ if (!entry)
+ return -EINVAL;
+
+ kref_init(&entry->ref);
+ entry->ucontext = ucontext;
+
+ /*
+ * We want the whole allocation to be done without interruption from a
+ * different thread. The allocation requires finding a free range and
+ * storing. During the xa_insert the lock could be released, possibly
+ * allowing another thread to choose the same range.
+ */
+ mutex_lock(&ufile->umap_lock);
+
+ xa_lock(&ucontext->mmap_xa);
+
+ /* We want to find an empty range */
+ npages = (u32)DIV_ROUND_UP(length, PAGE_SIZE);
+ entry->npages = npages;
+ while (true) {
+ /* First find an empty index */
+ xas_find_marked(&xas, max_pgoff, XA_FREE_MARK);
+ if (xas.xa_node == XAS_RESTART)
+ goto err_unlock;
+
+ xa_first = xas.xa_index;
+
+ /* Is there enough room to have the range? */
+ if (check_add_overflow(xa_first, npages, &xa_last))
+ goto err_unlock;
+
+ /*
+ * Now look for the next present entry. If an entry doesn't
+ * exist, we found an empty range and can proceed.
+ */
+ xas_next_entry(&xas, xa_last - 1);
+ if (xas.xa_node == XAS_BOUNDS || xas.xa_index >= xa_last)
+ break;
+ }
+
+ for (i = xa_first; i < xa_last; i++) {
+ err = __xa_insert(&ucontext->mmap_xa, i, entry, GFP_KERNEL);
+ if (err)
+ goto err_undo;
+ }
+
+ /*
+ * Internally the kernel uses a page offset, in libc this is a byte
+ * offset. Drivers should not return pgoff to userspace.
+ */
+ entry->start_pgoff = xa_first;
+ xa_unlock(&ucontext->mmap_xa);
+ mutex_unlock(&ufile->umap_lock);
+
+ ibdev_dbg(ucontext->device, "mmap: pgoff[%#lx] npages[%#x] inserted\n",
+ entry->start_pgoff, npages);
+
+ return 0;
+
+err_undo:
+ for (; i > xa_first; i--)
+ __xa_erase(&ucontext->mmap_xa, i - 1);
+
+err_unlock:
+ xa_unlock(&ucontext->mmap_xa);
+ mutex_unlock(&ufile->umap_lock);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(rdma_user_mmap_entry_insert_range);
+
+/**
+ * rdma_user_mmap_entry_insert() - Insert an entry to the mmap_xa.
+ *
+ * @ucontext: associated user context.
+ * @entry: the entry to insert into the mmap_xa
+ * @length: length of the address that will be mmapped
+ *
+ * This function should be called by drivers that use the rdma_user_mmap
+ * interface for handling user mmapped addresses. The database is handled in
+ * the core and helper functions are provided to insert entries into the
+ * database and extract entries when the user calls mmap with the given offset.
+ * The function allocates a unique page offset that should be provided to user,
+ * the user will use the offset to retrieve information such as address to
+ * be mapped and how.
+ *
+ * Return: 0 on success and -ENOMEM on failure
+ */
+int rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext,
+ struct rdma_user_mmap_entry *entry,
+ size_t length)
+{
+ return rdma_user_mmap_entry_insert_range(ucontext, entry, length, 0,
+ U32_MAX);
+}
+EXPORT_SYMBOL(rdma_user_mmap_entry_insert);
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
new file mode 100644
index 000000000..2b47073c6
--- /dev/null
+++ b/drivers/infiniband/core/iwcm.c
@@ -0,0 +1,1223 @@
+/*
+ * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/rbtree.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_addr.h>
+#include <rdma/iw_portmap.h>
+#include <rdma/rdma_netlink.h>
+
+#include "iwcm.h"
+
+MODULE_AUTHOR("Tom Tucker");
+MODULE_DESCRIPTION("iWARP CM");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static const char * const iwcm_rej_reason_strs[] = {
+ [ECONNRESET] = "reset by remote host",
+ [ECONNREFUSED] = "refused by remote application",
+ [ETIMEDOUT] = "setup timeout",
+};
+
+const char *__attribute_const__ iwcm_reject_msg(int reason)
+{
+ size_t index;
+
+ /* iWARP uses negative errnos */
+ index = -reason;
+
+ if (index < ARRAY_SIZE(iwcm_rej_reason_strs) &&
+ iwcm_rej_reason_strs[index])
+ return iwcm_rej_reason_strs[index];
+ else
+ return "unrecognized reason";
+}
+EXPORT_SYMBOL(iwcm_reject_msg);
+
+static struct rdma_nl_cbs iwcm_nl_cb_table[RDMA_NL_IWPM_NUM_OPS] = {
+ [RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb},
+ [RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb},
+ [RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb},
+ [RDMA_NL_IWPM_REMOTE_INFO] = {.dump = iwpm_remote_info_cb},
+ [RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb},
+ [RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb},
+ [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb},
+ [RDMA_NL_IWPM_HELLO] = {.dump = iwpm_hello_cb}
+};
+
+static struct workqueue_struct *iwcm_wq;
+struct iwcm_work {
+ struct work_struct work;
+ struct iwcm_id_private *cm_id;
+ struct list_head list;
+ struct iw_cm_event event;
+ struct list_head free_list;
+};
+
+static unsigned int default_backlog = 256;
+
+static struct ctl_table_header *iwcm_ctl_table_hdr;
+static struct ctl_table iwcm_ctl_table[] = {
+ {
+ .procname = "default_backlog",
+ .data = &default_backlog,
+ .maxlen = sizeof(default_backlog),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+/*
+ * The following services provide a mechanism for pre-allocating iwcm_work
+ * elements. The design pre-allocates them based on the cm_id type:
+ * LISTENING IDS: Get enough elements preallocated to handle the
+ * listen backlog.
+ * ACTIVE IDS: 4: CONNECT_REPLY, ESTABLISHED, DISCONNECT, CLOSE
+ * PASSIVE IDS: 3: ESTABLISHED, DISCONNECT, CLOSE
+ *
+ * Allocating them in connect and listen avoids having to deal
+ * with allocation failures on the event upcall from the provider (which
+ * is called in the interrupt context).
+ *
+ * One exception is when creating the cm_id for incoming connection requests.
+ * There are two cases:
+ * 1) in the event upcall, cm_event_handler(), for a listening cm_id. If
+ * the backlog is exceeded, then no more connection request events will
+ * be processed. cm_event_handler() returns -ENOMEM in this case. Its up
+ * to the provider to reject the connection request.
+ * 2) in the connection request workqueue handler, cm_conn_req_handler().
+ * If work elements cannot be allocated for the new connect request cm_id,
+ * then IWCM will call the provider reject method. This is ok since
+ * cm_conn_req_handler() runs in the workqueue thread context.
+ */
+
+static struct iwcm_work *get_work(struct iwcm_id_private *cm_id_priv)
+{
+ struct iwcm_work *work;
+
+ if (list_empty(&cm_id_priv->work_free_list))
+ return NULL;
+ work = list_entry(cm_id_priv->work_free_list.next, struct iwcm_work,
+ free_list);
+ list_del_init(&work->free_list);
+ return work;
+}
+
+static void put_work(struct iwcm_work *work)
+{
+ list_add(&work->free_list, &work->cm_id->work_free_list);
+}
+
+static void dealloc_work_entries(struct iwcm_id_private *cm_id_priv)
+{
+ struct list_head *e, *tmp;
+
+ list_for_each_safe(e, tmp, &cm_id_priv->work_free_list) {
+ list_del(e);
+ kfree(list_entry(e, struct iwcm_work, free_list));
+ }
+}
+
+static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count)
+{
+ struct iwcm_work *work;
+
+ BUG_ON(!list_empty(&cm_id_priv->work_free_list));
+ while (count--) {
+ work = kmalloc(sizeof(struct iwcm_work), GFP_KERNEL);
+ if (!work) {
+ dealloc_work_entries(cm_id_priv);
+ return -ENOMEM;
+ }
+ work->cm_id = cm_id_priv;
+ INIT_LIST_HEAD(&work->list);
+ put_work(work);
+ }
+ return 0;
+}
+
+/*
+ * Save private data from incoming connection requests to
+ * iw_cm_event, so the low level driver doesn't have to. Adjust
+ * the event ptr to point to the local copy.
+ */
+static int copy_private_data(struct iw_cm_event *event)
+{
+ void *p;
+
+ p = kmemdup(event->private_data, event->private_data_len, GFP_ATOMIC);
+ if (!p)
+ return -ENOMEM;
+ event->private_data = p;
+ return 0;
+}
+
+static void free_cm_id(struct iwcm_id_private *cm_id_priv)
+{
+ dealloc_work_entries(cm_id_priv);
+ kfree(cm_id_priv);
+}
+
+/*
+ * Release a reference on cm_id. If the last reference is being
+ * released, free the cm_id and return 1.
+ */
+static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
+{
+ if (refcount_dec_and_test(&cm_id_priv->refcount)) {
+ BUG_ON(!list_empty(&cm_id_priv->work_list));
+ free_cm_id(cm_id_priv);
+ return 1;
+ }
+
+ return 0;
+}
+
+static void add_ref(struct iw_cm_id *cm_id)
+{
+ struct iwcm_id_private *cm_id_priv;
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ refcount_inc(&cm_id_priv->refcount);
+}
+
+static void rem_ref(struct iw_cm_id *cm_id)
+{
+ struct iwcm_id_private *cm_id_priv;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+ (void)iwcm_deref_id(cm_id_priv);
+}
+
+static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event);
+
+struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
+ iw_cm_handler cm_handler,
+ void *context)
+{
+ struct iwcm_id_private *cm_id_priv;
+
+ cm_id_priv = kzalloc(sizeof(*cm_id_priv), GFP_KERNEL);
+ if (!cm_id_priv)
+ return ERR_PTR(-ENOMEM);
+
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ cm_id_priv->id.device = device;
+ cm_id_priv->id.cm_handler = cm_handler;
+ cm_id_priv->id.context = context;
+ cm_id_priv->id.event_handler = cm_event_handler;
+ cm_id_priv->id.add_ref = add_ref;
+ cm_id_priv->id.rem_ref = rem_ref;
+ spin_lock_init(&cm_id_priv->lock);
+ refcount_set(&cm_id_priv->refcount, 1);
+ init_waitqueue_head(&cm_id_priv->connect_wait);
+ init_completion(&cm_id_priv->destroy_comp);
+ INIT_LIST_HEAD(&cm_id_priv->work_list);
+ INIT_LIST_HEAD(&cm_id_priv->work_free_list);
+
+ return &cm_id_priv->id;
+}
+EXPORT_SYMBOL(iw_create_cm_id);
+
+
+static int iwcm_modify_qp_err(struct ib_qp *qp)
+{
+ struct ib_qp_attr qp_attr;
+
+ if (!qp)
+ return -EINVAL;
+
+ qp_attr.qp_state = IB_QPS_ERR;
+ return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+}
+
+/*
+ * This is really the RDMAC CLOSING state. It is most similar to the
+ * IB SQD QP state.
+ */
+static int iwcm_modify_qp_sqd(struct ib_qp *qp)
+{
+ struct ib_qp_attr qp_attr;
+
+ BUG_ON(qp == NULL);
+ qp_attr.qp_state = IB_QPS_SQD;
+ return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+}
+
+/*
+ * CM_ID <-- CLOSING
+ *
+ * Block if a passive or active connection is currently being processed. Then
+ * process the event as follows:
+ * - If we are ESTABLISHED, move to CLOSING and modify the QP state
+ * based on the abrupt flag
+ * - If the connection is already in the CLOSING or IDLE state, the peer is
+ * disconnecting concurrently with us and we've already seen the
+ * DISCONNECT event -- ignore the request and return 0
+ * - Disconnect on a listening endpoint returns -EINVAL
+ */
+int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt)
+{
+ struct iwcm_id_private *cm_id_priv;
+ unsigned long flags;
+ int ret = 0;
+ struct ib_qp *qp = NULL;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ /* Wait if we're currently in a connect or accept downcall */
+ wait_event(cm_id_priv->connect_wait,
+ !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_ESTABLISHED:
+ cm_id_priv->state = IW_CM_STATE_CLOSING;
+
+ /* QP could be <nul> for user-mode client */
+ if (cm_id_priv->qp)
+ qp = cm_id_priv->qp;
+ else
+ ret = -EINVAL;
+ break;
+ case IW_CM_STATE_LISTEN:
+ ret = -EINVAL;
+ break;
+ case IW_CM_STATE_CLOSING:
+ /* remote peer closed first */
+ case IW_CM_STATE_IDLE:
+ /* accept or connect returned !0 */
+ break;
+ case IW_CM_STATE_CONN_RECV:
+ /*
+ * App called disconnect before/without calling accept after
+ * connect_request event delivered.
+ */
+ break;
+ case IW_CM_STATE_CONN_SENT:
+ /* Can only get here if wait above fails */
+ default:
+ BUG();
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ if (qp) {
+ if (abrupt)
+ ret = iwcm_modify_qp_err(qp);
+ else
+ ret = iwcm_modify_qp_sqd(qp);
+
+ /*
+ * If both sides are disconnecting the QP could
+ * already be in ERR or SQD states
+ */
+ ret = 0;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(iw_cm_disconnect);
+
+/*
+ * CM_ID <-- DESTROYING
+ *
+ * Clean up all resources associated with the connection and release
+ * the initial reference taken by iw_create_cm_id.
+ */
+static void destroy_cm_id(struct iw_cm_id *cm_id)
+{
+ struct iwcm_id_private *cm_id_priv;
+ struct ib_qp *qp;
+ unsigned long flags;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ /*
+ * Wait if we're currently in a connect or accept downcall. A
+ * listening endpoint should never block here.
+ */
+ wait_event(cm_id_priv->connect_wait,
+ !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
+
+ /*
+ * Since we're deleting the cm_id, drop any events that
+ * might arrive before the last dereference.
+ */
+ set_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ qp = cm_id_priv->qp;
+ cm_id_priv->qp = NULL;
+
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_LISTEN:
+ cm_id_priv->state = IW_CM_STATE_DESTROYING;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ /* destroy the listening endpoint */
+ cm_id->device->ops.iw_destroy_listen(cm_id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ break;
+ case IW_CM_STATE_ESTABLISHED:
+ cm_id_priv->state = IW_CM_STATE_DESTROYING;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ /* Abrupt close of the connection */
+ (void)iwcm_modify_qp_err(qp);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ break;
+ case IW_CM_STATE_IDLE:
+ case IW_CM_STATE_CLOSING:
+ cm_id_priv->state = IW_CM_STATE_DESTROYING;
+ break;
+ case IW_CM_STATE_CONN_RECV:
+ /*
+ * App called destroy before/without calling accept after
+ * receiving connection request event notification or
+ * returned non zero from the event callback function.
+ * In either case, must tell the provider to reject.
+ */
+ cm_id_priv->state = IW_CM_STATE_DESTROYING;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_id->device->ops.iw_reject(cm_id, NULL, 0);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ break;
+ case IW_CM_STATE_CONN_SENT:
+ case IW_CM_STATE_DESTROYING:
+ default:
+ BUG();
+ break;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ if (qp)
+ cm_id_priv->id.device->ops.iw_rem_ref(qp);
+
+ if (cm_id->mapped) {
+ iwpm_remove_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr);
+ iwpm_remove_mapping(&cm_id->local_addr, RDMA_NL_IWCM);
+ }
+
+ (void)iwcm_deref_id(cm_id_priv);
+}
+
+/*
+ * This function is only called by the application thread and cannot
+ * be called by the event thread. The function will wait for all
+ * references to be released on the cm_id and then kfree the cm_id
+ * object.
+ */
+void iw_destroy_cm_id(struct iw_cm_id *cm_id)
+{
+ destroy_cm_id(cm_id);
+}
+EXPORT_SYMBOL(iw_destroy_cm_id);
+
+/**
+ * iw_cm_check_wildcard - If IP address is 0 then use original
+ * @pm_addr: sockaddr containing the ip to check for wildcard
+ * @cm_addr: sockaddr containing the actual IP address
+ * @cm_outaddr: sockaddr to set IP addr which leaving port
+ *
+ * Checks the pm_addr for wildcard and then sets cm_outaddr's
+ * IP to the actual (cm_addr).
+ */
+static void iw_cm_check_wildcard(struct sockaddr_storage *pm_addr,
+ struct sockaddr_storage *cm_addr,
+ struct sockaddr_storage *cm_outaddr)
+{
+ if (pm_addr->ss_family == AF_INET) {
+ struct sockaddr_in *pm4_addr = (struct sockaddr_in *)pm_addr;
+
+ if (pm4_addr->sin_addr.s_addr == htonl(INADDR_ANY)) {
+ struct sockaddr_in *cm4_addr =
+ (struct sockaddr_in *)cm_addr;
+ struct sockaddr_in *cm4_outaddr =
+ (struct sockaddr_in *)cm_outaddr;
+
+ cm4_outaddr->sin_addr = cm4_addr->sin_addr;
+ }
+ } else {
+ struct sockaddr_in6 *pm6_addr = (struct sockaddr_in6 *)pm_addr;
+
+ if (ipv6_addr_type(&pm6_addr->sin6_addr) == IPV6_ADDR_ANY) {
+ struct sockaddr_in6 *cm6_addr =
+ (struct sockaddr_in6 *)cm_addr;
+ struct sockaddr_in6 *cm6_outaddr =
+ (struct sockaddr_in6 *)cm_outaddr;
+
+ cm6_outaddr->sin6_addr = cm6_addr->sin6_addr;
+ }
+ }
+}
+
+/**
+ * iw_cm_map - Use portmapper to map the ports
+ * @cm_id: connection manager pointer
+ * @active: Indicates the active side when true
+ * returns nonzero for error only if iwpm_create_mapinfo() fails
+ *
+ * Tries to add a mapping for a port using the Portmapper. If
+ * successful in mapping the IP/Port it will check the remote
+ * mapped IP address for a wildcard IP address and replace the
+ * zero IP address with the remote_addr.
+ */
+static int iw_cm_map(struct iw_cm_id *cm_id, bool active)
+{
+ const char *devname = dev_name(&cm_id->device->dev);
+ const char *ifname = cm_id->device->iw_ifname;
+ struct iwpm_dev_data pm_reg_msg = {};
+ struct iwpm_sa_data pm_msg;
+ int status;
+
+ if (strlen(devname) >= sizeof(pm_reg_msg.dev_name) ||
+ strlen(ifname) >= sizeof(pm_reg_msg.if_name))
+ return -EINVAL;
+
+ cm_id->m_local_addr = cm_id->local_addr;
+ cm_id->m_remote_addr = cm_id->remote_addr;
+
+ strcpy(pm_reg_msg.dev_name, devname);
+ strcpy(pm_reg_msg.if_name, ifname);
+
+ if (iwpm_register_pid(&pm_reg_msg, RDMA_NL_IWCM) ||
+ !iwpm_valid_pid())
+ return 0;
+
+ cm_id->mapped = true;
+ pm_msg.loc_addr = cm_id->local_addr;
+ pm_msg.rem_addr = cm_id->remote_addr;
+ pm_msg.flags = (cm_id->device->iw_driver_flags & IW_F_NO_PORT_MAP) ?
+ IWPM_FLAGS_NO_PORT_MAP : 0;
+ if (active)
+ status = iwpm_add_and_query_mapping(&pm_msg,
+ RDMA_NL_IWCM);
+ else
+ status = iwpm_add_mapping(&pm_msg, RDMA_NL_IWCM);
+
+ if (!status) {
+ cm_id->m_local_addr = pm_msg.mapped_loc_addr;
+ if (active) {
+ cm_id->m_remote_addr = pm_msg.mapped_rem_addr;
+ iw_cm_check_wildcard(&pm_msg.mapped_rem_addr,
+ &cm_id->remote_addr,
+ &cm_id->m_remote_addr);
+ }
+ }
+
+ return iwpm_create_mapinfo(&cm_id->local_addr,
+ &cm_id->m_local_addr,
+ RDMA_NL_IWCM, pm_msg.flags);
+}
+
+/*
+ * CM_ID <-- LISTEN
+ *
+ * Start listening for connect requests. Generates one CONNECT_REQUEST
+ * event for each inbound connect request.
+ */
+int iw_cm_listen(struct iw_cm_id *cm_id, int backlog)
+{
+ struct iwcm_id_private *cm_id_priv;
+ unsigned long flags;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+ if (!backlog)
+ backlog = default_backlog;
+
+ ret = alloc_work_entries(cm_id_priv, backlog);
+ if (ret)
+ return ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_IDLE:
+ cm_id_priv->state = IW_CM_STATE_LISTEN;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ ret = iw_cm_map(cm_id, false);
+ if (!ret)
+ ret = cm_id->device->ops.iw_create_listen(cm_id,
+ backlog);
+ if (ret)
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(iw_cm_listen);
+
+/*
+ * CM_ID <-- IDLE
+ *
+ * Rejects an inbound connection request. No events are generated.
+ */
+int iw_cm_reject(struct iw_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct iwcm_id_private *cm_id_priv;
+ unsigned long flags;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+ return -EINVAL;
+ }
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ ret = cm_id->device->ops.iw_reject(cm_id, private_data,
+ private_data_len);
+
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+
+ return ret;
+}
+EXPORT_SYMBOL(iw_cm_reject);
+
+/*
+ * CM_ID <-- ESTABLISHED
+ *
+ * Accepts an inbound connection request and generates an ESTABLISHED
+ * event. Callers of iw_cm_disconnect and iw_destroy_cm_id will block
+ * until the ESTABLISHED event is received from the provider.
+ */
+int iw_cm_accept(struct iw_cm_id *cm_id,
+ struct iw_cm_conn_param *iw_param)
+{
+ struct iwcm_id_private *cm_id_priv;
+ struct ib_qp *qp;
+ unsigned long flags;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+ return -EINVAL;
+ }
+ /* Get the ib_qp given the QPN */
+ qp = cm_id->device->ops.iw_get_qp(cm_id->device, iw_param->qpn);
+ if (!qp) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+ return -EINVAL;
+ }
+ cm_id->device->ops.iw_add_ref(qp);
+ cm_id_priv->qp = qp;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ ret = cm_id->device->ops.iw_accept(cm_id, iw_param);
+ if (ret) {
+ /* An error on accept precludes provider events */
+ BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ qp = cm_id_priv->qp;
+ cm_id_priv->qp = NULL;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ if (qp)
+ cm_id->device->ops.iw_rem_ref(qp);
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(iw_cm_accept);
+
+/*
+ * Active Side: CM_ID <-- CONN_SENT
+ *
+ * If successful, results in the generation of a CONNECT_REPLY
+ * event. iw_cm_disconnect and iw_cm_destroy will block until the
+ * CONNECT_REPLY event is received from the provider.
+ */
+int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+ struct iwcm_id_private *cm_id_priv;
+ int ret;
+ unsigned long flags;
+ struct ib_qp *qp = NULL;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+ ret = alloc_work_entries(cm_id_priv, 4);
+ if (ret)
+ return ret;
+
+ set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+ if (cm_id_priv->state != IW_CM_STATE_IDLE) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ /* Get the ib_qp given the QPN */
+ qp = cm_id->device->ops.iw_get_qp(cm_id->device, iw_param->qpn);
+ if (!qp) {
+ ret = -EINVAL;
+ goto err;
+ }
+ cm_id->device->ops.iw_add_ref(qp);
+ cm_id_priv->qp = qp;
+ cm_id_priv->state = IW_CM_STATE_CONN_SENT;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ ret = iw_cm_map(cm_id, true);
+ if (!ret)
+ ret = cm_id->device->ops.iw_connect(cm_id, iw_param);
+ if (!ret)
+ return 0; /* success */
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ qp = cm_id_priv->qp;
+ cm_id_priv->qp = NULL;
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+err:
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ if (qp)
+ cm_id->device->ops.iw_rem_ref(qp);
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+ return ret;
+}
+EXPORT_SYMBOL(iw_cm_connect);
+
+/*
+ * Passive Side: new CM_ID <-- CONN_RECV
+ *
+ * Handles an inbound connect request. The function creates a new
+ * iw_cm_id to represent the new connection and inherits the client
+ * callback function and other attributes from the listening parent.
+ *
+ * The work item contains a pointer to the listen_cm_id and the event. The
+ * listen_cm_id contains the client cm_handler, context and
+ * device. These are copied when the device is cloned. The event
+ * contains the new four tuple.
+ *
+ * An error on the child should not affect the parent, so this
+ * function does not return a value.
+ */
+static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ unsigned long flags;
+ struct iw_cm_id *cm_id;
+ struct iwcm_id_private *cm_id_priv;
+ int ret;
+
+ /*
+ * The provider should never generate a connection request
+ * event with a bad status.
+ */
+ BUG_ON(iw_event->status);
+
+ cm_id = iw_create_cm_id(listen_id_priv->id.device,
+ listen_id_priv->id.cm_handler,
+ listen_id_priv->id.context);
+ /* If the cm_id could not be created, ignore the request */
+ if (IS_ERR(cm_id))
+ goto out;
+
+ cm_id->provider_data = iw_event->provider_data;
+ cm_id->m_local_addr = iw_event->local_addr;
+ cm_id->m_remote_addr = iw_event->remote_addr;
+ cm_id->local_addr = listen_id_priv->id.local_addr;
+
+ ret = iwpm_get_remote_info(&listen_id_priv->id.m_local_addr,
+ &iw_event->remote_addr,
+ &cm_id->remote_addr,
+ RDMA_NL_IWCM);
+ if (ret) {
+ cm_id->remote_addr = iw_event->remote_addr;
+ } else {
+ iw_cm_check_wildcard(&listen_id_priv->id.m_local_addr,
+ &iw_event->local_addr,
+ &cm_id->local_addr);
+ iw_event->local_addr = cm_id->local_addr;
+ iw_event->remote_addr = cm_id->remote_addr;
+ }
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ cm_id_priv->state = IW_CM_STATE_CONN_RECV;
+
+ /*
+ * We could be destroying the listening id. If so, ignore this
+ * upcall.
+ */
+ spin_lock_irqsave(&listen_id_priv->lock, flags);
+ if (listen_id_priv->state != IW_CM_STATE_LISTEN) {
+ spin_unlock_irqrestore(&listen_id_priv->lock, flags);
+ iw_cm_reject(cm_id, NULL, 0);
+ iw_destroy_cm_id(cm_id);
+ goto out;
+ }
+ spin_unlock_irqrestore(&listen_id_priv->lock, flags);
+
+ ret = alloc_work_entries(cm_id_priv, 3);
+ if (ret) {
+ iw_cm_reject(cm_id, NULL, 0);
+ iw_destroy_cm_id(cm_id);
+ goto out;
+ }
+
+ /* Call the client CM handler */
+ ret = cm_id->cm_handler(cm_id, iw_event);
+ if (ret) {
+ iw_cm_reject(cm_id, NULL, 0);
+ iw_destroy_cm_id(cm_id);
+ }
+
+out:
+ if (iw_event->private_data_len)
+ kfree(iw_event->private_data);
+}
+
+/*
+ * Passive Side: CM_ID <-- ESTABLISHED
+ *
+ * The provider generated an ESTABLISHED event which means that
+ * the MPA negotion has completed successfully and we are now in MPA
+ * FPDU mode.
+ *
+ * This event can only be received in the CONN_RECV state. If the
+ * remote peer closed, the ESTABLISHED event would be received followed
+ * by the CLOSE event. If the app closes, it will block until we wake
+ * it up after processing this event.
+ */
+static int cm_conn_est_handler(struct iwcm_id_private *cm_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+ /*
+ * We clear the CONNECT_WAIT bit here to allow the callback
+ * function to call iw_cm_disconnect. Calling iw_destroy_cm_id
+ * from a callback handler is not allowed.
+ */
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
+ cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+ wake_up_all(&cm_id_priv->connect_wait);
+
+ return ret;
+}
+
+/*
+ * Active Side: CM_ID <-- ESTABLISHED
+ *
+ * The app has called connect and is waiting for the established event to
+ * post it's requests to the server. This event will wake up anyone
+ * blocked in iw_cm_disconnect or iw_destroy_id.
+ */
+static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ struct ib_qp *qp = NULL;
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ /*
+ * Clear the connect wait bit so a callback function calling
+ * iw_cm_disconnect will not wait and deadlock this thread
+ */
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT);
+ if (iw_event->status == 0) {
+ cm_id_priv->id.m_local_addr = iw_event->local_addr;
+ cm_id_priv->id.m_remote_addr = iw_event->remote_addr;
+ iw_event->local_addr = cm_id_priv->id.local_addr;
+ iw_event->remote_addr = cm_id_priv->id.remote_addr;
+ cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
+ } else {
+ /* REJECTED or RESET */
+ qp = cm_id_priv->qp;
+ cm_id_priv->qp = NULL;
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ if (qp)
+ cm_id_priv->id.device->ops.iw_rem_ref(qp);
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+
+ if (iw_event->private_data_len)
+ kfree(iw_event->private_data);
+
+ /* Wake up waiters on connect complete */
+ wake_up_all(&cm_id_priv->connect_wait);
+
+ return ret;
+}
+
+/*
+ * CM_ID <-- CLOSING
+ *
+ * If in the ESTABLISHED state, move to CLOSING.
+ */
+static void cm_disconnect_handler(struct iwcm_id_private *cm_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id_priv->state == IW_CM_STATE_ESTABLISHED)
+ cm_id_priv->state = IW_CM_STATE_CLOSING;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+}
+
+/*
+ * CM_ID <-- IDLE
+ *
+ * If in the ESTBLISHED or CLOSING states, the QP will have have been
+ * moved by the provider to the ERR state. Disassociate the CM_ID from
+ * the QP, move to IDLE, and remove the 'connected' reference.
+ *
+ * If in some other state, the cm_id was destroyed asynchronously.
+ * This is the last reference that will result in waking up
+ * the app thread blocked in iw_destroy_cm_id.
+ */
+static int cm_close_handler(struct iwcm_id_private *cm_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ struct ib_qp *qp;
+ unsigned long flags;
+ int ret = 0, notify_event = 0;
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ qp = cm_id_priv->qp;
+ cm_id_priv->qp = NULL;
+
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_ESTABLISHED:
+ case IW_CM_STATE_CLOSING:
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ notify_event = 1;
+ break;
+ case IW_CM_STATE_DESTROYING:
+ break;
+ default:
+ BUG();
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ if (qp)
+ cm_id_priv->id.device->ops.iw_rem_ref(qp);
+ if (notify_event)
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+ return ret;
+}
+
+static int process_event(struct iwcm_id_private *cm_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ int ret = 0;
+
+ switch (iw_event->event) {
+ case IW_CM_EVENT_CONNECT_REQUEST:
+ cm_conn_req_handler(cm_id_priv, iw_event);
+ break;
+ case IW_CM_EVENT_CONNECT_REPLY:
+ ret = cm_conn_rep_handler(cm_id_priv, iw_event);
+ break;
+ case IW_CM_EVENT_ESTABLISHED:
+ ret = cm_conn_est_handler(cm_id_priv, iw_event);
+ break;
+ case IW_CM_EVENT_DISCONNECT:
+ cm_disconnect_handler(cm_id_priv, iw_event);
+ break;
+ case IW_CM_EVENT_CLOSE:
+ ret = cm_close_handler(cm_id_priv, iw_event);
+ break;
+ default:
+ BUG();
+ }
+
+ return ret;
+}
+
+/*
+ * Process events on the work_list for the cm_id. If the callback
+ * function requests that the cm_id be deleted, a flag is set in the
+ * cm_id flags to indicate that when the last reference is
+ * removed, the cm_id is to be destroyed. This is necessary to
+ * distinguish between an object that will be destroyed by the app
+ * thread asleep on the destroy_comp list vs. an object destroyed
+ * here synchronously when the last reference is removed.
+ */
+static void cm_work_handler(struct work_struct *_work)
+{
+ struct iwcm_work *work = container_of(_work, struct iwcm_work, work);
+ struct iw_cm_event levent;
+ struct iwcm_id_private *cm_id_priv = work->cm_id;
+ unsigned long flags;
+ int empty;
+ int ret = 0;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ empty = list_empty(&cm_id_priv->work_list);
+ while (!empty) {
+ work = list_entry(cm_id_priv->work_list.next,
+ struct iwcm_work, list);
+ list_del_init(&work->list);
+ empty = list_empty(&cm_id_priv->work_list);
+ levent = work->event;
+ put_work(work);
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) {
+ ret = process_event(cm_id_priv, &levent);
+ if (ret)
+ destroy_cm_id(&cm_id_priv->id);
+ } else
+ pr_debug("dropping event %d\n", levent.event);
+ if (iwcm_deref_id(cm_id_priv))
+ return;
+ if (empty)
+ return;
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+}
+
+/*
+ * This function is called on interrupt context. Schedule events on
+ * the iwcm_wq thread to allow callback functions to downcall into
+ * the CM and/or block. Events are queued to a per-CM_ID
+ * work_list. If this is the first event on the work_list, the work
+ * element is also queued on the iwcm_wq thread.
+ *
+ * Each event holds a reference on the cm_id. Until the last posted
+ * event has been delivered and processed, the cm_id cannot be
+ * deleted.
+ *
+ * Returns:
+ * 0 - the event was handled.
+ * -ENOMEM - the event was not handled due to lack of resources.
+ */
+static int cm_event_handler(struct iw_cm_id *cm_id,
+ struct iw_cm_event *iw_event)
+{
+ struct iwcm_work *work;
+ struct iwcm_id_private *cm_id_priv;
+ unsigned long flags;
+ int ret = 0;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ work = get_work(cm_id_priv);
+ if (!work) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ INIT_WORK(&work->work, cm_work_handler);
+ work->cm_id = cm_id_priv;
+ work->event = *iw_event;
+
+ if ((work->event.event == IW_CM_EVENT_CONNECT_REQUEST ||
+ work->event.event == IW_CM_EVENT_CONNECT_REPLY) &&
+ work->event.private_data_len) {
+ ret = copy_private_data(&work->event);
+ if (ret) {
+ put_work(work);
+ goto out;
+ }
+ }
+
+ refcount_inc(&cm_id_priv->refcount);
+ if (list_empty(&cm_id_priv->work_list)) {
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ queue_work(iwcm_wq, &work->work);
+ } else
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+out:
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+
+static int iwcm_init_qp_init_attr(struct iwcm_id_private *cm_id_priv,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_IDLE:
+ case IW_CM_STATE_CONN_SENT:
+ case IW_CM_STATE_CONN_RECV:
+ case IW_CM_STATE_ESTABLISHED:
+ *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
+ qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE|
+ IB_ACCESS_REMOTE_READ;
+ ret = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+
+static int iwcm_init_qp_rts_attr(struct iwcm_id_private *cm_id_priv,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_IDLE:
+ case IW_CM_STATE_CONN_SENT:
+ case IW_CM_STATE_CONN_RECV:
+ case IW_CM_STATE_ESTABLISHED:
+ *qp_attr_mask = 0;
+ ret = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+
+int iw_cm_init_qp_attr(struct iw_cm_id *cm_id,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ struct iwcm_id_private *cm_id_priv;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ switch (qp_attr->qp_state) {
+ case IB_QPS_INIT:
+ case IB_QPS_RTR:
+ ret = iwcm_init_qp_init_attr(cm_id_priv,
+ qp_attr, qp_attr_mask);
+ break;
+ case IB_QPS_RTS:
+ ret = iwcm_init_qp_rts_attr(cm_id_priv,
+ qp_attr, qp_attr_mask);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(iw_cm_init_qp_attr);
+
+static int __init iw_cm_init(void)
+{
+ int ret;
+
+ ret = iwpm_init(RDMA_NL_IWCM);
+ if (ret)
+ return ret;
+
+ iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", 0);
+ if (!iwcm_wq)
+ goto err_alloc;
+
+ iwcm_ctl_table_hdr = register_net_sysctl(&init_net, "net/iw_cm",
+ iwcm_ctl_table);
+ if (!iwcm_ctl_table_hdr) {
+ pr_err("iw_cm: couldn't register sysctl paths\n");
+ goto err_sysctl;
+ }
+
+ rdma_nl_register(RDMA_NL_IWCM, iwcm_nl_cb_table);
+ return 0;
+
+err_sysctl:
+ destroy_workqueue(iwcm_wq);
+err_alloc:
+ iwpm_exit(RDMA_NL_IWCM);
+ return -ENOMEM;
+}
+
+static void __exit iw_cm_cleanup(void)
+{
+ rdma_nl_unregister(RDMA_NL_IWCM);
+ unregister_net_sysctl_table(iwcm_ctl_table_hdr);
+ destroy_workqueue(iwcm_wq);
+ iwpm_exit(RDMA_NL_IWCM);
+}
+
+MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_IWCM, 2);
+
+module_init(iw_cm_init);
+module_exit(iw_cm_cleanup);
diff --git a/drivers/infiniband/core/iwcm.h b/drivers/infiniband/core/iwcm.h
new file mode 100644
index 000000000..bf74639be
--- /dev/null
+++ b/drivers/infiniband/core/iwcm.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef IWCM_H
+#define IWCM_H
+
+enum iw_cm_state {
+ IW_CM_STATE_IDLE, /* unbound, inactive */
+ IW_CM_STATE_LISTEN, /* listen waiting for connect */
+ IW_CM_STATE_CONN_RECV, /* inbound waiting for user accept */
+ IW_CM_STATE_CONN_SENT, /* outbound waiting for peer accept */
+ IW_CM_STATE_ESTABLISHED, /* established */
+ IW_CM_STATE_CLOSING, /* disconnect */
+ IW_CM_STATE_DESTROYING /* object being deleted */
+};
+
+struct iwcm_id_private {
+ struct iw_cm_id id;
+ enum iw_cm_state state;
+ unsigned long flags;
+ struct ib_qp *qp;
+ struct completion destroy_comp;
+ wait_queue_head_t connect_wait;
+ struct list_head work_list;
+ spinlock_t lock;
+ refcount_t refcount;
+ struct list_head work_free_list;
+};
+
+#define IWCM_F_DROP_EVENTS 1
+#define IWCM_F_CONNECT_WAIT 2
+
+#endif /* IWCM_H */
diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c
new file mode 100644
index 000000000..3c9a98692
--- /dev/null
+++ b/drivers/infiniband/core/iwpm_msg.c
@@ -0,0 +1,846 @@
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "iwpm_util.h"
+
+static const char iwpm_ulib_name[IWPM_ULIBNAME_SIZE] = "iWarpPortMapperUser";
+u16 iwpm_ulib_version = IWPM_UABI_VERSION_MIN;
+static int iwpm_user_pid = IWPM_PID_UNDEFINED;
+static atomic_t echo_nlmsg_seq;
+
+/**
+ * iwpm_valid_pid - Check if the userspace iwarp port mapper pid is valid
+ *
+ * Returns true if the pid is greater than zero, otherwise returns false
+ */
+int iwpm_valid_pid(void)
+{
+ return iwpm_user_pid > 0;
+}
+
+/**
+ * iwpm_register_pid - Send a netlink query to userspace
+ * to get the iwarp port mapper pid
+ * @pm_msg: Contains driver info to send to the userspace port mapper
+ * @nl_client: The index of the netlink client
+ *
+ * nlmsg attributes:
+ * [IWPM_NLA_REG_PID_SEQ]
+ * [IWPM_NLA_REG_IF_NAME]
+ * [IWPM_NLA_REG_IBDEV_NAME]
+ * [IWPM_NLA_REG_ULIB_NAME]
+ */
+int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client)
+{
+ struct sk_buff *skb = NULL;
+ struct iwpm_nlmsg_request *nlmsg_request = NULL;
+ struct nlmsghdr *nlh;
+ u32 msg_seq;
+ const char *err_str = "";
+ int ret = -EINVAL;
+
+ if (iwpm_check_registration(nl_client, IWPM_REG_VALID) ||
+ iwpm_user_pid == IWPM_PID_UNAVAILABLE)
+ return 0;
+ skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REG_PID, &nlh, nl_client);
+ if (!skb) {
+ err_str = "Unable to create a nlmsg";
+ goto pid_query_error;
+ }
+ nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+ nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, nl_client, GFP_KERNEL);
+ if (!nlmsg_request) {
+ err_str = "Unable to allocate netlink request";
+ goto pid_query_error;
+ }
+ msg_seq = atomic_read(&echo_nlmsg_seq);
+
+ /* fill in the pid request message */
+ err_str = "Unable to put attribute of the nlmsg";
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_REG_PID_SEQ);
+ if (ret)
+ goto pid_query_error;
+ ret = ibnl_put_attr(skb, nlh, IFNAMSIZ,
+ pm_msg->if_name, IWPM_NLA_REG_IF_NAME);
+ if (ret)
+ goto pid_query_error;
+ ret = ibnl_put_attr(skb, nlh, IWPM_DEVNAME_SIZE,
+ pm_msg->dev_name, IWPM_NLA_REG_IBDEV_NAME);
+ if (ret)
+ goto pid_query_error;
+ ret = ibnl_put_attr(skb, nlh, IWPM_ULIBNAME_SIZE,
+ (char *)iwpm_ulib_name, IWPM_NLA_REG_ULIB_NAME);
+ if (ret)
+ goto pid_query_error;
+
+ nlmsg_end(skb, nlh);
+
+ pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n",
+ __func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name);
+
+ ret = rdma_nl_multicast(&init_net, skb, RDMA_NL_GROUP_IWPM, GFP_KERNEL);
+ if (ret) {
+ skb = NULL; /* skb is freed in the netlink send-op handling */
+ iwpm_user_pid = IWPM_PID_UNAVAILABLE;
+ err_str = "Unable to send a nlmsg";
+ goto pid_query_error;
+ }
+ nlmsg_request->req_buffer = pm_msg;
+ ret = iwpm_wait_complete_req(nlmsg_request);
+ return ret;
+pid_query_error:
+ pr_info("%s: %s (client = %u)\n", __func__, err_str, nl_client);
+ dev_kfree_skb(skb);
+ if (nlmsg_request)
+ iwpm_free_nlmsg_request(&nlmsg_request->kref);
+ return ret;
+}
+
+/**
+ * iwpm_add_mapping - Send a netlink add mapping request to
+ * the userspace port mapper
+ * @pm_msg: Contains the local ip/tcp address info to send
+ * @nl_client: The index of the netlink client
+ *
+ * nlmsg attributes:
+ * [IWPM_NLA_MANAGE_MAPPING_SEQ]
+ * [IWPM_NLA_MANAGE_ADDR]
+ * [IWPM_NLA_MANAGE_FLAGS]
+ *
+ * If the request is successful, the pm_msg stores
+ * the port mapper response (mapped address info)
+ */
+int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
+{
+ struct sk_buff *skb = NULL;
+ struct iwpm_nlmsg_request *nlmsg_request = NULL;
+ struct nlmsghdr *nlh;
+ u32 msg_seq;
+ const char *err_str = "";
+ int ret = -EINVAL;
+
+ if (!iwpm_valid_pid())
+ return 0;
+ if (!iwpm_check_registration(nl_client, IWPM_REG_VALID)) {
+ err_str = "Unregistered port mapper client";
+ goto add_mapping_error;
+ }
+ skb = iwpm_create_nlmsg(RDMA_NL_IWPM_ADD_MAPPING, &nlh, nl_client);
+ if (!skb) {
+ err_str = "Unable to create a nlmsg";
+ goto add_mapping_error;
+ }
+ nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+ nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, nl_client, GFP_KERNEL);
+ if (!nlmsg_request) {
+ err_str = "Unable to allocate netlink request";
+ goto add_mapping_error;
+ }
+ msg_seq = atomic_read(&echo_nlmsg_seq);
+ /* fill in the add mapping message */
+ err_str = "Unable to put attribute of the nlmsg";
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq,
+ IWPM_NLA_MANAGE_MAPPING_SEQ);
+ if (ret)
+ goto add_mapping_error;
+ ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage),
+ &pm_msg->loc_addr, IWPM_NLA_MANAGE_ADDR);
+ if (ret)
+ goto add_mapping_error;
+
+ /* If flags are required and we're not V4, then return a quiet error */
+ if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) {
+ ret = -EINVAL;
+ goto add_mapping_error_nowarn;
+ }
+ if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) {
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags,
+ IWPM_NLA_MANAGE_FLAGS);
+ if (ret)
+ goto add_mapping_error;
+ }
+
+ nlmsg_end(skb, nlh);
+ nlmsg_request->req_buffer = pm_msg;
+
+ ret = rdma_nl_unicast_wait(&init_net, skb, iwpm_user_pid);
+ if (ret) {
+ skb = NULL; /* skb is freed in the netlink send-op handling */
+ iwpm_user_pid = IWPM_PID_UNDEFINED;
+ err_str = "Unable to send a nlmsg";
+ goto add_mapping_error;
+ }
+ ret = iwpm_wait_complete_req(nlmsg_request);
+ return ret;
+add_mapping_error:
+ pr_info("%s: %s (client = %u)\n", __func__, err_str, nl_client);
+add_mapping_error_nowarn:
+ dev_kfree_skb(skb);
+ if (nlmsg_request)
+ iwpm_free_nlmsg_request(&nlmsg_request->kref);
+ return ret;
+}
+
+/**
+ * iwpm_add_and_query_mapping - Process the port mapper response to
+ * iwpm_add_and_query_mapping request
+ * @pm_msg: Contains the local ip/tcp address info to send
+ * @nl_client: The index of the netlink client
+ *
+ * nlmsg attributes:
+ * [IWPM_NLA_QUERY_MAPPING_SEQ]
+ * [IWPM_NLA_QUERY_LOCAL_ADDR]
+ * [IWPM_NLA_QUERY_REMOTE_ADDR]
+ * [IWPM_NLA_QUERY_FLAGS]
+ */
+int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
+{
+ struct sk_buff *skb = NULL;
+ struct iwpm_nlmsg_request *nlmsg_request = NULL;
+ struct nlmsghdr *nlh;
+ u32 msg_seq;
+ const char *err_str = "";
+ int ret = -EINVAL;
+
+ if (!iwpm_valid_pid())
+ return 0;
+ if (!iwpm_check_registration(nl_client, IWPM_REG_VALID)) {
+ err_str = "Unregistered port mapper client";
+ goto query_mapping_error;
+ }
+ ret = -ENOMEM;
+ skb = iwpm_create_nlmsg(RDMA_NL_IWPM_QUERY_MAPPING, &nlh, nl_client);
+ if (!skb) {
+ err_str = "Unable to create a nlmsg";
+ goto query_mapping_error;
+ }
+ nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+ nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq,
+ nl_client, GFP_KERNEL);
+ if (!nlmsg_request) {
+ err_str = "Unable to allocate netlink request";
+ goto query_mapping_error;
+ }
+ msg_seq = atomic_read(&echo_nlmsg_seq);
+
+ /* fill in the query message */
+ err_str = "Unable to put attribute of the nlmsg";
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq,
+ IWPM_NLA_QUERY_MAPPING_SEQ);
+ if (ret)
+ goto query_mapping_error;
+ ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage),
+ &pm_msg->loc_addr, IWPM_NLA_QUERY_LOCAL_ADDR);
+ if (ret)
+ goto query_mapping_error;
+ ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage),
+ &pm_msg->rem_addr, IWPM_NLA_QUERY_REMOTE_ADDR);
+ if (ret)
+ goto query_mapping_error;
+
+ /* If flags are required and we're not V4, then return a quite error */
+ if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) {
+ ret = -EINVAL;
+ goto query_mapping_error_nowarn;
+ }
+ if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) {
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags,
+ IWPM_NLA_QUERY_FLAGS);
+ if (ret)
+ goto query_mapping_error;
+ }
+
+ nlmsg_end(skb, nlh);
+ nlmsg_request->req_buffer = pm_msg;
+
+ ret = rdma_nl_unicast_wait(&init_net, skb, iwpm_user_pid);
+ if (ret) {
+ skb = NULL; /* skb is freed in the netlink send-op handling */
+ err_str = "Unable to send a nlmsg";
+ goto query_mapping_error;
+ }
+ ret = iwpm_wait_complete_req(nlmsg_request);
+ return ret;
+query_mapping_error:
+ pr_info("%s: %s (client = %u)\n", __func__, err_str, nl_client);
+query_mapping_error_nowarn:
+ dev_kfree_skb(skb);
+ if (nlmsg_request)
+ iwpm_free_nlmsg_request(&nlmsg_request->kref);
+ return ret;
+}
+
+/**
+ * iwpm_remove_mapping - Send a netlink remove mapping request
+ * to the userspace port mapper
+ *
+ * @local_addr: Local ip/tcp address to remove
+ * @nl_client: The index of the netlink client
+ *
+ * nlmsg attributes:
+ * [IWPM_NLA_MANAGE_MAPPING_SEQ]
+ * [IWPM_NLA_MANAGE_ADDR]
+ */
+int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client)
+{
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ u32 msg_seq;
+ const char *err_str = "";
+ int ret = -EINVAL;
+
+ if (!iwpm_valid_pid())
+ return 0;
+ if (iwpm_check_registration(nl_client, IWPM_REG_UNDEF)) {
+ err_str = "Unregistered port mapper client";
+ goto remove_mapping_error;
+ }
+ skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REMOVE_MAPPING, &nlh, nl_client);
+ if (!skb) {
+ ret = -ENOMEM;
+ err_str = "Unable to create a nlmsg";
+ goto remove_mapping_error;
+ }
+ msg_seq = atomic_read(&echo_nlmsg_seq);
+ nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+ err_str = "Unable to put attribute of the nlmsg";
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq,
+ IWPM_NLA_MANAGE_MAPPING_SEQ);
+ if (ret)
+ goto remove_mapping_error;
+ ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage),
+ local_addr, IWPM_NLA_MANAGE_ADDR);
+ if (ret)
+ goto remove_mapping_error;
+
+ nlmsg_end(skb, nlh);
+
+ ret = rdma_nl_unicast_wait(&init_net, skb, iwpm_user_pid);
+ if (ret) {
+ skb = NULL; /* skb is freed in the netlink send-op handling */
+ iwpm_user_pid = IWPM_PID_UNDEFINED;
+ err_str = "Unable to send a nlmsg";
+ goto remove_mapping_error;
+ }
+ iwpm_print_sockaddr(local_addr,
+ "remove_mapping: Local sockaddr:");
+ return 0;
+remove_mapping_error:
+ pr_info("%s: %s (client = %u)\n", __func__, err_str, nl_client);
+ if (skb)
+ dev_kfree_skb_any(skb);
+ return ret;
+}
+
+/* netlink attribute policy for the received response to register pid request */
+static const struct nla_policy resp_reg_policy[IWPM_NLA_RREG_PID_MAX] = {
+ [IWPM_NLA_RREG_PID_SEQ] = { .type = NLA_U32 },
+ [IWPM_NLA_RREG_IBDEV_NAME] = { .type = NLA_STRING,
+ .len = IWPM_DEVNAME_SIZE - 1 },
+ [IWPM_NLA_RREG_ULIB_NAME] = { .type = NLA_STRING,
+ .len = IWPM_ULIBNAME_SIZE - 1 },
+ [IWPM_NLA_RREG_ULIB_VER] = { .type = NLA_U16 },
+ [IWPM_NLA_RREG_PID_ERR] = { .type = NLA_U16 }
+};
+
+/**
+ * iwpm_register_pid_cb - Process the port mapper response to
+ * iwpm_register_pid query
+ * @skb: The socket buffer
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * If successful, the function receives the userspace port mapper pid
+ * which is used in future communication with the port mapper
+ */
+int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct iwpm_nlmsg_request *nlmsg_request = NULL;
+ struct nlattr *nltb[IWPM_NLA_RREG_PID_MAX];
+ struct iwpm_dev_data *pm_msg;
+ char *dev_name, *iwpm_name;
+ u32 msg_seq;
+ u8 nl_client;
+ u16 iwpm_version;
+ const char *msg_type = "Register Pid response";
+
+ if (iwpm_parse_nlmsg(cb, IWPM_NLA_RREG_PID_MAX,
+ resp_reg_policy, nltb, msg_type))
+ return -EINVAL;
+
+ msg_seq = nla_get_u32(nltb[IWPM_NLA_RREG_PID_SEQ]);
+ nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
+ if (!nlmsg_request) {
+ pr_info("%s: Could not find a matching request (seq = %u)\n",
+ __func__, msg_seq);
+ return -EINVAL;
+ }
+ pm_msg = nlmsg_request->req_buffer;
+ nl_client = nlmsg_request->nl_client;
+ dev_name = (char *)nla_data(nltb[IWPM_NLA_RREG_IBDEV_NAME]);
+ iwpm_name = (char *)nla_data(nltb[IWPM_NLA_RREG_ULIB_NAME]);
+ iwpm_version = nla_get_u16(nltb[IWPM_NLA_RREG_ULIB_VER]);
+
+ /* check device name, ulib name and version */
+ if (strcmp(pm_msg->dev_name, dev_name) ||
+ strcmp(iwpm_ulib_name, iwpm_name) ||
+ iwpm_version < IWPM_UABI_VERSION_MIN) {
+
+ pr_info("%s: Incorrect info (dev = %s name = %s version = %u)\n",
+ __func__, dev_name, iwpm_name, iwpm_version);
+ nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+ goto register_pid_response_exit;
+ }
+ iwpm_user_pid = cb->nlh->nlmsg_pid;
+ iwpm_ulib_version = iwpm_version;
+ if (iwpm_ulib_version < IWPM_UABI_VERSION)
+ pr_warn_once("%s: Down level iwpmd/pid %d. Continuing...",
+ __func__, iwpm_user_pid);
+ atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+ pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",
+ __func__, iwpm_user_pid);
+ iwpm_set_registration(nl_client, IWPM_REG_VALID);
+register_pid_response_exit:
+ nlmsg_request->request_done = 1;
+ /* always for found nlmsg_request */
+ kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+ barrier();
+ up(&nlmsg_request->sem);
+ return 0;
+}
+
+/* netlink attribute policy for the received response to add mapping request */
+static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = {
+ [IWPM_NLA_RMANAGE_MAPPING_SEQ] = { .type = NLA_U32 },
+ [IWPM_NLA_RMANAGE_ADDR] = {
+ .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR] = {
+ .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RMANAGE_MAPPING_ERR] = { .type = NLA_U16 }
+};
+
+/**
+ * iwpm_add_mapping_cb - Process the port mapper response to
+ * iwpm_add_mapping request
+ * @skb: The socket buffer
+ * @cb: Contains the received message (payload and netlink header)
+ */
+int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct iwpm_sa_data *pm_msg;
+ struct iwpm_nlmsg_request *nlmsg_request = NULL;
+ struct nlattr *nltb[IWPM_NLA_RMANAGE_MAPPING_MAX];
+ struct sockaddr_storage *local_sockaddr;
+ struct sockaddr_storage *mapped_sockaddr;
+ const char *msg_type;
+ u32 msg_seq;
+
+ msg_type = "Add Mapping response";
+ if (iwpm_parse_nlmsg(cb, IWPM_NLA_RMANAGE_MAPPING_MAX,
+ resp_add_policy, nltb, msg_type))
+ return -EINVAL;
+
+ atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+
+ msg_seq = nla_get_u32(nltb[IWPM_NLA_RMANAGE_MAPPING_SEQ]);
+ nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
+ if (!nlmsg_request) {
+ pr_info("%s: Could not find a matching request (seq = %u)\n",
+ __func__, msg_seq);
+ return -EINVAL;
+ }
+ pm_msg = nlmsg_request->req_buffer;
+ local_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_RMANAGE_ADDR]);
+ mapped_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR]);
+
+ if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr)) {
+ nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+ goto add_mapping_response_exit;
+ }
+ if (mapped_sockaddr->ss_family != local_sockaddr->ss_family) {
+ pr_info("%s: Sockaddr family doesn't match the requested one\n",
+ __func__);
+ nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+ goto add_mapping_response_exit;
+ }
+ memcpy(&pm_msg->mapped_loc_addr, mapped_sockaddr,
+ sizeof(*mapped_sockaddr));
+ iwpm_print_sockaddr(&pm_msg->loc_addr,
+ "add_mapping: Local sockaddr:");
+ iwpm_print_sockaddr(&pm_msg->mapped_loc_addr,
+ "add_mapping: Mapped local sockaddr:");
+
+add_mapping_response_exit:
+ nlmsg_request->request_done = 1;
+ /* always for found request */
+ kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+ barrier();
+ up(&nlmsg_request->sem);
+ return 0;
+}
+
+/* netlink attribute policy for the response to add and query mapping request
+ * and response with remote address info
+ */
+static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = {
+ [IWPM_NLA_RQUERY_MAPPING_SEQ] = { .type = NLA_U32 },
+ [IWPM_NLA_RQUERY_LOCAL_ADDR] = {
+ .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RQUERY_REMOTE_ADDR] = {
+ .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = {
+ .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = {
+ .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RQUERY_MAPPING_ERR] = { .type = NLA_U16 }
+};
+
+/**
+ * iwpm_add_and_query_mapping_cb - Process the port mapper response to
+ * iwpm_add_and_query_mapping request
+ * @skb: The socket buffer
+ * @cb: Contains the received message (payload and netlink header)
+ */
+int iwpm_add_and_query_mapping_cb(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct iwpm_sa_data *pm_msg;
+ struct iwpm_nlmsg_request *nlmsg_request = NULL;
+ struct nlattr *nltb[IWPM_NLA_RQUERY_MAPPING_MAX];
+ struct sockaddr_storage *local_sockaddr, *remote_sockaddr;
+ struct sockaddr_storage *mapped_loc_sockaddr, *mapped_rem_sockaddr;
+ const char *msg_type;
+ u32 msg_seq;
+ u16 err_code;
+
+ msg_type = "Query Mapping response";
+ if (iwpm_parse_nlmsg(cb, IWPM_NLA_RQUERY_MAPPING_MAX,
+ resp_query_policy, nltb, msg_type))
+ return -EINVAL;
+ atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+
+ msg_seq = nla_get_u32(nltb[IWPM_NLA_RQUERY_MAPPING_SEQ]);
+ nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
+ if (!nlmsg_request) {
+ pr_info("%s: Could not find a matching request (seq = %u)\n",
+ __func__, msg_seq);
+ return -EINVAL;
+ }
+ pm_msg = nlmsg_request->req_buffer;
+ local_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]);
+ remote_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]);
+ mapped_loc_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]);
+ mapped_rem_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_REM_ADDR]);
+
+ err_code = nla_get_u16(nltb[IWPM_NLA_RQUERY_MAPPING_ERR]);
+ if (err_code == IWPM_REMOTE_QUERY_REJECT) {
+ pr_info("%s: Received a Reject (pid = %u, echo seq = %u)\n",
+ __func__, cb->nlh->nlmsg_pid, msg_seq);
+ nlmsg_request->err_code = IWPM_REMOTE_QUERY_REJECT;
+ }
+ if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr) ||
+ iwpm_compare_sockaddr(remote_sockaddr, &pm_msg->rem_addr)) {
+ pr_info("%s: Incorrect local sockaddr\n", __func__);
+ nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+ goto query_mapping_response_exit;
+ }
+ if (mapped_loc_sockaddr->ss_family != local_sockaddr->ss_family ||
+ mapped_rem_sockaddr->ss_family != remote_sockaddr->ss_family) {
+ pr_info("%s: Sockaddr family doesn't match the requested one\n",
+ __func__);
+ nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+ goto query_mapping_response_exit;
+ }
+ memcpy(&pm_msg->mapped_loc_addr, mapped_loc_sockaddr,
+ sizeof(*mapped_loc_sockaddr));
+ memcpy(&pm_msg->mapped_rem_addr, mapped_rem_sockaddr,
+ sizeof(*mapped_rem_sockaddr));
+
+ iwpm_print_sockaddr(&pm_msg->loc_addr,
+ "query_mapping: Local sockaddr:");
+ iwpm_print_sockaddr(&pm_msg->mapped_loc_addr,
+ "query_mapping: Mapped local sockaddr:");
+ iwpm_print_sockaddr(&pm_msg->rem_addr,
+ "query_mapping: Remote sockaddr:");
+ iwpm_print_sockaddr(&pm_msg->mapped_rem_addr,
+ "query_mapping: Mapped remote sockaddr:");
+query_mapping_response_exit:
+ nlmsg_request->request_done = 1;
+ /* always for found request */
+ kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+ barrier();
+ up(&nlmsg_request->sem);
+ return 0;
+}
+
+/**
+ * iwpm_remote_info_cb - Process remote connecting peer address info, which
+ * the port mapper has received from the connecting peer
+ * @skb: The socket buffer
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * Stores the IPv4/IPv6 address info in a hash table
+ */
+int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nlattr *nltb[IWPM_NLA_RQUERY_MAPPING_MAX];
+ struct sockaddr_storage *local_sockaddr, *remote_sockaddr;
+ struct sockaddr_storage *mapped_loc_sockaddr, *mapped_rem_sockaddr;
+ struct iwpm_remote_info *rem_info;
+ const char *msg_type;
+ u8 nl_client;
+ int ret = -EINVAL;
+
+ msg_type = "Remote Mapping info";
+ if (iwpm_parse_nlmsg(cb, IWPM_NLA_RQUERY_MAPPING_MAX,
+ resp_query_policy, nltb, msg_type))
+ return ret;
+
+ nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
+ atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+
+ local_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]);
+ remote_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]);
+ mapped_loc_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]);
+ mapped_rem_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_REM_ADDR]);
+
+ if (mapped_loc_sockaddr->ss_family != local_sockaddr->ss_family ||
+ mapped_rem_sockaddr->ss_family != remote_sockaddr->ss_family) {
+ pr_info("%s: Sockaddr family doesn't match the requested one\n",
+ __func__);
+ return ret;
+ }
+ rem_info = kzalloc(sizeof(struct iwpm_remote_info), GFP_ATOMIC);
+ if (!rem_info) {
+ ret = -ENOMEM;
+ return ret;
+ }
+ memcpy(&rem_info->mapped_loc_sockaddr, mapped_loc_sockaddr,
+ sizeof(struct sockaddr_storage));
+ memcpy(&rem_info->remote_sockaddr, remote_sockaddr,
+ sizeof(struct sockaddr_storage));
+ memcpy(&rem_info->mapped_rem_sockaddr, mapped_rem_sockaddr,
+ sizeof(struct sockaddr_storage));
+ rem_info->nl_client = nl_client;
+
+ iwpm_add_remote_info(rem_info);
+
+ iwpm_print_sockaddr(local_sockaddr,
+ "remote_info: Local sockaddr:");
+ iwpm_print_sockaddr(mapped_loc_sockaddr,
+ "remote_info: Mapped local sockaddr:");
+ iwpm_print_sockaddr(remote_sockaddr,
+ "remote_info: Remote sockaddr:");
+ iwpm_print_sockaddr(mapped_rem_sockaddr,
+ "remote_info: Mapped remote sockaddr:");
+ return ret;
+}
+
+/* netlink attribute policy for the received request for mapping info */
+static const struct nla_policy resp_mapinfo_policy[IWPM_NLA_MAPINFO_REQ_MAX] = {
+ [IWPM_NLA_MAPINFO_ULIB_NAME] = { .type = NLA_STRING,
+ .len = IWPM_ULIBNAME_SIZE - 1 },
+ [IWPM_NLA_MAPINFO_ULIB_VER] = { .type = NLA_U16 }
+};
+
+/**
+ * iwpm_mapping_info_cb - Process a notification that the userspace
+ * port mapper daemon is started
+ * @skb: The socket buffer
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * Using the received port mapper pid, send all the local mapping
+ * info records to the userspace port mapper
+ */
+int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nlattr *nltb[IWPM_NLA_MAPINFO_REQ_MAX];
+ const char *msg_type = "Mapping Info response";
+ u8 nl_client;
+ char *iwpm_name;
+ u16 iwpm_version;
+ int ret = -EINVAL;
+
+ if (iwpm_parse_nlmsg(cb, IWPM_NLA_MAPINFO_REQ_MAX,
+ resp_mapinfo_policy, nltb, msg_type)) {
+ pr_info("%s: Unable to parse nlmsg\n", __func__);
+ return ret;
+ }
+ iwpm_name = (char *)nla_data(nltb[IWPM_NLA_MAPINFO_ULIB_NAME]);
+ iwpm_version = nla_get_u16(nltb[IWPM_NLA_MAPINFO_ULIB_VER]);
+ if (strcmp(iwpm_ulib_name, iwpm_name) ||
+ iwpm_version < IWPM_UABI_VERSION_MIN) {
+ pr_info("%s: Invalid port mapper name = %s version = %u\n",
+ __func__, iwpm_name, iwpm_version);
+ return ret;
+ }
+ nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
+ iwpm_set_registration(nl_client, IWPM_REG_INCOMPL);
+ atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+ iwpm_user_pid = cb->nlh->nlmsg_pid;
+
+ if (iwpm_ulib_version < IWPM_UABI_VERSION)
+ pr_warn_once("%s: Down level iwpmd/pid %d. Continuing...",
+ __func__, iwpm_user_pid);
+
+ if (!iwpm_mapinfo_available())
+ return 0;
+ pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",
+ __func__, iwpm_user_pid);
+ ret = iwpm_send_mapinfo(nl_client, iwpm_user_pid);
+ return ret;
+}
+
+/* netlink attribute policy for the received mapping info ack */
+static const struct nla_policy ack_mapinfo_policy[IWPM_NLA_MAPINFO_NUM_MAX] = {
+ [IWPM_NLA_MAPINFO_SEQ] = { .type = NLA_U32 },
+ [IWPM_NLA_MAPINFO_SEND_NUM] = { .type = NLA_U32 },
+ [IWPM_NLA_MAPINFO_ACK_NUM] = { .type = NLA_U32 }
+};
+
+/**
+ * iwpm_ack_mapping_info_cb - Process the port mapper ack for
+ * the provided local mapping info records
+ * @skb: The socket buffer
+ * @cb: Contains the received message (payload and netlink header)
+ */
+int iwpm_ack_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nlattr *nltb[IWPM_NLA_MAPINFO_NUM_MAX];
+ u32 mapinfo_send, mapinfo_ack;
+ const char *msg_type = "Mapping Info Ack";
+
+ if (iwpm_parse_nlmsg(cb, IWPM_NLA_MAPINFO_NUM_MAX,
+ ack_mapinfo_policy, nltb, msg_type))
+ return -EINVAL;
+ mapinfo_send = nla_get_u32(nltb[IWPM_NLA_MAPINFO_SEND_NUM]);
+ mapinfo_ack = nla_get_u32(nltb[IWPM_NLA_MAPINFO_ACK_NUM]);
+ if (mapinfo_ack != mapinfo_send)
+ pr_info("%s: Invalid mapinfo number (sent = %u ack-ed = %u)\n",
+ __func__, mapinfo_send, mapinfo_ack);
+ atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+ return 0;
+}
+
+/* netlink attribute policy for the received port mapper error message */
+static const struct nla_policy map_error_policy[IWPM_NLA_ERR_MAX] = {
+ [IWPM_NLA_ERR_SEQ] = { .type = NLA_U32 },
+ [IWPM_NLA_ERR_CODE] = { .type = NLA_U16 },
+};
+
+/**
+ * iwpm_mapping_error_cb - Process port mapper notification for error
+ *
+ * @skb: The socket buffer
+ * @cb: Contains the received message (payload and netlink header)
+ */
+int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct iwpm_nlmsg_request *nlmsg_request = NULL;
+ int nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
+ struct nlattr *nltb[IWPM_NLA_ERR_MAX];
+ u32 msg_seq;
+ u16 err_code;
+ const char *msg_type = "Mapping Error Msg";
+
+ if (iwpm_parse_nlmsg(cb, IWPM_NLA_ERR_MAX,
+ map_error_policy, nltb, msg_type))
+ return -EINVAL;
+
+ msg_seq = nla_get_u32(nltb[IWPM_NLA_ERR_SEQ]);
+ err_code = nla_get_u16(nltb[IWPM_NLA_ERR_CODE]);
+ pr_info("%s: Received msg seq = %u err code = %u client = %d\n",
+ __func__, msg_seq, err_code, nl_client);
+ /* look for nlmsg_request */
+ nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
+ if (!nlmsg_request) {
+ /* not all errors have associated requests */
+ pr_debug("Could not find matching req (seq = %u)\n", msg_seq);
+ return 0;
+ }
+ atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+ nlmsg_request->err_code = err_code;
+ nlmsg_request->request_done = 1;
+ /* always for found request */
+ kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+ barrier();
+ up(&nlmsg_request->sem);
+ return 0;
+}
+
+/* netlink attribute policy for the received hello request */
+static const struct nla_policy hello_policy[IWPM_NLA_HELLO_MAX] = {
+ [IWPM_NLA_HELLO_ABI_VERSION] = { .type = NLA_U16 }
+};
+
+/**
+ * iwpm_hello_cb - Process a hello message from iwpmd
+ *
+ * @skb: The socket buffer
+ * @cb: Contains the received message (payload and netlink header)
+ *
+ * Using the received port mapper pid, send the kernel's abi_version
+ * after adjusting it to support the iwpmd version.
+ */
+int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nlattr *nltb[IWPM_NLA_HELLO_MAX];
+ const char *msg_type = "Hello request";
+ u8 nl_client;
+ u16 abi_version;
+ int ret = -EINVAL;
+
+ if (iwpm_parse_nlmsg(cb, IWPM_NLA_HELLO_MAX, hello_policy, nltb,
+ msg_type)) {
+ pr_info("%s: Unable to parse nlmsg\n", __func__);
+ return ret;
+ }
+ abi_version = nla_get_u16(nltb[IWPM_NLA_HELLO_ABI_VERSION]);
+ nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
+ iwpm_set_registration(nl_client, IWPM_REG_INCOMPL);
+ atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+ iwpm_ulib_version = min_t(u16, IWPM_UABI_VERSION, abi_version);
+ pr_debug("Using ABI version %u\n", iwpm_ulib_version);
+ iwpm_user_pid = cb->nlh->nlmsg_pid;
+ ret = iwpm_send_hello(nl_client, iwpm_user_pid, iwpm_ulib_version);
+ return ret;
+}
diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c
new file mode 100644
index 000000000..358a2db38
--- /dev/null
+++ b/drivers/infiniband/core/iwpm_util.c
@@ -0,0 +1,793 @@
+/*
+ * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "iwpm_util.h"
+
+#define IWPM_MAPINFO_HASH_SIZE 512
+#define IWPM_MAPINFO_HASH_MASK (IWPM_MAPINFO_HASH_SIZE - 1)
+#define IWPM_REMINFO_HASH_SIZE 64
+#define IWPM_REMINFO_HASH_MASK (IWPM_REMINFO_HASH_SIZE - 1)
+#define IWPM_MSG_SIZE 512
+
+static LIST_HEAD(iwpm_nlmsg_req_list);
+static DEFINE_SPINLOCK(iwpm_nlmsg_req_lock);
+
+static struct hlist_head *iwpm_hash_bucket;
+static DEFINE_SPINLOCK(iwpm_mapinfo_lock);
+
+static struct hlist_head *iwpm_reminfo_bucket;
+static DEFINE_SPINLOCK(iwpm_reminfo_lock);
+
+static struct iwpm_admin_data iwpm_admin;
+
+/**
+ * iwpm_init - Allocate resources for the iwarp port mapper
+ * @nl_client: The index of the netlink client
+ *
+ * Should be called when network interface goes up.
+ */
+int iwpm_init(u8 nl_client)
+{
+ iwpm_hash_bucket = kcalloc(IWPM_MAPINFO_HASH_SIZE,
+ sizeof(struct hlist_head), GFP_KERNEL);
+ if (!iwpm_hash_bucket)
+ return -ENOMEM;
+
+ iwpm_reminfo_bucket = kcalloc(IWPM_REMINFO_HASH_SIZE,
+ sizeof(struct hlist_head), GFP_KERNEL);
+ if (!iwpm_reminfo_bucket) {
+ kfree(iwpm_hash_bucket);
+ return -ENOMEM;
+ }
+
+ iwpm_set_registration(nl_client, IWPM_REG_UNDEF);
+ pr_debug("%s: Mapinfo and reminfo tables are created\n", __func__);
+ return 0;
+}
+
+static void free_hash_bucket(void);
+static void free_reminfo_bucket(void);
+
+/**
+ * iwpm_exit - Deallocate resources for the iwarp port mapper
+ * @nl_client: The index of the netlink client
+ *
+ * Should be called when network interface goes down.
+ */
+int iwpm_exit(u8 nl_client)
+{
+ free_hash_bucket();
+ free_reminfo_bucket();
+ pr_debug("%s: Resources are destroyed\n", __func__);
+ iwpm_set_registration(nl_client, IWPM_REG_UNDEF);
+ return 0;
+}
+
+static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *,
+ struct sockaddr_storage *);
+
+/**
+ * iwpm_create_mapinfo - Store local and mapped IPv4/IPv6 address
+ * info in a hash table
+ * @local_sockaddr: Local ip/tcp address
+ * @mapped_sockaddr: Mapped local ip/tcp address
+ * @nl_client: The index of the netlink client
+ * @map_flags: IWPM mapping flags
+ */
+int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr,
+ struct sockaddr_storage *mapped_sockaddr,
+ u8 nl_client, u32 map_flags)
+{
+ struct hlist_head *hash_bucket_head = NULL;
+ struct iwpm_mapping_info *map_info;
+ unsigned long flags;
+ int ret = -EINVAL;
+
+ map_info = kzalloc(sizeof(struct iwpm_mapping_info), GFP_KERNEL);
+ if (!map_info)
+ return -ENOMEM;
+
+ memcpy(&map_info->local_sockaddr, local_sockaddr,
+ sizeof(struct sockaddr_storage));
+ memcpy(&map_info->mapped_sockaddr, mapped_sockaddr,
+ sizeof(struct sockaddr_storage));
+ map_info->nl_client = nl_client;
+ map_info->map_flags = map_flags;
+
+ spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+ if (iwpm_hash_bucket) {
+ hash_bucket_head = get_mapinfo_hash_bucket(
+ &map_info->local_sockaddr,
+ &map_info->mapped_sockaddr);
+ if (hash_bucket_head) {
+ hlist_add_head(&map_info->hlist_node, hash_bucket_head);
+ ret = 0;
+ }
+ }
+ spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+
+ if (!hash_bucket_head)
+ kfree(map_info);
+ return ret;
+}
+
+/**
+ * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address
+ * info from the hash table
+ * @local_sockaddr: Local ip/tcp address
+ * @mapped_local_addr: Mapped local ip/tcp address
+ *
+ * Returns err code if mapping info is not found in the hash table,
+ * otherwise returns 0
+ */
+int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr,
+ struct sockaddr_storage *mapped_local_addr)
+{
+ struct hlist_node *tmp_hlist_node;
+ struct hlist_head *hash_bucket_head;
+ struct iwpm_mapping_info *map_info = NULL;
+ unsigned long flags;
+ int ret = -EINVAL;
+
+ spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+ if (iwpm_hash_bucket) {
+ hash_bucket_head = get_mapinfo_hash_bucket(
+ local_sockaddr,
+ mapped_local_addr);
+ if (!hash_bucket_head)
+ goto remove_mapinfo_exit;
+
+ hlist_for_each_entry_safe(map_info, tmp_hlist_node,
+ hash_bucket_head, hlist_node) {
+
+ if (!iwpm_compare_sockaddr(&map_info->mapped_sockaddr,
+ mapped_local_addr)) {
+
+ hlist_del_init(&map_info->hlist_node);
+ kfree(map_info);
+ ret = 0;
+ break;
+ }
+ }
+ }
+remove_mapinfo_exit:
+ spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+ return ret;
+}
+
+static void free_hash_bucket(void)
+{
+ struct hlist_node *tmp_hlist_node;
+ struct iwpm_mapping_info *map_info;
+ unsigned long flags;
+ int i;
+
+ /* remove all the mapinfo data from the list */
+ spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+ for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) {
+ hlist_for_each_entry_safe(map_info, tmp_hlist_node,
+ &iwpm_hash_bucket[i], hlist_node) {
+
+ hlist_del_init(&map_info->hlist_node);
+ kfree(map_info);
+ }
+ }
+ /* free the hash list */
+ kfree(iwpm_hash_bucket);
+ iwpm_hash_bucket = NULL;
+ spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+}
+
+static void free_reminfo_bucket(void)
+{
+ struct hlist_node *tmp_hlist_node;
+ struct iwpm_remote_info *rem_info;
+ unsigned long flags;
+ int i;
+
+ /* remove all the remote info from the list */
+ spin_lock_irqsave(&iwpm_reminfo_lock, flags);
+ for (i = 0; i < IWPM_REMINFO_HASH_SIZE; i++) {
+ hlist_for_each_entry_safe(rem_info, tmp_hlist_node,
+ &iwpm_reminfo_bucket[i], hlist_node) {
+
+ hlist_del_init(&rem_info->hlist_node);
+ kfree(rem_info);
+ }
+ }
+ /* free the hash list */
+ kfree(iwpm_reminfo_bucket);
+ iwpm_reminfo_bucket = NULL;
+ spin_unlock_irqrestore(&iwpm_reminfo_lock, flags);
+}
+
+static struct hlist_head *get_reminfo_hash_bucket(struct sockaddr_storage *,
+ struct sockaddr_storage *);
+
+void iwpm_add_remote_info(struct iwpm_remote_info *rem_info)
+{
+ struct hlist_head *hash_bucket_head;
+ unsigned long flags;
+
+ spin_lock_irqsave(&iwpm_reminfo_lock, flags);
+ if (iwpm_reminfo_bucket) {
+ hash_bucket_head = get_reminfo_hash_bucket(
+ &rem_info->mapped_loc_sockaddr,
+ &rem_info->mapped_rem_sockaddr);
+ if (hash_bucket_head)
+ hlist_add_head(&rem_info->hlist_node, hash_bucket_head);
+ }
+ spin_unlock_irqrestore(&iwpm_reminfo_lock, flags);
+}
+
+/**
+ * iwpm_get_remote_info - Get the remote connecting peer address info
+ *
+ * @mapped_loc_addr: Mapped local address of the listening peer
+ * @mapped_rem_addr: Mapped remote address of the connecting peer
+ * @remote_addr: To store the remote address of the connecting peer
+ * @nl_client: The index of the netlink client
+ *
+ * The remote address info is retrieved and provided to the client in
+ * the remote_addr. After that it is removed from the hash table
+ */
+int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr,
+ struct sockaddr_storage *mapped_rem_addr,
+ struct sockaddr_storage *remote_addr,
+ u8 nl_client)
+{
+ struct hlist_node *tmp_hlist_node;
+ struct hlist_head *hash_bucket_head;
+ struct iwpm_remote_info *rem_info = NULL;
+ unsigned long flags;
+ int ret = -EINVAL;
+
+ spin_lock_irqsave(&iwpm_reminfo_lock, flags);
+ if (iwpm_reminfo_bucket) {
+ hash_bucket_head = get_reminfo_hash_bucket(
+ mapped_loc_addr,
+ mapped_rem_addr);
+ if (!hash_bucket_head)
+ goto get_remote_info_exit;
+ hlist_for_each_entry_safe(rem_info, tmp_hlist_node,
+ hash_bucket_head, hlist_node) {
+
+ if (!iwpm_compare_sockaddr(&rem_info->mapped_loc_sockaddr,
+ mapped_loc_addr) &&
+ !iwpm_compare_sockaddr(&rem_info->mapped_rem_sockaddr,
+ mapped_rem_addr)) {
+
+ memcpy(remote_addr, &rem_info->remote_sockaddr,
+ sizeof(struct sockaddr_storage));
+ iwpm_print_sockaddr(remote_addr,
+ "get_remote_info: Remote sockaddr:");
+
+ hlist_del_init(&rem_info->hlist_node);
+ kfree(rem_info);
+ ret = 0;
+ break;
+ }
+ }
+ }
+get_remote_info_exit:
+ spin_unlock_irqrestore(&iwpm_reminfo_lock, flags);
+ return ret;
+}
+
+struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq,
+ u8 nl_client, gfp_t gfp)
+{
+ struct iwpm_nlmsg_request *nlmsg_request = NULL;
+ unsigned long flags;
+
+ nlmsg_request = kzalloc(sizeof(struct iwpm_nlmsg_request), gfp);
+ if (!nlmsg_request)
+ return NULL;
+
+ spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags);
+ list_add_tail(&nlmsg_request->inprocess_list, &iwpm_nlmsg_req_list);
+ spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags);
+
+ kref_init(&nlmsg_request->kref);
+ kref_get(&nlmsg_request->kref);
+ nlmsg_request->nlmsg_seq = nlmsg_seq;
+ nlmsg_request->nl_client = nl_client;
+ nlmsg_request->request_done = 0;
+ nlmsg_request->err_code = 0;
+ sema_init(&nlmsg_request->sem, 1);
+ down(&nlmsg_request->sem);
+ return nlmsg_request;
+}
+
+void iwpm_free_nlmsg_request(struct kref *kref)
+{
+ struct iwpm_nlmsg_request *nlmsg_request;
+ unsigned long flags;
+
+ nlmsg_request = container_of(kref, struct iwpm_nlmsg_request, kref);
+
+ spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags);
+ list_del_init(&nlmsg_request->inprocess_list);
+ spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags);
+
+ if (!nlmsg_request->request_done)
+ pr_debug("%s Freeing incomplete nlmsg request (seq = %u).\n",
+ __func__, nlmsg_request->nlmsg_seq);
+ kfree(nlmsg_request);
+}
+
+struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq)
+{
+ struct iwpm_nlmsg_request *nlmsg_request;
+ struct iwpm_nlmsg_request *found_request = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags);
+ list_for_each_entry(nlmsg_request, &iwpm_nlmsg_req_list,
+ inprocess_list) {
+ if (nlmsg_request->nlmsg_seq == echo_seq) {
+ found_request = nlmsg_request;
+ kref_get(&nlmsg_request->kref);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags);
+ return found_request;
+}
+
+int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request)
+{
+ int ret;
+
+ ret = down_timeout(&nlmsg_request->sem, IWPM_NL_TIMEOUT);
+ if (ret) {
+ ret = -EINVAL;
+ pr_info("%s: Timeout %d sec for netlink request (seq = %u)\n",
+ __func__, (IWPM_NL_TIMEOUT/HZ), nlmsg_request->nlmsg_seq);
+ } else {
+ ret = nlmsg_request->err_code;
+ }
+ kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+ return ret;
+}
+
+int iwpm_get_nlmsg_seq(void)
+{
+ return atomic_inc_return(&iwpm_admin.nlmsg_seq);
+}
+
+/* valid client */
+u32 iwpm_get_registration(u8 nl_client)
+{
+ return iwpm_admin.reg_list[nl_client];
+}
+
+/* valid client */
+void iwpm_set_registration(u8 nl_client, u32 reg)
+{
+ iwpm_admin.reg_list[nl_client] = reg;
+}
+
+/* valid client */
+u32 iwpm_check_registration(u8 nl_client, u32 reg)
+{
+ return (iwpm_get_registration(nl_client) & reg);
+}
+
+int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr,
+ struct sockaddr_storage *b_sockaddr)
+{
+ if (a_sockaddr->ss_family != b_sockaddr->ss_family)
+ return 1;
+ if (a_sockaddr->ss_family == AF_INET) {
+ struct sockaddr_in *a4_sockaddr =
+ (struct sockaddr_in *)a_sockaddr;
+ struct sockaddr_in *b4_sockaddr =
+ (struct sockaddr_in *)b_sockaddr;
+ if (!memcmp(&a4_sockaddr->sin_addr,
+ &b4_sockaddr->sin_addr, sizeof(struct in_addr))
+ && a4_sockaddr->sin_port == b4_sockaddr->sin_port)
+ return 0;
+
+ } else if (a_sockaddr->ss_family == AF_INET6) {
+ struct sockaddr_in6 *a6_sockaddr =
+ (struct sockaddr_in6 *)a_sockaddr;
+ struct sockaddr_in6 *b6_sockaddr =
+ (struct sockaddr_in6 *)b_sockaddr;
+ if (!memcmp(&a6_sockaddr->sin6_addr,
+ &b6_sockaddr->sin6_addr, sizeof(struct in6_addr))
+ && a6_sockaddr->sin6_port == b6_sockaddr->sin6_port)
+ return 0;
+
+ } else {
+ pr_err("%s: Invalid sockaddr family\n", __func__);
+ }
+ return 1;
+}
+
+struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh,
+ int nl_client)
+{
+ struct sk_buff *skb = NULL;
+
+ skb = dev_alloc_skb(IWPM_MSG_SIZE);
+ if (!skb)
+ goto create_nlmsg_exit;
+
+ if (!(ibnl_put_msg(skb, nlh, 0, 0, nl_client, nl_op,
+ NLM_F_REQUEST))) {
+ pr_warn("%s: Unable to put the nlmsg header\n", __func__);
+ dev_kfree_skb(skb);
+ skb = NULL;
+ }
+create_nlmsg_exit:
+ return skb;
+}
+
+int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max,
+ const struct nla_policy *nlmsg_policy,
+ struct nlattr *nltb[], const char *msg_type)
+{
+ int nlh_len = 0;
+ int ret;
+ const char *err_str = "";
+
+ ret = nlmsg_validate_deprecated(cb->nlh, nlh_len, policy_max - 1,
+ nlmsg_policy, NULL);
+ if (ret) {
+ err_str = "Invalid attribute";
+ goto parse_nlmsg_error;
+ }
+ ret = nlmsg_parse_deprecated(cb->nlh, nlh_len, nltb, policy_max - 1,
+ nlmsg_policy, NULL);
+ if (ret) {
+ err_str = "Unable to parse the nlmsg";
+ goto parse_nlmsg_error;
+ }
+ ret = iwpm_validate_nlmsg_attr(nltb, policy_max);
+ if (ret) {
+ err_str = "Invalid NULL attribute";
+ goto parse_nlmsg_error;
+ }
+ return 0;
+parse_nlmsg_error:
+ pr_warn("%s: %s (msg type %s ret = %d)\n",
+ __func__, err_str, msg_type, ret);
+ return ret;
+}
+
+void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg)
+{
+ struct sockaddr_in6 *sockaddr_v6;
+ struct sockaddr_in *sockaddr_v4;
+
+ switch (sockaddr->ss_family) {
+ case AF_INET:
+ sockaddr_v4 = (struct sockaddr_in *)sockaddr;
+ pr_debug("%s IPV4 %pI4: %u(0x%04X)\n",
+ msg, &sockaddr_v4->sin_addr,
+ ntohs(sockaddr_v4->sin_port),
+ ntohs(sockaddr_v4->sin_port));
+ break;
+ case AF_INET6:
+ sockaddr_v6 = (struct sockaddr_in6 *)sockaddr;
+ pr_debug("%s IPV6 %pI6: %u(0x%04X)\n",
+ msg, &sockaddr_v6->sin6_addr,
+ ntohs(sockaddr_v6->sin6_port),
+ ntohs(sockaddr_v6->sin6_port));
+ break;
+ default:
+ break;
+ }
+}
+
+static u32 iwpm_ipv6_jhash(struct sockaddr_in6 *ipv6_sockaddr)
+{
+ u32 ipv6_hash = jhash(&ipv6_sockaddr->sin6_addr, sizeof(struct in6_addr), 0);
+ u32 hash = jhash_2words(ipv6_hash, (__force u32) ipv6_sockaddr->sin6_port, 0);
+ return hash;
+}
+
+static u32 iwpm_ipv4_jhash(struct sockaddr_in *ipv4_sockaddr)
+{
+ u32 ipv4_hash = jhash(&ipv4_sockaddr->sin_addr, sizeof(struct in_addr), 0);
+ u32 hash = jhash_2words(ipv4_hash, (__force u32) ipv4_sockaddr->sin_port, 0);
+ return hash;
+}
+
+static int get_hash_bucket(struct sockaddr_storage *a_sockaddr,
+ struct sockaddr_storage *b_sockaddr, u32 *hash)
+{
+ u32 a_hash, b_hash;
+
+ if (a_sockaddr->ss_family == AF_INET) {
+ a_hash = iwpm_ipv4_jhash((struct sockaddr_in *) a_sockaddr);
+ b_hash = iwpm_ipv4_jhash((struct sockaddr_in *) b_sockaddr);
+
+ } else if (a_sockaddr->ss_family == AF_INET6) {
+ a_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) a_sockaddr);
+ b_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) b_sockaddr);
+ } else {
+ pr_err("%s: Invalid sockaddr family\n", __func__);
+ return -EINVAL;
+ }
+
+ if (a_hash == b_hash) /* if port mapper isn't available */
+ *hash = a_hash;
+ else
+ *hash = jhash_2words(a_hash, b_hash, 0);
+ return 0;
+}
+
+static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage
+ *local_sockaddr, struct sockaddr_storage
+ *mapped_sockaddr)
+{
+ u32 hash;
+ int ret;
+
+ ret = get_hash_bucket(local_sockaddr, mapped_sockaddr, &hash);
+ if (ret)
+ return NULL;
+ return &iwpm_hash_bucket[hash & IWPM_MAPINFO_HASH_MASK];
+}
+
+static struct hlist_head *get_reminfo_hash_bucket(struct sockaddr_storage
+ *mapped_loc_sockaddr, struct sockaddr_storage
+ *mapped_rem_sockaddr)
+{
+ u32 hash;
+ int ret;
+
+ ret = get_hash_bucket(mapped_loc_sockaddr, mapped_rem_sockaddr, &hash);
+ if (ret)
+ return NULL;
+ return &iwpm_reminfo_bucket[hash & IWPM_REMINFO_HASH_MASK];
+}
+
+static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid)
+{
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ u32 msg_seq;
+ const char *err_str = "";
+ int ret = -EINVAL;
+
+ skb = iwpm_create_nlmsg(RDMA_NL_IWPM_MAPINFO_NUM, &nlh, nl_client);
+ if (!skb) {
+ err_str = "Unable to create a nlmsg";
+ goto mapinfo_num_error;
+ }
+ nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+ msg_seq = 0;
+ err_str = "Unable to put attribute of mapinfo number nlmsg";
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_MAPINFO_SEQ);
+ if (ret)
+ goto mapinfo_num_error;
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32),
+ &mapping_num, IWPM_NLA_MAPINFO_SEND_NUM);
+ if (ret)
+ goto mapinfo_num_error;
+
+ nlmsg_end(skb, nlh);
+
+ ret = rdma_nl_unicast(&init_net, skb, iwpm_pid);
+ if (ret) {
+ skb = NULL;
+ err_str = "Unable to send a nlmsg";
+ goto mapinfo_num_error;
+ }
+ pr_debug("%s: Sent mapping number = %u\n", __func__, mapping_num);
+ return 0;
+mapinfo_num_error:
+ pr_info("%s: %s\n", __func__, err_str);
+ dev_kfree_skb(skb);
+ return ret;
+}
+
+static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid)
+{
+ struct nlmsghdr *nlh = NULL;
+ int ret = 0;
+
+ if (!skb)
+ return ret;
+ if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client,
+ RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) {
+ pr_warn("%s Unable to put NLMSG_DONE\n", __func__);
+ dev_kfree_skb(skb);
+ return -ENOMEM;
+ }
+ nlh->nlmsg_type = NLMSG_DONE;
+ ret = rdma_nl_unicast(&init_net, skb, iwpm_pid);
+ if (ret)
+ pr_warn("%s Unable to send a nlmsg\n", __func__);
+ return ret;
+}
+
+int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid)
+{
+ struct iwpm_mapping_info *map_info;
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ int skb_num = 0, mapping_num = 0;
+ int i = 0, nlmsg_bytes = 0;
+ unsigned long flags;
+ const char *err_str = "";
+ int ret;
+
+ skb = dev_alloc_skb(NLMSG_GOODSIZE);
+ if (!skb) {
+ ret = -ENOMEM;
+ err_str = "Unable to allocate skb";
+ goto send_mapping_info_exit;
+ }
+ skb_num++;
+ spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+ ret = -EINVAL;
+ for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) {
+ hlist_for_each_entry(map_info, &iwpm_hash_bucket[i],
+ hlist_node) {
+ if (map_info->nl_client != nl_client)
+ continue;
+ nlh = NULL;
+ if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client,
+ RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) {
+ ret = -ENOMEM;
+ err_str = "Unable to put the nlmsg header";
+ goto send_mapping_info_unlock;
+ }
+ err_str = "Unable to put attribute of the nlmsg";
+ ret = ibnl_put_attr(skb, nlh,
+ sizeof(struct sockaddr_storage),
+ &map_info->local_sockaddr,
+ IWPM_NLA_MAPINFO_LOCAL_ADDR);
+ if (ret)
+ goto send_mapping_info_unlock;
+
+ ret = ibnl_put_attr(skb, nlh,
+ sizeof(struct sockaddr_storage),
+ &map_info->mapped_sockaddr,
+ IWPM_NLA_MAPINFO_MAPPED_ADDR);
+ if (ret)
+ goto send_mapping_info_unlock;
+
+ if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) {
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32),
+ &map_info->map_flags,
+ IWPM_NLA_MAPINFO_FLAGS);
+ if (ret)
+ goto send_mapping_info_unlock;
+ }
+
+ nlmsg_end(skb, nlh);
+
+ iwpm_print_sockaddr(&map_info->local_sockaddr,
+ "send_mapping_info: Local sockaddr:");
+ iwpm_print_sockaddr(&map_info->mapped_sockaddr,
+ "send_mapping_info: Mapped local sockaddr:");
+ mapping_num++;
+ nlmsg_bytes += nlh->nlmsg_len;
+
+ /* check if all mappings can fit in one skb */
+ if (NLMSG_GOODSIZE - nlmsg_bytes < nlh->nlmsg_len * 2) {
+ /* and leave room for NLMSG_DONE */
+ nlmsg_bytes = 0;
+ skb_num++;
+ spin_unlock_irqrestore(&iwpm_mapinfo_lock,
+ flags);
+ /* send the skb */
+ ret = send_nlmsg_done(skb, nl_client, iwpm_pid);
+ skb = NULL;
+ if (ret) {
+ err_str = "Unable to send map info";
+ goto send_mapping_info_exit;
+ }
+ if (skb_num == IWPM_MAPINFO_SKB_COUNT) {
+ ret = -ENOMEM;
+ err_str = "Insufficient skbs for map info";
+ goto send_mapping_info_exit;
+ }
+ skb = dev_alloc_skb(NLMSG_GOODSIZE);
+ if (!skb) {
+ ret = -ENOMEM;
+ err_str = "Unable to allocate skb";
+ goto send_mapping_info_exit;
+ }
+ spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+ }
+ }
+ }
+send_mapping_info_unlock:
+ spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+send_mapping_info_exit:
+ if (ret) {
+ pr_warn("%s: %s (ret = %d)\n", __func__, err_str, ret);
+ dev_kfree_skb(skb);
+ return ret;
+ }
+ send_nlmsg_done(skb, nl_client, iwpm_pid);
+ return send_mapinfo_num(mapping_num, nl_client, iwpm_pid);
+}
+
+int iwpm_mapinfo_available(void)
+{
+ unsigned long flags;
+ int full_bucket = 0, i = 0;
+
+ spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+ if (iwpm_hash_bucket) {
+ for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) {
+ if (!hlist_empty(&iwpm_hash_bucket[i])) {
+ full_bucket = 1;
+ break;
+ }
+ }
+ }
+ spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+ return full_bucket;
+}
+
+int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version)
+{
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ const char *err_str;
+ int ret = -EINVAL;
+
+ skb = iwpm_create_nlmsg(RDMA_NL_IWPM_HELLO, &nlh, nl_client);
+ if (!skb) {
+ err_str = "Unable to create a nlmsg";
+ goto hello_num_error;
+ }
+ nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+ err_str = "Unable to put attribute of abi_version into nlmsg";
+ ret = ibnl_put_attr(skb, nlh, sizeof(u16), &abi_version,
+ IWPM_NLA_HELLO_ABI_VERSION);
+ if (ret)
+ goto hello_num_error;
+ nlmsg_end(skb, nlh);
+
+ ret = rdma_nl_unicast(&init_net, skb, iwpm_pid);
+ if (ret) {
+ skb = NULL;
+ err_str = "Unable to send a nlmsg";
+ goto hello_num_error;
+ }
+ pr_debug("%s: Sent hello abi_version = %u\n", __func__, abi_version);
+ return 0;
+hello_num_error:
+ pr_info("%s: %s\n", __func__, err_str);
+ dev_kfree_skb(skb);
+ return ret;
+}
diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h
new file mode 100644
index 000000000..d6fc84021
--- /dev/null
+++ b/drivers/infiniband/core/iwpm_util.h
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _IWPM_UTIL_H
+#define _IWPM_UTIL_H
+
+#include <linux/io.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/delay.h>
+#include <linux/workqueue.h>
+#include <linux/mutex.h>
+#include <linux/jhash.h>
+#include <linux/kref.h>
+#include <net/netlink.h>
+#include <linux/errno.h>
+#include <rdma/iw_portmap.h>
+#include <rdma/rdma_netlink.h>
+
+
+#define IWPM_NL_RETRANS 3
+#define IWPM_NL_TIMEOUT (10*HZ)
+#define IWPM_MAPINFO_SKB_COUNT 20
+
+#define IWPM_PID_UNDEFINED -1
+#define IWPM_PID_UNAVAILABLE -2
+
+#define IWPM_REG_UNDEF 0x01
+#define IWPM_REG_VALID 0x02
+#define IWPM_REG_INCOMPL 0x04
+
+struct iwpm_nlmsg_request {
+ struct list_head inprocess_list;
+ __u32 nlmsg_seq;
+ void *req_buffer;
+ u8 nl_client;
+ u8 request_done;
+ u16 err_code;
+ struct semaphore sem;
+ struct kref kref;
+};
+
+struct iwpm_mapping_info {
+ struct hlist_node hlist_node;
+ struct sockaddr_storage local_sockaddr;
+ struct sockaddr_storage mapped_sockaddr;
+ u8 nl_client;
+ u32 map_flags;
+};
+
+struct iwpm_remote_info {
+ struct hlist_node hlist_node;
+ struct sockaddr_storage remote_sockaddr;
+ struct sockaddr_storage mapped_loc_sockaddr;
+ struct sockaddr_storage mapped_rem_sockaddr;
+ u8 nl_client;
+};
+
+struct iwpm_admin_data {
+ atomic_t nlmsg_seq;
+ u32 reg_list[RDMA_NL_NUM_CLIENTS];
+};
+
+/**
+ * iwpm_get_nlmsg_request - Allocate and initialize netlink message request
+ * @nlmsg_seq: Sequence number of the netlink message
+ * @nl_client: The index of the netlink client
+ * @gfp: Indicates how the memory for the request should be allocated
+ *
+ * Returns the newly allocated netlink request object if successful,
+ * otherwise returns NULL
+ */
+struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq,
+ u8 nl_client, gfp_t gfp);
+
+/**
+ * iwpm_free_nlmsg_request - Deallocate netlink message request
+ * @kref: Holds reference of netlink message request
+ */
+void iwpm_free_nlmsg_request(struct kref *kref);
+
+/**
+ * iwpm_find_nlmsg_request - Find netlink message request in the request list
+ * @echo_seq: Sequence number of the netlink request to find
+ *
+ * Returns the found netlink message request,
+ * if not found, returns NULL
+ */
+struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq);
+
+/**
+ * iwpm_wait_complete_req - Block while servicing the netlink request
+ * @nlmsg_request: Netlink message request to service
+ *
+ * Wakes up, after the request is completed or expired
+ * Returns 0 if the request is complete without error
+ */
+int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request);
+
+/**
+ * iwpm_get_nlmsg_seq - Get the sequence number for a netlink
+ * message to send to the port mapper
+ *
+ * Returns the sequence number for the netlink message.
+ */
+int iwpm_get_nlmsg_seq(void);
+
+/**
+ * iwpm_add_remote_info - Add remote address info of the connecting peer
+ * to the remote info hash table
+ * @reminfo: The remote info to be added
+ */
+void iwpm_add_remote_info(struct iwpm_remote_info *reminfo);
+
+/**
+ * iwpm_check_registration - Check if the client registration
+ * matches the given one
+ * @nl_client: The index of the netlink client
+ * @reg: The given registration type to compare with
+ *
+ * Call iwpm_register_pid() to register a client
+ * Returns true if the client registration matches reg,
+ * otherwise returns false
+ */
+u32 iwpm_check_registration(u8 nl_client, u32 reg);
+
+/**
+ * iwpm_set_registration - Set the client registration
+ * @nl_client: The index of the netlink client
+ * @reg: Registration type to set
+ */
+void iwpm_set_registration(u8 nl_client, u32 reg);
+
+/**
+ * iwpm_get_registration - Get the client registration
+ * @nl_client: The index of the netlink client
+ *
+ * Returns the client registration type
+ */
+u32 iwpm_get_registration(u8 nl_client);
+
+/**
+ * iwpm_send_mapinfo - Send local and mapped IPv4/IPv6 address info of
+ * a client to the user space port mapper
+ * @nl_client: The index of the netlink client
+ * @iwpm_pid: The pid of the user space port mapper
+ *
+ * If successful, returns the number of sent mapping info records
+ */
+int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid);
+
+/**
+ * iwpm_mapinfo_available - Check if any mapping info records is available
+ * in the hash table
+ *
+ * Returns 1 if mapping information is available, otherwise returns 0
+ */
+int iwpm_mapinfo_available(void);
+
+/**
+ * iwpm_compare_sockaddr - Compare two sockaddr storage structs
+ * @a_sockaddr: first sockaddr to compare
+ * @b_sockaddr: second sockaddr to compare
+ *
+ * Return: 0 if they are holding the same ip/tcp address info,
+ * otherwise returns 1
+ */
+int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr,
+ struct sockaddr_storage *b_sockaddr);
+
+/**
+ * iwpm_validate_nlmsg_attr - Check for NULL netlink attributes
+ * @nltb: Holds address of each netlink message attributes
+ * @nla_count: Number of netlink message attributes
+ *
+ * Returns error if any of the nla_count attributes is NULL
+ */
+static inline int iwpm_validate_nlmsg_attr(struct nlattr *nltb[],
+ int nla_count)
+{
+ int i;
+ for (i = 1; i < nla_count; i++) {
+ if (!nltb[i])
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/**
+ * iwpm_create_nlmsg - Allocate skb and form a netlink message
+ * @nl_op: Netlink message opcode
+ * @nlh: Holds address of the netlink message header in skb
+ * @nl_client: The index of the netlink client
+ *
+ * Returns the newly allcated skb, or NULL if the tailroom of the skb
+ * is insufficient to store the message header and payload
+ */
+struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh,
+ int nl_client);
+
+/**
+ * iwpm_parse_nlmsg - Validate and parse the received netlink message
+ * @cb: Netlink callback structure
+ * @policy_max: Maximum attribute type to be expected
+ * @nlmsg_policy: Validation policy
+ * @nltb: Array to store policy_max parsed elements
+ * @msg_type: Type of netlink message
+ *
+ * Returns 0 on success or a negative error code
+ */
+int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max,
+ const struct nla_policy *nlmsg_policy,
+ struct nlattr *nltb[], const char *msg_type);
+
+/**
+ * iwpm_print_sockaddr - Print IPv4/IPv6 address and TCP port
+ * @sockaddr: Socket address to print
+ * @msg: Message to print
+ */
+void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg);
+
+/**
+ * iwpm_send_hello - Send hello response to iwpmd
+ *
+ * @nl_client: The index of the netlink client
+ * @iwpm_pid: The pid of the user space port mapper
+ * @abi_version: The kernel's abi_version
+ *
+ * Returns 0 on success or a negative error code
+ */
+int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version);
+extern u16 iwpm_ulib_version;
+#endif
diff --git a/drivers/infiniband/core/lag.c b/drivers/infiniband/core/lag.c
new file mode 100644
index 000000000..c77d7d255
--- /dev/null
+++ b/drivers/infiniband/core/lag.c
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2020 Mellanox Technologies. All rights reserved.
+ */
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+#include <rdma/lag.h>
+
+static struct sk_buff *rdma_build_skb(struct net_device *netdev,
+ struct rdma_ah_attr *ah_attr,
+ gfp_t flags)
+{
+ struct ipv6hdr *ip6h;
+ struct sk_buff *skb;
+ struct ethhdr *eth;
+ struct iphdr *iph;
+ struct udphdr *uh;
+ u8 smac[ETH_ALEN];
+ bool is_ipv4;
+ int hdr_len;
+
+ is_ipv4 = ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw);
+ hdr_len = ETH_HLEN + sizeof(struct udphdr) + LL_RESERVED_SPACE(netdev);
+ hdr_len += is_ipv4 ? sizeof(struct iphdr) : sizeof(struct ipv6hdr);
+
+ skb = alloc_skb(hdr_len, flags);
+ if (!skb)
+ return NULL;
+
+ skb->dev = netdev;
+ skb_reserve(skb, hdr_len);
+ skb_push(skb, sizeof(struct udphdr));
+ skb_reset_transport_header(skb);
+ uh = udp_hdr(skb);
+ uh->source =
+ htons(rdma_flow_label_to_udp_sport(ah_attr->grh.flow_label));
+ uh->dest = htons(ROCE_V2_UDP_DPORT);
+ uh->len = htons(sizeof(struct udphdr));
+
+ if (is_ipv4) {
+ skb_push(skb, sizeof(struct iphdr));
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ iph->frag_off = 0;
+ iph->version = 4;
+ iph->protocol = IPPROTO_UDP;
+ iph->ihl = 0x5;
+ iph->tot_len = htons(sizeof(struct udphdr) + sizeof(struct
+ iphdr));
+ memcpy(&iph->saddr, ah_attr->grh.sgid_attr->gid.raw + 12,
+ sizeof(struct in_addr));
+ memcpy(&iph->daddr, ah_attr->grh.dgid.raw + 12,
+ sizeof(struct in_addr));
+ } else {
+ skb_push(skb, sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb);
+ ip6h = ipv6_hdr(skb);
+ ip6h->version = 6;
+ ip6h->nexthdr = IPPROTO_UDP;
+ memcpy(&ip6h->flow_lbl, &ah_attr->grh.flow_label,
+ sizeof(*ip6h->flow_lbl));
+ memcpy(&ip6h->saddr, ah_attr->grh.sgid_attr->gid.raw,
+ sizeof(struct in6_addr));
+ memcpy(&ip6h->daddr, ah_attr->grh.dgid.raw,
+ sizeof(struct in6_addr));
+ }
+
+ skb_push(skb, sizeof(struct ethhdr));
+ skb_reset_mac_header(skb);
+ eth = eth_hdr(skb);
+ skb->protocol = eth->h_proto = htons(is_ipv4 ? ETH_P_IP : ETH_P_IPV6);
+ rdma_read_gid_l2_fields(ah_attr->grh.sgid_attr, NULL, smac);
+ memcpy(eth->h_source, smac, ETH_ALEN);
+ memcpy(eth->h_dest, ah_attr->roce.dmac, ETH_ALEN);
+
+ return skb;
+}
+
+static struct net_device *rdma_get_xmit_slave_udp(struct ib_device *device,
+ struct net_device *master,
+ struct rdma_ah_attr *ah_attr,
+ gfp_t flags)
+{
+ struct net_device *slave;
+ struct sk_buff *skb;
+
+ skb = rdma_build_skb(master, ah_attr, flags);
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+
+ rcu_read_lock();
+ slave = netdev_get_xmit_slave(master, skb,
+ !!(device->lag_flags &
+ RDMA_LAG_FLAGS_HASH_ALL_SLAVES));
+ if (slave)
+ dev_hold(slave);
+ rcu_read_unlock();
+ kfree_skb(skb);
+ return slave;
+}
+
+void rdma_lag_put_ah_roce_slave(struct net_device *xmit_slave)
+{
+ if (xmit_slave)
+ dev_put(xmit_slave);
+}
+
+struct net_device *rdma_lag_get_ah_roce_slave(struct ib_device *device,
+ struct rdma_ah_attr *ah_attr,
+ gfp_t flags)
+{
+ struct net_device *slave = NULL;
+ struct net_device *master;
+
+ if (!(ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE &&
+ ah_attr->grh.sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP &&
+ ah_attr->grh.flow_label))
+ return NULL;
+
+ rcu_read_lock();
+ master = rdma_read_gid_attr_ndev_rcu(ah_attr->grh.sgid_attr);
+ if (IS_ERR(master)) {
+ rcu_read_unlock();
+ return master;
+ }
+ dev_hold(master);
+ rcu_read_unlock();
+
+ if (!netif_is_bond_master(master))
+ goto put;
+
+ slave = rdma_get_xmit_slave_udp(device, master, ah_attr, flags);
+put:
+ dev_put(master);
+ return slave;
+}
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
new file mode 100644
index 000000000..674344eb8
--- /dev/null
+++ b/drivers/infiniband/core/mad.c
@@ -0,0 +1,3156 @@
+/*
+ * Copyright (c) 2004-2007 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ * Copyright (c) 2014,2018 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/security.h>
+#include <linux/xarray.h>
+#include <rdma/ib_cache.h>
+
+#include "mad_priv.h"
+#include "core_priv.h"
+#include "mad_rmpp.h"
+#include "smi.h"
+#include "opa_smi.h"
+#include "agent.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/ib_mad.h>
+
+#ifdef CONFIG_TRACEPOINTS
+static void create_mad_addr_info(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_qp_info *qp_info,
+ struct trace_event_raw_ib_mad_send_template *entry)
+{
+ struct ib_ud_wr *wr = &mad_send_wr->send_wr;
+ struct rdma_ah_attr attr = {};
+
+ rdma_query_ah(wr->ah, &attr);
+
+ /* These are common */
+ entry->sl = attr.sl;
+ entry->rqpn = wr->remote_qpn;
+ entry->rqkey = wr->remote_qkey;
+ entry->dlid = rdma_ah_get_dlid(&attr);
+}
+#endif
+
+static int mad_sendq_size = IB_MAD_QP_SEND_SIZE;
+static int mad_recvq_size = IB_MAD_QP_RECV_SIZE;
+
+module_param_named(send_queue_size, mad_sendq_size, int, 0444);
+MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests");
+module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
+MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
+
+static DEFINE_XARRAY_ALLOC1(ib_mad_clients);
+static u32 ib_mad_client_next;
+static struct list_head ib_mad_port_list;
+
+/* Port list lock */
+static DEFINE_SPINLOCK(ib_mad_port_list_lock);
+
+/* Forward declarations */
+static int method_in_use(struct ib_mad_mgmt_method_table **method,
+ struct ib_mad_reg_req *mad_reg_req);
+static void remove_mad_reg_req(struct ib_mad_agent_private *priv);
+static struct ib_mad_agent_private *find_mad_agent(
+ struct ib_mad_port_private *port_priv,
+ const struct ib_mad_hdr *mad);
+static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_private *mad);
+static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv);
+static void timeout_sends(struct work_struct *work);
+static void local_completions(struct work_struct *work);
+static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+ struct ib_mad_agent_private *agent_priv,
+ u8 mgmt_class);
+static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+ struct ib_mad_agent_private *agent_priv);
+static bool ib_mad_send_error(struct ib_mad_port_private *port_priv,
+ struct ib_wc *wc);
+static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc);
+
+/*
+ * Returns a ib_mad_port_private structure or NULL for a device/port
+ * Assumes ib_mad_port_list_lock is being held
+ */
+static inline struct ib_mad_port_private *
+__ib_get_mad_port(struct ib_device *device, u32 port_num)
+{
+ struct ib_mad_port_private *entry;
+
+ list_for_each_entry(entry, &ib_mad_port_list, port_list) {
+ if (entry->device == device && entry->port_num == port_num)
+ return entry;
+ }
+ return NULL;
+}
+
+/*
+ * Wrapper function to return a ib_mad_port_private structure or NULL
+ * for a device/port
+ */
+static inline struct ib_mad_port_private *
+ib_get_mad_port(struct ib_device *device, u32 port_num)
+{
+ struct ib_mad_port_private *entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+ entry = __ib_get_mad_port(device, port_num);
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+ return entry;
+}
+
+static inline u8 convert_mgmt_class(u8 mgmt_class)
+{
+ /* Alias IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE to 0 */
+ return mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE ?
+ 0 : mgmt_class;
+}
+
+static int get_spl_qp_index(enum ib_qp_type qp_type)
+{
+ switch (qp_type) {
+ case IB_QPT_SMI:
+ return 0;
+ case IB_QPT_GSI:
+ return 1;
+ default:
+ return -1;
+ }
+}
+
+static int vendor_class_index(u8 mgmt_class)
+{
+ return mgmt_class - IB_MGMT_CLASS_VENDOR_RANGE2_START;
+}
+
+static int is_vendor_class(u8 mgmt_class)
+{
+ if ((mgmt_class < IB_MGMT_CLASS_VENDOR_RANGE2_START) ||
+ (mgmt_class > IB_MGMT_CLASS_VENDOR_RANGE2_END))
+ return 0;
+ return 1;
+}
+
+static int is_vendor_oui(char *oui)
+{
+ if (oui[0] || oui[1] || oui[2])
+ return 1;
+ return 0;
+}
+
+static int is_vendor_method_in_use(
+ struct ib_mad_mgmt_vendor_class *vendor_class,
+ struct ib_mad_reg_req *mad_reg_req)
+{
+ struct ib_mad_mgmt_method_table *method;
+ int i;
+
+ for (i = 0; i < MAX_MGMT_OUI; i++) {
+ if (!memcmp(vendor_class->oui[i], mad_reg_req->oui, 3)) {
+ method = vendor_class->method_table[i];
+ if (method) {
+ if (method_in_use(&method, mad_reg_req))
+ return 1;
+ else
+ break;
+ }
+ }
+ }
+ return 0;
+}
+
+int ib_response_mad(const struct ib_mad_hdr *hdr)
+{
+ return ((hdr->method & IB_MGMT_METHOD_RESP) ||
+ (hdr->method == IB_MGMT_METHOD_TRAP_REPRESS) ||
+ ((hdr->mgmt_class == IB_MGMT_CLASS_BM) &&
+ (hdr->attr_mod & IB_BM_ATTR_MOD_RESP)));
+}
+EXPORT_SYMBOL(ib_response_mad);
+
+/*
+ * ib_register_mad_agent - Register to send/receive MADs
+ *
+ * Context: Process context.
+ */
+struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
+ u32 port_num,
+ enum ib_qp_type qp_type,
+ struct ib_mad_reg_req *mad_reg_req,
+ u8 rmpp_version,
+ ib_mad_send_handler send_handler,
+ ib_mad_recv_handler recv_handler,
+ void *context,
+ u32 registration_flags)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_agent *ret = ERR_PTR(-EINVAL);
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_reg_req *reg_req = NULL;
+ struct ib_mad_mgmt_class_table *class;
+ struct ib_mad_mgmt_vendor_class_table *vendor;
+ struct ib_mad_mgmt_vendor_class *vendor_class;
+ struct ib_mad_mgmt_method_table *method;
+ int ret2, qpn;
+ u8 mgmt_class, vclass;
+
+ if ((qp_type == IB_QPT_SMI && !rdma_cap_ib_smi(device, port_num)) ||
+ (qp_type == IB_QPT_GSI && !rdma_cap_ib_cm(device, port_num)))
+ return ERR_PTR(-EPROTONOSUPPORT);
+
+ /* Validate parameters */
+ qpn = get_spl_qp_index(qp_type);
+ if (qpn == -1) {
+ dev_dbg_ratelimited(&device->dev, "%s: invalid QP Type %d\n",
+ __func__, qp_type);
+ goto error1;
+ }
+
+ if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) {
+ dev_dbg_ratelimited(&device->dev,
+ "%s: invalid RMPP Version %u\n",
+ __func__, rmpp_version);
+ goto error1;
+ }
+
+ /* Validate MAD registration request if supplied */
+ if (mad_reg_req) {
+ if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) {
+ dev_dbg_ratelimited(&device->dev,
+ "%s: invalid Class Version %u\n",
+ __func__,
+ mad_reg_req->mgmt_class_version);
+ goto error1;
+ }
+ if (!recv_handler) {
+ dev_dbg_ratelimited(&device->dev,
+ "%s: no recv_handler\n", __func__);
+ goto error1;
+ }
+ if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) {
+ /*
+ * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE is the only
+ * one in this range currently allowed
+ */
+ if (mad_reg_req->mgmt_class !=
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+ dev_dbg_ratelimited(&device->dev,
+ "%s: Invalid Mgmt Class 0x%x\n",
+ __func__, mad_reg_req->mgmt_class);
+ goto error1;
+ }
+ } else if (mad_reg_req->mgmt_class == 0) {
+ /*
+ * Class 0 is reserved in IBA and is used for
+ * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+ */
+ dev_dbg_ratelimited(&device->dev,
+ "%s: Invalid Mgmt Class 0\n",
+ __func__);
+ goto error1;
+ } else if (is_vendor_class(mad_reg_req->mgmt_class)) {
+ /*
+ * If class is in "new" vendor range,
+ * ensure supplied OUI is not zero
+ */
+ if (!is_vendor_oui(mad_reg_req->oui)) {
+ dev_dbg_ratelimited(&device->dev,
+ "%s: No OUI specified for class 0x%x\n",
+ __func__,
+ mad_reg_req->mgmt_class);
+ goto error1;
+ }
+ }
+ /* Make sure class supplied is consistent with RMPP */
+ if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) {
+ if (rmpp_version) {
+ dev_dbg_ratelimited(&device->dev,
+ "%s: RMPP version for non-RMPP class 0x%x\n",
+ __func__, mad_reg_req->mgmt_class);
+ goto error1;
+ }
+ }
+
+ /* Make sure class supplied is consistent with QP type */
+ if (qp_type == IB_QPT_SMI) {
+ if ((mad_reg_req->mgmt_class !=
+ IB_MGMT_CLASS_SUBN_LID_ROUTED) &&
+ (mad_reg_req->mgmt_class !=
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+ dev_dbg_ratelimited(&device->dev,
+ "%s: Invalid SM QP type: class 0x%x\n",
+ __func__, mad_reg_req->mgmt_class);
+ goto error1;
+ }
+ } else {
+ if ((mad_reg_req->mgmt_class ==
+ IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
+ (mad_reg_req->mgmt_class ==
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+ dev_dbg_ratelimited(&device->dev,
+ "%s: Invalid GS QP type: class 0x%x\n",
+ __func__, mad_reg_req->mgmt_class);
+ goto error1;
+ }
+ }
+ } else {
+ /* No registration request supplied */
+ if (!send_handler)
+ goto error1;
+ if (registration_flags & IB_MAD_USER_RMPP)
+ goto error1;
+ }
+
+ /* Validate device and port */
+ port_priv = ib_get_mad_port(device, port_num);
+ if (!port_priv) {
+ dev_dbg_ratelimited(&device->dev, "%s: Invalid port %u\n",
+ __func__, port_num);
+ ret = ERR_PTR(-ENODEV);
+ goto error1;
+ }
+
+ /* Verify the QP requested is supported. For example, Ethernet devices
+ * will not have QP0.
+ */
+ if (!port_priv->qp_info[qpn].qp) {
+ dev_dbg_ratelimited(&device->dev, "%s: QP %d not supported\n",
+ __func__, qpn);
+ ret = ERR_PTR(-EPROTONOSUPPORT);
+ goto error1;
+ }
+
+ /* Allocate structures */
+ mad_agent_priv = kzalloc(sizeof *mad_agent_priv, GFP_KERNEL);
+ if (!mad_agent_priv) {
+ ret = ERR_PTR(-ENOMEM);
+ goto error1;
+ }
+
+ if (mad_reg_req) {
+ reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL);
+ if (!reg_req) {
+ ret = ERR_PTR(-ENOMEM);
+ goto error3;
+ }
+ }
+
+ /* Now, fill in the various structures */
+ mad_agent_priv->qp_info = &port_priv->qp_info[qpn];
+ mad_agent_priv->reg_req = reg_req;
+ mad_agent_priv->agent.rmpp_version = rmpp_version;
+ mad_agent_priv->agent.device = device;
+ mad_agent_priv->agent.recv_handler = recv_handler;
+ mad_agent_priv->agent.send_handler = send_handler;
+ mad_agent_priv->agent.context = context;
+ mad_agent_priv->agent.qp = port_priv->qp_info[qpn].qp;
+ mad_agent_priv->agent.port_num = port_num;
+ mad_agent_priv->agent.flags = registration_flags;
+ spin_lock_init(&mad_agent_priv->lock);
+ INIT_LIST_HEAD(&mad_agent_priv->send_list);
+ INIT_LIST_HEAD(&mad_agent_priv->wait_list);
+ INIT_LIST_HEAD(&mad_agent_priv->done_list);
+ INIT_LIST_HEAD(&mad_agent_priv->rmpp_list);
+ INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends);
+ INIT_LIST_HEAD(&mad_agent_priv->local_list);
+ INIT_WORK(&mad_agent_priv->local_work, local_completions);
+ refcount_set(&mad_agent_priv->refcount, 1);
+ init_completion(&mad_agent_priv->comp);
+
+ ret2 = ib_mad_agent_security_setup(&mad_agent_priv->agent, qp_type);
+ if (ret2) {
+ ret = ERR_PTR(ret2);
+ goto error4;
+ }
+
+ /*
+ * The mlx4 driver uses the top byte to distinguish which virtual
+ * function generated the MAD, so we must avoid using it.
+ */
+ ret2 = xa_alloc_cyclic(&ib_mad_clients, &mad_agent_priv->agent.hi_tid,
+ mad_agent_priv, XA_LIMIT(0, (1 << 24) - 1),
+ &ib_mad_client_next, GFP_KERNEL);
+ if (ret2 < 0) {
+ ret = ERR_PTR(ret2);
+ goto error5;
+ }
+
+ /*
+ * Make sure MAD registration (if supplied)
+ * is non overlapping with any existing ones
+ */
+ spin_lock_irq(&port_priv->reg_lock);
+ if (mad_reg_req) {
+ mgmt_class = convert_mgmt_class(mad_reg_req->mgmt_class);
+ if (!is_vendor_class(mgmt_class)) {
+ class = port_priv->version[mad_reg_req->
+ mgmt_class_version].class;
+ if (class) {
+ method = class->method_table[mgmt_class];
+ if (method) {
+ if (method_in_use(&method,
+ mad_reg_req))
+ goto error6;
+ }
+ }
+ ret2 = add_nonoui_reg_req(mad_reg_req, mad_agent_priv,
+ mgmt_class);
+ } else {
+ /* "New" vendor class range */
+ vendor = port_priv->version[mad_reg_req->
+ mgmt_class_version].vendor;
+ if (vendor) {
+ vclass = vendor_class_index(mgmt_class);
+ vendor_class = vendor->vendor_class[vclass];
+ if (vendor_class) {
+ if (is_vendor_method_in_use(
+ vendor_class,
+ mad_reg_req))
+ goto error6;
+ }
+ }
+ ret2 = add_oui_reg_req(mad_reg_req, mad_agent_priv);
+ }
+ if (ret2) {
+ ret = ERR_PTR(ret2);
+ goto error6;
+ }
+ }
+ spin_unlock_irq(&port_priv->reg_lock);
+
+ trace_ib_mad_create_agent(mad_agent_priv);
+ return &mad_agent_priv->agent;
+error6:
+ spin_unlock_irq(&port_priv->reg_lock);
+ xa_erase(&ib_mad_clients, mad_agent_priv->agent.hi_tid);
+error5:
+ ib_mad_agent_security_cleanup(&mad_agent_priv->agent);
+error4:
+ kfree(reg_req);
+error3:
+ kfree(mad_agent_priv);
+error1:
+ return ret;
+}
+EXPORT_SYMBOL(ib_register_mad_agent);
+
+static inline void deref_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
+{
+ if (refcount_dec_and_test(&mad_agent_priv->refcount))
+ complete(&mad_agent_priv->comp);
+}
+
+static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
+{
+ struct ib_mad_port_private *port_priv;
+
+ /* Note that we could still be handling received MADs */
+ trace_ib_mad_unregister_agent(mad_agent_priv);
+
+ /*
+ * Canceling all sends results in dropping received response
+ * MADs, preventing us from queuing additional work
+ */
+ cancel_mads(mad_agent_priv);
+ port_priv = mad_agent_priv->qp_info->port_priv;
+ cancel_delayed_work(&mad_agent_priv->timed_work);
+
+ spin_lock_irq(&port_priv->reg_lock);
+ remove_mad_reg_req(mad_agent_priv);
+ spin_unlock_irq(&port_priv->reg_lock);
+ xa_erase(&ib_mad_clients, mad_agent_priv->agent.hi_tid);
+
+ flush_workqueue(port_priv->wq);
+
+ deref_mad_agent(mad_agent_priv);
+ wait_for_completion(&mad_agent_priv->comp);
+ ib_cancel_rmpp_recvs(mad_agent_priv);
+
+ ib_mad_agent_security_cleanup(&mad_agent_priv->agent);
+
+ kfree(mad_agent_priv->reg_req);
+ kfree_rcu(mad_agent_priv, rcu);
+}
+
+/*
+ * ib_unregister_mad_agent - Unregisters a client from using MAD services
+ *
+ * Context: Process context.
+ */
+void ib_unregister_mad_agent(struct ib_mad_agent *mad_agent)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+
+ mad_agent_priv = container_of(mad_agent,
+ struct ib_mad_agent_private,
+ agent);
+ unregister_mad_agent(mad_agent_priv);
+}
+EXPORT_SYMBOL(ib_unregister_mad_agent);
+
+static void dequeue_mad(struct ib_mad_list_head *mad_list)
+{
+ struct ib_mad_queue *mad_queue;
+ unsigned long flags;
+
+ mad_queue = mad_list->mad_queue;
+ spin_lock_irqsave(&mad_queue->lock, flags);
+ list_del(&mad_list->list);
+ mad_queue->count--;
+ spin_unlock_irqrestore(&mad_queue->lock, flags);
+}
+
+static void build_smp_wc(struct ib_qp *qp, struct ib_cqe *cqe, u16 slid,
+ u16 pkey_index, u32 port_num, struct ib_wc *wc)
+{
+ memset(wc, 0, sizeof *wc);
+ wc->wr_cqe = cqe;
+ wc->status = IB_WC_SUCCESS;
+ wc->opcode = IB_WC_RECV;
+ wc->pkey_index = pkey_index;
+ wc->byte_len = sizeof(struct ib_mad) + sizeof(struct ib_grh);
+ wc->src_qp = IB_QP0;
+ wc->qp = qp;
+ wc->slid = slid;
+ wc->sl = 0;
+ wc->dlid_path_bits = 0;
+ wc->port_num = port_num;
+}
+
+static size_t mad_priv_size(const struct ib_mad_private *mp)
+{
+ return sizeof(struct ib_mad_private) + mp->mad_size;
+}
+
+static struct ib_mad_private *alloc_mad_private(size_t mad_size, gfp_t flags)
+{
+ size_t size = sizeof(struct ib_mad_private) + mad_size;
+ struct ib_mad_private *ret = kzalloc(size, flags);
+
+ if (ret)
+ ret->mad_size = mad_size;
+
+ return ret;
+}
+
+static size_t port_mad_size(const struct ib_mad_port_private *port_priv)
+{
+ return rdma_max_mad_size(port_priv->device, port_priv->port_num);
+}
+
+static size_t mad_priv_dma_size(const struct ib_mad_private *mp)
+{
+ return sizeof(struct ib_grh) + mp->mad_size;
+}
+
+/*
+ * Return 0 if SMP is to be sent
+ * Return 1 if SMP was consumed locally (whether or not solicited)
+ * Return < 0 if error
+ */
+static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_send_wr_private *mad_send_wr)
+{
+ int ret = 0;
+ struct ib_smp *smp = mad_send_wr->send_buf.mad;
+ struct opa_smp *opa_smp = (struct opa_smp *)smp;
+ unsigned long flags;
+ struct ib_mad_local_private *local;
+ struct ib_mad_private *mad_priv;
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_agent_private *recv_mad_agent = NULL;
+ struct ib_device *device = mad_agent_priv->agent.device;
+ u32 port_num;
+ struct ib_wc mad_wc;
+ struct ib_ud_wr *send_wr = &mad_send_wr->send_wr;
+ size_t mad_size = port_mad_size(mad_agent_priv->qp_info->port_priv);
+ u16 out_mad_pkey_index = 0;
+ u16 drslid;
+ bool opa = rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device,
+ mad_agent_priv->qp_info->port_priv->port_num);
+
+ if (rdma_cap_ib_switch(device) &&
+ smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ port_num = send_wr->port_num;
+ else
+ port_num = mad_agent_priv->agent.port_num;
+
+ /*
+ * Directed route handling starts if the initial LID routed part of
+ * a request or the ending LID routed part of a response is empty.
+ * If we are at the start of the LID routed part, don't update the
+ * hop_ptr or hop_cnt. See section 14.2.2, Vol 1 IB spec.
+ */
+ if (opa && smp->class_version == OPA_SM_CLASS_VERSION) {
+ u32 opa_drslid;
+
+ trace_ib_mad_handle_out_opa_smi(opa_smp);
+
+ if ((opa_get_smp_direction(opa_smp)
+ ? opa_smp->route.dr.dr_dlid : opa_smp->route.dr.dr_slid) ==
+ OPA_LID_PERMISSIVE &&
+ opa_smi_handle_dr_smp_send(opa_smp,
+ rdma_cap_ib_switch(device),
+ port_num) == IB_SMI_DISCARD) {
+ ret = -EINVAL;
+ dev_err(&device->dev, "OPA Invalid directed route\n");
+ goto out;
+ }
+ opa_drslid = be32_to_cpu(opa_smp->route.dr.dr_slid);
+ if (opa_drslid != be32_to_cpu(OPA_LID_PERMISSIVE) &&
+ opa_drslid & 0xffff0000) {
+ ret = -EINVAL;
+ dev_err(&device->dev, "OPA Invalid dr_slid 0x%x\n",
+ opa_drslid);
+ goto out;
+ }
+ drslid = (u16)(opa_drslid & 0x0000ffff);
+
+ /* Check to post send on QP or process locally */
+ if (opa_smi_check_local_smp(opa_smp, device) == IB_SMI_DISCARD &&
+ opa_smi_check_local_returning_smp(opa_smp, device) == IB_SMI_DISCARD)
+ goto out;
+ } else {
+ trace_ib_mad_handle_out_ib_smi(smp);
+
+ if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) ==
+ IB_LID_PERMISSIVE &&
+ smi_handle_dr_smp_send(smp, rdma_cap_ib_switch(device), port_num) ==
+ IB_SMI_DISCARD) {
+ ret = -EINVAL;
+ dev_err(&device->dev, "Invalid directed route\n");
+ goto out;
+ }
+ drslid = be16_to_cpu(smp->dr_slid);
+
+ /* Check to post send on QP or process locally */
+ if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD &&
+ smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD)
+ goto out;
+ }
+
+ local = kmalloc(sizeof *local, GFP_ATOMIC);
+ if (!local) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ local->mad_priv = NULL;
+ local->recv_mad_agent = NULL;
+ mad_priv = alloc_mad_private(mad_size, GFP_ATOMIC);
+ if (!mad_priv) {
+ ret = -ENOMEM;
+ kfree(local);
+ goto out;
+ }
+
+ build_smp_wc(mad_agent_priv->agent.qp,
+ send_wr->wr.wr_cqe, drslid,
+ send_wr->pkey_index,
+ send_wr->port_num, &mad_wc);
+
+ if (opa && smp->base_version == OPA_MGMT_BASE_VERSION) {
+ mad_wc.byte_len = mad_send_wr->send_buf.hdr_len
+ + mad_send_wr->send_buf.data_len
+ + sizeof(struct ib_grh);
+ }
+
+ /* No GRH for DR SMP */
+ ret = device->ops.process_mad(device, 0, port_num, &mad_wc, NULL,
+ (const struct ib_mad *)smp,
+ (struct ib_mad *)mad_priv->mad, &mad_size,
+ &out_mad_pkey_index);
+ switch (ret) {
+ case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY:
+ if (ib_response_mad((const struct ib_mad_hdr *)mad_priv->mad) &&
+ mad_agent_priv->agent.recv_handler) {
+ local->mad_priv = mad_priv;
+ local->recv_mad_agent = mad_agent_priv;
+ /*
+ * Reference MAD agent until receive
+ * side of local completion handled
+ */
+ refcount_inc(&mad_agent_priv->refcount);
+ } else
+ kfree(mad_priv);
+ break;
+ case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED:
+ kfree(mad_priv);
+ break;
+ case IB_MAD_RESULT_SUCCESS:
+ /* Treat like an incoming receive MAD */
+ port_priv = ib_get_mad_port(mad_agent_priv->agent.device,
+ mad_agent_priv->agent.port_num);
+ if (port_priv) {
+ memcpy(mad_priv->mad, smp, mad_priv->mad_size);
+ recv_mad_agent = find_mad_agent(port_priv,
+ (const struct ib_mad_hdr *)mad_priv->mad);
+ }
+ if (!port_priv || !recv_mad_agent) {
+ /*
+ * No receiving agent so drop packet and
+ * generate send completion.
+ */
+ kfree(mad_priv);
+ break;
+ }
+ local->mad_priv = mad_priv;
+ local->recv_mad_agent = recv_mad_agent;
+ break;
+ default:
+ kfree(mad_priv);
+ kfree(local);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ local->mad_send_wr = mad_send_wr;
+ if (opa) {
+ local->mad_send_wr->send_wr.pkey_index = out_mad_pkey_index;
+ local->return_wc_byte_len = mad_size;
+ }
+ /* Reference MAD agent until send side of local completion handled */
+ refcount_inc(&mad_agent_priv->refcount);
+ /* Queue local completion to local list */
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ list_add_tail(&local->completion_list, &mad_agent_priv->local_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ queue_work(mad_agent_priv->qp_info->port_priv->wq,
+ &mad_agent_priv->local_work);
+ ret = 1;
+out:
+ return ret;
+}
+
+static int get_pad_size(int hdr_len, int data_len, size_t mad_size)
+{
+ int seg_size, pad;
+
+ seg_size = mad_size - hdr_len;
+ if (data_len && seg_size) {
+ pad = seg_size - data_len % seg_size;
+ return pad == seg_size ? 0 : pad;
+ } else
+ return seg_size;
+}
+
+static void free_send_rmpp_list(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_rmpp_segment *s, *t;
+
+ list_for_each_entry_safe(s, t, &mad_send_wr->rmpp_list, list) {
+ list_del(&s->list);
+ kfree(s);
+ }
+}
+
+static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr,
+ size_t mad_size, gfp_t gfp_mask)
+{
+ struct ib_mad_send_buf *send_buf = &send_wr->send_buf;
+ struct ib_rmpp_mad *rmpp_mad = send_buf->mad;
+ struct ib_rmpp_segment *seg = NULL;
+ int left, seg_size, pad;
+
+ send_buf->seg_size = mad_size - send_buf->hdr_len;
+ send_buf->seg_rmpp_size = mad_size - IB_MGMT_RMPP_HDR;
+ seg_size = send_buf->seg_size;
+ pad = send_wr->pad;
+
+ /* Allocate data segments. */
+ for (left = send_buf->data_len + pad; left > 0; left -= seg_size) {
+ seg = kmalloc(sizeof(*seg) + seg_size, gfp_mask);
+ if (!seg) {
+ free_send_rmpp_list(send_wr);
+ return -ENOMEM;
+ }
+ seg->num = ++send_buf->seg_count;
+ list_add_tail(&seg->list, &send_wr->rmpp_list);
+ }
+
+ /* Zero any padding */
+ if (pad)
+ memset(seg->data + seg_size - pad, 0, pad);
+
+ rmpp_mad->rmpp_hdr.rmpp_version = send_wr->mad_agent_priv->
+ agent.rmpp_version;
+ rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_DATA;
+ ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+
+ send_wr->cur_seg = container_of(send_wr->rmpp_list.next,
+ struct ib_rmpp_segment, list);
+ send_wr->last_ack_seg = send_wr->cur_seg;
+ return 0;
+}
+
+int ib_mad_kernel_rmpp_agent(const struct ib_mad_agent *agent)
+{
+ return agent->rmpp_version && !(agent->flags & IB_MAD_USER_RMPP);
+}
+EXPORT_SYMBOL(ib_mad_kernel_rmpp_agent);
+
+struct ib_mad_send_buf *ib_create_send_mad(struct ib_mad_agent *mad_agent,
+ u32 remote_qpn, u16 pkey_index,
+ int rmpp_active, int hdr_len,
+ int data_len, gfp_t gfp_mask,
+ u8 base_version)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ int pad, message_size, ret, size;
+ void *buf;
+ size_t mad_size;
+ bool opa;
+
+ mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
+ agent);
+
+ opa = rdma_cap_opa_mad(mad_agent->device, mad_agent->port_num);
+
+ if (opa && base_version == OPA_MGMT_BASE_VERSION)
+ mad_size = sizeof(struct opa_mad);
+ else
+ mad_size = sizeof(struct ib_mad);
+
+ pad = get_pad_size(hdr_len, data_len, mad_size);
+ message_size = hdr_len + data_len + pad;
+
+ if (ib_mad_kernel_rmpp_agent(mad_agent)) {
+ if (!rmpp_active && message_size > mad_size)
+ return ERR_PTR(-EINVAL);
+ } else
+ if (rmpp_active || message_size > mad_size)
+ return ERR_PTR(-EINVAL);
+
+ size = rmpp_active ? hdr_len : mad_size;
+ buf = kzalloc(sizeof *mad_send_wr + size, gfp_mask);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+
+ mad_send_wr = buf + size;
+ INIT_LIST_HEAD(&mad_send_wr->rmpp_list);
+ mad_send_wr->send_buf.mad = buf;
+ mad_send_wr->send_buf.hdr_len = hdr_len;
+ mad_send_wr->send_buf.data_len = data_len;
+ mad_send_wr->pad = pad;
+
+ mad_send_wr->mad_agent_priv = mad_agent_priv;
+ mad_send_wr->sg_list[0].length = hdr_len;
+ mad_send_wr->sg_list[0].lkey = mad_agent->qp->pd->local_dma_lkey;
+
+ /* OPA MADs don't have to be the full 2048 bytes */
+ if (opa && base_version == OPA_MGMT_BASE_VERSION &&
+ data_len < mad_size - hdr_len)
+ mad_send_wr->sg_list[1].length = data_len;
+ else
+ mad_send_wr->sg_list[1].length = mad_size - hdr_len;
+
+ mad_send_wr->sg_list[1].lkey = mad_agent->qp->pd->local_dma_lkey;
+
+ mad_send_wr->mad_list.cqe.done = ib_mad_send_done;
+
+ mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe;
+ mad_send_wr->send_wr.wr.sg_list = mad_send_wr->sg_list;
+ mad_send_wr->send_wr.wr.num_sge = 2;
+ mad_send_wr->send_wr.wr.opcode = IB_WR_SEND;
+ mad_send_wr->send_wr.wr.send_flags = IB_SEND_SIGNALED;
+ mad_send_wr->send_wr.remote_qpn = remote_qpn;
+ mad_send_wr->send_wr.remote_qkey = IB_QP_SET_QKEY;
+ mad_send_wr->send_wr.pkey_index = pkey_index;
+
+ if (rmpp_active) {
+ ret = alloc_send_rmpp_list(mad_send_wr, mad_size, gfp_mask);
+ if (ret) {
+ kfree(buf);
+ return ERR_PTR(ret);
+ }
+ }
+
+ mad_send_wr->send_buf.mad_agent = mad_agent;
+ refcount_inc(&mad_agent_priv->refcount);
+ return &mad_send_wr->send_buf;
+}
+EXPORT_SYMBOL(ib_create_send_mad);
+
+int ib_get_mad_data_offset(u8 mgmt_class)
+{
+ if (mgmt_class == IB_MGMT_CLASS_SUBN_ADM)
+ return IB_MGMT_SA_HDR;
+ else if ((mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) ||
+ (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) ||
+ (mgmt_class == IB_MGMT_CLASS_BIS))
+ return IB_MGMT_DEVICE_HDR;
+ else if ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) &&
+ (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END))
+ return IB_MGMT_VENDOR_HDR;
+ else
+ return IB_MGMT_MAD_HDR;
+}
+EXPORT_SYMBOL(ib_get_mad_data_offset);
+
+int ib_is_mad_class_rmpp(u8 mgmt_class)
+{
+ if ((mgmt_class == IB_MGMT_CLASS_SUBN_ADM) ||
+ (mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) ||
+ (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) ||
+ (mgmt_class == IB_MGMT_CLASS_BIS) ||
+ ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) &&
+ (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END)))
+ return 1;
+ return 0;
+}
+EXPORT_SYMBOL(ib_is_mad_class_rmpp);
+
+void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct list_head *list;
+
+ mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
+ send_buf);
+ list = &mad_send_wr->cur_seg->list;
+
+ if (mad_send_wr->cur_seg->num < seg_num) {
+ list_for_each_entry(mad_send_wr->cur_seg, list, list)
+ if (mad_send_wr->cur_seg->num == seg_num)
+ break;
+ } else if (mad_send_wr->cur_seg->num > seg_num) {
+ list_for_each_entry_reverse(mad_send_wr->cur_seg, list, list)
+ if (mad_send_wr->cur_seg->num == seg_num)
+ break;
+ }
+ return mad_send_wr->cur_seg->data;
+}
+EXPORT_SYMBOL(ib_get_rmpp_segment);
+
+static inline void *ib_get_payload(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ if (mad_send_wr->send_buf.seg_count)
+ return ib_get_rmpp_segment(&mad_send_wr->send_buf,
+ mad_send_wr->seg_num);
+ else
+ return mad_send_wr->send_buf.mad +
+ mad_send_wr->send_buf.hdr_len;
+}
+
+void ib_free_send_mad(struct ib_mad_send_buf *send_buf)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_wr_private *mad_send_wr;
+
+ mad_agent_priv = container_of(send_buf->mad_agent,
+ struct ib_mad_agent_private, agent);
+ mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
+ send_buf);
+
+ free_send_rmpp_list(mad_send_wr);
+ kfree(send_buf->mad);
+ deref_mad_agent(mad_agent_priv);
+}
+EXPORT_SYMBOL(ib_free_send_mad);
+
+int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_mad_qp_info *qp_info;
+ struct list_head *list;
+ struct ib_mad_agent *mad_agent;
+ struct ib_sge *sge;
+ unsigned long flags;
+ int ret;
+
+ /* Set WR ID to find mad_send_wr upon completion */
+ qp_info = mad_send_wr->mad_agent_priv->qp_info;
+ mad_send_wr->mad_list.mad_queue = &qp_info->send_queue;
+ mad_send_wr->mad_list.cqe.done = ib_mad_send_done;
+ mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe;
+
+ mad_agent = mad_send_wr->send_buf.mad_agent;
+ sge = mad_send_wr->sg_list;
+ sge[0].addr = ib_dma_map_single(mad_agent->device,
+ mad_send_wr->send_buf.mad,
+ sge[0].length,
+ DMA_TO_DEVICE);
+ if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[0].addr)))
+ return -ENOMEM;
+
+ mad_send_wr->header_mapping = sge[0].addr;
+
+ sge[1].addr = ib_dma_map_single(mad_agent->device,
+ ib_get_payload(mad_send_wr),
+ sge[1].length,
+ DMA_TO_DEVICE);
+ if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[1].addr))) {
+ ib_dma_unmap_single(mad_agent->device,
+ mad_send_wr->header_mapping,
+ sge[0].length, DMA_TO_DEVICE);
+ return -ENOMEM;
+ }
+ mad_send_wr->payload_mapping = sge[1].addr;
+
+ spin_lock_irqsave(&qp_info->send_queue.lock, flags);
+ if (qp_info->send_queue.count < qp_info->send_queue.max_active) {
+ trace_ib_mad_ib_send_mad(mad_send_wr, qp_info);
+ ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr.wr,
+ NULL);
+ list = &qp_info->send_queue.list;
+ } else {
+ ret = 0;
+ list = &qp_info->overflow_list;
+ }
+
+ if (!ret) {
+ qp_info->send_queue.count++;
+ list_add_tail(&mad_send_wr->mad_list.list, list);
+ }
+ spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+ if (ret) {
+ ib_dma_unmap_single(mad_agent->device,
+ mad_send_wr->header_mapping,
+ sge[0].length, DMA_TO_DEVICE);
+ ib_dma_unmap_single(mad_agent->device,
+ mad_send_wr->payload_mapping,
+ sge[1].length, DMA_TO_DEVICE);
+ }
+ return ret;
+}
+
+/*
+ * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated
+ * with the registered client
+ */
+int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
+ struct ib_mad_send_buf **bad_send_buf)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_buf *next_send_buf;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ unsigned long flags;
+ int ret = -EINVAL;
+
+ /* Walk list of send WRs and post each on send list */
+ for (; send_buf; send_buf = next_send_buf) {
+ mad_send_wr = container_of(send_buf,
+ struct ib_mad_send_wr_private,
+ send_buf);
+ mad_agent_priv = mad_send_wr->mad_agent_priv;
+
+ ret = ib_mad_enforce_security(mad_agent_priv,
+ mad_send_wr->send_wr.pkey_index);
+ if (ret)
+ goto error;
+
+ if (!send_buf->mad_agent->send_handler ||
+ (send_buf->timeout_ms &&
+ !send_buf->mad_agent->recv_handler)) {
+ ret = -EINVAL;
+ goto error;
+ }
+
+ if (!ib_is_mad_class_rmpp(((struct ib_mad_hdr *) send_buf->mad)->mgmt_class)) {
+ if (mad_agent_priv->agent.rmpp_version) {
+ ret = -EINVAL;
+ goto error;
+ }
+ }
+
+ /*
+ * Save pointer to next work request to post in case the
+ * current one completes, and the user modifies the work
+ * request associated with the completion
+ */
+ next_send_buf = send_buf->next;
+ mad_send_wr->send_wr.ah = send_buf->ah;
+
+ if (((struct ib_mad_hdr *) send_buf->mad)->mgmt_class ==
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+ ret = handle_outgoing_dr_smp(mad_agent_priv,
+ mad_send_wr);
+ if (ret < 0) /* error */
+ goto error;
+ else if (ret == 1) /* locally consumed */
+ continue;
+ }
+
+ mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid;
+ /* Timeout will be updated after send completes */
+ mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms);
+ mad_send_wr->max_retries = send_buf->retries;
+ mad_send_wr->retries_left = send_buf->retries;
+ send_buf->retries = 0;
+ /* Reference for work request to QP + response */
+ mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
+ mad_send_wr->status = IB_WC_SUCCESS;
+
+ /* Reference MAD agent until send completes */
+ refcount_inc(&mad_agent_priv->refcount);
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ list_add_tail(&mad_send_wr->agent_list,
+ &mad_agent_priv->send_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
+ ret = ib_send_rmpp_mad(mad_send_wr);
+ if (ret >= 0 && ret != IB_RMPP_RESULT_CONSUMED)
+ ret = ib_send_mad(mad_send_wr);
+ } else
+ ret = ib_send_mad(mad_send_wr);
+ if (ret < 0) {
+ /* Fail send request */
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ list_del(&mad_send_wr->agent_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ deref_mad_agent(mad_agent_priv);
+ goto error;
+ }
+ }
+ return 0;
+error:
+ if (bad_send_buf)
+ *bad_send_buf = send_buf;
+ return ret;
+}
+EXPORT_SYMBOL(ib_post_send_mad);
+
+/*
+ * ib_free_recv_mad - Returns data buffers used to receive
+ * a MAD to the access layer
+ */
+void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_mad_recv_buf *mad_recv_buf, *temp_recv_buf;
+ struct ib_mad_private_header *mad_priv_hdr;
+ struct ib_mad_private *priv;
+ struct list_head free_list;
+
+ INIT_LIST_HEAD(&free_list);
+ list_splice_init(&mad_recv_wc->rmpp_list, &free_list);
+
+ list_for_each_entry_safe(mad_recv_buf, temp_recv_buf,
+ &free_list, list) {
+ mad_recv_wc = container_of(mad_recv_buf, struct ib_mad_recv_wc,
+ recv_buf);
+ mad_priv_hdr = container_of(mad_recv_wc,
+ struct ib_mad_private_header,
+ recv_wc);
+ priv = container_of(mad_priv_hdr, struct ib_mad_private,
+ header);
+ kfree(priv);
+ }
+}
+EXPORT_SYMBOL(ib_free_recv_mad);
+
+static int method_in_use(struct ib_mad_mgmt_method_table **method,
+ struct ib_mad_reg_req *mad_reg_req)
+{
+ int i;
+
+ for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) {
+ if ((*method)->agent[i]) {
+ pr_err("Method %d already in use\n", i);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+static int allocate_method_table(struct ib_mad_mgmt_method_table **method)
+{
+ /* Allocate management method table */
+ *method = kzalloc(sizeof **method, GFP_ATOMIC);
+ return (*method) ? 0 : (-ENOMEM);
+}
+
+/*
+ * Check to see if there are any methods still in use
+ */
+static int check_method_table(struct ib_mad_mgmt_method_table *method)
+{
+ int i;
+
+ for (i = 0; i < IB_MGMT_MAX_METHODS; i++)
+ if (method->agent[i])
+ return 1;
+ return 0;
+}
+
+/*
+ * Check to see if there are any method tables for this class still in use
+ */
+static int check_class_table(struct ib_mad_mgmt_class_table *class)
+{
+ int i;
+
+ for (i = 0; i < MAX_MGMT_CLASS; i++)
+ if (class->method_table[i])
+ return 1;
+ return 0;
+}
+
+static int check_vendor_class(struct ib_mad_mgmt_vendor_class *vendor_class)
+{
+ int i;
+
+ for (i = 0; i < MAX_MGMT_OUI; i++)
+ if (vendor_class->method_table[i])
+ return 1;
+ return 0;
+}
+
+static int find_vendor_oui(struct ib_mad_mgmt_vendor_class *vendor_class,
+ const char *oui)
+{
+ int i;
+
+ for (i = 0; i < MAX_MGMT_OUI; i++)
+ /* Is there matching OUI for this vendor class ? */
+ if (!memcmp(vendor_class->oui[i], oui, 3))
+ return i;
+
+ return -1;
+}
+
+static int check_vendor_table(struct ib_mad_mgmt_vendor_class_table *vendor)
+{
+ int i;
+
+ for (i = 0; i < MAX_MGMT_VENDOR_RANGE2; i++)
+ if (vendor->vendor_class[i])
+ return 1;
+
+ return 0;
+}
+
+static void remove_methods_mad_agent(struct ib_mad_mgmt_method_table *method,
+ struct ib_mad_agent_private *agent)
+{
+ int i;
+
+ /* Remove any methods for this mad agent */
+ for (i = 0; i < IB_MGMT_MAX_METHODS; i++)
+ if (method->agent[i] == agent)
+ method->agent[i] = NULL;
+}
+
+static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+ struct ib_mad_agent_private *agent_priv,
+ u8 mgmt_class)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_mgmt_class_table **class;
+ struct ib_mad_mgmt_method_table **method;
+ int i, ret;
+
+ port_priv = agent_priv->qp_info->port_priv;
+ class = &port_priv->version[mad_reg_req->mgmt_class_version].class;
+ if (!*class) {
+ /* Allocate management class table for "new" class version */
+ *class = kzalloc(sizeof **class, GFP_ATOMIC);
+ if (!*class) {
+ ret = -ENOMEM;
+ goto error1;
+ }
+
+ /* Allocate method table for this management class */
+ method = &(*class)->method_table[mgmt_class];
+ if ((ret = allocate_method_table(method)))
+ goto error2;
+ } else {
+ method = &(*class)->method_table[mgmt_class];
+ if (!*method) {
+ /* Allocate method table for this management class */
+ if ((ret = allocate_method_table(method)))
+ goto error1;
+ }
+ }
+
+ /* Now, make sure methods are not already in use */
+ if (method_in_use(method, mad_reg_req))
+ goto error3;
+
+ /* Finally, add in methods being registered */
+ for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS)
+ (*method)->agent[i] = agent_priv;
+
+ return 0;
+
+error3:
+ /* Remove any methods for this mad agent */
+ remove_methods_mad_agent(*method, agent_priv);
+ /* Now, check to see if there are any methods in use */
+ if (!check_method_table(*method)) {
+ /* If not, release management method table */
+ kfree(*method);
+ *method = NULL;
+ }
+ ret = -EINVAL;
+ goto error1;
+error2:
+ kfree(*class);
+ *class = NULL;
+error1:
+ return ret;
+}
+
+static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+ struct ib_mad_agent_private *agent_priv)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_mgmt_vendor_class_table **vendor_table;
+ struct ib_mad_mgmt_vendor_class_table *vendor = NULL;
+ struct ib_mad_mgmt_vendor_class *vendor_class = NULL;
+ struct ib_mad_mgmt_method_table **method;
+ int i, ret = -ENOMEM;
+ u8 vclass;
+
+ /* "New" vendor (with OUI) class */
+ vclass = vendor_class_index(mad_reg_req->mgmt_class);
+ port_priv = agent_priv->qp_info->port_priv;
+ vendor_table = &port_priv->version[
+ mad_reg_req->mgmt_class_version].vendor;
+ if (!*vendor_table) {
+ /* Allocate mgmt vendor class table for "new" class version */
+ vendor = kzalloc(sizeof *vendor, GFP_ATOMIC);
+ if (!vendor)
+ goto error1;
+
+ *vendor_table = vendor;
+ }
+ if (!(*vendor_table)->vendor_class[vclass]) {
+ /* Allocate table for this management vendor class */
+ vendor_class = kzalloc(sizeof *vendor_class, GFP_ATOMIC);
+ if (!vendor_class)
+ goto error2;
+
+ (*vendor_table)->vendor_class[vclass] = vendor_class;
+ }
+ for (i = 0; i < MAX_MGMT_OUI; i++) {
+ /* Is there matching OUI for this vendor class ? */
+ if (!memcmp((*vendor_table)->vendor_class[vclass]->oui[i],
+ mad_reg_req->oui, 3)) {
+ method = &(*vendor_table)->vendor_class[
+ vclass]->method_table[i];
+ if (!*method)
+ goto error3;
+ goto check_in_use;
+ }
+ }
+ for (i = 0; i < MAX_MGMT_OUI; i++) {
+ /* OUI slot available ? */
+ if (!is_vendor_oui((*vendor_table)->vendor_class[
+ vclass]->oui[i])) {
+ method = &(*vendor_table)->vendor_class[
+ vclass]->method_table[i];
+ /* Allocate method table for this OUI */
+ if (!*method) {
+ ret = allocate_method_table(method);
+ if (ret)
+ goto error3;
+ }
+ memcpy((*vendor_table)->vendor_class[vclass]->oui[i],
+ mad_reg_req->oui, 3);
+ goto check_in_use;
+ }
+ }
+ dev_err(&agent_priv->agent.device->dev, "All OUI slots in use\n");
+ goto error3;
+
+check_in_use:
+ /* Now, make sure methods are not already in use */
+ if (method_in_use(method, mad_reg_req))
+ goto error4;
+
+ /* Finally, add in methods being registered */
+ for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS)
+ (*method)->agent[i] = agent_priv;
+
+ return 0;
+
+error4:
+ /* Remove any methods for this mad agent */
+ remove_methods_mad_agent(*method, agent_priv);
+ /* Now, check to see if there are any methods in use */
+ if (!check_method_table(*method)) {
+ /* If not, release management method table */
+ kfree(*method);
+ *method = NULL;
+ }
+ ret = -EINVAL;
+error3:
+ if (vendor_class) {
+ (*vendor_table)->vendor_class[vclass] = NULL;
+ kfree(vendor_class);
+ }
+error2:
+ if (vendor) {
+ *vendor_table = NULL;
+ kfree(vendor);
+ }
+error1:
+ return ret;
+}
+
+static void remove_mad_reg_req(struct ib_mad_agent_private *agent_priv)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_mgmt_class_table *class;
+ struct ib_mad_mgmt_method_table *method;
+ struct ib_mad_mgmt_vendor_class_table *vendor;
+ struct ib_mad_mgmt_vendor_class *vendor_class;
+ int index;
+ u8 mgmt_class;
+
+ /*
+ * Was MAD registration request supplied
+ * with original registration ?
+ */
+ if (!agent_priv->reg_req)
+ goto out;
+
+ port_priv = agent_priv->qp_info->port_priv;
+ mgmt_class = convert_mgmt_class(agent_priv->reg_req->mgmt_class);
+ class = port_priv->version[
+ agent_priv->reg_req->mgmt_class_version].class;
+ if (!class)
+ goto vendor_check;
+
+ method = class->method_table[mgmt_class];
+ if (method) {
+ /* Remove any methods for this mad agent */
+ remove_methods_mad_agent(method, agent_priv);
+ /* Now, check to see if there are any methods still in use */
+ if (!check_method_table(method)) {
+ /* If not, release management method table */
+ kfree(method);
+ class->method_table[mgmt_class] = NULL;
+ /* Any management classes left ? */
+ if (!check_class_table(class)) {
+ /* If not, release management class table */
+ kfree(class);
+ port_priv->version[
+ agent_priv->reg_req->
+ mgmt_class_version].class = NULL;
+ }
+ }
+ }
+
+vendor_check:
+ if (!is_vendor_class(mgmt_class))
+ goto out;
+
+ /* normalize mgmt_class to vendor range 2 */
+ mgmt_class = vendor_class_index(agent_priv->reg_req->mgmt_class);
+ vendor = port_priv->version[
+ agent_priv->reg_req->mgmt_class_version].vendor;
+
+ if (!vendor)
+ goto out;
+
+ vendor_class = vendor->vendor_class[mgmt_class];
+ if (vendor_class) {
+ index = find_vendor_oui(vendor_class, agent_priv->reg_req->oui);
+ if (index < 0)
+ goto out;
+ method = vendor_class->method_table[index];
+ if (method) {
+ /* Remove any methods for this mad agent */
+ remove_methods_mad_agent(method, agent_priv);
+ /*
+ * Now, check to see if there are
+ * any methods still in use
+ */
+ if (!check_method_table(method)) {
+ /* If not, release management method table */
+ kfree(method);
+ vendor_class->method_table[index] = NULL;
+ memset(vendor_class->oui[index], 0, 3);
+ /* Any OUIs left ? */
+ if (!check_vendor_class(vendor_class)) {
+ /* If not, release vendor class table */
+ kfree(vendor_class);
+ vendor->vendor_class[mgmt_class] = NULL;
+ /* Any other vendor classes left ? */
+ if (!check_vendor_table(vendor)) {
+ kfree(vendor);
+ port_priv->version[
+ agent_priv->reg_req->
+ mgmt_class_version].
+ vendor = NULL;
+ }
+ }
+ }
+ }
+ }
+
+out:
+ return;
+}
+
+static struct ib_mad_agent_private *
+find_mad_agent(struct ib_mad_port_private *port_priv,
+ const struct ib_mad_hdr *mad_hdr)
+{
+ struct ib_mad_agent_private *mad_agent = NULL;
+ unsigned long flags;
+
+ if (ib_response_mad(mad_hdr)) {
+ u32 hi_tid;
+
+ /*
+ * Routing is based on high 32 bits of transaction ID
+ * of MAD.
+ */
+ hi_tid = be64_to_cpu(mad_hdr->tid) >> 32;
+ rcu_read_lock();
+ mad_agent = xa_load(&ib_mad_clients, hi_tid);
+ if (mad_agent && !refcount_inc_not_zero(&mad_agent->refcount))
+ mad_agent = NULL;
+ rcu_read_unlock();
+ } else {
+ struct ib_mad_mgmt_class_table *class;
+ struct ib_mad_mgmt_method_table *method;
+ struct ib_mad_mgmt_vendor_class_table *vendor;
+ struct ib_mad_mgmt_vendor_class *vendor_class;
+ const struct ib_vendor_mad *vendor_mad;
+ int index;
+
+ spin_lock_irqsave(&port_priv->reg_lock, flags);
+ /*
+ * Routing is based on version, class, and method
+ * For "newer" vendor MADs, also based on OUI
+ */
+ if (mad_hdr->class_version >= MAX_MGMT_VERSION)
+ goto out;
+ if (!is_vendor_class(mad_hdr->mgmt_class)) {
+ class = port_priv->version[
+ mad_hdr->class_version].class;
+ if (!class)
+ goto out;
+ if (convert_mgmt_class(mad_hdr->mgmt_class) >=
+ ARRAY_SIZE(class->method_table))
+ goto out;
+ method = class->method_table[convert_mgmt_class(
+ mad_hdr->mgmt_class)];
+ if (method)
+ mad_agent = method->agent[mad_hdr->method &
+ ~IB_MGMT_METHOD_RESP];
+ } else {
+ vendor = port_priv->version[
+ mad_hdr->class_version].vendor;
+ if (!vendor)
+ goto out;
+ vendor_class = vendor->vendor_class[vendor_class_index(
+ mad_hdr->mgmt_class)];
+ if (!vendor_class)
+ goto out;
+ /* Find matching OUI */
+ vendor_mad = (const struct ib_vendor_mad *)mad_hdr;
+ index = find_vendor_oui(vendor_class, vendor_mad->oui);
+ if (index == -1)
+ goto out;
+ method = vendor_class->method_table[index];
+ if (method) {
+ mad_agent = method->agent[mad_hdr->method &
+ ~IB_MGMT_METHOD_RESP];
+ }
+ }
+ if (mad_agent)
+ refcount_inc(&mad_agent->refcount);
+out:
+ spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+ }
+
+ if (mad_agent && !mad_agent->agent.recv_handler) {
+ dev_notice(&port_priv->device->dev,
+ "No receive handler for client %p on port %u\n",
+ &mad_agent->agent, port_priv->port_num);
+ deref_mad_agent(mad_agent);
+ mad_agent = NULL;
+ }
+
+ return mad_agent;
+}
+
+static int validate_mad(const struct ib_mad_hdr *mad_hdr,
+ const struct ib_mad_qp_info *qp_info,
+ bool opa)
+{
+ int valid = 0;
+ u32 qp_num = qp_info->qp->qp_num;
+
+ /* Make sure MAD base version is understood */
+ if (mad_hdr->base_version != IB_MGMT_BASE_VERSION &&
+ (!opa || mad_hdr->base_version != OPA_MGMT_BASE_VERSION)) {
+ pr_err("MAD received with unsupported base version %u %s\n",
+ mad_hdr->base_version, opa ? "(opa)" : "");
+ goto out;
+ }
+
+ /* Filter SMI packets sent to other than QP0 */
+ if ((mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
+ (mad_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+ if (qp_num == 0)
+ valid = 1;
+ } else {
+ /* CM attributes other than ClassPortInfo only use Send method */
+ if ((mad_hdr->mgmt_class == IB_MGMT_CLASS_CM) &&
+ (mad_hdr->attr_id != IB_MGMT_CLASSPORTINFO_ATTR_ID) &&
+ (mad_hdr->method != IB_MGMT_METHOD_SEND))
+ goto out;
+ /* Filter GSI packets sent to QP0 */
+ if (qp_num != 0)
+ valid = 1;
+ }
+
+out:
+ return valid;
+}
+
+static int is_rmpp_data_mad(const struct ib_mad_agent_private *mad_agent_priv,
+ const struct ib_mad_hdr *mad_hdr)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+
+ rmpp_mad = (struct ib_rmpp_mad *)mad_hdr;
+ return !mad_agent_priv->agent.rmpp_version ||
+ !ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent) ||
+ !(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+ IB_MGMT_RMPP_FLAG_ACTIVE) ||
+ (rmpp_mad->rmpp_hdr.rmpp_type == IB_MGMT_RMPP_TYPE_DATA);
+}
+
+static inline int rcv_has_same_class(const struct ib_mad_send_wr_private *wr,
+ const struct ib_mad_recv_wc *rwc)
+{
+ return ((struct ib_mad_hdr *)(wr->send_buf.mad))->mgmt_class ==
+ rwc->recv_buf.mad->mad_hdr.mgmt_class;
+}
+
+static inline int
+rcv_has_same_gid(const struct ib_mad_agent_private *mad_agent_priv,
+ const struct ib_mad_send_wr_private *wr,
+ const struct ib_mad_recv_wc *rwc)
+{
+ struct rdma_ah_attr attr;
+ u8 send_resp, rcv_resp;
+ union ib_gid sgid;
+ struct ib_device *device = mad_agent_priv->agent.device;
+ u32 port_num = mad_agent_priv->agent.port_num;
+ u8 lmc;
+ bool has_grh;
+
+ send_resp = ib_response_mad((struct ib_mad_hdr *)wr->send_buf.mad);
+ rcv_resp = ib_response_mad(&rwc->recv_buf.mad->mad_hdr);
+
+ if (send_resp == rcv_resp)
+ /* both requests, or both responses. GIDs different */
+ return 0;
+
+ if (rdma_query_ah(wr->send_buf.ah, &attr))
+ /* Assume not equal, to avoid false positives. */
+ return 0;
+
+ has_grh = !!(rdma_ah_get_ah_flags(&attr) & IB_AH_GRH);
+ if (has_grh != !!(rwc->wc->wc_flags & IB_WC_GRH))
+ /* one has GID, other does not. Assume different */
+ return 0;
+
+ if (!send_resp && rcv_resp) {
+ /* is request/response. */
+ if (!has_grh) {
+ if (ib_get_cached_lmc(device, port_num, &lmc))
+ return 0;
+ return (!lmc || !((rdma_ah_get_path_bits(&attr) ^
+ rwc->wc->dlid_path_bits) &
+ ((1 << lmc) - 1)));
+ } else {
+ const struct ib_global_route *grh =
+ rdma_ah_read_grh(&attr);
+
+ if (rdma_query_gid(device, port_num,
+ grh->sgid_index, &sgid))
+ return 0;
+ return !memcmp(sgid.raw, rwc->recv_buf.grh->dgid.raw,
+ 16);
+ }
+ }
+
+ if (!has_grh)
+ return rdma_ah_get_dlid(&attr) == rwc->wc->slid;
+ else
+ return !memcmp(rdma_ah_read_grh(&attr)->dgid.raw,
+ rwc->recv_buf.grh->sgid.raw,
+ 16);
+}
+
+static inline int is_direct(u8 class)
+{
+ return (class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE);
+}
+
+struct ib_mad_send_wr_private*
+ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv,
+ const struct ib_mad_recv_wc *wc)
+{
+ struct ib_mad_send_wr_private *wr;
+ const struct ib_mad_hdr *mad_hdr;
+
+ mad_hdr = &wc->recv_buf.mad->mad_hdr;
+
+ list_for_each_entry(wr, &mad_agent_priv->wait_list, agent_list) {
+ if ((wr->tid == mad_hdr->tid) &&
+ rcv_has_same_class(wr, wc) &&
+ /*
+ * Don't check GID for direct routed MADs.
+ * These might have permissive LIDs.
+ */
+ (is_direct(mad_hdr->mgmt_class) ||
+ rcv_has_same_gid(mad_agent_priv, wr, wc)))
+ return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
+ }
+
+ /*
+ * It's possible to receive the response before we've
+ * been notified that the send has completed
+ */
+ list_for_each_entry(wr, &mad_agent_priv->send_list, agent_list) {
+ if (is_rmpp_data_mad(mad_agent_priv, wr->send_buf.mad) &&
+ wr->tid == mad_hdr->tid &&
+ wr->timeout &&
+ rcv_has_same_class(wr, wc) &&
+ /*
+ * Don't check GID for direct routed MADs.
+ * These might have permissive LIDs.
+ */
+ (is_direct(mad_hdr->mgmt_class) ||
+ rcv_has_same_gid(mad_agent_priv, wr, wc)))
+ /* Verify request has not been canceled */
+ return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
+ }
+ return NULL;
+}
+
+void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ mad_send_wr->timeout = 0;
+ if (mad_send_wr->refcount == 1)
+ list_move_tail(&mad_send_wr->agent_list,
+ &mad_send_wr->mad_agent_priv->done_list);
+}
+
+static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_mad_send_wc mad_send_wc;
+ unsigned long flags;
+ int ret;
+
+ INIT_LIST_HEAD(&mad_recv_wc->rmpp_list);
+ ret = ib_mad_enforce_security(mad_agent_priv,
+ mad_recv_wc->wc->pkey_index);
+ if (ret) {
+ ib_free_recv_mad(mad_recv_wc);
+ deref_mad_agent(mad_agent_priv);
+ return;
+ }
+
+ list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list);
+ if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
+ mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv,
+ mad_recv_wc);
+ if (!mad_recv_wc) {
+ deref_mad_agent(mad_agent_priv);
+ return;
+ }
+ }
+
+ /* Complete corresponding request */
+ if (ib_response_mad(&mad_recv_wc->recv_buf.mad->mad_hdr)) {
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
+ if (!mad_send_wr) {
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ if (!ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)
+ && ib_is_mad_class_rmpp(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class)
+ && (ib_get_rmpp_flags(&((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr)
+ & IB_MGMT_RMPP_FLAG_ACTIVE)) {
+ /* user rmpp is in effect
+ * and this is an active RMPP MAD
+ */
+ mad_agent_priv->agent.recv_handler(
+ &mad_agent_priv->agent, NULL,
+ mad_recv_wc);
+ deref_mad_agent(mad_agent_priv);
+ } else {
+ /* not user rmpp, revert to normal behavior and
+ * drop the mad
+ */
+ ib_free_recv_mad(mad_recv_wc);
+ deref_mad_agent(mad_agent_priv);
+ return;
+ }
+ } else {
+ ib_mark_mad_done(mad_send_wr);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ /* Defined behavior is to complete response before request */
+ mad_agent_priv->agent.recv_handler(
+ &mad_agent_priv->agent,
+ &mad_send_wr->send_buf,
+ mad_recv_wc);
+ deref_mad_agent(mad_agent_priv);
+
+ mad_send_wc.status = IB_WC_SUCCESS;
+ mad_send_wc.vendor_err = 0;
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+ }
+ } else {
+ mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, NULL,
+ mad_recv_wc);
+ deref_mad_agent(mad_agent_priv);
+ }
+}
+
+static enum smi_action handle_ib_smi(const struct ib_mad_port_private *port_priv,
+ const struct ib_mad_qp_info *qp_info,
+ const struct ib_wc *wc,
+ u32 port_num,
+ struct ib_mad_private *recv,
+ struct ib_mad_private *response)
+{
+ enum smi_forward_action retsmi;
+ struct ib_smp *smp = (struct ib_smp *)recv->mad;
+
+ trace_ib_mad_handle_ib_smi(smp);
+
+ if (smi_handle_dr_smp_recv(smp,
+ rdma_cap_ib_switch(port_priv->device),
+ port_num,
+ port_priv->device->phys_port_cnt) ==
+ IB_SMI_DISCARD)
+ return IB_SMI_DISCARD;
+
+ retsmi = smi_check_forward_dr_smp(smp);
+ if (retsmi == IB_SMI_LOCAL)
+ return IB_SMI_HANDLE;
+
+ if (retsmi == IB_SMI_SEND) { /* don't forward */
+ if (smi_handle_dr_smp_send(smp,
+ rdma_cap_ib_switch(port_priv->device),
+ port_num) == IB_SMI_DISCARD)
+ return IB_SMI_DISCARD;
+
+ if (smi_check_local_smp(smp, port_priv->device) == IB_SMI_DISCARD)
+ return IB_SMI_DISCARD;
+ } else if (rdma_cap_ib_switch(port_priv->device)) {
+ /* forward case for switches */
+ memcpy(response, recv, mad_priv_size(response));
+ response->header.recv_wc.wc = &response->header.wc;
+ response->header.recv_wc.recv_buf.mad = (struct ib_mad *)response->mad;
+ response->header.recv_wc.recv_buf.grh = &response->grh;
+
+ agent_send_response((const struct ib_mad_hdr *)response->mad,
+ &response->grh, wc,
+ port_priv->device,
+ smi_get_fwd_port(smp),
+ qp_info->qp->qp_num,
+ response->mad_size,
+ false);
+
+ return IB_SMI_DISCARD;
+ }
+ return IB_SMI_HANDLE;
+}
+
+static bool generate_unmatched_resp(const struct ib_mad_private *recv,
+ struct ib_mad_private *response,
+ size_t *resp_len, bool opa)
+{
+ const struct ib_mad_hdr *recv_hdr = (const struct ib_mad_hdr *)recv->mad;
+ struct ib_mad_hdr *resp_hdr = (struct ib_mad_hdr *)response->mad;
+
+ if (recv_hdr->method == IB_MGMT_METHOD_GET ||
+ recv_hdr->method == IB_MGMT_METHOD_SET) {
+ memcpy(response, recv, mad_priv_size(response));
+ response->header.recv_wc.wc = &response->header.wc;
+ response->header.recv_wc.recv_buf.mad = (struct ib_mad *)response->mad;
+ response->header.recv_wc.recv_buf.grh = &response->grh;
+ resp_hdr->method = IB_MGMT_METHOD_GET_RESP;
+ resp_hdr->status = cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB);
+ if (recv_hdr->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ resp_hdr->status |= IB_SMP_DIRECTION;
+
+ if (opa && recv_hdr->base_version == OPA_MGMT_BASE_VERSION) {
+ if (recv_hdr->mgmt_class ==
+ IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+ recv_hdr->mgmt_class ==
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ *resp_len = opa_get_smp_header_size(
+ (struct opa_smp *)recv->mad);
+ else
+ *resp_len = sizeof(struct ib_mad_hdr);
+ }
+
+ return true;
+ } else {
+ return false;
+ }
+}
+
+static enum smi_action
+handle_opa_smi(struct ib_mad_port_private *port_priv,
+ struct ib_mad_qp_info *qp_info,
+ struct ib_wc *wc,
+ u32 port_num,
+ struct ib_mad_private *recv,
+ struct ib_mad_private *response)
+{
+ enum smi_forward_action retsmi;
+ struct opa_smp *smp = (struct opa_smp *)recv->mad;
+
+ trace_ib_mad_handle_opa_smi(smp);
+
+ if (opa_smi_handle_dr_smp_recv(smp,
+ rdma_cap_ib_switch(port_priv->device),
+ port_num,
+ port_priv->device->phys_port_cnt) ==
+ IB_SMI_DISCARD)
+ return IB_SMI_DISCARD;
+
+ retsmi = opa_smi_check_forward_dr_smp(smp);
+ if (retsmi == IB_SMI_LOCAL)
+ return IB_SMI_HANDLE;
+
+ if (retsmi == IB_SMI_SEND) { /* don't forward */
+ if (opa_smi_handle_dr_smp_send(smp,
+ rdma_cap_ib_switch(port_priv->device),
+ port_num) == IB_SMI_DISCARD)
+ return IB_SMI_DISCARD;
+
+ if (opa_smi_check_local_smp(smp, port_priv->device) ==
+ IB_SMI_DISCARD)
+ return IB_SMI_DISCARD;
+
+ } else if (rdma_cap_ib_switch(port_priv->device)) {
+ /* forward case for switches */
+ memcpy(response, recv, mad_priv_size(response));
+ response->header.recv_wc.wc = &response->header.wc;
+ response->header.recv_wc.recv_buf.opa_mad =
+ (struct opa_mad *)response->mad;
+ response->header.recv_wc.recv_buf.grh = &response->grh;
+
+ agent_send_response((const struct ib_mad_hdr *)response->mad,
+ &response->grh, wc,
+ port_priv->device,
+ opa_smi_get_fwd_port(smp),
+ qp_info->qp->qp_num,
+ recv->header.wc.byte_len,
+ true);
+
+ return IB_SMI_DISCARD;
+ }
+
+ return IB_SMI_HANDLE;
+}
+
+static enum smi_action
+handle_smi(struct ib_mad_port_private *port_priv,
+ struct ib_mad_qp_info *qp_info,
+ struct ib_wc *wc,
+ u32 port_num,
+ struct ib_mad_private *recv,
+ struct ib_mad_private *response,
+ bool opa)
+{
+ struct ib_mad_hdr *mad_hdr = (struct ib_mad_hdr *)recv->mad;
+
+ if (opa && mad_hdr->base_version == OPA_MGMT_BASE_VERSION &&
+ mad_hdr->class_version == OPA_SM_CLASS_VERSION)
+ return handle_opa_smi(port_priv, qp_info, wc, port_num, recv,
+ response);
+
+ return handle_ib_smi(port_priv, qp_info, wc, port_num, recv, response);
+}
+
+static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct ib_mad_port_private *port_priv = cq->cq_context;
+ struct ib_mad_list_head *mad_list =
+ container_of(wc->wr_cqe, struct ib_mad_list_head, cqe);
+ struct ib_mad_qp_info *qp_info;
+ struct ib_mad_private_header *mad_priv_hdr;
+ struct ib_mad_private *recv, *response = NULL;
+ struct ib_mad_agent_private *mad_agent;
+ u32 port_num;
+ int ret = IB_MAD_RESULT_SUCCESS;
+ size_t mad_size;
+ u16 resp_mad_pkey_index = 0;
+ bool opa;
+
+ if (list_empty_careful(&port_priv->port_list))
+ return;
+
+ if (wc->status != IB_WC_SUCCESS) {
+ /*
+ * Receive errors indicate that the QP has entered the error
+ * state - error handling/shutdown code will cleanup
+ */
+ return;
+ }
+
+ qp_info = mad_list->mad_queue->qp_info;
+ dequeue_mad(mad_list);
+
+ opa = rdma_cap_opa_mad(qp_info->port_priv->device,
+ qp_info->port_priv->port_num);
+
+ mad_priv_hdr = container_of(mad_list, struct ib_mad_private_header,
+ mad_list);
+ recv = container_of(mad_priv_hdr, struct ib_mad_private, header);
+ ib_dma_unmap_single(port_priv->device,
+ recv->header.mapping,
+ mad_priv_dma_size(recv),
+ DMA_FROM_DEVICE);
+
+ /* Setup MAD receive work completion from "normal" work completion */
+ recv->header.wc = *wc;
+ recv->header.recv_wc.wc = &recv->header.wc;
+
+ if (opa && ((struct ib_mad_hdr *)(recv->mad))->base_version == OPA_MGMT_BASE_VERSION) {
+ recv->header.recv_wc.mad_len = wc->byte_len - sizeof(struct ib_grh);
+ recv->header.recv_wc.mad_seg_size = sizeof(struct opa_mad);
+ } else {
+ recv->header.recv_wc.mad_len = sizeof(struct ib_mad);
+ recv->header.recv_wc.mad_seg_size = sizeof(struct ib_mad);
+ }
+
+ recv->header.recv_wc.recv_buf.mad = (struct ib_mad *)recv->mad;
+ recv->header.recv_wc.recv_buf.grh = &recv->grh;
+
+ /* Validate MAD */
+ if (!validate_mad((const struct ib_mad_hdr *)recv->mad, qp_info, opa))
+ goto out;
+
+ trace_ib_mad_recv_done_handler(qp_info, wc,
+ (struct ib_mad_hdr *)recv->mad);
+
+ mad_size = recv->mad_size;
+ response = alloc_mad_private(mad_size, GFP_KERNEL);
+ if (!response)
+ goto out;
+
+ if (rdma_cap_ib_switch(port_priv->device))
+ port_num = wc->port_num;
+ else
+ port_num = port_priv->port_num;
+
+ if (((struct ib_mad_hdr *)recv->mad)->mgmt_class ==
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+ if (handle_smi(port_priv, qp_info, wc, port_num, recv,
+ response, opa)
+ == IB_SMI_DISCARD)
+ goto out;
+ }
+
+ /* Give driver "right of first refusal" on incoming MAD */
+ if (port_priv->device->ops.process_mad) {
+ ret = port_priv->device->ops.process_mad(
+ port_priv->device, 0, port_priv->port_num, wc,
+ &recv->grh, (const struct ib_mad *)recv->mad,
+ (struct ib_mad *)response->mad, &mad_size,
+ &resp_mad_pkey_index);
+
+ if (opa)
+ wc->pkey_index = resp_mad_pkey_index;
+
+ if (ret & IB_MAD_RESULT_SUCCESS) {
+ if (ret & IB_MAD_RESULT_CONSUMED)
+ goto out;
+ if (ret & IB_MAD_RESULT_REPLY) {
+ agent_send_response((const struct ib_mad_hdr *)response->mad,
+ &recv->grh, wc,
+ port_priv->device,
+ port_num,
+ qp_info->qp->qp_num,
+ mad_size, opa);
+ goto out;
+ }
+ }
+ }
+
+ mad_agent = find_mad_agent(port_priv, (const struct ib_mad_hdr *)recv->mad);
+ if (mad_agent) {
+ trace_ib_mad_recv_done_agent(mad_agent);
+ ib_mad_complete_recv(mad_agent, &recv->header.recv_wc);
+ /*
+ * recv is freed up in error cases in ib_mad_complete_recv
+ * or via recv_handler in ib_mad_complete_recv()
+ */
+ recv = NULL;
+ } else if ((ret & IB_MAD_RESULT_SUCCESS) &&
+ generate_unmatched_resp(recv, response, &mad_size, opa)) {
+ agent_send_response((const struct ib_mad_hdr *)response->mad, &recv->grh, wc,
+ port_priv->device, port_num,
+ qp_info->qp->qp_num, mad_size, opa);
+ }
+
+out:
+ /* Post another receive request for this QP */
+ if (response) {
+ ib_mad_post_receive_mads(qp_info, response);
+ kfree(recv);
+ } else
+ ib_mad_post_receive_mads(qp_info, recv);
+}
+
+static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ unsigned long delay;
+
+ if (list_empty(&mad_agent_priv->wait_list)) {
+ cancel_delayed_work(&mad_agent_priv->timed_work);
+ } else {
+ mad_send_wr = list_entry(mad_agent_priv->wait_list.next,
+ struct ib_mad_send_wr_private,
+ agent_list);
+
+ if (time_after(mad_agent_priv->timeout,
+ mad_send_wr->timeout)) {
+ mad_agent_priv->timeout = mad_send_wr->timeout;
+ delay = mad_send_wr->timeout - jiffies;
+ if ((long)delay <= 0)
+ delay = 1;
+ mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq,
+ &mad_agent_priv->timed_work, delay);
+ }
+ }
+}
+
+static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_wr_private *temp_mad_send_wr;
+ struct list_head *list_item;
+ unsigned long delay;
+
+ mad_agent_priv = mad_send_wr->mad_agent_priv;
+ list_del(&mad_send_wr->agent_list);
+
+ delay = mad_send_wr->timeout;
+ mad_send_wr->timeout += jiffies;
+
+ if (delay) {
+ list_for_each_prev(list_item, &mad_agent_priv->wait_list) {
+ temp_mad_send_wr = list_entry(list_item,
+ struct ib_mad_send_wr_private,
+ agent_list);
+ if (time_after(mad_send_wr->timeout,
+ temp_mad_send_wr->timeout))
+ break;
+ }
+ } else {
+ list_item = &mad_agent_priv->wait_list;
+ }
+
+ list_add(&mad_send_wr->agent_list, list_item);
+
+ /* Reschedule a work item if we have a shorter timeout */
+ if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list)
+ mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq,
+ &mad_agent_priv->timed_work, delay);
+}
+
+void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
+ unsigned long timeout_ms)
+{
+ mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
+ wait_for_response(mad_send_wr);
+}
+
+/*
+ * Process a send work completion
+ */
+void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ unsigned long flags;
+ int ret;
+
+ mad_agent_priv = mad_send_wr->mad_agent_priv;
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
+ ret = ib_process_rmpp_send_wc(mad_send_wr, mad_send_wc);
+ if (ret == IB_RMPP_RESULT_CONSUMED)
+ goto done;
+ } else
+ ret = IB_RMPP_RESULT_UNHANDLED;
+
+ if (mad_send_wc->status != IB_WC_SUCCESS &&
+ mad_send_wr->status == IB_WC_SUCCESS) {
+ mad_send_wr->status = mad_send_wc->status;
+ mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+ }
+
+ if (--mad_send_wr->refcount > 0) {
+ if (mad_send_wr->refcount == 1 && mad_send_wr->timeout &&
+ mad_send_wr->status == IB_WC_SUCCESS) {
+ wait_for_response(mad_send_wr);
+ }
+ goto done;
+ }
+
+ /* Remove send from MAD agent and notify client of completion */
+ list_del(&mad_send_wr->agent_list);
+ adjust_timeout(mad_agent_priv);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ if (mad_send_wr->status != IB_WC_SUCCESS)
+ mad_send_wc->status = mad_send_wr->status;
+ if (ret == IB_RMPP_RESULT_INTERNAL)
+ ib_rmpp_send_handler(mad_send_wc);
+ else
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ mad_send_wc);
+
+ /* Release reference on agent taken when sending */
+ deref_mad_agent(mad_agent_priv);
+ return;
+done:
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct ib_mad_port_private *port_priv = cq->cq_context;
+ struct ib_mad_list_head *mad_list =
+ container_of(wc->wr_cqe, struct ib_mad_list_head, cqe);
+ struct ib_mad_send_wr_private *mad_send_wr, *queued_send_wr;
+ struct ib_mad_qp_info *qp_info;
+ struct ib_mad_queue *send_queue;
+ struct ib_mad_send_wc mad_send_wc;
+ unsigned long flags;
+ int ret;
+
+ if (list_empty_careful(&port_priv->port_list))
+ return;
+
+ if (wc->status != IB_WC_SUCCESS) {
+ if (!ib_mad_send_error(port_priv, wc))
+ return;
+ }
+
+ mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
+ mad_list);
+ send_queue = mad_list->mad_queue;
+ qp_info = send_queue->qp_info;
+
+ trace_ib_mad_send_done_agent(mad_send_wr->mad_agent_priv);
+ trace_ib_mad_send_done_handler(mad_send_wr, wc);
+
+retry:
+ ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device,
+ mad_send_wr->header_mapping,
+ mad_send_wr->sg_list[0].length, DMA_TO_DEVICE);
+ ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device,
+ mad_send_wr->payload_mapping,
+ mad_send_wr->sg_list[1].length, DMA_TO_DEVICE);
+ queued_send_wr = NULL;
+ spin_lock_irqsave(&send_queue->lock, flags);
+ list_del(&mad_list->list);
+
+ /* Move queued send to the send queue */
+ if (send_queue->count-- > send_queue->max_active) {
+ mad_list = container_of(qp_info->overflow_list.next,
+ struct ib_mad_list_head, list);
+ queued_send_wr = container_of(mad_list,
+ struct ib_mad_send_wr_private,
+ mad_list);
+ list_move_tail(&mad_list->list, &send_queue->list);
+ }
+ spin_unlock_irqrestore(&send_queue->lock, flags);
+
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ mad_send_wc.status = wc->status;
+ mad_send_wc.vendor_err = wc->vendor_err;
+ ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+
+ if (queued_send_wr) {
+ trace_ib_mad_send_done_resend(queued_send_wr, qp_info);
+ ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr.wr,
+ NULL);
+ if (ret) {
+ dev_err(&port_priv->device->dev,
+ "ib_post_send failed: %d\n", ret);
+ mad_send_wr = queued_send_wr;
+ wc->status = IB_WC_LOC_QP_OP_ERR;
+ goto retry;
+ }
+ }
+}
+
+static void mark_sends_for_retry(struct ib_mad_qp_info *qp_info)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_mad_list_head *mad_list;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp_info->send_queue.lock, flags);
+ list_for_each_entry(mad_list, &qp_info->send_queue.list, list) {
+ mad_send_wr = container_of(mad_list,
+ struct ib_mad_send_wr_private,
+ mad_list);
+ mad_send_wr->retry = 1;
+ }
+ spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+}
+
+static bool ib_mad_send_error(struct ib_mad_port_private *port_priv,
+ struct ib_wc *wc)
+{
+ struct ib_mad_list_head *mad_list =
+ container_of(wc->wr_cqe, struct ib_mad_list_head, cqe);
+ struct ib_mad_qp_info *qp_info = mad_list->mad_queue->qp_info;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ int ret;
+
+ /*
+ * Send errors will transition the QP to SQE - move
+ * QP to RTS and repost flushed work requests
+ */
+ mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
+ mad_list);
+ if (wc->status == IB_WC_WR_FLUSH_ERR) {
+ if (mad_send_wr->retry) {
+ /* Repost send */
+ mad_send_wr->retry = 0;
+ trace_ib_mad_error_handler(mad_send_wr, qp_info);
+ ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr.wr,
+ NULL);
+ if (!ret)
+ return false;
+ }
+ } else {
+ struct ib_qp_attr *attr;
+
+ /* Transition QP to RTS and fail offending send */
+ attr = kmalloc(sizeof *attr, GFP_KERNEL);
+ if (attr) {
+ attr->qp_state = IB_QPS_RTS;
+ attr->cur_qp_state = IB_QPS_SQE;
+ ret = ib_modify_qp(qp_info->qp, attr,
+ IB_QP_STATE | IB_QP_CUR_STATE);
+ kfree(attr);
+ if (ret)
+ dev_err(&port_priv->device->dev,
+ "%s - ib_modify_qp to RTS: %d\n",
+ __func__, ret);
+ else
+ mark_sends_for_retry(qp_info);
+ }
+ }
+
+ return true;
+}
+
+static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv)
+{
+ unsigned long flags;
+ struct ib_mad_send_wr_private *mad_send_wr, *temp_mad_send_wr;
+ struct ib_mad_send_wc mad_send_wc;
+ struct list_head cancel_list;
+
+ INIT_LIST_HEAD(&cancel_list);
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
+ &mad_agent_priv->send_list, agent_list) {
+ if (mad_send_wr->status == IB_WC_SUCCESS) {
+ mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
+ mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+ }
+ }
+
+ /* Empty wait list to prevent receives from finding a request */
+ list_splice_init(&mad_agent_priv->wait_list, &cancel_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ /* Report all cancelled requests */
+ mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+ mad_send_wc.vendor_err = 0;
+
+ list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
+ &cancel_list, agent_list) {
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ list_del(&mad_send_wr->agent_list);
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ &mad_send_wc);
+ deref_mad_agent(mad_agent_priv);
+ }
+}
+
+static struct ib_mad_send_wr_private*
+find_send_wr(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_send_buf *send_buf)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+
+ list_for_each_entry(mad_send_wr, &mad_agent_priv->wait_list,
+ agent_list) {
+ if (&mad_send_wr->send_buf == send_buf)
+ return mad_send_wr;
+ }
+
+ list_for_each_entry(mad_send_wr, &mad_agent_priv->send_list,
+ agent_list) {
+ if (is_rmpp_data_mad(mad_agent_priv,
+ mad_send_wr->send_buf.mad) &&
+ &mad_send_wr->send_buf == send_buf)
+ return mad_send_wr;
+ }
+ return NULL;
+}
+
+int ib_modify_mad(struct ib_mad_send_buf *send_buf, u32 timeout_ms)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ unsigned long flags;
+ int active;
+
+ if (!send_buf)
+ return -EINVAL;
+
+ mad_agent_priv = container_of(send_buf->mad_agent,
+ struct ib_mad_agent_private, agent);
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ mad_send_wr = find_send_wr(mad_agent_priv, send_buf);
+ if (!mad_send_wr || mad_send_wr->status != IB_WC_SUCCESS) {
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ return -EINVAL;
+ }
+
+ active = (!mad_send_wr->timeout || mad_send_wr->refcount > 1);
+ if (!timeout_ms) {
+ mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
+ mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+ }
+
+ mad_send_wr->send_buf.timeout_ms = timeout_ms;
+ if (active)
+ mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
+ else
+ ib_reset_mad_timeout(mad_send_wr, timeout_ms);
+
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ return 0;
+}
+EXPORT_SYMBOL(ib_modify_mad);
+
+static void local_completions(struct work_struct *work)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_local_private *local;
+ struct ib_mad_agent_private *recv_mad_agent;
+ unsigned long flags;
+ int free_mad;
+ struct ib_wc wc;
+ struct ib_mad_send_wc mad_send_wc;
+ bool opa;
+
+ mad_agent_priv =
+ container_of(work, struct ib_mad_agent_private, local_work);
+
+ opa = rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device,
+ mad_agent_priv->qp_info->port_priv->port_num);
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ while (!list_empty(&mad_agent_priv->local_list)) {
+ local = list_entry(mad_agent_priv->local_list.next,
+ struct ib_mad_local_private,
+ completion_list);
+ list_del(&local->completion_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ free_mad = 0;
+ if (local->mad_priv) {
+ u8 base_version;
+ recv_mad_agent = local->recv_mad_agent;
+ if (!recv_mad_agent) {
+ dev_err(&mad_agent_priv->agent.device->dev,
+ "No receive MAD agent for local completion\n");
+ free_mad = 1;
+ goto local_send_completion;
+ }
+
+ /*
+ * Defined behavior is to complete response
+ * before request
+ */
+ build_smp_wc(recv_mad_agent->agent.qp,
+ local->mad_send_wr->send_wr.wr.wr_cqe,
+ be16_to_cpu(IB_LID_PERMISSIVE),
+ local->mad_send_wr->send_wr.pkey_index,
+ recv_mad_agent->agent.port_num, &wc);
+
+ local->mad_priv->header.recv_wc.wc = &wc;
+
+ base_version = ((struct ib_mad_hdr *)(local->mad_priv->mad))->base_version;
+ if (opa && base_version == OPA_MGMT_BASE_VERSION) {
+ local->mad_priv->header.recv_wc.mad_len = local->return_wc_byte_len;
+ local->mad_priv->header.recv_wc.mad_seg_size = sizeof(struct opa_mad);
+ } else {
+ local->mad_priv->header.recv_wc.mad_len = sizeof(struct ib_mad);
+ local->mad_priv->header.recv_wc.mad_seg_size = sizeof(struct ib_mad);
+ }
+
+ INIT_LIST_HEAD(&local->mad_priv->header.recv_wc.rmpp_list);
+ list_add(&local->mad_priv->header.recv_wc.recv_buf.list,
+ &local->mad_priv->header.recv_wc.rmpp_list);
+ local->mad_priv->header.recv_wc.recv_buf.grh = NULL;
+ local->mad_priv->header.recv_wc.recv_buf.mad =
+ (struct ib_mad *)local->mad_priv->mad;
+ recv_mad_agent->agent.recv_handler(
+ &recv_mad_agent->agent,
+ &local->mad_send_wr->send_buf,
+ &local->mad_priv->header.recv_wc);
+ spin_lock_irqsave(&recv_mad_agent->lock, flags);
+ deref_mad_agent(recv_mad_agent);
+ spin_unlock_irqrestore(&recv_mad_agent->lock, flags);
+ }
+
+local_send_completion:
+ /* Complete send */
+ mad_send_wc.status = IB_WC_SUCCESS;
+ mad_send_wc.vendor_err = 0;
+ mad_send_wc.send_buf = &local->mad_send_wr->send_buf;
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ &mad_send_wc);
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ deref_mad_agent(mad_agent_priv);
+ if (free_mad)
+ kfree(local->mad_priv);
+ kfree(local);
+ }
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ int ret;
+
+ if (!mad_send_wr->retries_left)
+ return -ETIMEDOUT;
+
+ mad_send_wr->retries_left--;
+ mad_send_wr->send_buf.retries++;
+
+ mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+
+ if (ib_mad_kernel_rmpp_agent(&mad_send_wr->mad_agent_priv->agent)) {
+ ret = ib_retry_rmpp(mad_send_wr);
+ switch (ret) {
+ case IB_RMPP_RESULT_UNHANDLED:
+ ret = ib_send_mad(mad_send_wr);
+ break;
+ case IB_RMPP_RESULT_CONSUMED:
+ ret = 0;
+ break;
+ default:
+ ret = -ECOMM;
+ break;
+ }
+ } else
+ ret = ib_send_mad(mad_send_wr);
+
+ if (!ret) {
+ mad_send_wr->refcount++;
+ list_add_tail(&mad_send_wr->agent_list,
+ &mad_send_wr->mad_agent_priv->send_list);
+ }
+ return ret;
+}
+
+static void timeout_sends(struct work_struct *work)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_mad_send_wc mad_send_wc;
+ unsigned long flags, delay;
+
+ mad_agent_priv = container_of(work, struct ib_mad_agent_private,
+ timed_work.work);
+ mad_send_wc.vendor_err = 0;
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ while (!list_empty(&mad_agent_priv->wait_list)) {
+ mad_send_wr = list_entry(mad_agent_priv->wait_list.next,
+ struct ib_mad_send_wr_private,
+ agent_list);
+
+ if (time_after(mad_send_wr->timeout, jiffies)) {
+ delay = mad_send_wr->timeout - jiffies;
+ if ((long)delay <= 0)
+ delay = 1;
+ queue_delayed_work(mad_agent_priv->qp_info->
+ port_priv->wq,
+ &mad_agent_priv->timed_work, delay);
+ break;
+ }
+
+ list_del(&mad_send_wr->agent_list);
+ if (mad_send_wr->status == IB_WC_SUCCESS &&
+ !retry_send(mad_send_wr))
+ continue;
+
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ if (mad_send_wr->status == IB_WC_SUCCESS)
+ mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR;
+ else
+ mad_send_wc.status = mad_send_wr->status;
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ &mad_send_wc);
+
+ deref_mad_agent(mad_agent_priv);
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ }
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+/*
+ * Allocate receive MADs and post receive WRs for them
+ */
+static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_private *mad)
+{
+ unsigned long flags;
+ int post, ret;
+ struct ib_mad_private *mad_priv;
+ struct ib_sge sg_list;
+ struct ib_recv_wr recv_wr;
+ struct ib_mad_queue *recv_queue = &qp_info->recv_queue;
+
+ /* Initialize common scatter list fields */
+ sg_list.lkey = qp_info->port_priv->pd->local_dma_lkey;
+
+ /* Initialize common receive WR fields */
+ recv_wr.next = NULL;
+ recv_wr.sg_list = &sg_list;
+ recv_wr.num_sge = 1;
+
+ do {
+ /* Allocate and map receive buffer */
+ if (mad) {
+ mad_priv = mad;
+ mad = NULL;
+ } else {
+ mad_priv = alloc_mad_private(port_mad_size(qp_info->port_priv),
+ GFP_ATOMIC);
+ if (!mad_priv) {
+ ret = -ENOMEM;
+ break;
+ }
+ }
+ sg_list.length = mad_priv_dma_size(mad_priv);
+ sg_list.addr = ib_dma_map_single(qp_info->port_priv->device,
+ &mad_priv->grh,
+ mad_priv_dma_size(mad_priv),
+ DMA_FROM_DEVICE);
+ if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device,
+ sg_list.addr))) {
+ kfree(mad_priv);
+ ret = -ENOMEM;
+ break;
+ }
+ mad_priv->header.mapping = sg_list.addr;
+ mad_priv->header.mad_list.mad_queue = recv_queue;
+ mad_priv->header.mad_list.cqe.done = ib_mad_recv_done;
+ recv_wr.wr_cqe = &mad_priv->header.mad_list.cqe;
+
+ /* Post receive WR */
+ spin_lock_irqsave(&recv_queue->lock, flags);
+ post = (++recv_queue->count < recv_queue->max_active);
+ list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list);
+ spin_unlock_irqrestore(&recv_queue->lock, flags);
+ ret = ib_post_recv(qp_info->qp, &recv_wr, NULL);
+ if (ret) {
+ spin_lock_irqsave(&recv_queue->lock, flags);
+ list_del(&mad_priv->header.mad_list.list);
+ recv_queue->count--;
+ spin_unlock_irqrestore(&recv_queue->lock, flags);
+ ib_dma_unmap_single(qp_info->port_priv->device,
+ mad_priv->header.mapping,
+ mad_priv_dma_size(mad_priv),
+ DMA_FROM_DEVICE);
+ kfree(mad_priv);
+ dev_err(&qp_info->port_priv->device->dev,
+ "ib_post_recv failed: %d\n", ret);
+ break;
+ }
+ } while (post);
+
+ return ret;
+}
+
+/*
+ * Return all the posted receive MADs
+ */
+static void cleanup_recv_queue(struct ib_mad_qp_info *qp_info)
+{
+ struct ib_mad_private_header *mad_priv_hdr;
+ struct ib_mad_private *recv;
+ struct ib_mad_list_head *mad_list;
+
+ if (!qp_info->qp)
+ return;
+
+ while (!list_empty(&qp_info->recv_queue.list)) {
+
+ mad_list = list_entry(qp_info->recv_queue.list.next,
+ struct ib_mad_list_head, list);
+ mad_priv_hdr = container_of(mad_list,
+ struct ib_mad_private_header,
+ mad_list);
+ recv = container_of(mad_priv_hdr, struct ib_mad_private,
+ header);
+
+ /* Remove from posted receive MAD list */
+ list_del(&mad_list->list);
+
+ ib_dma_unmap_single(qp_info->port_priv->device,
+ recv->header.mapping,
+ mad_priv_dma_size(recv),
+ DMA_FROM_DEVICE);
+ kfree(recv);
+ }
+
+ qp_info->recv_queue.count = 0;
+}
+
+/*
+ * Start the port
+ */
+static int ib_mad_port_start(struct ib_mad_port_private *port_priv)
+{
+ int ret, i;
+ struct ib_qp_attr *attr;
+ struct ib_qp *qp;
+ u16 pkey_index;
+
+ attr = kmalloc(sizeof *attr, GFP_KERNEL);
+ if (!attr)
+ return -ENOMEM;
+
+ ret = ib_find_pkey(port_priv->device, port_priv->port_num,
+ IB_DEFAULT_PKEY_FULL, &pkey_index);
+ if (ret)
+ pkey_index = 0;
+
+ for (i = 0; i < IB_MAD_QPS_CORE; i++) {
+ qp = port_priv->qp_info[i].qp;
+ if (!qp)
+ continue;
+
+ /*
+ * PKey index for QP1 is irrelevant but
+ * one is needed for the Reset to Init transition
+ */
+ attr->qp_state = IB_QPS_INIT;
+ attr->pkey_index = pkey_index;
+ attr->qkey = (qp->qp_num == 0) ? 0 : IB_QP1_QKEY;
+ ret = ib_modify_qp(qp, attr, IB_QP_STATE |
+ IB_QP_PKEY_INDEX | IB_QP_QKEY);
+ if (ret) {
+ dev_err(&port_priv->device->dev,
+ "Couldn't change QP%d state to INIT: %d\n",
+ i, ret);
+ goto out;
+ }
+
+ attr->qp_state = IB_QPS_RTR;
+ ret = ib_modify_qp(qp, attr, IB_QP_STATE);
+ if (ret) {
+ dev_err(&port_priv->device->dev,
+ "Couldn't change QP%d state to RTR: %d\n",
+ i, ret);
+ goto out;
+ }
+
+ attr->qp_state = IB_QPS_RTS;
+ attr->sq_psn = IB_MAD_SEND_Q_PSN;
+ ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_SQ_PSN);
+ if (ret) {
+ dev_err(&port_priv->device->dev,
+ "Couldn't change QP%d state to RTS: %d\n",
+ i, ret);
+ goto out;
+ }
+ }
+
+ ret = ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
+ if (ret) {
+ dev_err(&port_priv->device->dev,
+ "Failed to request completion notification: %d\n",
+ ret);
+ goto out;
+ }
+
+ for (i = 0; i < IB_MAD_QPS_CORE; i++) {
+ if (!port_priv->qp_info[i].qp)
+ continue;
+
+ ret = ib_mad_post_receive_mads(&port_priv->qp_info[i], NULL);
+ if (ret) {
+ dev_err(&port_priv->device->dev,
+ "Couldn't post receive WRs\n");
+ goto out;
+ }
+ }
+out:
+ kfree(attr);
+ return ret;
+}
+
+static void qp_event_handler(struct ib_event *event, void *qp_context)
+{
+ struct ib_mad_qp_info *qp_info = qp_context;
+
+ /* It's worse than that! He's dead, Jim! */
+ dev_err(&qp_info->port_priv->device->dev,
+ "Fatal error (%d) on MAD QP (%u)\n",
+ event->event, qp_info->qp->qp_num);
+}
+
+static void init_mad_queue(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_queue *mad_queue)
+{
+ mad_queue->qp_info = qp_info;
+ mad_queue->count = 0;
+ spin_lock_init(&mad_queue->lock);
+ INIT_LIST_HEAD(&mad_queue->list);
+}
+
+static void init_mad_qp(struct ib_mad_port_private *port_priv,
+ struct ib_mad_qp_info *qp_info)
+{
+ qp_info->port_priv = port_priv;
+ init_mad_queue(qp_info, &qp_info->send_queue);
+ init_mad_queue(qp_info, &qp_info->recv_queue);
+ INIT_LIST_HEAD(&qp_info->overflow_list);
+}
+
+static int create_mad_qp(struct ib_mad_qp_info *qp_info,
+ enum ib_qp_type qp_type)
+{
+ struct ib_qp_init_attr qp_init_attr;
+ int ret;
+
+ memset(&qp_init_attr, 0, sizeof qp_init_attr);
+ qp_init_attr.send_cq = qp_info->port_priv->cq;
+ qp_init_attr.recv_cq = qp_info->port_priv->cq;
+ qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+ qp_init_attr.cap.max_send_wr = mad_sendq_size;
+ qp_init_attr.cap.max_recv_wr = mad_recvq_size;
+ qp_init_attr.cap.max_send_sge = IB_MAD_SEND_REQ_MAX_SG;
+ qp_init_attr.cap.max_recv_sge = IB_MAD_RECV_REQ_MAX_SG;
+ qp_init_attr.qp_type = qp_type;
+ qp_init_attr.port_num = qp_info->port_priv->port_num;
+ qp_init_attr.qp_context = qp_info;
+ qp_init_attr.event_handler = qp_event_handler;
+ qp_info->qp = ib_create_qp(qp_info->port_priv->pd, &qp_init_attr);
+ if (IS_ERR(qp_info->qp)) {
+ dev_err(&qp_info->port_priv->device->dev,
+ "Couldn't create ib_mad QP%d\n",
+ get_spl_qp_index(qp_type));
+ ret = PTR_ERR(qp_info->qp);
+ goto error;
+ }
+ /* Use minimum queue sizes unless the CQ is resized */
+ qp_info->send_queue.max_active = mad_sendq_size;
+ qp_info->recv_queue.max_active = mad_recvq_size;
+ return 0;
+
+error:
+ return ret;
+}
+
+static void destroy_mad_qp(struct ib_mad_qp_info *qp_info)
+{
+ if (!qp_info->qp)
+ return;
+
+ ib_destroy_qp(qp_info->qp);
+}
+
+/*
+ * Open the port
+ * Create the QP, PD, MR, and CQ if needed
+ */
+static int ib_mad_port_open(struct ib_device *device,
+ u32 port_num)
+{
+ int ret, cq_size;
+ struct ib_mad_port_private *port_priv;
+ unsigned long flags;
+ char name[sizeof "ib_mad123"];
+ int has_smi;
+
+ if (WARN_ON(rdma_max_mad_size(device, port_num) < IB_MGMT_MAD_SIZE))
+ return -EFAULT;
+
+ if (WARN_ON(rdma_cap_opa_mad(device, port_num) &&
+ rdma_max_mad_size(device, port_num) < OPA_MGMT_MAD_SIZE))
+ return -EFAULT;
+
+ /* Create new device info */
+ port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
+ if (!port_priv)
+ return -ENOMEM;
+
+ port_priv->device = device;
+ port_priv->port_num = port_num;
+ spin_lock_init(&port_priv->reg_lock);
+ init_mad_qp(port_priv, &port_priv->qp_info[0]);
+ init_mad_qp(port_priv, &port_priv->qp_info[1]);
+
+ cq_size = mad_sendq_size + mad_recvq_size;
+ has_smi = rdma_cap_ib_smi(device, port_num);
+ if (has_smi)
+ cq_size *= 2;
+
+ port_priv->pd = ib_alloc_pd(device, 0);
+ if (IS_ERR(port_priv->pd)) {
+ dev_err(&device->dev, "Couldn't create ib_mad PD\n");
+ ret = PTR_ERR(port_priv->pd);
+ goto error3;
+ }
+
+ port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0,
+ IB_POLL_UNBOUND_WORKQUEUE);
+ if (IS_ERR(port_priv->cq)) {
+ dev_err(&device->dev, "Couldn't create ib_mad CQ\n");
+ ret = PTR_ERR(port_priv->cq);
+ goto error4;
+ }
+
+ if (has_smi) {
+ ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI);
+ if (ret)
+ goto error6;
+ }
+ ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI);
+ if (ret)
+ goto error7;
+
+ snprintf(name, sizeof(name), "ib_mad%u", port_num);
+ port_priv->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM);
+ if (!port_priv->wq) {
+ ret = -ENOMEM;
+ goto error8;
+ }
+
+ spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+ list_add_tail(&port_priv->port_list, &ib_mad_port_list);
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+ ret = ib_mad_port_start(port_priv);
+ if (ret) {
+ dev_err(&device->dev, "Couldn't start port\n");
+ goto error9;
+ }
+
+ return 0;
+
+error9:
+ spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+ list_del_init(&port_priv->port_list);
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+ destroy_workqueue(port_priv->wq);
+error8:
+ destroy_mad_qp(&port_priv->qp_info[1]);
+error7:
+ destroy_mad_qp(&port_priv->qp_info[0]);
+error6:
+ ib_free_cq(port_priv->cq);
+ cleanup_recv_queue(&port_priv->qp_info[1]);
+ cleanup_recv_queue(&port_priv->qp_info[0]);
+error4:
+ ib_dealloc_pd(port_priv->pd);
+error3:
+ kfree(port_priv);
+
+ return ret;
+}
+
+/*
+ * Close the port
+ * If there are no classes using the port, free the port
+ * resources (CQ, MR, PD, QP) and remove the port's info structure
+ */
+static int ib_mad_port_close(struct ib_device *device, u32 port_num)
+{
+ struct ib_mad_port_private *port_priv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+ port_priv = __ib_get_mad_port(device, port_num);
+ if (port_priv == NULL) {
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+ dev_err(&device->dev, "Port %u not found\n", port_num);
+ return -ENODEV;
+ }
+ list_del_init(&port_priv->port_list);
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+ destroy_workqueue(port_priv->wq);
+ destroy_mad_qp(&port_priv->qp_info[1]);
+ destroy_mad_qp(&port_priv->qp_info[0]);
+ ib_free_cq(port_priv->cq);
+ ib_dealloc_pd(port_priv->pd);
+ cleanup_recv_queue(&port_priv->qp_info[1]);
+ cleanup_recv_queue(&port_priv->qp_info[0]);
+ /* XXX: Handle deallocation of MAD registration tables */
+
+ kfree(port_priv);
+
+ return 0;
+}
+
+static int ib_mad_init_device(struct ib_device *device)
+{
+ int start, i;
+ unsigned int count = 0;
+ int ret;
+
+ start = rdma_start_port(device);
+
+ for (i = start; i <= rdma_end_port(device); i++) {
+ if (!rdma_cap_ib_mad(device, i))
+ continue;
+
+ ret = ib_mad_port_open(device, i);
+ if (ret) {
+ dev_err(&device->dev, "Couldn't open port %d\n", i);
+ goto error;
+ }
+ ret = ib_agent_port_open(device, i);
+ if (ret) {
+ dev_err(&device->dev,
+ "Couldn't open port %d for agents\n", i);
+ goto error_agent;
+ }
+ count++;
+ }
+ if (!count)
+ return -EOPNOTSUPP;
+
+ return 0;
+
+error_agent:
+ if (ib_mad_port_close(device, i))
+ dev_err(&device->dev, "Couldn't close port %d\n", i);
+
+error:
+ while (--i >= start) {
+ if (!rdma_cap_ib_mad(device, i))
+ continue;
+
+ if (ib_agent_port_close(device, i))
+ dev_err(&device->dev,
+ "Couldn't close port %d for agents\n", i);
+ if (ib_mad_port_close(device, i))
+ dev_err(&device->dev, "Couldn't close port %d\n", i);
+ }
+ return ret;
+}
+
+static void ib_mad_remove_device(struct ib_device *device, void *client_data)
+{
+ unsigned int i;
+
+ rdma_for_each_port (device, i) {
+ if (!rdma_cap_ib_mad(device, i))
+ continue;
+
+ if (ib_agent_port_close(device, i))
+ dev_err(&device->dev,
+ "Couldn't close port %u for agents\n", i);
+ if (ib_mad_port_close(device, i))
+ dev_err(&device->dev, "Couldn't close port %u\n", i);
+ }
+}
+
+static struct ib_client mad_client = {
+ .name = "mad",
+ .add = ib_mad_init_device,
+ .remove = ib_mad_remove_device
+};
+
+int ib_mad_init(void)
+{
+ mad_recvq_size = min(mad_recvq_size, IB_MAD_QP_MAX_SIZE);
+ mad_recvq_size = max(mad_recvq_size, IB_MAD_QP_MIN_SIZE);
+
+ mad_sendq_size = min(mad_sendq_size, IB_MAD_QP_MAX_SIZE);
+ mad_sendq_size = max(mad_sendq_size, IB_MAD_QP_MIN_SIZE);
+
+ INIT_LIST_HEAD(&ib_mad_port_list);
+
+ if (ib_register_client(&mad_client)) {
+ pr_err("Couldn't register ib_mad client\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+void ib_mad_cleanup(void)
+{
+ ib_unregister_client(&mad_client);
+}
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
new file mode 100644
index 000000000..1b7445a6f
--- /dev/null
+++ b/drivers/infiniband/core/mad_priv.h
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2004, 2005, Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __IB_MAD_PRIV_H__
+#define __IB_MAD_PRIV_H__
+
+#include <linux/completion.h>
+#include <linux/err.h>
+#include <linux/workqueue.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+#include <rdma/opa_smi.h>
+
+#define IB_MAD_QPS_CORE 2 /* Always QP0 and QP1 as a minimum */
+
+/* QP and CQ parameters */
+#define IB_MAD_QP_SEND_SIZE 128
+#define IB_MAD_QP_RECV_SIZE 512
+#define IB_MAD_QP_MIN_SIZE 64
+#define IB_MAD_QP_MAX_SIZE 8192
+#define IB_MAD_SEND_REQ_MAX_SG 2
+#define IB_MAD_RECV_REQ_MAX_SG 1
+
+#define IB_MAD_SEND_Q_PSN 0
+
+/* Registration table sizes */
+#define MAX_MGMT_CLASS 80
+#define MAX_MGMT_VERSION 0x83
+#define MAX_MGMT_OUI 8
+#define MAX_MGMT_VENDOR_RANGE2 (IB_MGMT_CLASS_VENDOR_RANGE2_END - \
+ IB_MGMT_CLASS_VENDOR_RANGE2_START + 1)
+
+struct ib_mad_list_head {
+ struct list_head list;
+ struct ib_cqe cqe;
+ struct ib_mad_queue *mad_queue;
+};
+
+struct ib_mad_private_header {
+ struct ib_mad_list_head mad_list;
+ struct ib_mad_recv_wc recv_wc;
+ struct ib_wc wc;
+ u64 mapping;
+} __packed;
+
+struct ib_mad_private {
+ struct ib_mad_private_header header;
+ size_t mad_size;
+ struct ib_grh grh;
+ u8 mad[];
+} __packed;
+
+struct ib_rmpp_segment {
+ struct list_head list;
+ u32 num;
+ u8 data[];
+};
+
+struct ib_mad_agent_private {
+ struct ib_mad_agent agent;
+ struct ib_mad_reg_req *reg_req;
+ struct ib_mad_qp_info *qp_info;
+
+ spinlock_t lock;
+ struct list_head send_list;
+ struct list_head wait_list;
+ struct list_head done_list;
+ struct delayed_work timed_work;
+ unsigned long timeout;
+ struct list_head local_list;
+ struct work_struct local_work;
+ struct list_head rmpp_list;
+
+ refcount_t refcount;
+ union {
+ struct completion comp;
+ struct rcu_head rcu;
+ };
+};
+
+struct ib_mad_snoop_private {
+ struct ib_mad_agent agent;
+ struct ib_mad_qp_info *qp_info;
+ int snoop_index;
+ int mad_snoop_flags;
+ struct completion comp;
+};
+
+struct ib_mad_send_wr_private {
+ struct ib_mad_list_head mad_list;
+ struct list_head agent_list;
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_buf send_buf;
+ u64 header_mapping;
+ u64 payload_mapping;
+ struct ib_ud_wr send_wr;
+ struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
+ __be64 tid;
+ unsigned long timeout;
+ int max_retries;
+ int retries_left;
+ int retry;
+ int refcount;
+ enum ib_wc_status status;
+
+ /* RMPP control */
+ struct list_head rmpp_list;
+ struct ib_rmpp_segment *last_ack_seg;
+ struct ib_rmpp_segment *cur_seg;
+ int last_ack;
+ int seg_num;
+ int newwin;
+ int pad;
+};
+
+struct ib_mad_local_private {
+ struct list_head completion_list;
+ struct ib_mad_private *mad_priv;
+ struct ib_mad_agent_private *recv_mad_agent;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ size_t return_wc_byte_len;
+};
+
+struct ib_mad_mgmt_method_table {
+ struct ib_mad_agent_private *agent[IB_MGMT_MAX_METHODS];
+};
+
+struct ib_mad_mgmt_class_table {
+ struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_CLASS];
+};
+
+struct ib_mad_mgmt_vendor_class {
+ u8 oui[MAX_MGMT_OUI][3];
+ struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_OUI];
+};
+
+struct ib_mad_mgmt_vendor_class_table {
+ struct ib_mad_mgmt_vendor_class *vendor_class[MAX_MGMT_VENDOR_RANGE2];
+};
+
+struct ib_mad_mgmt_version_table {
+ struct ib_mad_mgmt_class_table *class;
+ struct ib_mad_mgmt_vendor_class_table *vendor;
+};
+
+struct ib_mad_queue {
+ spinlock_t lock;
+ struct list_head list;
+ int count;
+ int max_active;
+ struct ib_mad_qp_info *qp_info;
+};
+
+struct ib_mad_qp_info {
+ struct ib_mad_port_private *port_priv;
+ struct ib_qp *qp;
+ struct ib_mad_queue send_queue;
+ struct ib_mad_queue recv_queue;
+ struct list_head overflow_list;
+ spinlock_t snoop_lock;
+ struct ib_mad_snoop_private **snoop_table;
+ int snoop_table_size;
+ atomic_t snoop_count;
+};
+
+struct ib_mad_port_private {
+ struct list_head port_list;
+ struct ib_device *device;
+ int port_num;
+ struct ib_cq *cq;
+ struct ib_pd *pd;
+
+ spinlock_t reg_lock;
+ struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION];
+ struct workqueue_struct *wq;
+ struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE];
+};
+
+int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr);
+
+struct ib_mad_send_wr_private *
+ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv,
+ const struct ib_mad_recv_wc *mad_recv_wc);
+
+void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_send_wc *mad_send_wc);
+
+void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr);
+
+void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
+ unsigned long timeout_ms);
+
+#endif /* __IB_MAD_PRIV_H__ */
diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c
new file mode 100644
index 000000000..8af0619a3
--- /dev/null
+++ b/drivers/infiniband/core/mad_rmpp.c
@@ -0,0 +1,960 @@
+/*
+ * Copyright (c) 2005 Intel Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include "mad_priv.h"
+#include "mad_rmpp.h"
+
+enum rmpp_state {
+ RMPP_STATE_ACTIVE,
+ RMPP_STATE_TIMEOUT,
+ RMPP_STATE_COMPLETE
+};
+
+struct mad_rmpp_recv {
+ struct ib_mad_agent_private *agent;
+ struct list_head list;
+ struct delayed_work timeout_work;
+ struct delayed_work cleanup_work;
+ struct completion comp;
+ enum rmpp_state state;
+ spinlock_t lock;
+ refcount_t refcount;
+
+ struct ib_ah *ah;
+ struct ib_mad_recv_wc *rmpp_wc;
+ struct ib_mad_recv_buf *cur_seg_buf;
+ int last_ack;
+ int seg_num;
+ int newwin;
+ int repwin;
+
+ __be64 tid;
+ u32 src_qp;
+ u32 slid;
+ u8 mgmt_class;
+ u8 class_version;
+ u8 method;
+ u8 base_version;
+};
+
+static inline void deref_rmpp_recv(struct mad_rmpp_recv *rmpp_recv)
+{
+ if (refcount_dec_and_test(&rmpp_recv->refcount))
+ complete(&rmpp_recv->comp);
+}
+
+static void destroy_rmpp_recv(struct mad_rmpp_recv *rmpp_recv)
+{
+ deref_rmpp_recv(rmpp_recv);
+ wait_for_completion(&rmpp_recv->comp);
+ rdma_destroy_ah(rmpp_recv->ah, RDMA_DESTROY_AH_SLEEPABLE);
+ kfree(rmpp_recv);
+}
+
+void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent)
+{
+ struct mad_rmpp_recv *rmpp_recv, *temp_rmpp_recv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&agent->lock, flags);
+ list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+ cancel_delayed_work(&rmpp_recv->timeout_work);
+ cancel_delayed_work(&rmpp_recv->cleanup_work);
+ }
+ spin_unlock_irqrestore(&agent->lock, flags);
+
+ flush_workqueue(agent->qp_info->port_priv->wq);
+
+ list_for_each_entry_safe(rmpp_recv, temp_rmpp_recv,
+ &agent->rmpp_list, list) {
+ list_del(&rmpp_recv->list);
+ if (rmpp_recv->state != RMPP_STATE_COMPLETE)
+ ib_free_recv_mad(rmpp_recv->rmpp_wc);
+ destroy_rmpp_recv(rmpp_recv);
+ }
+}
+
+static void format_ack(struct ib_mad_send_buf *msg,
+ struct ib_rmpp_mad *data,
+ struct mad_rmpp_recv *rmpp_recv)
+{
+ struct ib_rmpp_mad *ack = msg->mad;
+ unsigned long flags;
+
+ memcpy(ack, &data->mad_hdr, msg->hdr_len);
+
+ ack->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+ ack->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ACK;
+ ib_set_rmpp_flags(&ack->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+
+ spin_lock_irqsave(&rmpp_recv->lock, flags);
+ rmpp_recv->last_ack = rmpp_recv->seg_num;
+ ack->rmpp_hdr.seg_num = cpu_to_be32(rmpp_recv->seg_num);
+ ack->rmpp_hdr.paylen_newwin = cpu_to_be32(rmpp_recv->newwin);
+ spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+}
+
+static void ack_recv(struct mad_rmpp_recv *rmpp_recv,
+ struct ib_mad_recv_wc *recv_wc)
+{
+ struct ib_mad_send_buf *msg;
+ int ret, hdr_len;
+
+ hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
+ msg = ib_create_send_mad(&rmpp_recv->agent->agent, recv_wc->wc->src_qp,
+ recv_wc->wc->pkey_index, 1, hdr_len,
+ 0, GFP_KERNEL,
+ IB_MGMT_BASE_VERSION);
+ if (IS_ERR(msg))
+ return;
+
+ format_ack(msg, (struct ib_rmpp_mad *) recv_wc->recv_buf.mad, rmpp_recv);
+ msg->ah = rmpp_recv->ah;
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ ib_free_send_mad(msg);
+}
+
+static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent,
+ struct ib_mad_recv_wc *recv_wc)
+{
+ struct ib_mad_send_buf *msg;
+ struct ib_ah *ah;
+ int hdr_len;
+
+ ah = ib_create_ah_from_wc(agent->qp->pd, recv_wc->wc,
+ recv_wc->recv_buf.grh, agent->port_num);
+ if (IS_ERR(ah))
+ return (void *) ah;
+
+ hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
+ msg = ib_create_send_mad(agent, recv_wc->wc->src_qp,
+ recv_wc->wc->pkey_index, 1,
+ hdr_len, 0, GFP_KERNEL,
+ IB_MGMT_BASE_VERSION);
+ if (IS_ERR(msg))
+ rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE);
+ else {
+ msg->ah = ah;
+ msg->context[0] = ah;
+ }
+
+ return msg;
+}
+
+static void ack_ds_ack(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *recv_wc)
+{
+ struct ib_mad_send_buf *msg;
+ struct ib_rmpp_mad *rmpp_mad;
+ int ret;
+
+ msg = alloc_response_msg(&agent->agent, recv_wc);
+ if (IS_ERR(msg))
+ return;
+
+ rmpp_mad = msg->mad;
+ memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len);
+
+ rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+ ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+ rmpp_mad->rmpp_hdr.seg_num = 0;
+ rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(1);
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ rdma_destroy_ah(msg->ah, RDMA_DESTROY_AH_SLEEPABLE);
+ ib_free_send_mad(msg);
+ }
+}
+
+void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc)
+{
+ if (mad_send_wc->send_buf->context[0] == mad_send_wc->send_buf->ah)
+ rdma_destroy_ah(mad_send_wc->send_buf->ah,
+ RDMA_DESTROY_AH_SLEEPABLE);
+ ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+static void nack_recv(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *recv_wc, u8 rmpp_status)
+{
+ struct ib_mad_send_buf *msg;
+ struct ib_rmpp_mad *rmpp_mad;
+ int ret;
+
+ msg = alloc_response_msg(&agent->agent, recv_wc);
+ if (IS_ERR(msg))
+ return;
+
+ rmpp_mad = msg->mad;
+ memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len);
+
+ rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+ rmpp_mad->rmpp_hdr.rmpp_version = IB_MGMT_RMPP_VERSION;
+ rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ABORT;
+ ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+ rmpp_mad->rmpp_hdr.rmpp_status = rmpp_status;
+ rmpp_mad->rmpp_hdr.seg_num = 0;
+ rmpp_mad->rmpp_hdr.paylen_newwin = 0;
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ rdma_destroy_ah(msg->ah, RDMA_DESTROY_AH_SLEEPABLE);
+ ib_free_send_mad(msg);
+ }
+}
+
+static void recv_timeout_handler(struct work_struct *work)
+{
+ struct mad_rmpp_recv *rmpp_recv =
+ container_of(work, struct mad_rmpp_recv, timeout_work.work);
+ struct ib_mad_recv_wc *rmpp_wc;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rmpp_recv->agent->lock, flags);
+ if (rmpp_recv->state != RMPP_STATE_ACTIVE) {
+ spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+ return;
+ }
+ rmpp_recv->state = RMPP_STATE_TIMEOUT;
+ list_del(&rmpp_recv->list);
+ spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+
+ rmpp_wc = rmpp_recv->rmpp_wc;
+ nack_recv(rmpp_recv->agent, rmpp_wc, IB_MGMT_RMPP_STATUS_T2L);
+ destroy_rmpp_recv(rmpp_recv);
+ ib_free_recv_mad(rmpp_wc);
+}
+
+static void recv_cleanup_handler(struct work_struct *work)
+{
+ struct mad_rmpp_recv *rmpp_recv =
+ container_of(work, struct mad_rmpp_recv, cleanup_work.work);
+ unsigned long flags;
+
+ spin_lock_irqsave(&rmpp_recv->agent->lock, flags);
+ list_del(&rmpp_recv->list);
+ spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+ destroy_rmpp_recv(rmpp_recv);
+}
+
+static struct mad_rmpp_recv *
+create_rmpp_recv(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct mad_rmpp_recv *rmpp_recv;
+ struct ib_mad_hdr *mad_hdr;
+
+ rmpp_recv = kmalloc(sizeof *rmpp_recv, GFP_KERNEL);
+ if (!rmpp_recv)
+ return NULL;
+
+ rmpp_recv->ah = ib_create_ah_from_wc(agent->agent.qp->pd,
+ mad_recv_wc->wc,
+ mad_recv_wc->recv_buf.grh,
+ agent->agent.port_num);
+ if (IS_ERR(rmpp_recv->ah))
+ goto error;
+
+ rmpp_recv->agent = agent;
+ init_completion(&rmpp_recv->comp);
+ INIT_DELAYED_WORK(&rmpp_recv->timeout_work, recv_timeout_handler);
+ INIT_DELAYED_WORK(&rmpp_recv->cleanup_work, recv_cleanup_handler);
+ spin_lock_init(&rmpp_recv->lock);
+ rmpp_recv->state = RMPP_STATE_ACTIVE;
+ refcount_set(&rmpp_recv->refcount, 1);
+
+ rmpp_recv->rmpp_wc = mad_recv_wc;
+ rmpp_recv->cur_seg_buf = &mad_recv_wc->recv_buf;
+ rmpp_recv->newwin = 1;
+ rmpp_recv->seg_num = 1;
+ rmpp_recv->last_ack = 0;
+ rmpp_recv->repwin = 1;
+
+ mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr;
+ rmpp_recv->tid = mad_hdr->tid;
+ rmpp_recv->src_qp = mad_recv_wc->wc->src_qp;
+ rmpp_recv->slid = mad_recv_wc->wc->slid;
+ rmpp_recv->mgmt_class = mad_hdr->mgmt_class;
+ rmpp_recv->class_version = mad_hdr->class_version;
+ rmpp_recv->method = mad_hdr->method;
+ rmpp_recv->base_version = mad_hdr->base_version;
+ return rmpp_recv;
+
+error: kfree(rmpp_recv);
+ return NULL;
+}
+
+static struct mad_rmpp_recv *
+find_rmpp_recv(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct mad_rmpp_recv *rmpp_recv;
+ struct ib_mad_hdr *mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr;
+
+ list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+ if (rmpp_recv->tid == mad_hdr->tid &&
+ rmpp_recv->src_qp == mad_recv_wc->wc->src_qp &&
+ rmpp_recv->slid == mad_recv_wc->wc->slid &&
+ rmpp_recv->mgmt_class == mad_hdr->mgmt_class &&
+ rmpp_recv->class_version == mad_hdr->class_version &&
+ rmpp_recv->method == mad_hdr->method)
+ return rmpp_recv;
+ }
+ return NULL;
+}
+
+static struct mad_rmpp_recv *
+acquire_rmpp_recv(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct mad_rmpp_recv *rmpp_recv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&agent->lock, flags);
+ rmpp_recv = find_rmpp_recv(agent, mad_recv_wc);
+ if (rmpp_recv)
+ refcount_inc(&rmpp_recv->refcount);
+ spin_unlock_irqrestore(&agent->lock, flags);
+ return rmpp_recv;
+}
+
+static struct mad_rmpp_recv *
+insert_rmpp_recv(struct ib_mad_agent_private *agent,
+ struct mad_rmpp_recv *rmpp_recv)
+{
+ struct mad_rmpp_recv *cur_rmpp_recv;
+
+ cur_rmpp_recv = find_rmpp_recv(agent, rmpp_recv->rmpp_wc);
+ if (!cur_rmpp_recv)
+ list_add_tail(&rmpp_recv->list, &agent->rmpp_list);
+
+ return cur_rmpp_recv;
+}
+
+static inline int get_last_flag(struct ib_mad_recv_buf *seg)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+
+ rmpp_mad = (struct ib_rmpp_mad *) seg->mad;
+ return ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_LAST;
+}
+
+static inline int get_seg_num(struct ib_mad_recv_buf *seg)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+
+ rmpp_mad = (struct ib_rmpp_mad *) seg->mad;
+ return be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num);
+}
+
+static inline struct ib_mad_recv_buf *get_next_seg(struct list_head *rmpp_list,
+ struct ib_mad_recv_buf *seg)
+{
+ if (seg->list.next == rmpp_list)
+ return NULL;
+
+ return container_of(seg->list.next, struct ib_mad_recv_buf, list);
+}
+
+static inline int window_size(struct ib_mad_agent_private *agent)
+{
+ return max(agent->qp_info->recv_queue.max_active >> 3, 1);
+}
+
+static struct ib_mad_recv_buf *find_seg_location(struct list_head *rmpp_list,
+ int seg_num)
+{
+ struct ib_mad_recv_buf *seg_buf;
+ int cur_seg_num;
+
+ list_for_each_entry_reverse(seg_buf, rmpp_list, list) {
+ cur_seg_num = get_seg_num(seg_buf);
+ if (seg_num > cur_seg_num)
+ return seg_buf;
+ if (seg_num == cur_seg_num)
+ break;
+ }
+ return NULL;
+}
+
+static void update_seg_num(struct mad_rmpp_recv *rmpp_recv,
+ struct ib_mad_recv_buf *new_buf)
+{
+ struct list_head *rmpp_list = &rmpp_recv->rmpp_wc->rmpp_list;
+
+ while (new_buf && (get_seg_num(new_buf) == rmpp_recv->seg_num + 1)) {
+ rmpp_recv->cur_seg_buf = new_buf;
+ rmpp_recv->seg_num++;
+ new_buf = get_next_seg(rmpp_list, new_buf);
+ }
+}
+
+static inline int get_mad_len(struct mad_rmpp_recv *rmpp_recv)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+ int hdr_size, data_size, pad;
+ bool opa = rdma_cap_opa_mad(rmpp_recv->agent->qp_info->port_priv->device,
+ rmpp_recv->agent->qp_info->port_priv->port_num);
+
+ rmpp_mad = (struct ib_rmpp_mad *)rmpp_recv->cur_seg_buf->mad;
+
+ hdr_size = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
+ if (opa && rmpp_recv->base_version == OPA_MGMT_BASE_VERSION) {
+ data_size = sizeof(struct opa_rmpp_mad) - hdr_size;
+ pad = OPA_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+ if (pad > OPA_MGMT_RMPP_DATA || pad < 0)
+ pad = 0;
+ } else {
+ data_size = sizeof(struct ib_rmpp_mad) - hdr_size;
+ pad = IB_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+ if (pad > IB_MGMT_RMPP_DATA || pad < 0)
+ pad = 0;
+ }
+
+ return hdr_size + rmpp_recv->seg_num * data_size - pad;
+}
+
+static struct ib_mad_recv_wc *complete_rmpp(struct mad_rmpp_recv *rmpp_recv)
+{
+ struct ib_mad_recv_wc *rmpp_wc;
+
+ ack_recv(rmpp_recv, rmpp_recv->rmpp_wc);
+ if (rmpp_recv->seg_num > 1)
+ cancel_delayed_work(&rmpp_recv->timeout_work);
+
+ rmpp_wc = rmpp_recv->rmpp_wc;
+ rmpp_wc->mad_len = get_mad_len(rmpp_recv);
+ /* 10 seconds until we can find the packet lifetime */
+ queue_delayed_work(rmpp_recv->agent->qp_info->port_priv->wq,
+ &rmpp_recv->cleanup_work, msecs_to_jiffies(10000));
+ return rmpp_wc;
+}
+
+static struct ib_mad_recv_wc *
+continue_rmpp(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct mad_rmpp_recv *rmpp_recv;
+ struct ib_mad_recv_buf *prev_buf;
+ struct ib_mad_recv_wc *done_wc;
+ int seg_num;
+ unsigned long flags;
+
+ rmpp_recv = acquire_rmpp_recv(agent, mad_recv_wc);
+ if (!rmpp_recv)
+ goto drop1;
+
+ seg_num = get_seg_num(&mad_recv_wc->recv_buf);
+
+ spin_lock_irqsave(&rmpp_recv->lock, flags);
+ if ((rmpp_recv->state == RMPP_STATE_TIMEOUT) ||
+ (seg_num > rmpp_recv->newwin))
+ goto drop3;
+
+ if ((seg_num <= rmpp_recv->last_ack) ||
+ (rmpp_recv->state == RMPP_STATE_COMPLETE)) {
+ spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+ ack_recv(rmpp_recv, mad_recv_wc);
+ goto drop2;
+ }
+
+ prev_buf = find_seg_location(&rmpp_recv->rmpp_wc->rmpp_list, seg_num);
+ if (!prev_buf)
+ goto drop3;
+
+ done_wc = NULL;
+ list_add(&mad_recv_wc->recv_buf.list, &prev_buf->list);
+ if (rmpp_recv->cur_seg_buf == prev_buf) {
+ update_seg_num(rmpp_recv, &mad_recv_wc->recv_buf);
+ if (get_last_flag(rmpp_recv->cur_seg_buf)) {
+ rmpp_recv->state = RMPP_STATE_COMPLETE;
+ spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+ done_wc = complete_rmpp(rmpp_recv);
+ goto out;
+ } else if (rmpp_recv->seg_num == rmpp_recv->newwin) {
+ rmpp_recv->newwin += window_size(agent);
+ spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+ ack_recv(rmpp_recv, mad_recv_wc);
+ goto out;
+ }
+ }
+ spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+out:
+ deref_rmpp_recv(rmpp_recv);
+ return done_wc;
+
+drop3: spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+drop2: deref_rmpp_recv(rmpp_recv);
+drop1: ib_free_recv_mad(mad_recv_wc);
+ return NULL;
+}
+
+static struct ib_mad_recv_wc *
+start_rmpp(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct mad_rmpp_recv *rmpp_recv;
+ unsigned long flags;
+
+ rmpp_recv = create_rmpp_recv(agent, mad_recv_wc);
+ if (!rmpp_recv) {
+ ib_free_recv_mad(mad_recv_wc);
+ return NULL;
+ }
+
+ spin_lock_irqsave(&agent->lock, flags);
+ if (insert_rmpp_recv(agent, rmpp_recv)) {
+ spin_unlock_irqrestore(&agent->lock, flags);
+ /* duplicate first MAD */
+ destroy_rmpp_recv(rmpp_recv);
+ return continue_rmpp(agent, mad_recv_wc);
+ }
+ refcount_inc(&rmpp_recv->refcount);
+
+ if (get_last_flag(&mad_recv_wc->recv_buf)) {
+ rmpp_recv->state = RMPP_STATE_COMPLETE;
+ spin_unlock_irqrestore(&agent->lock, flags);
+ complete_rmpp(rmpp_recv);
+ } else {
+ spin_unlock_irqrestore(&agent->lock, flags);
+ /* 40 seconds until we can find the packet lifetimes */
+ queue_delayed_work(agent->qp_info->port_priv->wq,
+ &rmpp_recv->timeout_work,
+ msecs_to_jiffies(40000));
+ rmpp_recv->newwin += window_size(agent);
+ ack_recv(rmpp_recv, mad_recv_wc);
+ mad_recv_wc = NULL;
+ }
+ deref_rmpp_recv(rmpp_recv);
+ return mad_recv_wc;
+}
+
+static int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+ int timeout;
+ u32 paylen = 0;
+
+ rmpp_mad = mad_send_wr->send_buf.mad;
+ ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+ rmpp_mad->rmpp_hdr.seg_num = cpu_to_be32(++mad_send_wr->seg_num);
+
+ if (mad_send_wr->seg_num == 1) {
+ rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_FIRST;
+ paylen = (mad_send_wr->send_buf.seg_count *
+ mad_send_wr->send_buf.seg_rmpp_size) -
+ mad_send_wr->pad;
+ }
+
+ if (mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count) {
+ rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_LAST;
+ paylen = mad_send_wr->send_buf.seg_rmpp_size - mad_send_wr->pad;
+ }
+ rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen);
+
+ /* 2 seconds for an ACK until we can find the packet lifetime */
+ timeout = mad_send_wr->send_buf.timeout_ms;
+ if (!timeout || timeout > 2000)
+ mad_send_wr->timeout = msecs_to_jiffies(2000);
+
+ return ib_send_mad(mad_send_wr);
+}
+
+static void abort_send(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc, u8 rmpp_status)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_mad_send_wc wc;
+ unsigned long flags;
+
+ spin_lock_irqsave(&agent->lock, flags);
+ mad_send_wr = ib_find_send_mad(agent, mad_recv_wc);
+ if (!mad_send_wr)
+ goto out; /* Unmatched send */
+
+ if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) ||
+ (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS))
+ goto out; /* Send is already done */
+
+ ib_mark_mad_done(mad_send_wr);
+ spin_unlock_irqrestore(&agent->lock, flags);
+
+ wc.status = IB_WC_REM_ABORT_ERR;
+ wc.vendor_err = rmpp_status;
+ wc.send_buf = &mad_send_wr->send_buf;
+ ib_mad_complete_send_wr(mad_send_wr, &wc);
+ return;
+out:
+ spin_unlock_irqrestore(&agent->lock, flags);
+}
+
+static inline void adjust_last_ack(struct ib_mad_send_wr_private *wr,
+ int seg_num)
+{
+ struct list_head *list;
+
+ wr->last_ack = seg_num;
+ list = &wr->last_ack_seg->list;
+ list_for_each_entry(wr->last_ack_seg, list, list)
+ if (wr->last_ack_seg->num == seg_num)
+ break;
+}
+
+static void process_ds_ack(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc, int newwin)
+{
+ struct mad_rmpp_recv *rmpp_recv;
+
+ rmpp_recv = find_rmpp_recv(agent, mad_recv_wc);
+ if (rmpp_recv && rmpp_recv->state == RMPP_STATE_COMPLETE)
+ rmpp_recv->repwin = newwin;
+}
+
+static void process_rmpp_ack(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_rmpp_mad *rmpp_mad;
+ unsigned long flags;
+ int seg_num, newwin, ret;
+
+ rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+ if (rmpp_mad->rmpp_hdr.rmpp_status) {
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+ return;
+ }
+
+ seg_num = be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num);
+ newwin = be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+ if (newwin < seg_num) {
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S);
+ return;
+ }
+
+ spin_lock_irqsave(&agent->lock, flags);
+ mad_send_wr = ib_find_send_mad(agent, mad_recv_wc);
+ if (!mad_send_wr) {
+ if (!seg_num)
+ process_ds_ack(agent, mad_recv_wc, newwin);
+ goto out; /* Unmatched or DS RMPP ACK */
+ }
+
+ if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) &&
+ (mad_send_wr->timeout)) {
+ spin_unlock_irqrestore(&agent->lock, flags);
+ ack_ds_ack(agent, mad_recv_wc);
+ return; /* Repeated ACK for DS RMPP transaction */
+ }
+
+ if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) ||
+ (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS))
+ goto out; /* Send is already done */
+
+ if (seg_num > mad_send_wr->send_buf.seg_count ||
+ seg_num > mad_send_wr->newwin) {
+ spin_unlock_irqrestore(&agent->lock, flags);
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B);
+ return;
+ }
+
+ if (newwin < mad_send_wr->newwin || seg_num < mad_send_wr->last_ack)
+ goto out; /* Old ACK */
+
+ if (seg_num > mad_send_wr->last_ack) {
+ adjust_last_ack(mad_send_wr, seg_num);
+ mad_send_wr->retries_left = mad_send_wr->max_retries;
+ }
+ mad_send_wr->newwin = newwin;
+ if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) {
+ /* If no response is expected, the ACK completes the send */
+ if (!mad_send_wr->send_buf.timeout_ms) {
+ struct ib_mad_send_wc wc;
+
+ ib_mark_mad_done(mad_send_wr);
+ spin_unlock_irqrestore(&agent->lock, flags);
+
+ wc.status = IB_WC_SUCCESS;
+ wc.vendor_err = 0;
+ wc.send_buf = &mad_send_wr->send_buf;
+ ib_mad_complete_send_wr(mad_send_wr, &wc);
+ return;
+ }
+ if (mad_send_wr->refcount == 1)
+ ib_reset_mad_timeout(mad_send_wr,
+ mad_send_wr->send_buf.timeout_ms);
+ spin_unlock_irqrestore(&agent->lock, flags);
+ ack_ds_ack(agent, mad_recv_wc);
+ return;
+ } else if (mad_send_wr->refcount == 1 &&
+ mad_send_wr->seg_num < mad_send_wr->newwin &&
+ mad_send_wr->seg_num < mad_send_wr->send_buf.seg_count) {
+ /* Send failure will just result in a timeout/retry */
+ ret = send_next_seg(mad_send_wr);
+ if (ret)
+ goto out;
+
+ mad_send_wr->refcount++;
+ list_move_tail(&mad_send_wr->agent_list,
+ &mad_send_wr->mad_agent_priv->send_list);
+ }
+out:
+ spin_unlock_irqrestore(&agent->lock, flags);
+}
+
+static struct ib_mad_recv_wc *
+process_rmpp_data(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_rmpp_hdr *rmpp_hdr;
+ u8 rmpp_status;
+
+ rmpp_hdr = &((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr;
+
+ if (rmpp_hdr->rmpp_status) {
+ rmpp_status = IB_MGMT_RMPP_STATUS_BAD_STATUS;
+ goto bad;
+ }
+
+ if (rmpp_hdr->seg_num == cpu_to_be32(1)) {
+ if (!(ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST)) {
+ rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG;
+ goto bad;
+ }
+ return start_rmpp(agent, mad_recv_wc);
+ } else {
+ if (ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST) {
+ rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG;
+ goto bad;
+ }
+ return continue_rmpp(agent, mad_recv_wc);
+ }
+bad:
+ nack_recv(agent, mad_recv_wc, rmpp_status);
+ ib_free_recv_mad(mad_recv_wc);
+ return NULL;
+}
+
+static void process_rmpp_stop(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+
+ rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+
+ if (rmpp_mad->rmpp_hdr.rmpp_status != IB_MGMT_RMPP_STATUS_RESX) {
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+ } else
+ abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status);
+}
+
+static void process_rmpp_abort(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+
+ rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+
+ if (rmpp_mad->rmpp_hdr.rmpp_status < IB_MGMT_RMPP_STATUS_ABORT_MIN ||
+ rmpp_mad->rmpp_hdr.rmpp_status > IB_MGMT_RMPP_STATUS_ABORT_MAX) {
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+ } else
+ abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status);
+}
+
+struct ib_mad_recv_wc *
+ib_process_rmpp_recv_wc(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+
+ rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+ if (!(rmpp_mad->rmpp_hdr.rmpp_rtime_flags & IB_MGMT_RMPP_FLAG_ACTIVE))
+ return mad_recv_wc;
+
+ if (rmpp_mad->rmpp_hdr.rmpp_version != IB_MGMT_RMPP_VERSION) {
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV);
+ goto out;
+ }
+
+ switch (rmpp_mad->rmpp_hdr.rmpp_type) {
+ case IB_MGMT_RMPP_TYPE_DATA:
+ return process_rmpp_data(agent, mad_recv_wc);
+ case IB_MGMT_RMPP_TYPE_ACK:
+ process_rmpp_ack(agent, mad_recv_wc);
+ break;
+ case IB_MGMT_RMPP_TYPE_STOP:
+ process_rmpp_stop(agent, mad_recv_wc);
+ break;
+ case IB_MGMT_RMPP_TYPE_ABORT:
+ process_rmpp_abort(agent, mad_recv_wc);
+ break;
+ default:
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT);
+ break;
+ }
+out:
+ ib_free_recv_mad(mad_recv_wc);
+ return NULL;
+}
+
+static int init_newwin(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_mad_agent_private *agent = mad_send_wr->mad_agent_priv;
+ struct ib_mad_hdr *mad_hdr = mad_send_wr->send_buf.mad;
+ struct mad_rmpp_recv *rmpp_recv;
+ struct rdma_ah_attr ah_attr;
+ unsigned long flags;
+ int newwin = 1;
+
+ if (!(mad_hdr->method & IB_MGMT_METHOD_RESP))
+ goto out;
+
+ spin_lock_irqsave(&agent->lock, flags);
+ list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+ if (rmpp_recv->tid != mad_hdr->tid ||
+ rmpp_recv->mgmt_class != mad_hdr->mgmt_class ||
+ rmpp_recv->class_version != mad_hdr->class_version ||
+ (rmpp_recv->method & IB_MGMT_METHOD_RESP))
+ continue;
+
+ if (rdma_query_ah(mad_send_wr->send_buf.ah, &ah_attr))
+ continue;
+
+ if (rmpp_recv->slid == rdma_ah_get_dlid(&ah_attr)) {
+ newwin = rmpp_recv->repwin;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&agent->lock, flags);
+out:
+ return newwin;
+}
+
+int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+ int ret;
+
+ rmpp_mad = mad_send_wr->send_buf.mad;
+ if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+ IB_MGMT_RMPP_FLAG_ACTIVE))
+ return IB_RMPP_RESULT_UNHANDLED;
+
+ if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA) {
+ mad_send_wr->seg_num = 1;
+ return IB_RMPP_RESULT_INTERNAL;
+ }
+
+ mad_send_wr->newwin = init_newwin(mad_send_wr);
+
+ /* We need to wait for the final ACK even if there isn't a response */
+ mad_send_wr->refcount += (mad_send_wr->timeout == 0);
+ ret = send_next_seg(mad_send_wr);
+ if (!ret)
+ return IB_RMPP_RESULT_CONSUMED;
+ return ret;
+}
+
+int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+ int ret;
+
+ rmpp_mad = mad_send_wr->send_buf.mad;
+ if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+ IB_MGMT_RMPP_FLAG_ACTIVE))
+ return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */
+
+ if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA)
+ return IB_RMPP_RESULT_INTERNAL; /* ACK, STOP, or ABORT */
+
+ if (mad_send_wc->status != IB_WC_SUCCESS ||
+ mad_send_wr->status != IB_WC_SUCCESS)
+ return IB_RMPP_RESULT_PROCESSED; /* Canceled or send error */
+
+ if (!mad_send_wr->timeout)
+ return IB_RMPP_RESULT_PROCESSED; /* Response received */
+
+ if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) {
+ mad_send_wr->timeout =
+ msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+ return IB_RMPP_RESULT_PROCESSED; /* Send done */
+ }
+
+ if (mad_send_wr->seg_num == mad_send_wr->newwin ||
+ mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count)
+ return IB_RMPP_RESULT_PROCESSED; /* Wait for ACK */
+
+ ret = send_next_seg(mad_send_wr);
+ if (ret) {
+ mad_send_wc->status = IB_WC_GENERAL_ERR;
+ return IB_RMPP_RESULT_PROCESSED;
+ }
+ return IB_RMPP_RESULT_CONSUMED;
+}
+
+int ib_retry_rmpp(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+ int ret;
+
+ rmpp_mad = mad_send_wr->send_buf.mad;
+ if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+ IB_MGMT_RMPP_FLAG_ACTIVE))
+ return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */
+
+ if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count)
+ return IB_RMPP_RESULT_PROCESSED;
+
+ mad_send_wr->seg_num = mad_send_wr->last_ack;
+ mad_send_wr->cur_seg = mad_send_wr->last_ack_seg;
+
+ ret = send_next_seg(mad_send_wr);
+ if (ret)
+ return IB_RMPP_RESULT_PROCESSED;
+
+ return IB_RMPP_RESULT_CONSUMED;
+}
diff --git a/drivers/infiniband/core/mad_rmpp.h b/drivers/infiniband/core/mad_rmpp.h
new file mode 100644
index 000000000..3d336bff1
--- /dev/null
+++ b/drivers/infiniband/core/mad_rmpp.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2005 Intel Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MAD_RMPP_H__
+#define __MAD_RMPP_H__
+
+enum {
+ IB_RMPP_RESULT_PROCESSED,
+ IB_RMPP_RESULT_CONSUMED,
+ IB_RMPP_RESULT_INTERNAL,
+ IB_RMPP_RESULT_UNHANDLED
+};
+
+int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr);
+
+struct ib_mad_recv_wc *
+ib_process_rmpp_recv_wc(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc);
+
+int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_send_wc *mad_send_wc);
+
+void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc);
+
+void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent);
+
+int ib_retry_rmpp(struct ib_mad_send_wr_private *mad_send_wr);
+
+#endif /* __MAD_RMPP_H__ */
diff --git a/drivers/infiniband/core/mr_pool.c b/drivers/infiniband/core/mr_pool.c
new file mode 100644
index 000000000..c0e2df128
--- /dev/null
+++ b/drivers/infiniband/core/mr_pool.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2016 HGST, a Western Digital Company.
+ */
+#include <rdma/ib_verbs.h>
+#include <rdma/mr_pool.h>
+
+struct ib_mr *ib_mr_pool_get(struct ib_qp *qp, struct list_head *list)
+{
+ struct ib_mr *mr;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->mr_lock, flags);
+ mr = list_first_entry_or_null(list, struct ib_mr, qp_entry);
+ if (mr) {
+ list_del(&mr->qp_entry);
+ qp->mrs_used++;
+ }
+ spin_unlock_irqrestore(&qp->mr_lock, flags);
+
+ return mr;
+}
+EXPORT_SYMBOL(ib_mr_pool_get);
+
+void ib_mr_pool_put(struct ib_qp *qp, struct list_head *list, struct ib_mr *mr)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->mr_lock, flags);
+ list_add(&mr->qp_entry, list);
+ qp->mrs_used--;
+ spin_unlock_irqrestore(&qp->mr_lock, flags);
+}
+EXPORT_SYMBOL(ib_mr_pool_put);
+
+int ib_mr_pool_init(struct ib_qp *qp, struct list_head *list, int nr,
+ enum ib_mr_type type, u32 max_num_sg, u32 max_num_meta_sg)
+{
+ struct ib_mr *mr;
+ unsigned long flags;
+ int ret, i;
+
+ for (i = 0; i < nr; i++) {
+ if (type == IB_MR_TYPE_INTEGRITY)
+ mr = ib_alloc_mr_integrity(qp->pd, max_num_sg,
+ max_num_meta_sg);
+ else
+ mr = ib_alloc_mr(qp->pd, type, max_num_sg);
+ if (IS_ERR(mr)) {
+ ret = PTR_ERR(mr);
+ goto out;
+ }
+
+ spin_lock_irqsave(&qp->mr_lock, flags);
+ list_add_tail(&mr->qp_entry, list);
+ spin_unlock_irqrestore(&qp->mr_lock, flags);
+ }
+
+ return 0;
+out:
+ ib_mr_pool_destroy(qp, list);
+ return ret;
+}
+EXPORT_SYMBOL(ib_mr_pool_init);
+
+void ib_mr_pool_destroy(struct ib_qp *qp, struct list_head *list)
+{
+ struct ib_mr *mr;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->mr_lock, flags);
+ while (!list_empty(list)) {
+ mr = list_first_entry(list, struct ib_mr, qp_entry);
+ list_del(&mr->qp_entry);
+
+ spin_unlock_irqrestore(&qp->mr_lock, flags);
+ ib_dereg_mr(mr);
+ spin_lock_irqsave(&qp->mr_lock, flags);
+ }
+ spin_unlock_irqrestore(&qp->mr_lock, flags);
+}
+EXPORT_SYMBOL(ib_mr_pool_destroy);
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c
new file mode 100644
index 000000000..a236532a9
--- /dev/null
+++ b/drivers/infiniband/core/multicast.c
@@ -0,0 +1,906 @@
+/*
+ * Copyright (c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/random.h>
+
+#include <rdma/ib_cache.h>
+#include "sa.h"
+
+static int mcast_add_one(struct ib_device *device);
+static void mcast_remove_one(struct ib_device *device, void *client_data);
+
+static struct ib_client mcast_client = {
+ .name = "ib_multicast",
+ .add = mcast_add_one,
+ .remove = mcast_remove_one
+};
+
+static struct ib_sa_client sa_client;
+static struct workqueue_struct *mcast_wq;
+static union ib_gid mgid0;
+
+struct mcast_device;
+
+struct mcast_port {
+ struct mcast_device *dev;
+ spinlock_t lock;
+ struct rb_root table;
+ refcount_t refcount;
+ struct completion comp;
+ u32 port_num;
+};
+
+struct mcast_device {
+ struct ib_device *device;
+ struct ib_event_handler event_handler;
+ int start_port;
+ int end_port;
+ struct mcast_port port[];
+};
+
+enum mcast_state {
+ MCAST_JOINING,
+ MCAST_MEMBER,
+ MCAST_ERROR,
+};
+
+enum mcast_group_state {
+ MCAST_IDLE,
+ MCAST_BUSY,
+ MCAST_GROUP_ERROR,
+ MCAST_PKEY_EVENT
+};
+
+enum {
+ MCAST_INVALID_PKEY_INDEX = 0xFFFF
+};
+
+struct mcast_member;
+
+struct mcast_group {
+ struct ib_sa_mcmember_rec rec;
+ struct rb_node node;
+ struct mcast_port *port;
+ spinlock_t lock;
+ struct work_struct work;
+ struct list_head pending_list;
+ struct list_head active_list;
+ struct mcast_member *last_join;
+ int members[NUM_JOIN_MEMBERSHIP_TYPES];
+ atomic_t refcount;
+ enum mcast_group_state state;
+ struct ib_sa_query *query;
+ u16 pkey_index;
+ u8 leave_state;
+ int retries;
+};
+
+struct mcast_member {
+ struct ib_sa_multicast multicast;
+ struct ib_sa_client *client;
+ struct mcast_group *group;
+ struct list_head list;
+ enum mcast_state state;
+ refcount_t refcount;
+ struct completion comp;
+};
+
+static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
+ void *context);
+static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
+ void *context);
+
+static struct mcast_group *mcast_find(struct mcast_port *port,
+ union ib_gid *mgid)
+{
+ struct rb_node *node = port->table.rb_node;
+ struct mcast_group *group;
+ int ret;
+
+ while (node) {
+ group = rb_entry(node, struct mcast_group, node);
+ ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid);
+ if (!ret)
+ return group;
+
+ if (ret < 0)
+ node = node->rb_left;
+ else
+ node = node->rb_right;
+ }
+ return NULL;
+}
+
+static struct mcast_group *mcast_insert(struct mcast_port *port,
+ struct mcast_group *group,
+ int allow_duplicates)
+{
+ struct rb_node **link = &port->table.rb_node;
+ struct rb_node *parent = NULL;
+ struct mcast_group *cur_group;
+ int ret;
+
+ while (*link) {
+ parent = *link;
+ cur_group = rb_entry(parent, struct mcast_group, node);
+
+ ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw,
+ sizeof group->rec.mgid);
+ if (ret < 0)
+ link = &(*link)->rb_left;
+ else if (ret > 0)
+ link = &(*link)->rb_right;
+ else if (allow_duplicates)
+ link = &(*link)->rb_left;
+ else
+ return cur_group;
+ }
+ rb_link_node(&group->node, parent, link);
+ rb_insert_color(&group->node, &port->table);
+ return NULL;
+}
+
+static void deref_port(struct mcast_port *port)
+{
+ if (refcount_dec_and_test(&port->refcount))
+ complete(&port->comp);
+}
+
+static void release_group(struct mcast_group *group)
+{
+ struct mcast_port *port = group->port;
+ unsigned long flags;
+
+ spin_lock_irqsave(&port->lock, flags);
+ if (atomic_dec_and_test(&group->refcount)) {
+ rb_erase(&group->node, &port->table);
+ spin_unlock_irqrestore(&port->lock, flags);
+ kfree(group);
+ deref_port(port);
+ } else
+ spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void deref_member(struct mcast_member *member)
+{
+ if (refcount_dec_and_test(&member->refcount))
+ complete(&member->comp);
+}
+
+static void queue_join(struct mcast_member *member)
+{
+ struct mcast_group *group = member->group;
+ unsigned long flags;
+
+ spin_lock_irqsave(&group->lock, flags);
+ list_add_tail(&member->list, &group->pending_list);
+ if (group->state == MCAST_IDLE) {
+ group->state = MCAST_BUSY;
+ atomic_inc(&group->refcount);
+ queue_work(mcast_wq, &group->work);
+ }
+ spin_unlock_irqrestore(&group->lock, flags);
+}
+
+/*
+ * A multicast group has four types of members: full member, non member,
+ * sendonly non member and sendonly full member.
+ * We need to keep track of the number of members of each
+ * type based on their join state. Adjust the number of members the belong to
+ * the specified join states.
+ */
+static void adjust_membership(struct mcast_group *group, u8 join_state, int inc)
+{
+ int i;
+
+ for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++, join_state >>= 1)
+ if (join_state & 0x1)
+ group->members[i] += inc;
+}
+
+/*
+ * If a multicast group has zero members left for a particular join state, but
+ * the group is still a member with the SA, we need to leave that join state.
+ * Determine which join states we still belong to, but that do not have any
+ * active members.
+ */
+static u8 get_leave_state(struct mcast_group *group)
+{
+ u8 leave_state = 0;
+ int i;
+
+ for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++)
+ if (!group->members[i])
+ leave_state |= (0x1 << i);
+
+ return leave_state & group->rec.join_state;
+}
+
+static int check_selector(ib_sa_comp_mask comp_mask,
+ ib_sa_comp_mask selector_mask,
+ ib_sa_comp_mask value_mask,
+ u8 selector, u8 src_value, u8 dst_value)
+{
+ int err;
+
+ if (!(comp_mask & selector_mask) || !(comp_mask & value_mask))
+ return 0;
+
+ switch (selector) {
+ case IB_SA_GT:
+ err = (src_value <= dst_value);
+ break;
+ case IB_SA_LT:
+ err = (src_value >= dst_value);
+ break;
+ case IB_SA_EQ:
+ err = (src_value != dst_value);
+ break;
+ default:
+ err = 0;
+ break;
+ }
+
+ return err;
+}
+
+static int cmp_rec(struct ib_sa_mcmember_rec *src,
+ struct ib_sa_mcmember_rec *dst, ib_sa_comp_mask comp_mask)
+{
+ /* MGID must already match */
+
+ if (comp_mask & IB_SA_MCMEMBER_REC_PORT_GID &&
+ memcmp(&src->port_gid, &dst->port_gid, sizeof src->port_gid))
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey)
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid)
+ return -EINVAL;
+ if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR,
+ IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector,
+ src->mtu, dst->mtu))
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS &&
+ src->traffic_class != dst->traffic_class)
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey)
+ return -EINVAL;
+ if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR,
+ IB_SA_MCMEMBER_REC_RATE, dst->rate_selector,
+ src->rate, dst->rate))
+ return -EINVAL;
+ if (check_selector(comp_mask,
+ IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR,
+ IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME,
+ dst->packet_life_time_selector,
+ src->packet_life_time, dst->packet_life_time))
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_SL && src->sl != dst->sl)
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL &&
+ src->flow_label != dst->flow_label)
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT &&
+ src->hop_limit != dst->hop_limit)
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && src->scope != dst->scope)
+ return -EINVAL;
+
+ /* join_state checked separately, proxy_join ignored */
+
+ return 0;
+}
+
+static int send_join(struct mcast_group *group, struct mcast_member *member)
+{
+ struct mcast_port *port = group->port;
+ int ret;
+
+ group->last_join = member;
+ ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device,
+ port->port_num, IB_MGMT_METHOD_SET,
+ &member->multicast.rec,
+ member->multicast.comp_mask,
+ 3000, GFP_KERNEL, join_handler, group,
+ &group->query);
+ return (ret > 0) ? 0 : ret;
+}
+
+static int send_leave(struct mcast_group *group, u8 leave_state)
+{
+ struct mcast_port *port = group->port;
+ struct ib_sa_mcmember_rec rec;
+ int ret;
+
+ rec = group->rec;
+ rec.join_state = leave_state;
+ group->leave_state = leave_state;
+
+ ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device,
+ port->port_num, IB_SA_METHOD_DELETE, &rec,
+ IB_SA_MCMEMBER_REC_MGID |
+ IB_SA_MCMEMBER_REC_PORT_GID |
+ IB_SA_MCMEMBER_REC_JOIN_STATE,
+ 3000, GFP_KERNEL, leave_handler,
+ group, &group->query);
+ return (ret > 0) ? 0 : ret;
+}
+
+static void join_group(struct mcast_group *group, struct mcast_member *member,
+ u8 join_state)
+{
+ member->state = MCAST_MEMBER;
+ adjust_membership(group, join_state, 1);
+ group->rec.join_state |= join_state;
+ member->multicast.rec = group->rec;
+ member->multicast.rec.join_state = join_state;
+ list_move(&member->list, &group->active_list);
+}
+
+static int fail_join(struct mcast_group *group, struct mcast_member *member,
+ int status)
+{
+ spin_lock_irq(&group->lock);
+ list_del_init(&member->list);
+ spin_unlock_irq(&group->lock);
+ return member->multicast.callback(status, &member->multicast);
+}
+
+static void process_group_error(struct mcast_group *group)
+{
+ struct mcast_member *member;
+ int ret = 0;
+ u16 pkey_index;
+
+ if (group->state == MCAST_PKEY_EVENT)
+ ret = ib_find_pkey(group->port->dev->device,
+ group->port->port_num,
+ be16_to_cpu(group->rec.pkey), &pkey_index);
+
+ spin_lock_irq(&group->lock);
+ if (group->state == MCAST_PKEY_EVENT && !ret &&
+ group->pkey_index == pkey_index)
+ goto out;
+
+ while (!list_empty(&group->active_list)) {
+ member = list_entry(group->active_list.next,
+ struct mcast_member, list);
+ refcount_inc(&member->refcount);
+ list_del_init(&member->list);
+ adjust_membership(group, member->multicast.rec.join_state, -1);
+ member->state = MCAST_ERROR;
+ spin_unlock_irq(&group->lock);
+
+ ret = member->multicast.callback(-ENETRESET,
+ &member->multicast);
+ deref_member(member);
+ if (ret)
+ ib_sa_free_multicast(&member->multicast);
+ spin_lock_irq(&group->lock);
+ }
+
+ group->rec.join_state = 0;
+out:
+ group->state = MCAST_BUSY;
+ spin_unlock_irq(&group->lock);
+}
+
+static void mcast_work_handler(struct work_struct *work)
+{
+ struct mcast_group *group;
+ struct mcast_member *member;
+ struct ib_sa_multicast *multicast;
+ int status, ret;
+ u8 join_state;
+
+ group = container_of(work, typeof(*group), work);
+retest:
+ spin_lock_irq(&group->lock);
+ while (!list_empty(&group->pending_list) ||
+ (group->state != MCAST_BUSY)) {
+
+ if (group->state != MCAST_BUSY) {
+ spin_unlock_irq(&group->lock);
+ process_group_error(group);
+ goto retest;
+ }
+
+ member = list_entry(group->pending_list.next,
+ struct mcast_member, list);
+ multicast = &member->multicast;
+ join_state = multicast->rec.join_state;
+ refcount_inc(&member->refcount);
+
+ if (join_state == (group->rec.join_state & join_state)) {
+ status = cmp_rec(&group->rec, &multicast->rec,
+ multicast->comp_mask);
+ if (!status)
+ join_group(group, member, join_state);
+ else
+ list_del_init(&member->list);
+ spin_unlock_irq(&group->lock);
+ ret = multicast->callback(status, multicast);
+ } else {
+ spin_unlock_irq(&group->lock);
+ status = send_join(group, member);
+ if (!status) {
+ deref_member(member);
+ return;
+ }
+ ret = fail_join(group, member, status);
+ }
+
+ deref_member(member);
+ if (ret)
+ ib_sa_free_multicast(&member->multicast);
+ spin_lock_irq(&group->lock);
+ }
+
+ join_state = get_leave_state(group);
+ if (join_state) {
+ group->rec.join_state &= ~join_state;
+ spin_unlock_irq(&group->lock);
+ if (send_leave(group, join_state))
+ goto retest;
+ } else {
+ group->state = MCAST_IDLE;
+ spin_unlock_irq(&group->lock);
+ release_group(group);
+ }
+}
+
+/*
+ * Fail a join request if it is still active - at the head of the pending queue.
+ */
+static void process_join_error(struct mcast_group *group, int status)
+{
+ struct mcast_member *member;
+ int ret;
+
+ spin_lock_irq(&group->lock);
+ member = list_entry(group->pending_list.next,
+ struct mcast_member, list);
+ if (group->last_join == member) {
+ refcount_inc(&member->refcount);
+ list_del_init(&member->list);
+ spin_unlock_irq(&group->lock);
+ ret = member->multicast.callback(status, &member->multicast);
+ deref_member(member);
+ if (ret)
+ ib_sa_free_multicast(&member->multicast);
+ } else
+ spin_unlock_irq(&group->lock);
+}
+
+static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
+ void *context)
+{
+ struct mcast_group *group = context;
+ u16 pkey_index = MCAST_INVALID_PKEY_INDEX;
+
+ if (status)
+ process_join_error(group, status);
+ else {
+ int mgids_changed, is_mgid0;
+
+ if (ib_find_pkey(group->port->dev->device,
+ group->port->port_num, be16_to_cpu(rec->pkey),
+ &pkey_index))
+ pkey_index = MCAST_INVALID_PKEY_INDEX;
+
+ spin_lock_irq(&group->port->lock);
+ if (group->state == MCAST_BUSY &&
+ group->pkey_index == MCAST_INVALID_PKEY_INDEX)
+ group->pkey_index = pkey_index;
+ mgids_changed = memcmp(&rec->mgid, &group->rec.mgid,
+ sizeof(group->rec.mgid));
+ group->rec = *rec;
+ if (mgids_changed) {
+ rb_erase(&group->node, &group->port->table);
+ is_mgid0 = !memcmp(&mgid0, &group->rec.mgid,
+ sizeof(mgid0));
+ mcast_insert(group->port, group, is_mgid0);
+ }
+ spin_unlock_irq(&group->port->lock);
+ }
+ mcast_work_handler(&group->work);
+}
+
+static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
+ void *context)
+{
+ struct mcast_group *group = context;
+
+ if (status && group->retries > 0 &&
+ !send_leave(group, group->leave_state))
+ group->retries--;
+ else
+ mcast_work_handler(&group->work);
+}
+
+static struct mcast_group *acquire_group(struct mcast_port *port,
+ union ib_gid *mgid, gfp_t gfp_mask)
+{
+ struct mcast_group *group, *cur_group;
+ unsigned long flags;
+ int is_mgid0;
+
+ is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0);
+ if (!is_mgid0) {
+ spin_lock_irqsave(&port->lock, flags);
+ group = mcast_find(port, mgid);
+ if (group)
+ goto found;
+ spin_unlock_irqrestore(&port->lock, flags);
+ }
+
+ group = kzalloc(sizeof *group, gfp_mask);
+ if (!group)
+ return NULL;
+
+ group->retries = 3;
+ group->port = port;
+ group->rec.mgid = *mgid;
+ group->pkey_index = MCAST_INVALID_PKEY_INDEX;
+ INIT_LIST_HEAD(&group->pending_list);
+ INIT_LIST_HEAD(&group->active_list);
+ INIT_WORK(&group->work, mcast_work_handler);
+ spin_lock_init(&group->lock);
+
+ spin_lock_irqsave(&port->lock, flags);
+ cur_group = mcast_insert(port, group, is_mgid0);
+ if (cur_group) {
+ kfree(group);
+ group = cur_group;
+ } else
+ refcount_inc(&port->refcount);
+found:
+ atomic_inc(&group->refcount);
+ spin_unlock_irqrestore(&port->lock, flags);
+ return group;
+}
+
+/*
+ * We serialize all join requests to a single group to make our lives much
+ * easier. Otherwise, two users could try to join the same group
+ * simultaneously, with different configurations, one could leave while the
+ * join is in progress, etc., which makes locking around error recovery
+ * difficult.
+ */
+struct ib_sa_multicast *
+ib_sa_join_multicast(struct ib_sa_client *client,
+ struct ib_device *device, u32 port_num,
+ struct ib_sa_mcmember_rec *rec,
+ ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
+ int (*callback)(int status,
+ struct ib_sa_multicast *multicast),
+ void *context)
+{
+ struct mcast_device *dev;
+ struct mcast_member *member;
+ struct ib_sa_multicast *multicast;
+ int ret;
+
+ dev = ib_get_client_data(device, &mcast_client);
+ if (!dev)
+ return ERR_PTR(-ENODEV);
+
+ member = kmalloc(sizeof *member, gfp_mask);
+ if (!member)
+ return ERR_PTR(-ENOMEM);
+
+ ib_sa_client_get(client);
+ member->client = client;
+ member->multicast.rec = *rec;
+ member->multicast.comp_mask = comp_mask;
+ member->multicast.callback = callback;
+ member->multicast.context = context;
+ init_completion(&member->comp);
+ refcount_set(&member->refcount, 1);
+ member->state = MCAST_JOINING;
+
+ member->group = acquire_group(&dev->port[port_num - dev->start_port],
+ &rec->mgid, gfp_mask);
+ if (!member->group) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ /*
+ * The user will get the multicast structure in their callback. They
+ * could then free the multicast structure before we can return from
+ * this routine. So we save the pointer to return before queuing
+ * any callback.
+ */
+ multicast = &member->multicast;
+ queue_join(member);
+ return multicast;
+
+err:
+ ib_sa_client_put(client);
+ kfree(member);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_sa_join_multicast);
+
+void ib_sa_free_multicast(struct ib_sa_multicast *multicast)
+{
+ struct mcast_member *member;
+ struct mcast_group *group;
+
+ member = container_of(multicast, struct mcast_member, multicast);
+ group = member->group;
+
+ spin_lock_irq(&group->lock);
+ if (member->state == MCAST_MEMBER)
+ adjust_membership(group, multicast->rec.join_state, -1);
+
+ list_del_init(&member->list);
+
+ if (group->state == MCAST_IDLE) {
+ group->state = MCAST_BUSY;
+ spin_unlock_irq(&group->lock);
+ /* Continue to hold reference on group until callback */
+ queue_work(mcast_wq, &group->work);
+ } else {
+ spin_unlock_irq(&group->lock);
+ release_group(group);
+ }
+
+ deref_member(member);
+ wait_for_completion(&member->comp);
+ ib_sa_client_put(member->client);
+ kfree(member);
+}
+EXPORT_SYMBOL(ib_sa_free_multicast);
+
+int ib_sa_get_mcmember_rec(struct ib_device *device, u32 port_num,
+ union ib_gid *mgid, struct ib_sa_mcmember_rec *rec)
+{
+ struct mcast_device *dev;
+ struct mcast_port *port;
+ struct mcast_group *group;
+ unsigned long flags;
+ int ret = 0;
+
+ dev = ib_get_client_data(device, &mcast_client);
+ if (!dev)
+ return -ENODEV;
+
+ port = &dev->port[port_num - dev->start_port];
+ spin_lock_irqsave(&port->lock, flags);
+ group = mcast_find(port, mgid);
+ if (group)
+ *rec = group->rec;
+ else
+ ret = -EADDRNOTAVAIL;
+ spin_unlock_irqrestore(&port->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_sa_get_mcmember_rec);
+
+/**
+ * ib_init_ah_from_mcmember - Initialize AH attribute from multicast
+ * member record and gid of the device.
+ * @device: RDMA device
+ * @port_num: Port of the rdma device to consider
+ * @rec: Multicast member record to use
+ * @ndev: Optional netdevice, applicable only for RoCE
+ * @gid_type: GID type to consider
+ * @ah_attr: AH attribute to fillup on successful completion
+ *
+ * ib_init_ah_from_mcmember() initializes AH attribute based on multicast
+ * member record and other device properties. On success the caller is
+ * responsible to call rdma_destroy_ah_attr on the ah_attr. Returns 0 on
+ * success or appropriate error code.
+ *
+ */
+int ib_init_ah_from_mcmember(struct ib_device *device, u32 port_num,
+ struct ib_sa_mcmember_rec *rec,
+ struct net_device *ndev,
+ enum ib_gid_type gid_type,
+ struct rdma_ah_attr *ah_attr)
+{
+ const struct ib_gid_attr *sgid_attr;
+
+ /* GID table is not based on the netdevice for IB link layer,
+ * so ignore ndev during search.
+ */
+ if (rdma_protocol_ib(device, port_num))
+ ndev = NULL;
+ else if (!rdma_protocol_roce(device, port_num))
+ return -EINVAL;
+
+ sgid_attr = rdma_find_gid_by_port(device, &rec->port_gid,
+ gid_type, port_num, ndev);
+ if (IS_ERR(sgid_attr))
+ return PTR_ERR(sgid_attr);
+
+ memset(ah_attr, 0, sizeof(*ah_attr));
+ ah_attr->type = rdma_ah_find_type(device, port_num);
+
+ rdma_ah_set_dlid(ah_attr, be16_to_cpu(rec->mlid));
+ rdma_ah_set_sl(ah_attr, rec->sl);
+ rdma_ah_set_port_num(ah_attr, port_num);
+ rdma_ah_set_static_rate(ah_attr, rec->rate);
+ rdma_move_grh_sgid_attr(ah_attr, &rec->mgid,
+ be32_to_cpu(rec->flow_label),
+ rec->hop_limit, rec->traffic_class,
+ sgid_attr);
+ return 0;
+}
+EXPORT_SYMBOL(ib_init_ah_from_mcmember);
+
+static void mcast_groups_event(struct mcast_port *port,
+ enum mcast_group_state state)
+{
+ struct mcast_group *group;
+ struct rb_node *node;
+ unsigned long flags;
+
+ spin_lock_irqsave(&port->lock, flags);
+ for (node = rb_first(&port->table); node; node = rb_next(node)) {
+ group = rb_entry(node, struct mcast_group, node);
+ spin_lock(&group->lock);
+ if (group->state == MCAST_IDLE) {
+ atomic_inc(&group->refcount);
+ queue_work(mcast_wq, &group->work);
+ }
+ if (group->state != MCAST_GROUP_ERROR)
+ group->state = state;
+ spin_unlock(&group->lock);
+ }
+ spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void mcast_event_handler(struct ib_event_handler *handler,
+ struct ib_event *event)
+{
+ struct mcast_device *dev;
+ int index;
+
+ dev = container_of(handler, struct mcast_device, event_handler);
+ if (!rdma_cap_ib_mcast(dev->device, event->element.port_num))
+ return;
+
+ index = event->element.port_num - dev->start_port;
+
+ switch (event->event) {
+ case IB_EVENT_PORT_ERR:
+ case IB_EVENT_LID_CHANGE:
+ case IB_EVENT_CLIENT_REREGISTER:
+ mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR);
+ break;
+ case IB_EVENT_PKEY_CHANGE:
+ mcast_groups_event(&dev->port[index], MCAST_PKEY_EVENT);
+ break;
+ default:
+ break;
+ }
+}
+
+static int mcast_add_one(struct ib_device *device)
+{
+ struct mcast_device *dev;
+ struct mcast_port *port;
+ int i;
+ int count = 0;
+
+ dev = kmalloc(struct_size(dev, port, device->phys_port_cnt),
+ GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+
+ dev->start_port = rdma_start_port(device);
+ dev->end_port = rdma_end_port(device);
+
+ for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+ if (!rdma_cap_ib_mcast(device, dev->start_port + i))
+ continue;
+ port = &dev->port[i];
+ port->dev = dev;
+ port->port_num = dev->start_port + i;
+ spin_lock_init(&port->lock);
+ port->table = RB_ROOT;
+ init_completion(&port->comp);
+ refcount_set(&port->refcount, 1);
+ ++count;
+ }
+
+ if (!count) {
+ kfree(dev);
+ return -EOPNOTSUPP;
+ }
+
+ dev->device = device;
+ ib_set_client_data(device, &mcast_client, dev);
+
+ INIT_IB_EVENT_HANDLER(&dev->event_handler, device, mcast_event_handler);
+ ib_register_event_handler(&dev->event_handler);
+ return 0;
+}
+
+static void mcast_remove_one(struct ib_device *device, void *client_data)
+{
+ struct mcast_device *dev = client_data;
+ struct mcast_port *port;
+ int i;
+
+ ib_unregister_event_handler(&dev->event_handler);
+ flush_workqueue(mcast_wq);
+
+ for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+ if (rdma_cap_ib_mcast(device, dev->start_port + i)) {
+ port = &dev->port[i];
+ deref_port(port);
+ wait_for_completion(&port->comp);
+ }
+ }
+
+ kfree(dev);
+}
+
+int mcast_init(void)
+{
+ int ret;
+
+ mcast_wq = alloc_ordered_workqueue("ib_mcast", WQ_MEM_RECLAIM);
+ if (!mcast_wq)
+ return -ENOMEM;
+
+ ib_sa_register_client(&sa_client);
+
+ ret = ib_register_client(&mcast_client);
+ if (ret)
+ goto err;
+ return 0;
+
+err:
+ ib_sa_unregister_client(&sa_client);
+ destroy_workqueue(mcast_wq);
+ return ret;
+}
+
+void mcast_cleanup(void)
+{
+ ib_unregister_client(&mcast_client);
+ ib_sa_unregister_client(&sa_client);
+ destroy_workqueue(mcast_wq);
+}
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
new file mode 100644
index 000000000..1b2cc9e45
--- /dev/null
+++ b/drivers/infiniband/core/netlink.c
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies Inc. All rights reserved.
+ * Copyright (c) 2010 Voltaire Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
+
+#include <linux/export.h>
+#include <net/netlink.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/sock.h>
+#include <rdma/rdma_netlink.h>
+#include <linux/module.h>
+#include "core_priv.h"
+
+static struct {
+ const struct rdma_nl_cbs *cb_table;
+ /* Synchronizes between ongoing netlink commands and netlink client
+ * unregistration.
+ */
+ struct rw_semaphore sem;
+} rdma_nl_types[RDMA_NL_NUM_CLIENTS];
+
+bool rdma_nl_chk_listeners(unsigned int group)
+{
+ struct rdma_dev_net *rnet = rdma_net_to_dev_net(&init_net);
+
+ return netlink_has_listeners(rnet->nl_sock, group);
+}
+EXPORT_SYMBOL(rdma_nl_chk_listeners);
+
+static bool is_nl_msg_valid(unsigned int type, unsigned int op)
+{
+ static const unsigned int max_num_ops[RDMA_NL_NUM_CLIENTS] = {
+ [RDMA_NL_IWCM] = RDMA_NL_IWPM_NUM_OPS,
+ [RDMA_NL_LS] = RDMA_NL_LS_NUM_OPS,
+ [RDMA_NL_NLDEV] = RDMA_NLDEV_NUM_OPS,
+ };
+
+ /*
+ * This BUILD_BUG_ON is intended to catch addition of new
+ * RDMA netlink protocol without updating the array above.
+ */
+ BUILD_BUG_ON(RDMA_NL_NUM_CLIENTS != 6);
+
+ if (type >= RDMA_NL_NUM_CLIENTS)
+ return false;
+
+ return (op < max_num_ops[type]) ? true : false;
+}
+
+static const struct rdma_nl_cbs *
+get_cb_table(const struct sk_buff *skb, unsigned int type, unsigned int op)
+{
+ const struct rdma_nl_cbs *cb_table;
+
+ /*
+ * Currently only NLDEV client is supporting netlink commands in
+ * non init_net net namespace.
+ */
+ if (sock_net(skb->sk) != &init_net && type != RDMA_NL_NLDEV)
+ return NULL;
+
+ cb_table = READ_ONCE(rdma_nl_types[type].cb_table);
+ if (!cb_table) {
+ /*
+ * Didn't get valid reference of the table, attempt module
+ * load once.
+ */
+ up_read(&rdma_nl_types[type].sem);
+
+ request_module("rdma-netlink-subsys-%u", type);
+
+ down_read(&rdma_nl_types[type].sem);
+ cb_table = READ_ONCE(rdma_nl_types[type].cb_table);
+ }
+ if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit))
+ return NULL;
+ return cb_table;
+}
+
+void rdma_nl_register(unsigned int index,
+ const struct rdma_nl_cbs cb_table[])
+{
+ if (WARN_ON(!is_nl_msg_valid(index, 0)) ||
+ WARN_ON(READ_ONCE(rdma_nl_types[index].cb_table)))
+ return;
+
+ /* Pairs with the READ_ONCE in is_nl_valid() */
+ smp_store_release(&rdma_nl_types[index].cb_table, cb_table);
+}
+EXPORT_SYMBOL(rdma_nl_register);
+
+void rdma_nl_unregister(unsigned int index)
+{
+ down_write(&rdma_nl_types[index].sem);
+ rdma_nl_types[index].cb_table = NULL;
+ up_write(&rdma_nl_types[index].sem);
+}
+EXPORT_SYMBOL(rdma_nl_unregister);
+
+void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq,
+ int len, int client, int op, int flags)
+{
+ *nlh = nlmsg_put(skb, 0, seq, RDMA_NL_GET_TYPE(client, op), len, flags);
+ if (!*nlh)
+ return NULL;
+ return nlmsg_data(*nlh);
+}
+EXPORT_SYMBOL(ibnl_put_msg);
+
+int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh,
+ int len, void *data, int type)
+{
+ if (nla_put(skb, type, len, data)) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ibnl_put_attr);
+
+static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ int type = nlh->nlmsg_type;
+ unsigned int index = RDMA_NL_GET_CLIENT(type);
+ unsigned int op = RDMA_NL_GET_OP(type);
+ const struct rdma_nl_cbs *cb_table;
+ int err = -EINVAL;
+
+ if (!is_nl_msg_valid(index, op))
+ return -EINVAL;
+
+ down_read(&rdma_nl_types[index].sem);
+ cb_table = get_cb_table(skb, index, op);
+ if (!cb_table)
+ goto done;
+
+ if ((cb_table[op].flags & RDMA_NL_ADMIN_PERM) &&
+ !netlink_capable(skb, CAP_NET_ADMIN)) {
+ err = -EPERM;
+ goto done;
+ }
+
+ /*
+ * LS responses overload the 0x100 (NLM_F_ROOT) flag. Don't
+ * mistakenly call the .dump() function.
+ */
+ if (index == RDMA_NL_LS) {
+ if (cb_table[op].doit)
+ err = cb_table[op].doit(skb, nlh, extack);
+ goto done;
+ }
+ /* FIXME: Convert IWCM to properly handle doit callbacks */
+ if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_IWCM) {
+ struct netlink_dump_control c = {
+ .dump = cb_table[op].dump,
+ };
+ if (c.dump)
+ err = netlink_dump_start(skb->sk, skb, nlh, &c);
+ goto done;
+ }
+
+ if (cb_table[op].doit)
+ err = cb_table[op].doit(skb, nlh, extack);
+done:
+ up_read(&rdma_nl_types[index].sem);
+ return err;
+}
+
+/*
+ * This function is similar to netlink_rcv_skb with one exception:
+ * It calls to the callback for the netlink messages without NLM_F_REQUEST
+ * flag. These messages are intended for RDMA_NL_LS consumer, so it is allowed
+ * for that consumer only.
+ */
+static int rdma_nl_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
+ struct nlmsghdr *,
+ struct netlink_ext_ack *))
+{
+ struct netlink_ext_ack extack = {};
+ struct nlmsghdr *nlh;
+ int err;
+
+ while (skb->len >= nlmsg_total_size(0)) {
+ int msglen;
+
+ nlh = nlmsg_hdr(skb);
+ err = 0;
+
+ if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
+ return 0;
+
+ /*
+ * Generally speaking, the only requests are handled
+ * by the kernel, but RDMA_NL_LS is different, because it
+ * runs backward netlink scheme. Kernel initiates messages
+ * and waits for reply with data to keep pathrecord cache
+ * in sync.
+ */
+ if (!(nlh->nlmsg_flags & NLM_F_REQUEST) &&
+ (RDMA_NL_GET_CLIENT(nlh->nlmsg_type) != RDMA_NL_LS))
+ goto ack;
+
+ /* Skip control messages */
+ if (nlh->nlmsg_type < NLMSG_MIN_TYPE)
+ goto ack;
+
+ err = cb(skb, nlh, &extack);
+ if (err == -EINTR)
+ goto skip;
+
+ack:
+ if (nlh->nlmsg_flags & NLM_F_ACK || err)
+ netlink_ack(skb, nlh, err, &extack);
+
+skip:
+ msglen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (msglen > skb->len)
+ msglen = skb->len;
+ skb_pull(skb, msglen);
+ }
+
+ return 0;
+}
+
+static void rdma_nl_rcv(struct sk_buff *skb)
+{
+ rdma_nl_rcv_skb(skb, &rdma_nl_rcv_msg);
+}
+
+int rdma_nl_unicast(struct net *net, struct sk_buff *skb, u32 pid)
+{
+ struct rdma_dev_net *rnet = rdma_net_to_dev_net(net);
+ int err;
+
+ err = netlink_unicast(rnet->nl_sock, skb, pid, MSG_DONTWAIT);
+ return (err < 0) ? err : 0;
+}
+EXPORT_SYMBOL(rdma_nl_unicast);
+
+int rdma_nl_unicast_wait(struct net *net, struct sk_buff *skb, __u32 pid)
+{
+ struct rdma_dev_net *rnet = rdma_net_to_dev_net(net);
+ int err;
+
+ err = netlink_unicast(rnet->nl_sock, skb, pid, 0);
+ return (err < 0) ? err : 0;
+}
+EXPORT_SYMBOL(rdma_nl_unicast_wait);
+
+int rdma_nl_multicast(struct net *net, struct sk_buff *skb,
+ unsigned int group, gfp_t flags)
+{
+ struct rdma_dev_net *rnet = rdma_net_to_dev_net(net);
+
+ return nlmsg_multicast(rnet->nl_sock, skb, 0, group, flags);
+}
+EXPORT_SYMBOL(rdma_nl_multicast);
+
+void rdma_nl_init(void)
+{
+ int idx;
+
+ for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++)
+ init_rwsem(&rdma_nl_types[idx].sem);
+}
+
+void rdma_nl_exit(void)
+{
+ int idx;
+
+ for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++)
+ WARN(rdma_nl_types[idx].cb_table,
+ "Netlink client %d wasn't released prior to unloading %s\n",
+ idx, KBUILD_MODNAME);
+}
+
+int rdma_nl_net_init(struct rdma_dev_net *rnet)
+{
+ struct net *net = read_pnet(&rnet->net);
+ struct netlink_kernel_cfg cfg = {
+ .input = rdma_nl_rcv,
+ };
+ struct sock *nls;
+
+ nls = netlink_kernel_create(net, NETLINK_RDMA, &cfg);
+ if (!nls)
+ return -ENOMEM;
+
+ nls->sk_sndtimeo = 10 * HZ;
+ rnet->nl_sock = nls;
+ return 0;
+}
+
+void rdma_nl_net_exit(struct rdma_dev_net *rnet)
+{
+ netlink_kernel_release(rnet->nl_sock);
+}
+
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_RDMA);
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
new file mode 100644
index 000000000..1adf20198
--- /dev/null
+++ b/drivers/infiniband/core/nldev.c
@@ -0,0 +1,2548 @@
+/*
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/pid.h>
+#include <linux/pid_namespace.h>
+#include <linux/mutex.h>
+#include <net/netlink.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/rdma_netlink.h>
+
+#include "core_priv.h"
+#include "cma_priv.h"
+#include "restrack.h"
+#include "uverbs.h"
+
+typedef int (*res_fill_func_t)(struct sk_buff*, bool,
+ struct rdma_restrack_entry*, uint32_t);
+
+/*
+ * Sort array elements by the netlink attribute name
+ */
+static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
+ [RDMA_NLDEV_ATTR_CHARDEV] = { .type = NLA_U64 },
+ [RDMA_NLDEV_ATTR_CHARDEV_ABI] = { .type = NLA_U64 },
+ [RDMA_NLDEV_ATTR_CHARDEV_NAME] = { .type = NLA_NUL_STRING,
+ .len = RDMA_NLDEV_ATTR_EMPTY_STRING },
+ [RDMA_NLDEV_ATTR_CHARDEV_TYPE] = { .type = NLA_NUL_STRING,
+ .len = RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE },
+ [RDMA_NLDEV_ATTR_DEV_DIM] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING,
+ .len = IB_DEVICE_NAME_MAX },
+ [RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_DEV_PROTOCOL] = { .type = NLA_NUL_STRING,
+ .len = RDMA_NLDEV_ATTR_EMPTY_STRING },
+ [RDMA_NLDEV_ATTR_DRIVER] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_DRIVER_ENTRY] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_DRIVER_STRING] = { .type = NLA_NUL_STRING,
+ .len = RDMA_NLDEV_ATTR_EMPTY_STRING },
+ [RDMA_NLDEV_ATTR_DRIVER_S32] = { .type = NLA_S32 },
+ [RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 },
+ [RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 },
+ [RDMA_NLDEV_ATTR_FW_VERSION] = { .type = NLA_NUL_STRING,
+ .len = RDMA_NLDEV_ATTR_EMPTY_STRING },
+ [RDMA_NLDEV_ATTR_LID] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = NLA_NUL_STRING,
+ .len = IFNAMSIZ },
+ [RDMA_NLDEV_ATTR_LMC] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_NDEV_INDEX] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_NDEV_NAME] = { .type = NLA_NUL_STRING,
+ .len = IFNAMSIZ },
+ [RDMA_NLDEV_ATTR_NODE_GUID] = { .type = NLA_U64 },
+ [RDMA_NLDEV_ATTR_PORT_INDEX] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_PORT_STATE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_RES_CM_ID] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_CM_IDN] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_CQ] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_CQE] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_CQN] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_CQ_ENTRY] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_CTX] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_CTXN] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_CTX_ENTRY] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_DST_ADDR] = {
+ .len = sizeof(struct __kernel_sockaddr_storage) },
+ [RDMA_NLDEV_ATTR_RES_IOVA] = { .type = NLA_U64 },
+ [RDMA_NLDEV_ATTR_RES_KERN_NAME] = { .type = NLA_NUL_STRING,
+ .len = RDMA_NLDEV_ATTR_EMPTY_STRING },
+ [RDMA_NLDEV_ATTR_RES_LKEY] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_LQPN] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_MR] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_MRLEN] = { .type = NLA_U64 },
+ [RDMA_NLDEV_ATTR_RES_MRN] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_MR_ENTRY] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_RES_PD] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_PDN] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_PD_ENTRY] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_PID] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_POLL_CTX] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_RES_PS] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_QP] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_QP_ENTRY] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_RAW] = { .type = NLA_BINARY },
+ [RDMA_NLDEV_ATTR_RES_RKEY] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_RQPN] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_RQ_PSN] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_SQ_PSN] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_SRC_ADDR] = {
+ .len = sizeof(struct __kernel_sockaddr_storage) },
+ [RDMA_NLDEV_ATTR_RES_STATE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_RES_SUMMARY] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR]= { .type = NLA_U64 },
+ [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME]= { .type = NLA_NUL_STRING,
+ .len = RDMA_NLDEV_ATTR_EMPTY_STRING },
+ [RDMA_NLDEV_ATTR_RES_TYPE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY]= { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_USECNT] = { .type = NLA_U64 },
+ [RDMA_NLDEV_ATTR_RES_SRQ] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_RES_SRQN] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_RES_SRQ_ENTRY] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_MIN_RANGE] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_MAX_RANGE] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_SM_LID] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_SUBNET_PREFIX] = { .type = NLA_U64 },
+ [RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_STAT_MODE] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_STAT_RES] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_STAT_COUNTER] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_STAT_COUNTER_ID] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_STAT_HWCOUNTERS] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME] = { .type = NLA_NUL_STRING },
+ [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE] = { .type = NLA_U64 },
+ [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 },
+ [RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID] = { .type = NLA_U32 },
+ [RDMA_NLDEV_NET_NS_FD] = { .type = NLA_U32 },
+ [RDMA_NLDEV_SYS_ATTR_NETNS_MODE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_SYS_ATTR_COPY_ON_FORK] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_INDEX] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_DYNAMIC] = { .type = NLA_U8 },
+};
+
+static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
+ enum rdma_nldev_print_type print_type)
+{
+ if (nla_put_string(msg, RDMA_NLDEV_ATTR_DRIVER_STRING, name))
+ return -EMSGSIZE;
+ if (print_type != RDMA_NLDEV_PRINT_TYPE_UNSPEC &&
+ nla_put_u8(msg, RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE, print_type))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int _rdma_nl_put_driver_u32(struct sk_buff *msg, const char *name,
+ enum rdma_nldev_print_type print_type,
+ u32 value)
+{
+ if (put_driver_name_print_type(msg, name, print_type))
+ return -EMSGSIZE;
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DRIVER_U32, value))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int _rdma_nl_put_driver_u64(struct sk_buff *msg, const char *name,
+ enum rdma_nldev_print_type print_type,
+ u64 value)
+{
+ if (put_driver_name_print_type(msg, name, print_type))
+ return -EMSGSIZE;
+ if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_DRIVER_U64, value,
+ RDMA_NLDEV_ATTR_PAD))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+int rdma_nl_put_driver_string(struct sk_buff *msg, const char *name,
+ const char *str)
+{
+ if (put_driver_name_print_type(msg, name,
+ RDMA_NLDEV_PRINT_TYPE_UNSPEC))
+ return -EMSGSIZE;
+ if (nla_put_string(msg, RDMA_NLDEV_ATTR_DRIVER_STRING, str))
+ return -EMSGSIZE;
+
+ return 0;
+}
+EXPORT_SYMBOL(rdma_nl_put_driver_string);
+
+int rdma_nl_put_driver_u32(struct sk_buff *msg, const char *name, u32 value)
+{
+ return _rdma_nl_put_driver_u32(msg, name, RDMA_NLDEV_PRINT_TYPE_UNSPEC,
+ value);
+}
+EXPORT_SYMBOL(rdma_nl_put_driver_u32);
+
+int rdma_nl_put_driver_u32_hex(struct sk_buff *msg, const char *name,
+ u32 value)
+{
+ return _rdma_nl_put_driver_u32(msg, name, RDMA_NLDEV_PRINT_TYPE_HEX,
+ value);
+}
+EXPORT_SYMBOL(rdma_nl_put_driver_u32_hex);
+
+int rdma_nl_put_driver_u64(struct sk_buff *msg, const char *name, u64 value)
+{
+ return _rdma_nl_put_driver_u64(msg, name, RDMA_NLDEV_PRINT_TYPE_UNSPEC,
+ value);
+}
+EXPORT_SYMBOL(rdma_nl_put_driver_u64);
+
+int rdma_nl_put_driver_u64_hex(struct sk_buff *msg, const char *name, u64 value)
+{
+ return _rdma_nl_put_driver_u64(msg, name, RDMA_NLDEV_PRINT_TYPE_HEX,
+ value);
+}
+EXPORT_SYMBOL(rdma_nl_put_driver_u64_hex);
+
+static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device)
+{
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index))
+ return -EMSGSIZE;
+ if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME,
+ dev_name(&device->dev)))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
+{
+ char fw[IB_FW_VERSION_NAME_MAX];
+ int ret = 0;
+ u32 port;
+
+ if (fill_nldev_handle(msg, device))
+ return -EMSGSIZE;
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, rdma_end_port(device)))
+ return -EMSGSIZE;
+
+ BUILD_BUG_ON(sizeof(device->attrs.device_cap_flags) != sizeof(u64));
+ if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS,
+ device->attrs.device_cap_flags,
+ RDMA_NLDEV_ATTR_PAD))
+ return -EMSGSIZE;
+
+ ib_get_device_fw_str(device, fw);
+ /* Device without FW has strlen(fw) = 0 */
+ if (strlen(fw) && nla_put_string(msg, RDMA_NLDEV_ATTR_FW_VERSION, fw))
+ return -EMSGSIZE;
+
+ if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_NODE_GUID,
+ be64_to_cpu(device->node_guid),
+ RDMA_NLDEV_ATTR_PAD))
+ return -EMSGSIZE;
+ if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SYS_IMAGE_GUID,
+ be64_to_cpu(device->attrs.sys_image_guid),
+ RDMA_NLDEV_ATTR_PAD))
+ return -EMSGSIZE;
+ if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type))
+ return -EMSGSIZE;
+ if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, device->use_cq_dim))
+ return -EMSGSIZE;
+
+ /*
+ * Link type is determined on first port and mlx4 device
+ * which can potentially have two different link type for the same
+ * IB device is considered as better to be avoided in the future,
+ */
+ port = rdma_start_port(device);
+ if (rdma_cap_opa_mad(device, port))
+ ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "opa");
+ else if (rdma_protocol_ib(device, port))
+ ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "ib");
+ else if (rdma_protocol_iwarp(device, port))
+ ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "iw");
+ else if (rdma_protocol_roce(device, port))
+ ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "roce");
+ else if (rdma_protocol_usnic(device, port))
+ ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL,
+ "usnic");
+ return ret;
+}
+
+static int fill_port_info(struct sk_buff *msg,
+ struct ib_device *device, u32 port,
+ const struct net *net)
+{
+ struct net_device *netdev = NULL;
+ struct ib_port_attr attr;
+ int ret;
+ u64 cap_flags = 0;
+
+ if (fill_nldev_handle(msg, device))
+ return -EMSGSIZE;
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port))
+ return -EMSGSIZE;
+
+ ret = ib_query_port(device, port, &attr);
+ if (ret)
+ return ret;
+
+ if (rdma_protocol_ib(device, port)) {
+ BUILD_BUG_ON((sizeof(attr.port_cap_flags) +
+ sizeof(attr.port_cap_flags2)) > sizeof(u64));
+ cap_flags = attr.port_cap_flags |
+ ((u64)attr.port_cap_flags2 << 32);
+ if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS,
+ cap_flags, RDMA_NLDEV_ATTR_PAD))
+ return -EMSGSIZE;
+ if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SUBNET_PREFIX,
+ attr.subnet_prefix, RDMA_NLDEV_ATTR_PAD))
+ return -EMSGSIZE;
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_LID, attr.lid))
+ return -EMSGSIZE;
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_SM_LID, attr.sm_lid))
+ return -EMSGSIZE;
+ if (nla_put_u8(msg, RDMA_NLDEV_ATTR_LMC, attr.lmc))
+ return -EMSGSIZE;
+ }
+ if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_STATE, attr.state))
+ return -EMSGSIZE;
+ if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_PHYS_STATE, attr.phys_state))
+ return -EMSGSIZE;
+
+ netdev = ib_device_get_netdev(device, port);
+ if (netdev && net_eq(dev_net(netdev), net)) {
+ ret = nla_put_u32(msg,
+ RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex);
+ if (ret)
+ goto out;
+ ret = nla_put_string(msg,
+ RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name);
+ }
+
+out:
+ if (netdev)
+ dev_put(netdev);
+ return ret;
+}
+
+static int fill_res_info_entry(struct sk_buff *msg,
+ const char *name, u64 curr)
+{
+ struct nlattr *entry_attr;
+
+ entry_attr = nla_nest_start_noflag(msg,
+ RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY);
+ if (!entry_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME, name))
+ goto err;
+ if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR, curr,
+ RDMA_NLDEV_ATTR_PAD))
+ goto err;
+
+ nla_nest_end(msg, entry_attr);
+ return 0;
+
+err:
+ nla_nest_cancel(msg, entry_attr);
+ return -EMSGSIZE;
+}
+
+static int fill_res_info(struct sk_buff *msg, struct ib_device *device)
+{
+ static const char * const names[RDMA_RESTRACK_MAX] = {
+ [RDMA_RESTRACK_PD] = "pd",
+ [RDMA_RESTRACK_CQ] = "cq",
+ [RDMA_RESTRACK_QP] = "qp",
+ [RDMA_RESTRACK_CM_ID] = "cm_id",
+ [RDMA_RESTRACK_MR] = "mr",
+ [RDMA_RESTRACK_CTX] = "ctx",
+ [RDMA_RESTRACK_SRQ] = "srq",
+ };
+
+ struct nlattr *table_attr;
+ int ret, i, curr;
+
+ if (fill_nldev_handle(msg, device))
+ return -EMSGSIZE;
+
+ table_attr = nla_nest_start_noflag(msg, RDMA_NLDEV_ATTR_RES_SUMMARY);
+ if (!table_attr)
+ return -EMSGSIZE;
+
+ for (i = 0; i < RDMA_RESTRACK_MAX; i++) {
+ if (!names[i])
+ continue;
+ curr = rdma_restrack_count(device, i);
+ ret = fill_res_info_entry(msg, names[i], curr);
+ if (ret)
+ goto err;
+ }
+
+ nla_nest_end(msg, table_attr);
+ return 0;
+
+err:
+ nla_nest_cancel(msg, table_attr);
+ return ret;
+}
+
+static int fill_res_name_pid(struct sk_buff *msg,
+ struct rdma_restrack_entry *res)
+{
+ int err = 0;
+
+ /*
+ * For user resources, user is should read /proc/PID/comm to get the
+ * name of the task file.
+ */
+ if (rdma_is_kernel_res(res)) {
+ err = nla_put_string(msg, RDMA_NLDEV_ATTR_RES_KERN_NAME,
+ res->kern_name);
+ } else {
+ pid_t pid;
+
+ pid = task_pid_vnr(res->task);
+ /*
+ * Task is dead and in zombie state.
+ * There is no need to print PID anymore.
+ */
+ if (pid)
+ /*
+ * This part is racy, task can be killed and PID will
+ * be zero right here but it is ok, next query won't
+ * return PID. We don't promise real-time reflection
+ * of SW objects.
+ */
+ err = nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PID, pid);
+ }
+
+ return err ? -EMSGSIZE : 0;
+}
+
+static int fill_res_qp_entry_query(struct sk_buff *msg,
+ struct rdma_restrack_entry *res,
+ struct ib_device *dev,
+ struct ib_qp *qp)
+{
+ struct ib_qp_init_attr qp_init_attr;
+ struct ib_qp_attr qp_attr;
+ int ret;
+
+ ret = ib_query_qp(qp, &qp_attr, 0, &qp_init_attr);
+ if (ret)
+ return ret;
+
+ if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC) {
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQPN,
+ qp_attr.dest_qp_num))
+ goto err;
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQ_PSN,
+ qp_attr.rq_psn))
+ goto err;
+ }
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_SQ_PSN, qp_attr.sq_psn))
+ goto err;
+
+ if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC ||
+ qp->qp_type == IB_QPT_XRC_INI || qp->qp_type == IB_QPT_XRC_TGT) {
+ if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE,
+ qp_attr.path_mig_state))
+ goto err;
+ }
+ if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, qp->qp_type))
+ goto err;
+ if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state))
+ goto err;
+
+ if (dev->ops.fill_res_qp_entry)
+ return dev->ops.fill_res_qp_entry(msg, qp);
+ return 0;
+
+err: return -EMSGSIZE;
+}
+
+static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin,
+ struct rdma_restrack_entry *res, uint32_t port)
+{
+ struct ib_qp *qp = container_of(res, struct ib_qp, res);
+ struct ib_device *dev = qp->device;
+ int ret;
+
+ if (port && port != qp->port)
+ return -EAGAIN;
+
+ /* In create_qp() port is not set yet */
+ if (qp->port && nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, qp->port))
+ return -EMSGSIZE;
+
+ ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qp->qp_num);
+ if (ret)
+ return -EMSGSIZE;
+
+ if (!rdma_is_kernel_res(res) &&
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, qp->pd->res.id))
+ return -EMSGSIZE;
+
+ ret = fill_res_name_pid(msg, res);
+ if (ret)
+ return -EMSGSIZE;
+
+ return fill_res_qp_entry_query(msg, res, dev, qp);
+}
+
+static int fill_res_qp_raw_entry(struct sk_buff *msg, bool has_cap_net_admin,
+ struct rdma_restrack_entry *res, uint32_t port)
+{
+ struct ib_qp *qp = container_of(res, struct ib_qp, res);
+ struct ib_device *dev = qp->device;
+
+ if (port && port != qp->port)
+ return -EAGAIN;
+ if (!dev->ops.fill_res_qp_entry_raw)
+ return -EINVAL;
+ return dev->ops.fill_res_qp_entry_raw(msg, qp);
+}
+
+static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin,
+ struct rdma_restrack_entry *res, uint32_t port)
+{
+ struct rdma_id_private *id_priv =
+ container_of(res, struct rdma_id_private, res);
+ struct ib_device *dev = id_priv->id.device;
+ struct rdma_cm_id *cm_id = &id_priv->id;
+
+ if (port && port != cm_id->port_num)
+ return -EAGAIN;
+
+ if (cm_id->port_num &&
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, cm_id->port_num))
+ goto err;
+
+ if (id_priv->qp_num) {
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, id_priv->qp_num))
+ goto err;
+ if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, cm_id->qp_type))
+ goto err;
+ }
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PS, cm_id->ps))
+ goto err;
+
+ if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, id_priv->state))
+ goto err;
+
+ if (cm_id->route.addr.src_addr.ss_family &&
+ nla_put(msg, RDMA_NLDEV_ATTR_RES_SRC_ADDR,
+ sizeof(cm_id->route.addr.src_addr),
+ &cm_id->route.addr.src_addr))
+ goto err;
+ if (cm_id->route.addr.dst_addr.ss_family &&
+ nla_put(msg, RDMA_NLDEV_ATTR_RES_DST_ADDR,
+ sizeof(cm_id->route.addr.dst_addr),
+ &cm_id->route.addr.dst_addr))
+ goto err;
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CM_IDN, res->id))
+ goto err;
+
+ if (fill_res_name_pid(msg, res))
+ goto err;
+
+ if (dev->ops.fill_res_cm_id_entry)
+ return dev->ops.fill_res_cm_id_entry(msg, cm_id);
+ return 0;
+
+err: return -EMSGSIZE;
+}
+
+static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin,
+ struct rdma_restrack_entry *res, uint32_t port)
+{
+ struct ib_cq *cq = container_of(res, struct ib_cq, res);
+ struct ib_device *dev = cq->device;
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQE, cq->cqe))
+ return -EMSGSIZE;
+ if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT,
+ atomic_read(&cq->usecnt), RDMA_NLDEV_ATTR_PAD))
+ return -EMSGSIZE;
+
+ /* Poll context is only valid for kernel CQs */
+ if (rdma_is_kernel_res(res) &&
+ nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx))
+ return -EMSGSIZE;
+
+ if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, (cq->dim != NULL)))
+ return -EMSGSIZE;
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id))
+ return -EMSGSIZE;
+ if (!rdma_is_kernel_res(res) &&
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN,
+ cq->uobject->uevent.uobject.context->res.id))
+ return -EMSGSIZE;
+
+ if (fill_res_name_pid(msg, res))
+ return -EMSGSIZE;
+
+ return (dev->ops.fill_res_cq_entry) ?
+ dev->ops.fill_res_cq_entry(msg, cq) : 0;
+}
+
+static int fill_res_cq_raw_entry(struct sk_buff *msg, bool has_cap_net_admin,
+ struct rdma_restrack_entry *res, uint32_t port)
+{
+ struct ib_cq *cq = container_of(res, struct ib_cq, res);
+ struct ib_device *dev = cq->device;
+
+ if (!dev->ops.fill_res_cq_entry_raw)
+ return -EINVAL;
+ return dev->ops.fill_res_cq_entry_raw(msg, cq);
+}
+
+static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin,
+ struct rdma_restrack_entry *res, uint32_t port)
+{
+ struct ib_mr *mr = container_of(res, struct ib_mr, res);
+ struct ib_device *dev = mr->pd->device;
+
+ if (has_cap_net_admin) {
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RKEY, mr->rkey))
+ return -EMSGSIZE;
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LKEY, mr->lkey))
+ return -EMSGSIZE;
+ }
+
+ if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_MRLEN, mr->length,
+ RDMA_NLDEV_ATTR_PAD))
+ return -EMSGSIZE;
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id))
+ return -EMSGSIZE;
+
+ if (!rdma_is_kernel_res(res) &&
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, mr->pd->res.id))
+ return -EMSGSIZE;
+
+ if (fill_res_name_pid(msg, res))
+ return -EMSGSIZE;
+
+ return (dev->ops.fill_res_mr_entry) ?
+ dev->ops.fill_res_mr_entry(msg, mr) :
+ 0;
+}
+
+static int fill_res_mr_raw_entry(struct sk_buff *msg, bool has_cap_net_admin,
+ struct rdma_restrack_entry *res, uint32_t port)
+{
+ struct ib_mr *mr = container_of(res, struct ib_mr, res);
+ struct ib_device *dev = mr->pd->device;
+
+ if (!dev->ops.fill_res_mr_entry_raw)
+ return -EINVAL;
+ return dev->ops.fill_res_mr_entry_raw(msg, mr);
+}
+
+static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin,
+ struct rdma_restrack_entry *res, uint32_t port)
+{
+ struct ib_pd *pd = container_of(res, struct ib_pd, res);
+
+ if (has_cap_net_admin) {
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY,
+ pd->local_dma_lkey))
+ goto err;
+ if ((pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) &&
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY,
+ pd->unsafe_global_rkey))
+ goto err;
+ }
+ if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT,
+ atomic_read(&pd->usecnt), RDMA_NLDEV_ATTR_PAD))
+ goto err;
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, res->id))
+ goto err;
+
+ if (!rdma_is_kernel_res(res) &&
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN,
+ pd->uobject->context->res.id))
+ goto err;
+
+ return fill_res_name_pid(msg, res);
+
+err: return -EMSGSIZE;
+}
+
+static int fill_res_ctx_entry(struct sk_buff *msg, bool has_cap_net_admin,
+ struct rdma_restrack_entry *res, uint32_t port)
+{
+ struct ib_ucontext *ctx = container_of(res, struct ib_ucontext, res);
+
+ if (rdma_is_kernel_res(res))
+ return 0;
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, ctx->res.id))
+ return -EMSGSIZE;
+
+ return fill_res_name_pid(msg, res);
+}
+
+static int fill_res_range_qp_entry(struct sk_buff *msg, uint32_t min_range,
+ uint32_t max_range)
+{
+ struct nlattr *entry_attr;
+
+ if (!min_range)
+ return 0;
+
+ entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY);
+ if (!entry_attr)
+ return -EMSGSIZE;
+
+ if (min_range == max_range) {
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, min_range))
+ goto err;
+ } else {
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_MIN_RANGE, min_range))
+ goto err;
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_MAX_RANGE, max_range))
+ goto err;
+ }
+ nla_nest_end(msg, entry_attr);
+ return 0;
+
+err:
+ nla_nest_cancel(msg, entry_attr);
+ return -EMSGSIZE;
+}
+
+static int fill_res_srq_qps(struct sk_buff *msg, struct ib_srq *srq)
+{
+ uint32_t min_range = 0, prev = 0;
+ struct rdma_restrack_entry *res;
+ struct rdma_restrack_root *rt;
+ struct nlattr *table_attr;
+ struct ib_qp *qp = NULL;
+ unsigned long id = 0;
+
+ table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP);
+ if (!table_attr)
+ return -EMSGSIZE;
+
+ rt = &srq->device->res[RDMA_RESTRACK_QP];
+ xa_lock(&rt->xa);
+ xa_for_each(&rt->xa, id, res) {
+ if (!rdma_restrack_get(res))
+ continue;
+
+ qp = container_of(res, struct ib_qp, res);
+ if (!qp->srq || (qp->srq->res.id != srq->res.id)) {
+ rdma_restrack_put(res);
+ continue;
+ }
+
+ if (qp->qp_num < prev)
+ /* qp_num should be ascending */
+ goto err_loop;
+
+ if (min_range == 0) {
+ min_range = qp->qp_num;
+ } else if (qp->qp_num > (prev + 1)) {
+ if (fill_res_range_qp_entry(msg, min_range, prev))
+ goto err_loop;
+
+ min_range = qp->qp_num;
+ }
+ prev = qp->qp_num;
+ rdma_restrack_put(res);
+ }
+
+ xa_unlock(&rt->xa);
+
+ if (fill_res_range_qp_entry(msg, min_range, prev))
+ goto err;
+
+ nla_nest_end(msg, table_attr);
+ return 0;
+
+err_loop:
+ rdma_restrack_put(res);
+ xa_unlock(&rt->xa);
+err:
+ nla_nest_cancel(msg, table_attr);
+ return -EMSGSIZE;
+}
+
+static int fill_res_srq_entry(struct sk_buff *msg, bool has_cap_net_admin,
+ struct rdma_restrack_entry *res, uint32_t port)
+{
+ struct ib_srq *srq = container_of(res, struct ib_srq, res);
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_SRQN, srq->res.id))
+ goto err;
+
+ if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, srq->srq_type))
+ goto err;
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, srq->pd->res.id))
+ goto err;
+
+ if (ib_srq_has_cq(srq->srq_type)) {
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN,
+ srq->ext.cq->res.id))
+ goto err;
+ }
+
+ if (fill_res_srq_qps(msg, srq))
+ goto err;
+
+ return fill_res_name_pid(msg, res);
+
+err:
+ return -EMSGSIZE;
+}
+
+static int fill_stat_counter_mode(struct sk_buff *msg,
+ struct rdma_counter *counter)
+{
+ struct rdma_counter_mode *m = &counter->mode;
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, m->mode))
+ return -EMSGSIZE;
+
+ if (m->mode == RDMA_COUNTER_MODE_AUTO) {
+ if ((m->mask & RDMA_COUNTER_MASK_QP_TYPE) &&
+ nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, m->param.qp_type))
+ return -EMSGSIZE;
+
+ if ((m->mask & RDMA_COUNTER_MASK_PID) &&
+ fill_res_name_pid(msg, &counter->res))
+ return -EMSGSIZE;
+ }
+
+ return 0;
+}
+
+static int fill_stat_counter_qp_entry(struct sk_buff *msg, u32 qpn)
+{
+ struct nlattr *entry_attr;
+
+ entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY);
+ if (!entry_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn))
+ goto err;
+
+ nla_nest_end(msg, entry_attr);
+ return 0;
+
+err:
+ nla_nest_cancel(msg, entry_attr);
+ return -EMSGSIZE;
+}
+
+static int fill_stat_counter_qps(struct sk_buff *msg,
+ struct rdma_counter *counter)
+{
+ struct rdma_restrack_entry *res;
+ struct rdma_restrack_root *rt;
+ struct nlattr *table_attr;
+ struct ib_qp *qp = NULL;
+ unsigned long id = 0;
+ int ret = 0;
+
+ table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP);
+ if (!table_attr)
+ return -EMSGSIZE;
+
+ rt = &counter->device->res[RDMA_RESTRACK_QP];
+ xa_lock(&rt->xa);
+ xa_for_each(&rt->xa, id, res) {
+ qp = container_of(res, struct ib_qp, res);
+ if (!qp->counter || (qp->counter->id != counter->id))
+ continue;
+
+ ret = fill_stat_counter_qp_entry(msg, qp->qp_num);
+ if (ret)
+ goto err;
+ }
+
+ xa_unlock(&rt->xa);
+ nla_nest_end(msg, table_attr);
+ return 0;
+
+err:
+ xa_unlock(&rt->xa);
+ nla_nest_cancel(msg, table_attr);
+ return ret;
+}
+
+int rdma_nl_stat_hwcounter_entry(struct sk_buff *msg, const char *name,
+ u64 value)
+{
+ struct nlattr *entry_attr;
+
+ entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY);
+ if (!entry_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_string(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME,
+ name))
+ goto err;
+ if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE,
+ value, RDMA_NLDEV_ATTR_PAD))
+ goto err;
+
+ nla_nest_end(msg, entry_attr);
+ return 0;
+
+err:
+ nla_nest_cancel(msg, entry_attr);
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL(rdma_nl_stat_hwcounter_entry);
+
+static int fill_stat_mr_entry(struct sk_buff *msg, bool has_cap_net_admin,
+ struct rdma_restrack_entry *res, uint32_t port)
+{
+ struct ib_mr *mr = container_of(res, struct ib_mr, res);
+ struct ib_device *dev = mr->pd->device;
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id))
+ goto err;
+
+ if (dev->ops.fill_stat_mr_entry)
+ return dev->ops.fill_stat_mr_entry(msg, mr);
+ return 0;
+
+err:
+ return -EMSGSIZE;
+}
+
+static int fill_stat_counter_hwcounters(struct sk_buff *msg,
+ struct rdma_counter *counter)
+{
+ struct rdma_hw_stats *st = counter->stats;
+ struct nlattr *table_attr;
+ int i;
+
+ table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS);
+ if (!table_attr)
+ return -EMSGSIZE;
+
+ mutex_lock(&st->lock);
+ for (i = 0; i < st->num_counters; i++) {
+ if (test_bit(i, st->is_disabled))
+ continue;
+ if (rdma_nl_stat_hwcounter_entry(msg, st->descs[i].name,
+ st->value[i]))
+ goto err;
+ }
+ mutex_unlock(&st->lock);
+
+ nla_nest_end(msg, table_attr);
+ return 0;
+
+err:
+ mutex_unlock(&st->lock);
+ nla_nest_cancel(msg, table_attr);
+ return -EMSGSIZE;
+}
+
+static int fill_res_counter_entry(struct sk_buff *msg, bool has_cap_net_admin,
+ struct rdma_restrack_entry *res,
+ uint32_t port)
+{
+ struct rdma_counter *counter =
+ container_of(res, struct rdma_counter, res);
+
+ if (port && port != counter->port)
+ return -EAGAIN;
+
+ /* Dump it even query failed */
+ rdma_counter_query_stats(counter);
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, counter->port) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, counter->id) ||
+ fill_stat_counter_mode(msg, counter) ||
+ fill_stat_counter_qps(msg, counter) ||
+ fill_stat_counter_hwcounters(msg, counter))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ struct ib_device *device;
+ struct sk_buff *msg;
+ u32 index;
+ int err;
+
+ err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+ return -EINVAL;
+
+ index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+
+ device = ib_device_get_by_index(sock_net(skb->sk), index);
+ if (!device)
+ return -EINVAL;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET),
+ 0, 0);
+
+ err = fill_dev_info(msg, device);
+ if (err)
+ goto err_free;
+
+ nlmsg_end(msg, nlh);
+
+ ib_device_put(device);
+ return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
+
+err_free:
+ nlmsg_free(msg);
+err:
+ ib_device_put(device);
+ return err;
+}
+
+static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ struct ib_device *device;
+ u32 index;
+ int err;
+
+ err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+ return -EINVAL;
+
+ index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(sock_net(skb->sk), index);
+ if (!device)
+ return -EINVAL;
+
+ if (tb[RDMA_NLDEV_ATTR_DEV_NAME]) {
+ char name[IB_DEVICE_NAME_MAX] = {};
+
+ nla_strscpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME],
+ IB_DEVICE_NAME_MAX);
+ if (strlen(name) == 0) {
+ err = -EINVAL;
+ goto done;
+ }
+ err = ib_device_rename(device, name);
+ goto done;
+ }
+
+ if (tb[RDMA_NLDEV_NET_NS_FD]) {
+ u32 ns_fd;
+
+ ns_fd = nla_get_u32(tb[RDMA_NLDEV_NET_NS_FD]);
+ err = ib_device_set_netns_put(skb, device, ns_fd);
+ goto put_done;
+ }
+
+ if (tb[RDMA_NLDEV_ATTR_DEV_DIM]) {
+ u8 use_dim;
+
+ use_dim = nla_get_u8(tb[RDMA_NLDEV_ATTR_DEV_DIM]);
+ err = ib_device_set_dim(device, use_dim);
+ goto done;
+ }
+
+done:
+ ib_device_put(device);
+put_done:
+ return err;
+}
+
+static int _nldev_get_dumpit(struct ib_device *device,
+ struct sk_buff *skb,
+ struct netlink_callback *cb,
+ unsigned int idx)
+{
+ int start = cb->args[0];
+ struct nlmsghdr *nlh;
+
+ if (idx < start)
+ return 0;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET),
+ 0, NLM_F_MULTI);
+
+ if (fill_dev_info(skb, device)) {
+ nlmsg_cancel(skb, nlh);
+ goto out;
+ }
+
+ nlmsg_end(skb, nlh);
+
+ idx++;
+
+out: cb->args[0] = idx;
+ return skb->len;
+}
+
+static int nldev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ /*
+ * There is no need to take lock, because
+ * we are relying on ib_core's locking.
+ */
+ return ib_enum_all_devs(_nldev_get_dumpit, skb, cb);
+}
+
+static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ struct ib_device *device;
+ struct sk_buff *msg;
+ u32 index;
+ u32 port;
+ int err;
+
+ err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (err ||
+ !tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+ !tb[RDMA_NLDEV_ATTR_PORT_INDEX])
+ return -EINVAL;
+
+ index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(sock_net(skb->sk), index);
+ if (!device)
+ return -EINVAL;
+
+ port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+ if (!rdma_is_port_valid(device, port)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET),
+ 0, 0);
+
+ err = fill_port_info(msg, device, port, sock_net(skb->sk));
+ if (err)
+ goto err_free;
+
+ nlmsg_end(msg, nlh);
+ ib_device_put(device);
+
+ return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
+
+err_free:
+ nlmsg_free(msg);
+err:
+ ib_device_put(device);
+ return err;
+}
+
+static int nldev_port_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ struct ib_device *device;
+ int start = cb->args[0];
+ struct nlmsghdr *nlh;
+ u32 idx = 0;
+ u32 ifindex;
+ int err;
+ unsigned int p;
+
+ err = nlmsg_parse_deprecated(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NULL);
+ if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+ return -EINVAL;
+
+ ifindex = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(sock_net(skb->sk), ifindex);
+ if (!device)
+ return -EINVAL;
+
+ rdma_for_each_port (device, p) {
+ /*
+ * The dumpit function returns all information from specific
+ * index. This specific index is taken from the netlink
+ * messages request sent by user and it is available
+ * in cb->args[0].
+ *
+ * Usually, the user doesn't fill this field and it causes
+ * to return everything.
+ *
+ */
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+ RDMA_NLDEV_CMD_PORT_GET),
+ 0, NLM_F_MULTI);
+
+ if (fill_port_info(skb, device, p, sock_net(skb->sk))) {
+ nlmsg_cancel(skb, nlh);
+ goto out;
+ }
+ idx++;
+ nlmsg_end(skb, nlh);
+ }
+
+out:
+ ib_device_put(device);
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ struct ib_device *device;
+ struct sk_buff *msg;
+ u32 index;
+ int ret;
+
+ ret = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+ return -EINVAL;
+
+ index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(sock_net(skb->sk), index);
+ if (!device)
+ return -EINVAL;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET),
+ 0, 0);
+
+ ret = fill_res_info(msg, device);
+ if (ret)
+ goto err_free;
+
+ nlmsg_end(msg, nlh);
+ ib_device_put(device);
+ return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
+
+err_free:
+ nlmsg_free(msg);
+err:
+ ib_device_put(device);
+ return ret;
+}
+
+static int _nldev_res_get_dumpit(struct ib_device *device,
+ struct sk_buff *skb,
+ struct netlink_callback *cb,
+ unsigned int idx)
+{
+ int start = cb->args[0];
+ struct nlmsghdr *nlh;
+
+ if (idx < start)
+ return 0;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET),
+ 0, NLM_F_MULTI);
+
+ if (fill_res_info(skb, device)) {
+ nlmsg_cancel(skb, nlh);
+ goto out;
+ }
+ nlmsg_end(skb, nlh);
+
+ idx++;
+
+out:
+ cb->args[0] = idx;
+ return skb->len;
+}
+
+static int nldev_res_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return ib_enum_all_devs(_nldev_res_get_dumpit, skb, cb);
+}
+
+struct nldev_fill_res_entry {
+ enum rdma_nldev_attr nldev_attr;
+ u8 flags;
+ u32 entry;
+ u32 id;
+};
+
+enum nldev_res_flags {
+ NLDEV_PER_DEV = 1 << 0,
+};
+
+static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
+ [RDMA_RESTRACK_QP] = {
+ .nldev_attr = RDMA_NLDEV_ATTR_RES_QP,
+ .entry = RDMA_NLDEV_ATTR_RES_QP_ENTRY,
+ .id = RDMA_NLDEV_ATTR_RES_LQPN,
+ },
+ [RDMA_RESTRACK_CM_ID] = {
+ .nldev_attr = RDMA_NLDEV_ATTR_RES_CM_ID,
+ .entry = RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY,
+ .id = RDMA_NLDEV_ATTR_RES_CM_IDN,
+ },
+ [RDMA_RESTRACK_CQ] = {
+ .nldev_attr = RDMA_NLDEV_ATTR_RES_CQ,
+ .flags = NLDEV_PER_DEV,
+ .entry = RDMA_NLDEV_ATTR_RES_CQ_ENTRY,
+ .id = RDMA_NLDEV_ATTR_RES_CQN,
+ },
+ [RDMA_RESTRACK_MR] = {
+ .nldev_attr = RDMA_NLDEV_ATTR_RES_MR,
+ .flags = NLDEV_PER_DEV,
+ .entry = RDMA_NLDEV_ATTR_RES_MR_ENTRY,
+ .id = RDMA_NLDEV_ATTR_RES_MRN,
+ },
+ [RDMA_RESTRACK_PD] = {
+ .nldev_attr = RDMA_NLDEV_ATTR_RES_PD,
+ .flags = NLDEV_PER_DEV,
+ .entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY,
+ .id = RDMA_NLDEV_ATTR_RES_PDN,
+ },
+ [RDMA_RESTRACK_COUNTER] = {
+ .nldev_attr = RDMA_NLDEV_ATTR_STAT_COUNTER,
+ .entry = RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY,
+ .id = RDMA_NLDEV_ATTR_STAT_COUNTER_ID,
+ },
+ [RDMA_RESTRACK_CTX] = {
+ .nldev_attr = RDMA_NLDEV_ATTR_RES_CTX,
+ .flags = NLDEV_PER_DEV,
+ .entry = RDMA_NLDEV_ATTR_RES_CTX_ENTRY,
+ .id = RDMA_NLDEV_ATTR_RES_CTXN,
+ },
+ [RDMA_RESTRACK_SRQ] = {
+ .nldev_attr = RDMA_NLDEV_ATTR_RES_SRQ,
+ .flags = NLDEV_PER_DEV,
+ .entry = RDMA_NLDEV_ATTR_RES_SRQ_ENTRY,
+ .id = RDMA_NLDEV_ATTR_RES_SRQN,
+ },
+
+};
+
+static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack,
+ enum rdma_restrack_type res_type,
+ res_fill_func_t fill_func)
+{
+ const struct nldev_fill_res_entry *fe = &fill_entries[res_type];
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ struct rdma_restrack_entry *res;
+ struct ib_device *device;
+ u32 index, id, port = 0;
+ bool has_cap_net_admin;
+ struct sk_buff *msg;
+ int ret;
+
+ ret = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !fe->id || !tb[fe->id])
+ return -EINVAL;
+
+ index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(sock_net(skb->sk), index);
+ if (!device)
+ return -EINVAL;
+
+ if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) {
+ port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+ if (!rdma_is_port_valid(device, port)) {
+ ret = -EINVAL;
+ goto err;
+ }
+ }
+
+ if ((port && fe->flags & NLDEV_PER_DEV) ||
+ (!port && ~fe->flags & NLDEV_PER_DEV)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ id = nla_get_u32(tb[fe->id]);
+ res = rdma_restrack_get_byid(device, res_type, id);
+ if (IS_ERR(res)) {
+ ret = PTR_ERR(res);
+ goto err;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ ret = -ENOMEM;
+ goto err_get;
+ }
+
+ nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+ RDMA_NL_GET_OP(nlh->nlmsg_type)),
+ 0, 0);
+
+ if (fill_nldev_handle(msg, device)) {
+ ret = -EMSGSIZE;
+ goto err_free;
+ }
+
+ has_cap_net_admin = netlink_capable(skb, CAP_NET_ADMIN);
+
+ ret = fill_func(msg, has_cap_net_admin, res, port);
+ if (ret)
+ goto err_free;
+
+ rdma_restrack_put(res);
+ nlmsg_end(msg, nlh);
+ ib_device_put(device);
+ return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
+
+err_free:
+ nlmsg_free(msg);
+err_get:
+ rdma_restrack_put(res);
+err:
+ ib_device_put(device);
+ return ret;
+}
+
+static int res_get_common_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ enum rdma_restrack_type res_type,
+ res_fill_func_t fill_func)
+{
+ const struct nldev_fill_res_entry *fe = &fill_entries[res_type];
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ struct rdma_restrack_entry *res;
+ struct rdma_restrack_root *rt;
+ int err, ret = 0, idx = 0;
+ struct nlattr *table_attr;
+ struct nlattr *entry_attr;
+ struct ib_device *device;
+ int start = cb->args[0];
+ bool has_cap_net_admin;
+ struct nlmsghdr *nlh;
+ unsigned long id;
+ u32 index, port = 0;
+ bool filled = false;
+
+ err = nlmsg_parse_deprecated(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NULL);
+ /*
+ * Right now, we are expecting the device index to get res information,
+ * but it is possible to extend this code to return all devices in
+ * one shot by checking the existence of RDMA_NLDEV_ATTR_DEV_INDEX.
+ * if it doesn't exist, we will iterate over all devices.
+ *
+ * But it is not needed for now.
+ */
+ if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+ return -EINVAL;
+
+ index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(sock_net(skb->sk), index);
+ if (!device)
+ return -EINVAL;
+
+ /*
+ * If no PORT_INDEX is supplied, we will return all QPs from that device
+ */
+ if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) {
+ port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+ if (!rdma_is_port_valid(device, port)) {
+ ret = -EINVAL;
+ goto err_index;
+ }
+ }
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+ RDMA_NL_GET_OP(cb->nlh->nlmsg_type)),
+ 0, NLM_F_MULTI);
+
+ if (fill_nldev_handle(skb, device)) {
+ ret = -EMSGSIZE;
+ goto err;
+ }
+
+ table_attr = nla_nest_start_noflag(skb, fe->nldev_attr);
+ if (!table_attr) {
+ ret = -EMSGSIZE;
+ goto err;
+ }
+
+ has_cap_net_admin = netlink_capable(cb->skb, CAP_NET_ADMIN);
+
+ rt = &device->res[res_type];
+ xa_lock(&rt->xa);
+ /*
+ * FIXME: if the skip ahead is something common this loop should
+ * use xas_for_each & xas_pause to optimize, we can have a lot of
+ * objects.
+ */
+ xa_for_each(&rt->xa, id, res) {
+ if (idx < start || !rdma_restrack_get(res))
+ goto next;
+
+ xa_unlock(&rt->xa);
+
+ filled = true;
+
+ entry_attr = nla_nest_start_noflag(skb, fe->entry);
+ if (!entry_attr) {
+ ret = -EMSGSIZE;
+ rdma_restrack_put(res);
+ goto msg_full;
+ }
+
+ ret = fill_func(skb, has_cap_net_admin, res, port);
+
+ rdma_restrack_put(res);
+
+ if (ret) {
+ nla_nest_cancel(skb, entry_attr);
+ if (ret == -EMSGSIZE)
+ goto msg_full;
+ if (ret == -EAGAIN)
+ goto again;
+ goto res_err;
+ }
+ nla_nest_end(skb, entry_attr);
+again: xa_lock(&rt->xa);
+next: idx++;
+ }
+ xa_unlock(&rt->xa);
+
+msg_full:
+ nla_nest_end(skb, table_attr);
+ nlmsg_end(skb, nlh);
+ cb->args[0] = idx;
+
+ /*
+ * No more entries to fill, cancel the message and
+ * return 0 to mark end of dumpit.
+ */
+ if (!filled)
+ goto err;
+
+ ib_device_put(device);
+ return skb->len;
+
+res_err:
+ nla_nest_cancel(skb, table_attr);
+
+err:
+ nlmsg_cancel(skb, nlh);
+
+err_index:
+ ib_device_put(device);
+ return ret;
+}
+
+#define RES_GET_FUNCS(name, type) \
+ static int nldev_res_get_##name##_dumpit(struct sk_buff *skb, \
+ struct netlink_callback *cb) \
+ { \
+ return res_get_common_dumpit(skb, cb, type, \
+ fill_res_##name##_entry); \
+ } \
+ static int nldev_res_get_##name##_doit(struct sk_buff *skb, \
+ struct nlmsghdr *nlh, \
+ struct netlink_ext_ack *extack) \
+ { \
+ return res_get_common_doit(skb, nlh, extack, type, \
+ fill_res_##name##_entry); \
+ }
+
+RES_GET_FUNCS(qp, RDMA_RESTRACK_QP);
+RES_GET_FUNCS(qp_raw, RDMA_RESTRACK_QP);
+RES_GET_FUNCS(cm_id, RDMA_RESTRACK_CM_ID);
+RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ);
+RES_GET_FUNCS(cq_raw, RDMA_RESTRACK_CQ);
+RES_GET_FUNCS(pd, RDMA_RESTRACK_PD);
+RES_GET_FUNCS(mr, RDMA_RESTRACK_MR);
+RES_GET_FUNCS(mr_raw, RDMA_RESTRACK_MR);
+RES_GET_FUNCS(counter, RDMA_RESTRACK_COUNTER);
+RES_GET_FUNCS(ctx, RDMA_RESTRACK_CTX);
+RES_GET_FUNCS(srq, RDMA_RESTRACK_SRQ);
+
+static LIST_HEAD(link_ops);
+static DECLARE_RWSEM(link_ops_rwsem);
+
+static const struct rdma_link_ops *link_ops_get(const char *type)
+{
+ const struct rdma_link_ops *ops;
+
+ list_for_each_entry(ops, &link_ops, list) {
+ if (!strcmp(ops->type, type))
+ goto out;
+ }
+ ops = NULL;
+out:
+ return ops;
+}
+
+void rdma_link_register(struct rdma_link_ops *ops)
+{
+ down_write(&link_ops_rwsem);
+ if (WARN_ON_ONCE(link_ops_get(ops->type)))
+ goto out;
+ list_add(&ops->list, &link_ops);
+out:
+ up_write(&link_ops_rwsem);
+}
+EXPORT_SYMBOL(rdma_link_register);
+
+void rdma_link_unregister(struct rdma_link_ops *ops)
+{
+ down_write(&link_ops_rwsem);
+ list_del(&ops->list);
+ up_write(&link_ops_rwsem);
+}
+EXPORT_SYMBOL(rdma_link_unregister);
+
+static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ char ibdev_name[IB_DEVICE_NAME_MAX];
+ const struct rdma_link_ops *ops;
+ char ndev_name[IFNAMSIZ];
+ struct net_device *ndev;
+ char type[IFNAMSIZ];
+ int err;
+
+ err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||
+ !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME])
+ return -EINVAL;
+
+ nla_strscpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME],
+ sizeof(ibdev_name));
+ if (strchr(ibdev_name, '%') || strlen(ibdev_name) == 0)
+ return -EINVAL;
+
+ nla_strscpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type));
+ nla_strscpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME],
+ sizeof(ndev_name));
+
+ ndev = dev_get_by_name(sock_net(skb->sk), ndev_name);
+ if (!ndev)
+ return -ENODEV;
+
+ down_read(&link_ops_rwsem);
+ ops = link_ops_get(type);
+#ifdef CONFIG_MODULES
+ if (!ops) {
+ up_read(&link_ops_rwsem);
+ request_module("rdma-link-%s", type);
+ down_read(&link_ops_rwsem);
+ ops = link_ops_get(type);
+ }
+#endif
+ err = ops ? ops->newlink(ibdev_name, ndev) : -EINVAL;
+ up_read(&link_ops_rwsem);
+ dev_put(ndev);
+
+ return err;
+}
+
+static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ struct ib_device *device;
+ u32 index;
+ int err;
+
+ err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])
+ return -EINVAL;
+
+ index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(sock_net(skb->sk), index);
+ if (!device)
+ return -EINVAL;
+
+ if (!(device->attrs.kernel_cap_flags & IBK_ALLOW_USER_UNREG)) {
+ ib_device_put(device);
+ return -EINVAL;
+ }
+
+ ib_unregister_device_and_put(device);
+ return 0;
+}
+
+static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ char client_name[RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE];
+ struct ib_client_nl_info data = {};
+ struct ib_device *ibdev = NULL;
+ struct sk_buff *msg;
+ u32 index;
+ int err;
+
+ err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy,
+ extack);
+ if (err || !tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE])
+ return -EINVAL;
+
+ nla_strscpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE],
+ sizeof(client_name));
+
+ if (tb[RDMA_NLDEV_ATTR_DEV_INDEX]) {
+ index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ ibdev = ib_device_get_by_index(sock_net(skb->sk), index);
+ if (!ibdev)
+ return -EINVAL;
+
+ if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) {
+ data.port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+ if (!rdma_is_port_valid(ibdev, data.port)) {
+ err = -EINVAL;
+ goto out_put;
+ }
+ } else {
+ data.port = -1;
+ }
+ } else if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) {
+ return -EINVAL;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ err = -ENOMEM;
+ goto out_put;
+ }
+ nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+ RDMA_NLDEV_CMD_GET_CHARDEV),
+ 0, 0);
+
+ data.nl_msg = msg;
+ err = ib_get_client_nl_info(ibdev, client_name, &data);
+ if (err)
+ goto out_nlmsg;
+
+ err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV,
+ huge_encode_dev(data.cdev->devt),
+ RDMA_NLDEV_ATTR_PAD);
+ if (err)
+ goto out_data;
+ err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV_ABI, data.abi,
+ RDMA_NLDEV_ATTR_PAD);
+ if (err)
+ goto out_data;
+ if (nla_put_string(msg, RDMA_NLDEV_ATTR_CHARDEV_NAME,
+ dev_name(data.cdev))) {
+ err = -EMSGSIZE;
+ goto out_data;
+ }
+
+ nlmsg_end(msg, nlh);
+ put_device(data.cdev);
+ if (ibdev)
+ ib_device_put(ibdev);
+ return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
+
+out_data:
+ put_device(data.cdev);
+out_nlmsg:
+ nlmsg_free(msg);
+out_put:
+ if (ibdev)
+ ib_device_put(ibdev);
+ return err;
+}
+
+static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ struct sk_buff *msg;
+ int err;
+
+ err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (err)
+ return err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+ RDMA_NLDEV_CMD_SYS_GET),
+ 0, 0);
+
+ err = nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_NETNS_MODE,
+ (u8)ib_devices_shared_netns);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ /*
+ * Copy-on-fork is supported.
+ * See commits:
+ * 70e806e4e645 ("mm: Do early cow for pinned pages during fork() for ptes")
+ * 4eae4efa2c29 ("hugetlb: do early cow when page pinned on src mm")
+ * for more details. Don't backport this without them.
+ *
+ * Return value ignored on purpose, assume copy-on-fork is not
+ * supported in case of failure.
+ */
+ nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_COPY_ON_FORK, 1);
+
+ nlmsg_end(msg, nlh);
+ return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
+}
+
+static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ u8 enable;
+ int err;
+
+ err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (err || !tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE])
+ return -EINVAL;
+
+ enable = nla_get_u8(tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE]);
+ /* Only 0 and 1 are supported */
+ if (enable > 1)
+ return -EINVAL;
+
+ err = rdma_compatdev_set(enable);
+ return err;
+}
+
+static int nldev_stat_set_mode_doit(struct sk_buff *msg,
+ struct netlink_ext_ack *extack,
+ struct nlattr *tb[],
+ struct ib_device *device, u32 port)
+{
+ u32 mode, mask = 0, qpn, cntn = 0;
+ int ret;
+
+ /* Currently only counter for QP is supported */
+ if (!tb[RDMA_NLDEV_ATTR_STAT_RES] ||
+ nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP)
+ return -EINVAL;
+
+ mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]);
+ if (mode == RDMA_COUNTER_MODE_AUTO) {
+ if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK])
+ mask = nla_get_u32(
+ tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]);
+ return rdma_counter_set_auto_mode(device, port, mask, extack);
+ }
+
+ if (!tb[RDMA_NLDEV_ATTR_RES_LQPN])
+ return -EINVAL;
+
+ qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]);
+ if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) {
+ cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]);
+ ret = rdma_counter_bind_qpn(device, port, qpn, cntn);
+ if (ret)
+ return ret;
+ } else {
+ ret = rdma_counter_bind_qpn_alloc(device, port, qpn, &cntn);
+ if (ret)
+ return ret;
+ }
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) {
+ ret = -EMSGSIZE;
+ goto err_fill;
+ }
+
+ return 0;
+
+err_fill:
+ rdma_counter_unbind_qpn(device, port, qpn, cntn);
+ return ret;
+}
+
+static int nldev_stat_set_counter_dynamic_doit(struct nlattr *tb[],
+ struct ib_device *device,
+ u32 port)
+{
+ struct rdma_hw_stats *stats;
+ struct nlattr *entry_attr;
+ unsigned long *target;
+ int rem, i, ret = 0;
+ u32 index;
+
+ stats = ib_get_hw_stats_port(device, port);
+ if (!stats)
+ return -EINVAL;
+
+ target = kcalloc(BITS_TO_LONGS(stats->num_counters),
+ sizeof(*stats->is_disabled), GFP_KERNEL);
+ if (!target)
+ return -ENOMEM;
+
+ nla_for_each_nested(entry_attr, tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS],
+ rem) {
+ index = nla_get_u32(entry_attr);
+ if ((index >= stats->num_counters) ||
+ !(stats->descs[index].flags & IB_STAT_FLAG_OPTIONAL)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ set_bit(index, target);
+ }
+
+ for (i = 0; i < stats->num_counters; i++) {
+ if (!(stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL))
+ continue;
+
+ ret = rdma_counter_modify(device, port, i, test_bit(i, target));
+ if (ret)
+ goto out;
+ }
+
+out:
+ kfree(target);
+ return ret;
+}
+
+static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ struct ib_device *device;
+ struct sk_buff *msg;
+ u32 index, port;
+ int ret;
+
+ ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy,
+ extack);
+ if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+ !tb[RDMA_NLDEV_ATTR_PORT_INDEX])
+ return -EINVAL;
+
+ index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(sock_net(skb->sk), index);
+ if (!device)
+ return -EINVAL;
+
+ port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+ if (!rdma_is_port_valid(device, port)) {
+ ret = -EINVAL;
+ goto err_put_device;
+ }
+
+ if (!tb[RDMA_NLDEV_ATTR_STAT_MODE] &&
+ !tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS]) {
+ ret = -EINVAL;
+ goto err_put_device;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ ret = -ENOMEM;
+ goto err_put_device;
+ }
+ nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+ RDMA_NLDEV_CMD_STAT_SET),
+ 0, 0);
+ if (fill_nldev_handle(msg, device) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) {
+ ret = -EMSGSIZE;
+ goto err_free_msg;
+ }
+
+ if (tb[RDMA_NLDEV_ATTR_STAT_MODE]) {
+ ret = nldev_stat_set_mode_doit(msg, extack, tb, device, port);
+ if (ret)
+ goto err_free_msg;
+ }
+
+ if (tb[RDMA_NLDEV_ATTR_STAT_HWCOUNTERS]) {
+ ret = nldev_stat_set_counter_dynamic_doit(tb, device, port);
+ if (ret)
+ goto err_free_msg;
+ }
+
+ nlmsg_end(msg, nlh);
+ ib_device_put(device);
+ return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
+
+err_free_msg:
+ nlmsg_free(msg);
+err_put_device:
+ ib_device_put(device);
+ return ret;
+}
+
+static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ struct ib_device *device;
+ struct sk_buff *msg;
+ u32 index, port, qpn, cntn;
+ int ret;
+
+ ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES] ||
+ !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX] ||
+ !tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID] ||
+ !tb[RDMA_NLDEV_ATTR_RES_LQPN])
+ return -EINVAL;
+
+ if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP)
+ return -EINVAL;
+
+ index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(sock_net(skb->sk), index);
+ if (!device)
+ return -EINVAL;
+
+ port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+ if (!rdma_is_port_valid(device, port)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+ RDMA_NLDEV_CMD_STAT_SET),
+ 0, 0);
+
+ cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]);
+ qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]);
+ if (fill_nldev_handle(msg, device) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) {
+ ret = -EMSGSIZE;
+ goto err_fill;
+ }
+
+ ret = rdma_counter_unbind_qpn(device, port, qpn, cntn);
+ if (ret)
+ goto err_fill;
+
+ nlmsg_end(msg, nlh);
+ ib_device_put(device);
+ return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
+
+err_fill:
+ nlmsg_free(msg);
+err:
+ ib_device_put(device);
+ return ret;
+}
+
+static int stat_get_doit_default_counter(struct sk_buff *skb,
+ struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack,
+ struct nlattr *tb[])
+{
+ struct rdma_hw_stats *stats;
+ struct nlattr *table_attr;
+ struct ib_device *device;
+ int ret, num_cnts, i;
+ struct sk_buff *msg;
+ u32 index, port;
+ u64 v;
+
+ if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX])
+ return -EINVAL;
+
+ index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(sock_net(skb->sk), index);
+ if (!device)
+ return -EINVAL;
+
+ if (!device->ops.alloc_hw_port_stats || !device->ops.get_hw_stats) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+ stats = ib_get_hw_stats_port(device, port);
+ if (!stats) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+ RDMA_NLDEV_CMD_STAT_GET),
+ 0, 0);
+
+ if (fill_nldev_handle(msg, device) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) {
+ ret = -EMSGSIZE;
+ goto err_msg;
+ }
+
+ mutex_lock(&stats->lock);
+
+ num_cnts = device->ops.get_hw_stats(device, stats, port, 0);
+ if (num_cnts < 0) {
+ ret = -EINVAL;
+ goto err_stats;
+ }
+
+ table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS);
+ if (!table_attr) {
+ ret = -EMSGSIZE;
+ goto err_stats;
+ }
+ for (i = 0; i < num_cnts; i++) {
+ if (test_bit(i, stats->is_disabled))
+ continue;
+
+ v = stats->value[i] +
+ rdma_counter_get_hwstat_value(device, port, i);
+ if (rdma_nl_stat_hwcounter_entry(msg,
+ stats->descs[i].name, v)) {
+ ret = -EMSGSIZE;
+ goto err_table;
+ }
+ }
+ nla_nest_end(msg, table_attr);
+
+ mutex_unlock(&stats->lock);
+ nlmsg_end(msg, nlh);
+ ib_device_put(device);
+ return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
+
+err_table:
+ nla_nest_cancel(msg, table_attr);
+err_stats:
+ mutex_unlock(&stats->lock);
+err_msg:
+ nlmsg_free(msg);
+err:
+ ib_device_put(device);
+ return ret;
+}
+
+static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack, struct nlattr *tb[])
+
+{
+ static enum rdma_nl_counter_mode mode;
+ static enum rdma_nl_counter_mask mask;
+ struct ib_device *device;
+ struct sk_buff *msg;
+ u32 index, port;
+ int ret;
+
+ if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID])
+ return nldev_res_get_counter_doit(skb, nlh, extack);
+
+ if (!tb[RDMA_NLDEV_ATTR_STAT_MODE] ||
+ !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX])
+ return -EINVAL;
+
+ index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(sock_net(skb->sk), index);
+ if (!device)
+ return -EINVAL;
+
+ port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+ if (!rdma_is_port_valid(device, port)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+ RDMA_NLDEV_CMD_STAT_GET),
+ 0, 0);
+
+ ret = rdma_counter_get_mode(device, port, &mode, &mask);
+ if (ret)
+ goto err_msg;
+
+ if (fill_nldev_handle(msg, device) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, mode)) {
+ ret = -EMSGSIZE;
+ goto err_msg;
+ }
+
+ if ((mode == RDMA_COUNTER_MODE_AUTO) &&
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, mask)) {
+ ret = -EMSGSIZE;
+ goto err_msg;
+ }
+
+ nlmsg_end(msg, nlh);
+ ib_device_put(device);
+ return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
+
+err_msg:
+ nlmsg_free(msg);
+err:
+ ib_device_put(device);
+ return ret;
+}
+
+static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ int ret;
+
+ ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (ret)
+ return -EINVAL;
+
+ if (!tb[RDMA_NLDEV_ATTR_STAT_RES])
+ return stat_get_doit_default_counter(skb, nlh, extack, tb);
+
+ switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) {
+ case RDMA_NLDEV_ATTR_RES_QP:
+ ret = stat_get_doit_qp(skb, nlh, extack, tb);
+ break;
+ case RDMA_NLDEV_ATTR_RES_MR:
+ ret = res_get_common_doit(skb, nlh, extack, RDMA_RESTRACK_MR,
+ fill_stat_mr_entry);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+static int nldev_stat_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ int ret;
+
+ ret = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NULL);
+ if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES])
+ return -EINVAL;
+
+ switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) {
+ case RDMA_NLDEV_ATTR_RES_QP:
+ ret = nldev_res_get_counter_dumpit(skb, cb);
+ break;
+ case RDMA_NLDEV_ATTR_RES_MR:
+ ret = res_get_common_dumpit(skb, cb, RDMA_RESTRACK_MR,
+ fill_stat_mr_entry);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+static int nldev_stat_get_counter_status_doit(struct sk_buff *skb,
+ struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX], *table, *entry;
+ struct rdma_hw_stats *stats;
+ struct ib_device *device;
+ struct sk_buff *msg;
+ u32 devid, port;
+ int ret, i;
+
+ ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+ !tb[RDMA_NLDEV_ATTR_PORT_INDEX])
+ return -EINVAL;
+
+ devid = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(sock_net(skb->sk), devid);
+ if (!device)
+ return -EINVAL;
+
+ port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+ if (!rdma_is_port_valid(device, port)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ stats = ib_get_hw_stats_port(device, port);
+ if (!stats) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ nlh = nlmsg_put(
+ msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_GET_STATUS),
+ 0, 0);
+
+ ret = -EMSGSIZE;
+ if (fill_nldev_handle(msg, device) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port))
+ goto err_msg;
+
+ table = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS);
+ if (!table)
+ goto err_msg;
+
+ mutex_lock(&stats->lock);
+ for (i = 0; i < stats->num_counters; i++) {
+ entry = nla_nest_start(msg,
+ RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY);
+ if (!entry)
+ goto err_msg_table;
+
+ if (nla_put_string(msg,
+ RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME,
+ stats->descs[i].name) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_INDEX, i))
+ goto err_msg_entry;
+
+ if ((stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL) &&
+ (nla_put_u8(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_DYNAMIC,
+ !test_bit(i, stats->is_disabled))))
+ goto err_msg_entry;
+
+ nla_nest_end(msg, entry);
+ }
+ mutex_unlock(&stats->lock);
+
+ nla_nest_end(msg, table);
+ nlmsg_end(msg, nlh);
+ ib_device_put(device);
+ return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid);
+
+err_msg_entry:
+ nla_nest_cancel(msg, entry);
+err_msg_table:
+ mutex_unlock(&stats->lock);
+ nla_nest_cancel(msg, table);
+err_msg:
+ nlmsg_free(msg);
+err:
+ ib_device_put(device);
+ return ret;
+}
+
+static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
+ [RDMA_NLDEV_CMD_GET] = {
+ .doit = nldev_get_doit,
+ .dump = nldev_get_dumpit,
+ },
+ [RDMA_NLDEV_CMD_GET_CHARDEV] = {
+ .doit = nldev_get_chardev,
+ },
+ [RDMA_NLDEV_CMD_SET] = {
+ .doit = nldev_set_doit,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
+ [RDMA_NLDEV_CMD_NEWLINK] = {
+ .doit = nldev_newlink,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
+ [RDMA_NLDEV_CMD_DELLINK] = {
+ .doit = nldev_dellink,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
+ [RDMA_NLDEV_CMD_PORT_GET] = {
+ .doit = nldev_port_get_doit,
+ .dump = nldev_port_get_dumpit,
+ },
+ [RDMA_NLDEV_CMD_RES_GET] = {
+ .doit = nldev_res_get_doit,
+ .dump = nldev_res_get_dumpit,
+ },
+ [RDMA_NLDEV_CMD_RES_QP_GET] = {
+ .doit = nldev_res_get_qp_doit,
+ .dump = nldev_res_get_qp_dumpit,
+ },
+ [RDMA_NLDEV_CMD_RES_CM_ID_GET] = {
+ .doit = nldev_res_get_cm_id_doit,
+ .dump = nldev_res_get_cm_id_dumpit,
+ },
+ [RDMA_NLDEV_CMD_RES_CQ_GET] = {
+ .doit = nldev_res_get_cq_doit,
+ .dump = nldev_res_get_cq_dumpit,
+ },
+ [RDMA_NLDEV_CMD_RES_MR_GET] = {
+ .doit = nldev_res_get_mr_doit,
+ .dump = nldev_res_get_mr_dumpit,
+ },
+ [RDMA_NLDEV_CMD_RES_PD_GET] = {
+ .doit = nldev_res_get_pd_doit,
+ .dump = nldev_res_get_pd_dumpit,
+ },
+ [RDMA_NLDEV_CMD_RES_CTX_GET] = {
+ .doit = nldev_res_get_ctx_doit,
+ .dump = nldev_res_get_ctx_dumpit,
+ },
+ [RDMA_NLDEV_CMD_RES_SRQ_GET] = {
+ .doit = nldev_res_get_srq_doit,
+ .dump = nldev_res_get_srq_dumpit,
+ },
+ [RDMA_NLDEV_CMD_SYS_GET] = {
+ .doit = nldev_sys_get_doit,
+ },
+ [RDMA_NLDEV_CMD_SYS_SET] = {
+ .doit = nldev_set_sys_set_doit,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
+ [RDMA_NLDEV_CMD_STAT_SET] = {
+ .doit = nldev_stat_set_doit,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
+ [RDMA_NLDEV_CMD_STAT_GET] = {
+ .doit = nldev_stat_get_doit,
+ .dump = nldev_stat_get_dumpit,
+ },
+ [RDMA_NLDEV_CMD_STAT_DEL] = {
+ .doit = nldev_stat_del_doit,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
+ [RDMA_NLDEV_CMD_RES_QP_GET_RAW] = {
+ .doit = nldev_res_get_qp_raw_doit,
+ .dump = nldev_res_get_qp_raw_dumpit,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
+ [RDMA_NLDEV_CMD_RES_CQ_GET_RAW] = {
+ .doit = nldev_res_get_cq_raw_doit,
+ .dump = nldev_res_get_cq_raw_dumpit,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
+ [RDMA_NLDEV_CMD_RES_MR_GET_RAW] = {
+ .doit = nldev_res_get_mr_raw_doit,
+ .dump = nldev_res_get_mr_raw_dumpit,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
+ [RDMA_NLDEV_CMD_STAT_GET_STATUS] = {
+ .doit = nldev_stat_get_counter_status_doit,
+ },
+};
+
+void __init nldev_init(void)
+{
+ rdma_nl_register(RDMA_NL_NLDEV, nldev_cb_table);
+}
+
+void nldev_exit(void)
+{
+ rdma_nl_unregister(RDMA_NL_NLDEV);
+}
+
+MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_NLDEV, 5);
diff --git a/drivers/infiniband/core/opa_smi.h b/drivers/infiniband/core/opa_smi.h
new file mode 100644
index 000000000..64e2822af
--- /dev/null
+++ b/drivers/infiniband/core/opa_smi.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __OPA_SMI_H_
+#define __OPA_SMI_H_
+
+#include <rdma/ib_smi.h>
+#include <rdma/opa_smi.h>
+
+#include "smi.h"
+
+enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch,
+ u32 port_num, int phys_port_cnt);
+int opa_smi_get_fwd_port(struct opa_smp *smp);
+extern enum smi_forward_action opa_smi_check_forward_dr_smp(struct opa_smp *smp);
+extern enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp,
+ bool is_switch, u32 port_num);
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action opa_smi_check_local_smp(struct opa_smp *smp,
+ struct ib_device *device)
+{
+ /* C14-9:3 -- We're at the end of the DR segment of path */
+ /* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */
+ return (device->ops.process_mad &&
+ !opa_get_smp_direction(smp) &&
+ (smp->hop_ptr == smp->hop_cnt + 1)) ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD;
+}
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action opa_smi_check_local_returning_smp(struct opa_smp *smp,
+ struct ib_device *device)
+{
+ /* C14-13:3 -- We're at the end of the DR segment of path */
+ /* C14-13:4 -- Hop Pointer == 0 -> give to SM */
+ return (device->ops.process_mad &&
+ opa_get_smp_direction(smp) &&
+ !smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD;
+}
+
+#endif /* __OPA_SMI_H_ */
diff --git a/drivers/infiniband/core/packer.c b/drivers/infiniband/core/packer.c
new file mode 100644
index 000000000..19b1ee327
--- /dev/null
+++ b/drivers/infiniband/core/packer.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/export.h>
+#include <linux/string.h>
+
+#include <rdma/ib_pack.h>
+
+static u64 value_read(int offset, int size, void *structure)
+{
+ switch (size) {
+ case 1: return *(u8 *) (structure + offset);
+ case 2: return be16_to_cpup((__be16 *) (structure + offset));
+ case 4: return be32_to_cpup((__be32 *) (structure + offset));
+ case 8: return be64_to_cpup((__be64 *) (structure + offset));
+ default:
+ pr_warn("Field size %d bits not handled\n", size * 8);
+ return 0;
+ }
+}
+
+/**
+ * ib_pack - Pack a structure into a buffer
+ * @desc:Array of structure field descriptions
+ * @desc_len:Number of entries in @desc
+ * @structure:Structure to pack from
+ * @buf:Buffer to pack into
+ *
+ * ib_pack() packs a list of structure fields into a buffer,
+ * controlled by the array of fields in @desc.
+ */
+void ib_pack(const struct ib_field *desc,
+ int desc_len,
+ void *structure,
+ void *buf)
+{
+ int i;
+
+ for (i = 0; i < desc_len; ++i) {
+ if (desc[i].size_bits <= 32) {
+ int shift;
+ u32 val;
+ __be32 mask;
+ __be32 *addr;
+
+ shift = 32 - desc[i].offset_bits - desc[i].size_bits;
+ if (desc[i].struct_size_bytes)
+ val = value_read(desc[i].struct_offset_bytes,
+ desc[i].struct_size_bytes,
+ structure) << shift;
+ else
+ val = 0;
+
+ mask = cpu_to_be32(((1ull << desc[i].size_bits) - 1) << shift);
+ addr = (__be32 *) buf + desc[i].offset_words;
+ *addr = (*addr & ~mask) | (cpu_to_be32(val) & mask);
+ } else if (desc[i].size_bits <= 64) {
+ int shift;
+ u64 val;
+ __be64 mask;
+ __be64 *addr;
+
+ shift = 64 - desc[i].offset_bits - desc[i].size_bits;
+ if (desc[i].struct_size_bytes)
+ val = value_read(desc[i].struct_offset_bytes,
+ desc[i].struct_size_bytes,
+ structure) << shift;
+ else
+ val = 0;
+
+ mask = cpu_to_be64((~0ull >> (64 - desc[i].size_bits)) << shift);
+ addr = (__be64 *) ((__be32 *) buf + desc[i].offset_words);
+ *addr = (*addr & ~mask) | (cpu_to_be64(val) & mask);
+ } else {
+ if (desc[i].offset_bits % 8 ||
+ desc[i].size_bits % 8) {
+ pr_warn("Structure field %s of size %d bits is not byte-aligned\n",
+ desc[i].field_name, desc[i].size_bits);
+ }
+
+ if (desc[i].struct_size_bytes)
+ memcpy(buf + desc[i].offset_words * 4 +
+ desc[i].offset_bits / 8,
+ structure + desc[i].struct_offset_bytes,
+ desc[i].size_bits / 8);
+ else
+ memset(buf + desc[i].offset_words * 4 +
+ desc[i].offset_bits / 8,
+ 0,
+ desc[i].size_bits / 8);
+ }
+ }
+}
+EXPORT_SYMBOL(ib_pack);
+
+static void value_write(int offset, int size, u64 val, void *structure)
+{
+ switch (size * 8) {
+ case 8: *( u8 *) (structure + offset) = val; break;
+ case 16: *(__be16 *) (structure + offset) = cpu_to_be16(val); break;
+ case 32: *(__be32 *) (structure + offset) = cpu_to_be32(val); break;
+ case 64: *(__be64 *) (structure + offset) = cpu_to_be64(val); break;
+ default:
+ pr_warn("Field size %d bits not handled\n", size * 8);
+ }
+}
+
+/**
+ * ib_unpack - Unpack a buffer into a structure
+ * @desc:Array of structure field descriptions
+ * @desc_len:Number of entries in @desc
+ * @buf:Buffer to unpack from
+ * @structure:Structure to unpack into
+ *
+ * ib_pack() unpacks a list of structure fields from a buffer,
+ * controlled by the array of fields in @desc.
+ */
+void ib_unpack(const struct ib_field *desc,
+ int desc_len,
+ void *buf,
+ void *structure)
+{
+ int i;
+
+ for (i = 0; i < desc_len; ++i) {
+ if (!desc[i].struct_size_bytes)
+ continue;
+
+ if (desc[i].size_bits <= 32) {
+ int shift;
+ u32 val;
+ u32 mask;
+ __be32 *addr;
+
+ shift = 32 - desc[i].offset_bits - desc[i].size_bits;
+ mask = ((1ull << desc[i].size_bits) - 1) << shift;
+ addr = (__be32 *) buf + desc[i].offset_words;
+ val = (be32_to_cpup(addr) & mask) >> shift;
+ value_write(desc[i].struct_offset_bytes,
+ desc[i].struct_size_bytes,
+ val,
+ structure);
+ } else if (desc[i].size_bits <= 64) {
+ int shift;
+ u64 val;
+ u64 mask;
+ __be64 *addr;
+
+ shift = 64 - desc[i].offset_bits - desc[i].size_bits;
+ mask = (~0ull >> (64 - desc[i].size_bits)) << shift;
+ addr = (__be64 *) buf + desc[i].offset_words;
+ val = (be64_to_cpup(addr) & mask) >> shift;
+ value_write(desc[i].struct_offset_bytes,
+ desc[i].struct_size_bytes,
+ val,
+ structure);
+ } else {
+ if (desc[i].offset_bits % 8 ||
+ desc[i].size_bits % 8) {
+ pr_warn("Structure field %s of size %d bits is not byte-aligned\n",
+ desc[i].field_name, desc[i].size_bits);
+ }
+
+ memcpy(structure + desc[i].struct_offset_bytes,
+ buf + desc[i].offset_words * 4 +
+ desc[i].offset_bits / 8,
+ desc[i].size_bits / 8);
+ }
+ }
+}
+EXPORT_SYMBOL(ib_unpack);
diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c
new file mode 100644
index 000000000..29b1ab1d5
--- /dev/null
+++ b/drivers/infiniband/core/rdma_core.c
@@ -0,0 +1,1015 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
+#include <linux/sched/mm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/uverbs_types.h>
+#include <linux/rcupdate.h>
+#include <rdma/uverbs_ioctl.h>
+#include <rdma/rdma_user_ioctl.h>
+#include "uverbs.h"
+#include "core_priv.h"
+#include "rdma_core.h"
+
+static void uverbs_uobject_free(struct kref *ref)
+{
+ kfree_rcu(container_of(ref, struct ib_uobject, ref), rcu);
+}
+
+/*
+ * In order to indicate we no longer needs this uobject, uverbs_uobject_put
+ * is called. When the reference count is decreased, the uobject is freed.
+ * For example, this is used when attaching a completion channel to a CQ.
+ */
+void uverbs_uobject_put(struct ib_uobject *uobject)
+{
+ kref_put(&uobject->ref, uverbs_uobject_free);
+}
+EXPORT_SYMBOL(uverbs_uobject_put);
+
+static int uverbs_try_lock_object(struct ib_uobject *uobj,
+ enum rdma_lookup_mode mode)
+{
+ /*
+ * When a shared access is required, we use a positive counter. Each
+ * shared access request checks that the value != -1 and increment it.
+ * Exclusive access is required for operations like write or destroy.
+ * In exclusive access mode, we check that the counter is zero (nobody
+ * claimed this object) and we set it to -1. Releasing a shared access
+ * lock is done simply by decreasing the counter. As for exclusive
+ * access locks, since only a single one of them is allowed
+ * concurrently, setting the counter to zero is enough for releasing
+ * this lock.
+ */
+ switch (mode) {
+ case UVERBS_LOOKUP_READ:
+ return atomic_fetch_add_unless(&uobj->usecnt, 1, -1) == -1 ?
+ -EBUSY : 0;
+ case UVERBS_LOOKUP_WRITE:
+ /* lock is exclusive */
+ return atomic_cmpxchg(&uobj->usecnt, 0, -1) == 0 ? 0 : -EBUSY;
+ case UVERBS_LOOKUP_DESTROY:
+ return 0;
+ }
+ return 0;
+}
+
+static void assert_uverbs_usecnt(struct ib_uobject *uobj,
+ enum rdma_lookup_mode mode)
+{
+#ifdef CONFIG_LOCKDEP
+ switch (mode) {
+ case UVERBS_LOOKUP_READ:
+ WARN_ON(atomic_read(&uobj->usecnt) <= 0);
+ break;
+ case UVERBS_LOOKUP_WRITE:
+ WARN_ON(atomic_read(&uobj->usecnt) != -1);
+ break;
+ case UVERBS_LOOKUP_DESTROY:
+ break;
+ }
+#endif
+}
+
+/*
+ * This must be called with the hw_destroy_rwsem locked for read or write,
+ * also the uobject itself must be locked for write.
+ *
+ * Upon return the HW object is guaranteed to be destroyed.
+ *
+ * For RDMA_REMOVE_ABORT, the hw_destroy_rwsem is not required to be held,
+ * however the type's allocat_commit function cannot have been called and the
+ * uobject cannot be on the uobjects_lists
+ *
+ * For RDMA_REMOVE_DESTROY the caller should be holding a kref (eg via
+ * rdma_lookup_get_uobject) and the object is left in a state where the caller
+ * needs to call rdma_lookup_put_uobject.
+ *
+ * For all other destroy modes this function internally unlocks the uobject
+ * and consumes the kref on the uobj.
+ */
+static int uverbs_destroy_uobject(struct ib_uobject *uobj,
+ enum rdma_remove_reason reason,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_file *ufile = attrs->ufile;
+ unsigned long flags;
+ int ret;
+
+ lockdep_assert_held(&ufile->hw_destroy_rwsem);
+ assert_uverbs_usecnt(uobj, UVERBS_LOOKUP_WRITE);
+
+ if (reason == RDMA_REMOVE_ABORT) {
+ WARN_ON(!list_empty(&uobj->list));
+ WARN_ON(!uobj->context);
+ uobj->uapi_object->type_class->alloc_abort(uobj);
+ } else if (uobj->object) {
+ ret = uobj->uapi_object->type_class->destroy_hw(uobj, reason,
+ attrs);
+ if (ret)
+ /* Nothing to be done, wait till ucontext will clean it */
+ return ret;
+
+ uobj->object = NULL;
+ }
+
+ uobj->context = NULL;
+
+ /*
+ * For DESTROY the usecnt is not changed, the caller is expected to
+ * manage it via uobj_put_destroy(). Only DESTROY can remove the IDR
+ * handle.
+ */
+ if (reason != RDMA_REMOVE_DESTROY)
+ atomic_set(&uobj->usecnt, 0);
+ else
+ uobj->uapi_object->type_class->remove_handle(uobj);
+
+ if (!list_empty(&uobj->list)) {
+ spin_lock_irqsave(&ufile->uobjects_lock, flags);
+ list_del_init(&uobj->list);
+ spin_unlock_irqrestore(&ufile->uobjects_lock, flags);
+
+ /*
+ * Pairs with the get in rdma_alloc_commit_uobject(), could
+ * destroy uobj.
+ */
+ uverbs_uobject_put(uobj);
+ }
+
+ /*
+ * When aborting the stack kref remains owned by the core code, and is
+ * not transferred into the type. Pairs with the get in alloc_uobj
+ */
+ if (reason == RDMA_REMOVE_ABORT)
+ uverbs_uobject_put(uobj);
+
+ return 0;
+}
+
+/*
+ * This calls uverbs_destroy_uobject() using the RDMA_REMOVE_DESTROY
+ * sequence. It should only be used from command callbacks. On success the
+ * caller must pair this with uobj_put_destroy(). This
+ * version requires the caller to have already obtained an
+ * LOOKUP_DESTROY uobject kref.
+ */
+int uobj_destroy(struct ib_uobject *uobj, struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_file *ufile = attrs->ufile;
+ int ret;
+
+ down_read(&ufile->hw_destroy_rwsem);
+
+ /*
+ * Once the uobject is destroyed by RDMA_REMOVE_DESTROY then it is left
+ * write locked as the callers put it back with UVERBS_LOOKUP_DESTROY.
+ * This is because any other concurrent thread can still see the object
+ * in the xarray due to RCU. Leaving it locked ensures nothing else will
+ * touch it.
+ */
+ ret = uverbs_try_lock_object(uobj, UVERBS_LOOKUP_WRITE);
+ if (ret)
+ goto out_unlock;
+
+ ret = uverbs_destroy_uobject(uobj, RDMA_REMOVE_DESTROY, attrs);
+ if (ret) {
+ atomic_set(&uobj->usecnt, 0);
+ goto out_unlock;
+ }
+
+out_unlock:
+ up_read(&ufile->hw_destroy_rwsem);
+ return ret;
+}
+
+/*
+ * uobj_get_destroy destroys the HW object and returns a handle to the uobj
+ * with a NULL object pointer. The caller must pair this with
+ * uobj_put_destroy().
+ */
+struct ib_uobject *__uobj_get_destroy(const struct uverbs_api_object *obj,
+ u32 id, struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj;
+ int ret;
+
+ uobj = rdma_lookup_get_uobject(obj, attrs->ufile, id,
+ UVERBS_LOOKUP_DESTROY, attrs);
+ if (IS_ERR(uobj))
+ return uobj;
+
+ ret = uobj_destroy(uobj, attrs);
+ if (ret) {
+ rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_DESTROY);
+ return ERR_PTR(ret);
+ }
+
+ return uobj;
+}
+
+/*
+ * Does both uobj_get_destroy() and uobj_put_destroy(). Returns 0 on success
+ * (negative errno on failure). For use by callers that do not need the uobj.
+ */
+int __uobj_perform_destroy(const struct uverbs_api_object *obj, u32 id,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj;
+
+ uobj = __uobj_get_destroy(obj, id, attrs);
+ if (IS_ERR(uobj))
+ return PTR_ERR(uobj);
+ uobj_put_destroy(uobj);
+ return 0;
+}
+
+/* alloc_uobj must be undone by uverbs_destroy_uobject() */
+static struct ib_uobject *alloc_uobj(struct uverbs_attr_bundle *attrs,
+ const struct uverbs_api_object *obj)
+{
+ struct ib_uverbs_file *ufile = attrs->ufile;
+ struct ib_uobject *uobj;
+
+ if (!attrs->context) {
+ struct ib_ucontext *ucontext =
+ ib_uverbs_get_ucontext_file(ufile);
+
+ if (IS_ERR(ucontext))
+ return ERR_CAST(ucontext);
+ attrs->context = ucontext;
+ }
+
+ uobj = kzalloc(obj->type_attrs->obj_size, GFP_KERNEL);
+ if (!uobj)
+ return ERR_PTR(-ENOMEM);
+ /*
+ * user_handle should be filled by the handler,
+ * The object is added to the list in the commit stage.
+ */
+ uobj->ufile = ufile;
+ uobj->context = attrs->context;
+ INIT_LIST_HEAD(&uobj->list);
+ uobj->uapi_object = obj;
+ /*
+ * Allocated objects start out as write locked to deny any other
+ * syscalls from accessing them until they are committed. See
+ * rdma_alloc_commit_uobject
+ */
+ atomic_set(&uobj->usecnt, -1);
+ kref_init(&uobj->ref);
+
+ return uobj;
+}
+
+static int idr_add_uobj(struct ib_uobject *uobj)
+{
+ /*
+ * We start with allocating an idr pointing to NULL. This represents an
+ * object which isn't initialized yet. We'll replace it later on with
+ * the real object once we commit.
+ */
+ return xa_alloc(&uobj->ufile->idr, &uobj->id, NULL, xa_limit_32b,
+ GFP_KERNEL);
+}
+
+/* Returns the ib_uobject or an error. The caller should check for IS_ERR. */
+static struct ib_uobject *
+lookup_get_idr_uobject(const struct uverbs_api_object *obj,
+ struct ib_uverbs_file *ufile, s64 id,
+ enum rdma_lookup_mode mode)
+{
+ struct ib_uobject *uobj;
+
+ if (id < 0 || id > ULONG_MAX)
+ return ERR_PTR(-EINVAL);
+
+ rcu_read_lock();
+ /*
+ * The idr_find is guaranteed to return a pointer to something that
+ * isn't freed yet, or NULL, as the free after idr_remove goes through
+ * kfree_rcu(). However the object may still have been released and
+ * kfree() could be called at any time.
+ */
+ uobj = xa_load(&ufile->idr, id);
+ if (!uobj || !kref_get_unless_zero(&uobj->ref))
+ uobj = ERR_PTR(-ENOENT);
+ rcu_read_unlock();
+ return uobj;
+}
+
+static struct ib_uobject *
+lookup_get_fd_uobject(const struct uverbs_api_object *obj,
+ struct ib_uverbs_file *ufile, s64 id,
+ enum rdma_lookup_mode mode)
+{
+ const struct uverbs_obj_fd_type *fd_type;
+ struct file *f;
+ struct ib_uobject *uobject;
+ int fdno = id;
+
+ if (fdno != id)
+ return ERR_PTR(-EINVAL);
+
+ if (mode != UVERBS_LOOKUP_READ)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ if (!obj->type_attrs)
+ return ERR_PTR(-EIO);
+ fd_type =
+ container_of(obj->type_attrs, struct uverbs_obj_fd_type, type);
+
+ f = fget(fdno);
+ if (!f)
+ return ERR_PTR(-EBADF);
+
+ uobject = f->private_data;
+ /*
+ * fget(id) ensures we are not currently running
+ * uverbs_uobject_fd_release(), and the caller is expected to ensure
+ * that release is never done while a call to lookup is possible.
+ */
+ if (f->f_op != fd_type->fops || uobject->ufile != ufile) {
+ fput(f);
+ return ERR_PTR(-EBADF);
+ }
+
+ uverbs_uobject_get(uobject);
+ return uobject;
+}
+
+struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_api_object *obj,
+ struct ib_uverbs_file *ufile, s64 id,
+ enum rdma_lookup_mode mode,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj;
+ int ret;
+
+ if (obj == ERR_PTR(-ENOMSG)) {
+ /* must be UVERBS_IDR_ANY_OBJECT, see uapi_get_object() */
+ uobj = lookup_get_idr_uobject(NULL, ufile, id, mode);
+ if (IS_ERR(uobj))
+ return uobj;
+ } else {
+ if (IS_ERR(obj))
+ return ERR_PTR(-EINVAL);
+
+ uobj = obj->type_class->lookup_get(obj, ufile, id, mode);
+ if (IS_ERR(uobj))
+ return uobj;
+
+ if (uobj->uapi_object != obj) {
+ ret = -EINVAL;
+ goto free;
+ }
+ }
+
+ /*
+ * If we have been disassociated block every command except for
+ * DESTROY based commands.
+ */
+ if (mode != UVERBS_LOOKUP_DESTROY &&
+ !srcu_dereference(ufile->device->ib_dev,
+ &ufile->device->disassociate_srcu)) {
+ ret = -EIO;
+ goto free;
+ }
+
+ ret = uverbs_try_lock_object(uobj, mode);
+ if (ret)
+ goto free;
+ if (attrs)
+ attrs->context = uobj->context;
+
+ return uobj;
+free:
+ uobj->uapi_object->type_class->lookup_put(uobj, mode);
+ uverbs_uobject_put(uobj);
+ return ERR_PTR(ret);
+}
+
+static struct ib_uobject *
+alloc_begin_idr_uobject(const struct uverbs_api_object *obj,
+ struct uverbs_attr_bundle *attrs)
+{
+ int ret;
+ struct ib_uobject *uobj;
+
+ uobj = alloc_uobj(attrs, obj);
+ if (IS_ERR(uobj))
+ return uobj;
+
+ ret = idr_add_uobj(uobj);
+ if (ret)
+ goto uobj_put;
+
+ ret = ib_rdmacg_try_charge(&uobj->cg_obj, uobj->context->device,
+ RDMACG_RESOURCE_HCA_OBJECT);
+ if (ret)
+ goto remove;
+
+ return uobj;
+
+remove:
+ xa_erase(&attrs->ufile->idr, uobj->id);
+uobj_put:
+ uverbs_uobject_put(uobj);
+ return ERR_PTR(ret);
+}
+
+static struct ib_uobject *
+alloc_begin_fd_uobject(const struct uverbs_api_object *obj,
+ struct uverbs_attr_bundle *attrs)
+{
+ const struct uverbs_obj_fd_type *fd_type;
+ int new_fd;
+ struct ib_uobject *uobj, *ret;
+ struct file *filp;
+
+ uobj = alloc_uobj(attrs, obj);
+ if (IS_ERR(uobj))
+ return uobj;
+
+ fd_type =
+ container_of(obj->type_attrs, struct uverbs_obj_fd_type, type);
+ if (WARN_ON(fd_type->fops->release != &uverbs_uobject_fd_release &&
+ fd_type->fops->release != &uverbs_async_event_release)) {
+ ret = ERR_PTR(-EINVAL);
+ goto err_fd;
+ }
+
+ new_fd = get_unused_fd_flags(O_CLOEXEC);
+ if (new_fd < 0) {
+ ret = ERR_PTR(new_fd);
+ goto err_fd;
+ }
+
+ /* Note that uverbs_uobject_fd_release() is called during abort */
+ filp = anon_inode_getfile(fd_type->name, fd_type->fops, NULL,
+ fd_type->flags);
+ if (IS_ERR(filp)) {
+ ret = ERR_CAST(filp);
+ goto err_getfile;
+ }
+ uobj->object = filp;
+
+ uobj->id = new_fd;
+ return uobj;
+
+err_getfile:
+ put_unused_fd(new_fd);
+err_fd:
+ uverbs_uobject_put(uobj);
+ return ret;
+}
+
+struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_api_object *obj,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_file *ufile = attrs->ufile;
+ struct ib_uobject *ret;
+
+ if (IS_ERR(obj))
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * The hw_destroy_rwsem is held across the entire object creation and
+ * released during rdma_alloc_commit_uobject or
+ * rdma_alloc_abort_uobject
+ */
+ if (!down_read_trylock(&ufile->hw_destroy_rwsem))
+ return ERR_PTR(-EIO);
+
+ ret = obj->type_class->alloc_begin(obj, attrs);
+ if (IS_ERR(ret)) {
+ up_read(&ufile->hw_destroy_rwsem);
+ return ret;
+ }
+ return ret;
+}
+
+static void alloc_abort_idr_uobject(struct ib_uobject *uobj)
+{
+ ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device,
+ RDMACG_RESOURCE_HCA_OBJECT);
+
+ xa_erase(&uobj->ufile->idr, uobj->id);
+}
+
+static int __must_check destroy_hw_idr_uobject(struct ib_uobject *uobj,
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
+{
+ const struct uverbs_obj_idr_type *idr_type =
+ container_of(uobj->uapi_object->type_attrs,
+ struct uverbs_obj_idr_type, type);
+ int ret = idr_type->destroy_object(uobj, why, attrs);
+
+ if (ret)
+ return ret;
+
+ if (why == RDMA_REMOVE_ABORT)
+ return 0;
+
+ ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device,
+ RDMACG_RESOURCE_HCA_OBJECT);
+
+ return 0;
+}
+
+static void remove_handle_idr_uobject(struct ib_uobject *uobj)
+{
+ xa_erase(&uobj->ufile->idr, uobj->id);
+ /* Matches the kref in alloc_commit_idr_uobject */
+ uverbs_uobject_put(uobj);
+}
+
+static void alloc_abort_fd_uobject(struct ib_uobject *uobj)
+{
+ struct file *filp = uobj->object;
+
+ fput(filp);
+ put_unused_fd(uobj->id);
+}
+
+static int __must_check destroy_hw_fd_uobject(struct ib_uobject *uobj,
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
+{
+ const struct uverbs_obj_fd_type *fd_type = container_of(
+ uobj->uapi_object->type_attrs, struct uverbs_obj_fd_type, type);
+
+ fd_type->destroy_object(uobj, why);
+ return 0;
+}
+
+static void remove_handle_fd_uobject(struct ib_uobject *uobj)
+{
+}
+
+static void alloc_commit_idr_uobject(struct ib_uobject *uobj)
+{
+ struct ib_uverbs_file *ufile = uobj->ufile;
+ void *old;
+
+ /*
+ * We already allocated this IDR with a NULL object, so
+ * this shouldn't fail.
+ *
+ * NOTE: Storing the uobj transfers our kref on uobj to the XArray.
+ * It will be put by remove_commit_idr_uobject()
+ */
+ old = xa_store(&ufile->idr, uobj->id, uobj, GFP_KERNEL);
+ WARN_ON(old != NULL);
+}
+
+static void swap_idr_uobjects(struct ib_uobject *obj_old,
+ struct ib_uobject *obj_new)
+{
+ struct ib_uverbs_file *ufile = obj_old->ufile;
+ void *old;
+
+ /*
+ * New must be an object that been allocated but not yet committed, this
+ * moves the pre-committed state to obj_old, new still must be comitted.
+ */
+ old = xa_cmpxchg(&ufile->idr, obj_old->id, obj_old, XA_ZERO_ENTRY,
+ GFP_KERNEL);
+ if (WARN_ON(old != obj_old))
+ return;
+
+ swap(obj_old->id, obj_new->id);
+
+ old = xa_cmpxchg(&ufile->idr, obj_old->id, NULL, obj_old, GFP_KERNEL);
+ WARN_ON(old != NULL);
+}
+
+static void alloc_commit_fd_uobject(struct ib_uobject *uobj)
+{
+ int fd = uobj->id;
+ struct file *filp = uobj->object;
+
+ /* Matching put will be done in uverbs_uobject_fd_release() */
+ kref_get(&uobj->ufile->ref);
+
+ /* This shouldn't be used anymore. Use the file object instead */
+ uobj->id = 0;
+
+ /*
+ * NOTE: Once we install the file we loose ownership of our kref on
+ * uobj. It will be put by uverbs_uobject_fd_release()
+ */
+ filp->private_data = uobj;
+ fd_install(fd, filp);
+}
+
+/*
+ * In all cases rdma_alloc_commit_uobject() consumes the kref to uobj and the
+ * caller can no longer assume uobj is valid. If this function fails it
+ * destroys the uboject, including the attached HW object.
+ */
+void rdma_alloc_commit_uobject(struct ib_uobject *uobj,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_file *ufile = attrs->ufile;
+
+ /* kref is held so long as the uobj is on the uobj list. */
+ uverbs_uobject_get(uobj);
+ spin_lock_irq(&ufile->uobjects_lock);
+ list_add(&uobj->list, &ufile->uobjects);
+ spin_unlock_irq(&ufile->uobjects_lock);
+
+ /* matches atomic_set(-1) in alloc_uobj */
+ atomic_set(&uobj->usecnt, 0);
+
+ /* alloc_commit consumes the uobj kref */
+ uobj->uapi_object->type_class->alloc_commit(uobj);
+
+ /* Matches the down_read in rdma_alloc_begin_uobject */
+ up_read(&ufile->hw_destroy_rwsem);
+}
+
+/*
+ * new_uobj will be assigned to the handle currently used by to_uobj, and
+ * to_uobj will be destroyed.
+ *
+ * Upon return the caller must do:
+ * rdma_alloc_commit_uobject(new_uobj)
+ * uobj_put_destroy(to_uobj)
+ *
+ * to_uobj must have a write get but the put mode switches to destroy once
+ * this is called.
+ */
+void rdma_assign_uobject(struct ib_uobject *to_uobj, struct ib_uobject *new_uobj,
+ struct uverbs_attr_bundle *attrs)
+{
+ assert_uverbs_usecnt(new_uobj, UVERBS_LOOKUP_WRITE);
+
+ if (WARN_ON(to_uobj->uapi_object != new_uobj->uapi_object ||
+ !to_uobj->uapi_object->type_class->swap_uobjects))
+ return;
+
+ to_uobj->uapi_object->type_class->swap_uobjects(to_uobj, new_uobj);
+
+ /*
+ * If this fails then the uobject is still completely valid (though with
+ * a new ID) and we leak it until context close.
+ */
+ uverbs_destroy_uobject(to_uobj, RDMA_REMOVE_DESTROY, attrs);
+}
+
+/*
+ * This consumes the kref for uobj. It is up to the caller to unwind the HW
+ * object and anything else connected to uobj before calling this.
+ */
+void rdma_alloc_abort_uobject(struct ib_uobject *uobj,
+ struct uverbs_attr_bundle *attrs,
+ bool hw_obj_valid)
+{
+ struct ib_uverbs_file *ufile = uobj->ufile;
+ int ret;
+
+ if (hw_obj_valid) {
+ ret = uobj->uapi_object->type_class->destroy_hw(
+ uobj, RDMA_REMOVE_ABORT, attrs);
+ /*
+ * If the driver couldn't destroy the object then go ahead and
+ * commit it. Leaking objects that can't be destroyed is only
+ * done during FD close after the driver has a few more tries to
+ * destroy it.
+ */
+ if (WARN_ON(ret))
+ return rdma_alloc_commit_uobject(uobj, attrs);
+ }
+
+ uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT, attrs);
+
+ /* Matches the down_read in rdma_alloc_begin_uobject */
+ up_read(&ufile->hw_destroy_rwsem);
+}
+
+static void lookup_put_idr_uobject(struct ib_uobject *uobj,
+ enum rdma_lookup_mode mode)
+{
+}
+
+static void lookup_put_fd_uobject(struct ib_uobject *uobj,
+ enum rdma_lookup_mode mode)
+{
+ struct file *filp = uobj->object;
+
+ WARN_ON(mode != UVERBS_LOOKUP_READ);
+ /*
+ * This indirectly calls uverbs_uobject_fd_release() and free the
+ * object
+ */
+ fput(filp);
+}
+
+void rdma_lookup_put_uobject(struct ib_uobject *uobj,
+ enum rdma_lookup_mode mode)
+{
+ assert_uverbs_usecnt(uobj, mode);
+ /*
+ * In order to unlock an object, either decrease its usecnt for
+ * read access or zero it in case of exclusive access. See
+ * uverbs_try_lock_object for locking schema information.
+ */
+ switch (mode) {
+ case UVERBS_LOOKUP_READ:
+ atomic_dec(&uobj->usecnt);
+ break;
+ case UVERBS_LOOKUP_WRITE:
+ atomic_set(&uobj->usecnt, 0);
+ break;
+ case UVERBS_LOOKUP_DESTROY:
+ break;
+ }
+
+ uobj->uapi_object->type_class->lookup_put(uobj, mode);
+ /* Pairs with the kref obtained by type->lookup_get */
+ uverbs_uobject_put(uobj);
+}
+
+void setup_ufile_idr_uobject(struct ib_uverbs_file *ufile)
+{
+ xa_init_flags(&ufile->idr, XA_FLAGS_ALLOC);
+}
+
+void release_ufile_idr_uobject(struct ib_uverbs_file *ufile)
+{
+ struct ib_uobject *entry;
+ unsigned long id;
+
+ /*
+ * At this point uverbs_cleanup_ufile() is guaranteed to have run, and
+ * there are no HW objects left, however the xarray is still populated
+ * with anything that has not been cleaned up by userspace. Since the
+ * kref on ufile is 0, nothing is allowed to call lookup_get.
+ *
+ * This is an optimized equivalent to remove_handle_idr_uobject
+ */
+ xa_for_each(&ufile->idr, id, entry) {
+ WARN_ON(entry->object);
+ uverbs_uobject_put(entry);
+ }
+
+ xa_destroy(&ufile->idr);
+}
+
+const struct uverbs_obj_type_class uverbs_idr_class = {
+ .alloc_begin = alloc_begin_idr_uobject,
+ .lookup_get = lookup_get_idr_uobject,
+ .alloc_commit = alloc_commit_idr_uobject,
+ .alloc_abort = alloc_abort_idr_uobject,
+ .lookup_put = lookup_put_idr_uobject,
+ .destroy_hw = destroy_hw_idr_uobject,
+ .remove_handle = remove_handle_idr_uobject,
+ .swap_uobjects = swap_idr_uobjects,
+};
+EXPORT_SYMBOL(uverbs_idr_class);
+
+/*
+ * Users of UVERBS_TYPE_ALLOC_FD should set this function as the struct
+ * file_operations release method.
+ */
+int uverbs_uobject_fd_release(struct inode *inode, struct file *filp)
+{
+ struct ib_uverbs_file *ufile;
+ struct ib_uobject *uobj;
+
+ /*
+ * This can only happen if the fput came from alloc_abort_fd_uobject()
+ */
+ if (!filp->private_data)
+ return 0;
+ uobj = filp->private_data;
+ ufile = uobj->ufile;
+
+ if (down_read_trylock(&ufile->hw_destroy_rwsem)) {
+ struct uverbs_attr_bundle attrs = {
+ .context = uobj->context,
+ .ufile = ufile,
+ };
+
+ /*
+ * lookup_get_fd_uobject holds the kref on the struct file any
+ * time a FD uobj is locked, which prevents this release
+ * method from being invoked. Meaning we can always get the
+ * write lock here, or we have a kernel bug.
+ */
+ WARN_ON(uverbs_try_lock_object(uobj, UVERBS_LOOKUP_WRITE));
+ uverbs_destroy_uobject(uobj, RDMA_REMOVE_CLOSE, &attrs);
+ up_read(&ufile->hw_destroy_rwsem);
+ }
+
+ /* Matches the get in alloc_commit_fd_uobject() */
+ kref_put(&ufile->ref, ib_uverbs_release_file);
+
+ /* Pairs with filp->private_data in alloc_begin_fd_uobject */
+ uverbs_uobject_put(uobj);
+ return 0;
+}
+EXPORT_SYMBOL(uverbs_uobject_fd_release);
+
+/*
+ * Drop the ucontext off the ufile and completely disconnect it from the
+ * ib_device
+ */
+static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile,
+ enum rdma_remove_reason reason)
+{
+ struct ib_ucontext *ucontext = ufile->ucontext;
+ struct ib_device *ib_dev = ucontext->device;
+
+ /*
+ * If we are closing the FD then the user mmap VMAs must have
+ * already been destroyed as they hold on to the filep, otherwise
+ * they need to be zap'd.
+ */
+ if (reason == RDMA_REMOVE_DRIVER_REMOVE) {
+ uverbs_user_mmap_disassociate(ufile);
+ if (ib_dev->ops.disassociate_ucontext)
+ ib_dev->ops.disassociate_ucontext(ucontext);
+ }
+
+ ib_rdmacg_uncharge(&ucontext->cg_obj, ib_dev,
+ RDMACG_RESOURCE_HCA_HANDLE);
+
+ rdma_restrack_del(&ucontext->res);
+
+ ib_dev->ops.dealloc_ucontext(ucontext);
+ WARN_ON(!xa_empty(&ucontext->mmap_xa));
+ kfree(ucontext);
+
+ ufile->ucontext = NULL;
+}
+
+static int __uverbs_cleanup_ufile(struct ib_uverbs_file *ufile,
+ enum rdma_remove_reason reason)
+{
+ struct ib_uobject *obj, *next_obj;
+ int ret = -EINVAL;
+ struct uverbs_attr_bundle attrs = { .ufile = ufile };
+
+ /*
+ * This shouldn't run while executing other commands on this
+ * context. Thus, the only thing we should take care of is
+ * releasing a FD while traversing this list. The FD could be
+ * closed and released from the _release fop of this FD.
+ * In order to mitigate this, we add a lock.
+ * We take and release the lock per traversal in order to let
+ * other threads (which might still use the FDs) chance to run.
+ */
+ list_for_each_entry_safe(obj, next_obj, &ufile->uobjects, list) {
+ attrs.context = obj->context;
+ /*
+ * if we hit this WARN_ON, that means we are
+ * racing with a lookup_get.
+ */
+ WARN_ON(uverbs_try_lock_object(obj, UVERBS_LOOKUP_WRITE));
+ if (reason == RDMA_REMOVE_DRIVER_FAILURE)
+ obj->object = NULL;
+ if (!uverbs_destroy_uobject(obj, reason, &attrs))
+ ret = 0;
+ else
+ atomic_set(&obj->usecnt, 0);
+ }
+
+ if (reason == RDMA_REMOVE_DRIVER_FAILURE) {
+ WARN_ON(!list_empty(&ufile->uobjects));
+ return 0;
+ }
+ return ret;
+}
+
+/*
+ * Destroy the ucontext and every uobject associated with it.
+ *
+ * This is internally locked and can be called in parallel from multiple
+ * contexts.
+ */
+void uverbs_destroy_ufile_hw(struct ib_uverbs_file *ufile,
+ enum rdma_remove_reason reason)
+{
+ down_write(&ufile->hw_destroy_rwsem);
+
+ /*
+ * If a ucontext was never created then we can't have any uobjects to
+ * cleanup, nothing to do.
+ */
+ if (!ufile->ucontext)
+ goto done;
+
+ while (!list_empty(&ufile->uobjects) &&
+ !__uverbs_cleanup_ufile(ufile, reason)) {
+ }
+
+ if (WARN_ON(!list_empty(&ufile->uobjects)))
+ __uverbs_cleanup_ufile(ufile, RDMA_REMOVE_DRIVER_FAILURE);
+ ufile_destroy_ucontext(ufile, reason);
+
+done:
+ up_write(&ufile->hw_destroy_rwsem);
+}
+
+const struct uverbs_obj_type_class uverbs_fd_class = {
+ .alloc_begin = alloc_begin_fd_uobject,
+ .lookup_get = lookup_get_fd_uobject,
+ .alloc_commit = alloc_commit_fd_uobject,
+ .alloc_abort = alloc_abort_fd_uobject,
+ .lookup_put = lookup_put_fd_uobject,
+ .destroy_hw = destroy_hw_fd_uobject,
+ .remove_handle = remove_handle_fd_uobject,
+};
+EXPORT_SYMBOL(uverbs_fd_class);
+
+struct ib_uobject *
+uverbs_get_uobject_from_file(u16 object_id, enum uverbs_obj_access access,
+ s64 id, struct uverbs_attr_bundle *attrs)
+{
+ const struct uverbs_api_object *obj =
+ uapi_get_object(attrs->ufile->device->uapi, object_id);
+
+ switch (access) {
+ case UVERBS_ACCESS_READ:
+ return rdma_lookup_get_uobject(obj, attrs->ufile, id,
+ UVERBS_LOOKUP_READ, attrs);
+ case UVERBS_ACCESS_DESTROY:
+ /* Actual destruction is done inside uverbs_handle_method */
+ return rdma_lookup_get_uobject(obj, attrs->ufile, id,
+ UVERBS_LOOKUP_DESTROY, attrs);
+ case UVERBS_ACCESS_WRITE:
+ return rdma_lookup_get_uobject(obj, attrs->ufile, id,
+ UVERBS_LOOKUP_WRITE, attrs);
+ case UVERBS_ACCESS_NEW:
+ return rdma_alloc_begin_uobject(obj, attrs);
+ default:
+ WARN_ON(true);
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+}
+
+void uverbs_finalize_object(struct ib_uobject *uobj,
+ enum uverbs_obj_access access, bool hw_obj_valid,
+ bool commit, struct uverbs_attr_bundle *attrs)
+{
+ /*
+ * refcounts should be handled at the object level and not at the
+ * uobject level. Refcounts of the objects themselves are done in
+ * handlers.
+ */
+
+ switch (access) {
+ case UVERBS_ACCESS_READ:
+ rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_READ);
+ break;
+ case UVERBS_ACCESS_WRITE:
+ rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_WRITE);
+ break;
+ case UVERBS_ACCESS_DESTROY:
+ if (uobj)
+ rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_DESTROY);
+ break;
+ case UVERBS_ACCESS_NEW:
+ if (commit)
+ rdma_alloc_commit_uobject(uobj, attrs);
+ else
+ rdma_alloc_abort_uobject(uobj, attrs, hw_obj_valid);
+ break;
+ default:
+ WARN_ON(true);
+ }
+}
diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h
new file mode 100644
index 000000000..33706dad6
--- /dev/null
+++ b/drivers/infiniband/core/rdma_core.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005-2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef RDMA_CORE_H
+#define RDMA_CORE_H
+
+#include <linux/idr.h>
+#include <rdma/uverbs_types.h>
+#include <rdma/uverbs_ioctl.h>
+#include <rdma/ib_verbs.h>
+#include <linux/mutex.h>
+
+struct ib_uverbs_device;
+
+void uverbs_destroy_ufile_hw(struct ib_uverbs_file *ufile,
+ enum rdma_remove_reason reason);
+
+int uobj_destroy(struct ib_uobject *uobj, struct uverbs_attr_bundle *attrs);
+
+/*
+ * Get an ib_uobject that corresponds to the given id from ufile, assuming
+ * the object is from the given type. Lock it to the required access when
+ * applicable.
+ * This function could create (access == NEW), destroy (access == DESTROY)
+ * or unlock (access == READ || access == WRITE) objects if required.
+ * The action will be finalized only when uverbs_finalize_object or
+ * uverbs_finalize_objects are called.
+ */
+struct ib_uobject *
+uverbs_get_uobject_from_file(u16 object_id, enum uverbs_obj_access access,
+ s64 id, struct uverbs_attr_bundle *attrs);
+
+void uverbs_finalize_object(struct ib_uobject *uobj,
+ enum uverbs_obj_access access, bool hw_obj_valid,
+ bool commit, struct uverbs_attr_bundle *attrs);
+
+int uverbs_output_written(const struct uverbs_attr_bundle *bundle, size_t idx);
+
+void setup_ufile_idr_uobject(struct ib_uverbs_file *ufile);
+void release_ufile_idr_uobject(struct ib_uverbs_file *ufile);
+
+struct ib_udata *uverbs_get_cleared_udata(struct uverbs_attr_bundle *attrs);
+
+/*
+ * This is the runtime description of the uverbs API, used by the syscall
+ * machinery to validate and dispatch calls.
+ */
+
+/*
+ * Depending on ID the slot pointer in the radix tree points at one of these
+ * structs.
+ */
+
+struct uverbs_api_ioctl_method {
+ int(__rcu *handler)(struct uverbs_attr_bundle *attrs);
+ DECLARE_BITMAP(attr_mandatory, UVERBS_API_ATTR_BKEY_LEN);
+ u16 bundle_size;
+ u8 use_stack:1;
+ u8 driver_method:1;
+ u8 disabled:1;
+ u8 has_udata:1;
+ u8 key_bitmap_len;
+ u8 destroy_bkey;
+};
+
+struct uverbs_api_write_method {
+ int (*handler)(struct uverbs_attr_bundle *attrs);
+ u8 disabled:1;
+ u8 is_ex:1;
+ u8 has_udata:1;
+ u8 has_resp:1;
+ u8 req_size;
+ u8 resp_size;
+};
+
+struct uverbs_api_attr {
+ struct uverbs_attr_spec spec;
+};
+
+struct uverbs_api {
+ /* radix tree contains struct uverbs_api_* pointers */
+ struct radix_tree_root radix;
+ enum rdma_driver_id driver_id;
+
+ unsigned int num_write;
+ unsigned int num_write_ex;
+ struct uverbs_api_write_method notsupp_method;
+ const struct uverbs_api_write_method **write_methods;
+ const struct uverbs_api_write_method **write_ex_methods;
+};
+
+/*
+ * Get an uverbs_api_object that corresponds to the given object_id.
+ * Note:
+ * -ENOMSG means that any object is allowed to match during lookup.
+ */
+static inline const struct uverbs_api_object *
+uapi_get_object(struct uverbs_api *uapi, u16 object_id)
+{
+ const struct uverbs_api_object *res;
+
+ if (object_id == UVERBS_IDR_ANY_OBJECT)
+ return ERR_PTR(-ENOMSG);
+
+ res = radix_tree_lookup(&uapi->radix, uapi_key_obj(object_id));
+ if (!res)
+ return ERR_PTR(-ENOENT);
+
+ return res;
+}
+
+char *uapi_key_format(char *S, unsigned int key);
+struct uverbs_api *uverbs_alloc_api(struct ib_device *ibdev);
+void uverbs_disassociate_api_pre(struct ib_uverbs_device *uverbs_dev);
+void uverbs_disassociate_api(struct uverbs_api *uapi);
+void uverbs_destroy_api(struct uverbs_api *uapi);
+void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm,
+ unsigned int num_attrs);
+void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile);
+
+extern const struct uapi_definition uverbs_def_obj_async_fd[];
+extern const struct uapi_definition uverbs_def_obj_counters[];
+extern const struct uapi_definition uverbs_def_obj_cq[];
+extern const struct uapi_definition uverbs_def_obj_device[];
+extern const struct uapi_definition uverbs_def_obj_dm[];
+extern const struct uapi_definition uverbs_def_obj_flow_action[];
+extern const struct uapi_definition uverbs_def_obj_intf[];
+extern const struct uapi_definition uverbs_def_obj_mr[];
+extern const struct uapi_definition uverbs_def_obj_qp[];
+extern const struct uapi_definition uverbs_def_obj_srq[];
+extern const struct uapi_definition uverbs_def_obj_wq[];
+extern const struct uapi_definition uverbs_def_write_intf[];
+
+static inline const struct uverbs_api_write_method *
+uapi_get_method(const struct uverbs_api *uapi, u32 command)
+{
+ u32 cmd_idx = command & IB_USER_VERBS_CMD_COMMAND_MASK;
+
+ if (command & ~(u32)(IB_USER_VERBS_CMD_FLAG_EXTENDED |
+ IB_USER_VERBS_CMD_COMMAND_MASK))
+ return ERR_PTR(-EINVAL);
+
+ if (command & IB_USER_VERBS_CMD_FLAG_EXTENDED) {
+ if (cmd_idx >= uapi->num_write_ex)
+ return ERR_PTR(-EOPNOTSUPP);
+ return uapi->write_ex_methods[cmd_idx];
+ }
+
+ if (cmd_idx >= uapi->num_write)
+ return ERR_PTR(-EOPNOTSUPP);
+ return uapi->write_methods[cmd_idx];
+}
+
+void uverbs_fill_udata(struct uverbs_attr_bundle *bundle,
+ struct ib_udata *udata, unsigned int attr_in,
+ unsigned int attr_out);
+
+#endif /* RDMA_CORE_H */
diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
new file mode 100644
index 000000000..01a499a8b
--- /dev/null
+++ b/drivers/infiniband/core/restrack.c
@@ -0,0 +1,353 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved.
+ */
+
+#include <rdma/rdma_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/restrack.h>
+#include <rdma/rdma_counter.h>
+#include <linux/mutex.h>
+#include <linux/sched/task.h>
+#include <linux/pid_namespace.h>
+
+#include "cma_priv.h"
+#include "restrack.h"
+
+/**
+ * rdma_restrack_init() - initialize and allocate resource tracking
+ * @dev: IB device
+ *
+ * Return: 0 on success
+ */
+int rdma_restrack_init(struct ib_device *dev)
+{
+ struct rdma_restrack_root *rt;
+ int i;
+
+ dev->res = kcalloc(RDMA_RESTRACK_MAX, sizeof(*rt), GFP_KERNEL);
+ if (!dev->res)
+ return -ENOMEM;
+
+ rt = dev->res;
+
+ for (i = 0; i < RDMA_RESTRACK_MAX; i++)
+ xa_init_flags(&rt[i].xa, XA_FLAGS_ALLOC);
+
+ return 0;
+}
+
+static const char *type2str(enum rdma_restrack_type type)
+{
+ static const char * const names[RDMA_RESTRACK_MAX] = {
+ [RDMA_RESTRACK_PD] = "PD",
+ [RDMA_RESTRACK_CQ] = "CQ",
+ [RDMA_RESTRACK_QP] = "QP",
+ [RDMA_RESTRACK_CM_ID] = "CM_ID",
+ [RDMA_RESTRACK_MR] = "MR",
+ [RDMA_RESTRACK_CTX] = "CTX",
+ [RDMA_RESTRACK_COUNTER] = "COUNTER",
+ [RDMA_RESTRACK_SRQ] = "SRQ",
+ };
+
+ return names[type];
+};
+
+/**
+ * rdma_restrack_clean() - clean resource tracking
+ * @dev: IB device
+ */
+void rdma_restrack_clean(struct ib_device *dev)
+{
+ struct rdma_restrack_root *rt = dev->res;
+ struct rdma_restrack_entry *e;
+ char buf[TASK_COMM_LEN];
+ bool found = false;
+ const char *owner;
+ int i;
+
+ for (i = 0 ; i < RDMA_RESTRACK_MAX; i++) {
+ struct xarray *xa = &dev->res[i].xa;
+
+ if (!xa_empty(xa)) {
+ unsigned long index;
+
+ if (!found) {
+ pr_err("restrack: %s", CUT_HERE);
+ dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n");
+ }
+ xa_for_each(xa, index, e) {
+ if (rdma_is_kernel_res(e)) {
+ owner = e->kern_name;
+ } else {
+ /*
+ * There is no need to call get_task_struct here,
+ * because we can be here only if there are more
+ * get_task_struct() call than put_task_struct().
+ */
+ get_task_comm(buf, e->task);
+ owner = buf;
+ }
+
+ pr_err("restrack: %s %s object allocated by %s is not freed\n",
+ rdma_is_kernel_res(e) ? "Kernel" :
+ "User",
+ type2str(e->type), owner);
+ }
+ found = true;
+ }
+ xa_destroy(xa);
+ }
+ if (found)
+ pr_err("restrack: %s", CUT_HERE);
+
+ kfree(rt);
+}
+
+/**
+ * rdma_restrack_count() - the current usage of specific object
+ * @dev: IB device
+ * @type: actual type of object to operate
+ */
+int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type)
+{
+ struct rdma_restrack_root *rt = &dev->res[type];
+ struct rdma_restrack_entry *e;
+ XA_STATE(xas, &rt->xa, 0);
+ u32 cnt = 0;
+
+ xa_lock(&rt->xa);
+ xas_for_each(&xas, e, U32_MAX)
+ cnt++;
+ xa_unlock(&rt->xa);
+ return cnt;
+}
+EXPORT_SYMBOL(rdma_restrack_count);
+
+static struct ib_device *res_to_dev(struct rdma_restrack_entry *res)
+{
+ switch (res->type) {
+ case RDMA_RESTRACK_PD:
+ return container_of(res, struct ib_pd, res)->device;
+ case RDMA_RESTRACK_CQ:
+ return container_of(res, struct ib_cq, res)->device;
+ case RDMA_RESTRACK_QP:
+ return container_of(res, struct ib_qp, res)->device;
+ case RDMA_RESTRACK_CM_ID:
+ return container_of(res, struct rdma_id_private,
+ res)->id.device;
+ case RDMA_RESTRACK_MR:
+ return container_of(res, struct ib_mr, res)->device;
+ case RDMA_RESTRACK_CTX:
+ return container_of(res, struct ib_ucontext, res)->device;
+ case RDMA_RESTRACK_COUNTER:
+ return container_of(res, struct rdma_counter, res)->device;
+ case RDMA_RESTRACK_SRQ:
+ return container_of(res, struct ib_srq, res)->device;
+ default:
+ WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type);
+ return NULL;
+ }
+}
+
+/**
+ * rdma_restrack_attach_task() - attach the task onto this resource,
+ * valid for user space restrack entries.
+ * @res: resource entry
+ * @task: the task to attach
+ */
+static void rdma_restrack_attach_task(struct rdma_restrack_entry *res,
+ struct task_struct *task)
+{
+ if (WARN_ON_ONCE(!task))
+ return;
+
+ if (res->task)
+ put_task_struct(res->task);
+ get_task_struct(task);
+ res->task = task;
+ res->user = true;
+}
+
+/**
+ * rdma_restrack_set_name() - set the task for this resource
+ * @res: resource entry
+ * @caller: kernel name, the current task will be used if the caller is NULL.
+ */
+void rdma_restrack_set_name(struct rdma_restrack_entry *res, const char *caller)
+{
+ if (caller) {
+ res->kern_name = caller;
+ return;
+ }
+
+ rdma_restrack_attach_task(res, current);
+}
+EXPORT_SYMBOL(rdma_restrack_set_name);
+
+/**
+ * rdma_restrack_parent_name() - set the restrack name properties based
+ * on parent restrack
+ * @dst: destination resource entry
+ * @parent: parent resource entry
+ */
+void rdma_restrack_parent_name(struct rdma_restrack_entry *dst,
+ const struct rdma_restrack_entry *parent)
+{
+ if (rdma_is_kernel_res(parent))
+ dst->kern_name = parent->kern_name;
+ else
+ rdma_restrack_attach_task(dst, parent->task);
+}
+EXPORT_SYMBOL(rdma_restrack_parent_name);
+
+/**
+ * rdma_restrack_new() - Initializes new restrack entry to allow _put() interface
+ * to release memory in fully automatic way.
+ * @res: Entry to initialize
+ * @type: REstrack type
+ */
+void rdma_restrack_new(struct rdma_restrack_entry *res,
+ enum rdma_restrack_type type)
+{
+ kref_init(&res->kref);
+ init_completion(&res->comp);
+ res->type = type;
+}
+EXPORT_SYMBOL(rdma_restrack_new);
+
+/**
+ * rdma_restrack_add() - add object to the reource tracking database
+ * @res: resource entry
+ */
+void rdma_restrack_add(struct rdma_restrack_entry *res)
+{
+ struct ib_device *dev = res_to_dev(res);
+ struct rdma_restrack_root *rt;
+ int ret = 0;
+
+ if (!dev)
+ return;
+
+ if (res->no_track)
+ goto out;
+
+ rt = &dev->res[res->type];
+
+ if (res->type == RDMA_RESTRACK_QP) {
+ /* Special case to ensure that LQPN points to right QP */
+ struct ib_qp *qp = container_of(res, struct ib_qp, res);
+
+ WARN_ONCE(qp->qp_num >> 24 || qp->port >> 8,
+ "QP number 0x%0X and port 0x%0X", qp->qp_num,
+ qp->port);
+ res->id = qp->qp_num;
+ if (qp->qp_type == IB_QPT_SMI || qp->qp_type == IB_QPT_GSI)
+ res->id |= qp->port << 24;
+ ret = xa_insert(&rt->xa, res->id, res, GFP_KERNEL);
+ if (ret)
+ res->id = 0;
+ } else if (res->type == RDMA_RESTRACK_COUNTER) {
+ /* Special case to ensure that cntn points to right counter */
+ struct rdma_counter *counter;
+
+ counter = container_of(res, struct rdma_counter, res);
+ ret = xa_insert(&rt->xa, counter->id, res, GFP_KERNEL);
+ res->id = ret ? 0 : counter->id;
+ } else {
+ ret = xa_alloc_cyclic(&rt->xa, &res->id, res, xa_limit_32b,
+ &rt->next_id, GFP_KERNEL);
+ ret = (ret < 0) ? ret : 0;
+ }
+
+out:
+ if (!ret)
+ res->valid = true;
+}
+EXPORT_SYMBOL(rdma_restrack_add);
+
+int __must_check rdma_restrack_get(struct rdma_restrack_entry *res)
+{
+ return kref_get_unless_zero(&res->kref);
+}
+EXPORT_SYMBOL(rdma_restrack_get);
+
+/**
+ * rdma_restrack_get_byid() - translate from ID to restrack object
+ * @dev: IB device
+ * @type: resource track type
+ * @id: ID to take a look
+ *
+ * Return: Pointer to restrack entry or -ENOENT in case of error.
+ */
+struct rdma_restrack_entry *
+rdma_restrack_get_byid(struct ib_device *dev,
+ enum rdma_restrack_type type, u32 id)
+{
+ struct rdma_restrack_root *rt = &dev->res[type];
+ struct rdma_restrack_entry *res;
+
+ xa_lock(&rt->xa);
+ res = xa_load(&rt->xa, id);
+ if (!res || !rdma_restrack_get(res))
+ res = ERR_PTR(-ENOENT);
+ xa_unlock(&rt->xa);
+
+ return res;
+}
+EXPORT_SYMBOL(rdma_restrack_get_byid);
+
+static void restrack_release(struct kref *kref)
+{
+ struct rdma_restrack_entry *res;
+
+ res = container_of(kref, struct rdma_restrack_entry, kref);
+ if (res->task) {
+ put_task_struct(res->task);
+ res->task = NULL;
+ }
+ complete(&res->comp);
+}
+
+int rdma_restrack_put(struct rdma_restrack_entry *res)
+{
+ return kref_put(&res->kref, restrack_release);
+}
+EXPORT_SYMBOL(rdma_restrack_put);
+
+/**
+ * rdma_restrack_del() - delete object from the reource tracking database
+ * @res: resource entry
+ */
+void rdma_restrack_del(struct rdma_restrack_entry *res)
+{
+ struct rdma_restrack_entry *old;
+ struct rdma_restrack_root *rt;
+ struct ib_device *dev;
+
+ if (!res->valid) {
+ if (res->task) {
+ put_task_struct(res->task);
+ res->task = NULL;
+ }
+ return;
+ }
+
+ if (res->no_track)
+ goto out;
+
+ dev = res_to_dev(res);
+ if (WARN_ON(!dev))
+ return;
+
+ rt = &dev->res[res->type];
+
+ old = xa_erase(&rt->xa, res->id);
+ WARN_ON(old != res);
+
+out:
+ res->valid = false;
+ rdma_restrack_put(res);
+ wait_for_completion(&res->comp);
+}
+EXPORT_SYMBOL(rdma_restrack_del);
diff --git a/drivers/infiniband/core/restrack.h b/drivers/infiniband/core/restrack.h
new file mode 100644
index 000000000..6a04fc41f
--- /dev/null
+++ b/drivers/infiniband/core/restrack.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2017-2019 Mellanox Technologies. All rights reserved.
+ */
+
+#ifndef _RDMA_CORE_RESTRACK_H_
+#define _RDMA_CORE_RESTRACK_H_
+
+#include <linux/mutex.h>
+
+/**
+ * struct rdma_restrack_root - main resource tracking management
+ * entity, per-device
+ */
+struct rdma_restrack_root {
+ /**
+ * @xa: Array of XArray structure to hold restrack entries.
+ */
+ struct xarray xa;
+ /**
+ * @next_id: Next ID to support cyclic allocation
+ */
+ u32 next_id;
+};
+
+int rdma_restrack_init(struct ib_device *dev);
+void rdma_restrack_clean(struct ib_device *dev);
+void rdma_restrack_add(struct rdma_restrack_entry *res);
+void rdma_restrack_del(struct rdma_restrack_entry *res);
+void rdma_restrack_new(struct rdma_restrack_entry *res,
+ enum rdma_restrack_type type);
+void rdma_restrack_set_name(struct rdma_restrack_entry *res,
+ const char *caller);
+void rdma_restrack_parent_name(struct rdma_restrack_entry *dst,
+ const struct rdma_restrack_entry *parent);
+#endif /* _RDMA_CORE_RESTRACK_H_ */
diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
new file mode 100644
index 000000000..e958c43dd
--- /dev/null
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -0,0 +1,929 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "core_priv.h"
+
+#include <linux/in.h>
+#include <linux/in6.h>
+
+/* For in6_dev_get/in6_dev_put */
+#include <net/addrconf.h>
+#include <net/bonding.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_addr.h>
+
+static struct workqueue_struct *gid_cache_wq;
+
+enum gid_op_type {
+ GID_DEL = 0,
+ GID_ADD
+};
+
+struct update_gid_event_work {
+ struct work_struct work;
+ union ib_gid gid;
+ struct ib_gid_attr gid_attr;
+ enum gid_op_type gid_op;
+};
+
+#define ROCE_NETDEV_CALLBACK_SZ 3
+struct netdev_event_work_cmd {
+ roce_netdev_callback cb;
+ roce_netdev_filter filter;
+ struct net_device *ndev;
+ struct net_device *filter_ndev;
+};
+
+struct netdev_event_work {
+ struct work_struct work;
+ struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ];
+};
+
+static const struct {
+ bool (*is_supported)(const struct ib_device *device, u32 port_num);
+ enum ib_gid_type gid_type;
+} PORT_CAP_TO_GID_TYPE[] = {
+ {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE},
+ {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP},
+};
+
+#define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE)
+
+unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u32 port)
+{
+ int i;
+ unsigned int ret_flags = 0;
+
+ if (!rdma_protocol_roce(ib_dev, port))
+ return 1UL << IB_GID_TYPE_IB;
+
+ for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++)
+ if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port))
+ ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type;
+
+ return ret_flags;
+}
+EXPORT_SYMBOL(roce_gid_type_mask_support);
+
+static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev,
+ u32 port, union ib_gid *gid,
+ struct ib_gid_attr *gid_attr)
+{
+ int i;
+ unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+
+ for (i = 0; i < IB_GID_TYPE_SIZE; i++) {
+ if ((1UL << i) & gid_type_mask) {
+ gid_attr->gid_type = i;
+ switch (gid_op) {
+ case GID_ADD:
+ ib_cache_gid_add(ib_dev, port,
+ gid, gid_attr);
+ break;
+ case GID_DEL:
+ ib_cache_gid_del(ib_dev, port,
+ gid, gid_attr);
+ break;
+ }
+ }
+ }
+}
+
+enum bonding_slave_state {
+ BONDING_SLAVE_STATE_ACTIVE = 1UL << 0,
+ BONDING_SLAVE_STATE_INACTIVE = 1UL << 1,
+ /* No primary slave or the device isn't a slave in bonding */
+ BONDING_SLAVE_STATE_NA = 1UL << 2,
+};
+
+static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev,
+ struct net_device *upper)
+{
+ if (upper && netif_is_bond_master(upper)) {
+ struct net_device *pdev =
+ bond_option_active_slave_get_rcu(netdev_priv(upper));
+
+ if (pdev)
+ return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE :
+ BONDING_SLAVE_STATE_INACTIVE;
+ }
+
+ return BONDING_SLAVE_STATE_NA;
+}
+
+#define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \
+ BONDING_SLAVE_STATE_NA)
+static bool
+is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u32 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *real_dev;
+ bool res;
+
+ if (!rdma_ndev)
+ return false;
+
+ rcu_read_lock();
+ real_dev = rdma_vlan_dev_real_dev(cookie);
+ if (!real_dev)
+ real_dev = cookie;
+
+ res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) &&
+ (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) &
+ REQUIRED_BOND_STATES)) ||
+ real_dev == rdma_ndev);
+
+ rcu_read_unlock();
+ return res;
+}
+
+static bool
+is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u32 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *master_dev;
+ bool res;
+
+ if (!rdma_ndev)
+ return false;
+
+ rcu_read_lock();
+ master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev);
+ res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) ==
+ BONDING_SLAVE_STATE_INACTIVE;
+ rcu_read_unlock();
+
+ return res;
+}
+
+/**
+ * is_ndev_for_default_gid_filter - Check if a given netdevice
+ * can be considered for default GIDs or not.
+ * @ib_dev: IB device to check
+ * @port: Port to consider for adding default GID
+ * @rdma_ndev: rdma netdevice pointer
+ * @cookie: Netdevice to consider to form a default GID
+ *
+ * is_ndev_for_default_gid_filter() returns true if a given netdevice can be
+ * considered for deriving default RoCE GID, returns false otherwise.
+ */
+static bool
+is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u32 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *cookie_ndev = cookie;
+ bool res;
+
+ if (!rdma_ndev)
+ return false;
+
+ rcu_read_lock();
+
+ /*
+ * When rdma netdevice is used in bonding, bonding master netdevice
+ * should be considered for default GIDs. Therefore, ignore slave rdma
+ * netdevices when bonding is considered.
+ * Additionally when event(cookie) netdevice is bond master device,
+ * make sure that it the upper netdevice of rdma netdevice.
+ */
+ res = ((cookie_ndev == rdma_ndev && !netif_is_bond_slave(rdma_ndev)) ||
+ (netif_is_bond_master(cookie_ndev) &&
+ rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev)));
+
+ rcu_read_unlock();
+ return res;
+}
+
+static bool pass_all_filter(struct ib_device *ib_dev, u32 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ return true;
+}
+
+static bool upper_device_filter(struct ib_device *ib_dev, u32 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ bool res;
+
+ if (!rdma_ndev)
+ return false;
+
+ if (rdma_ndev == cookie)
+ return true;
+
+ rcu_read_lock();
+ res = rdma_is_upper_dev_rcu(rdma_ndev, cookie);
+ rcu_read_unlock();
+
+ return res;
+}
+
+/**
+ * is_upper_ndev_bond_master_filter - Check if a given netdevice
+ * is bond master device of netdevice of the RDMA device of port.
+ * @ib_dev: IB device to check
+ * @port: Port to consider for adding default GID
+ * @rdma_ndev: Pointer to rdma netdevice
+ * @cookie: Netdevice to consider to form a default GID
+ *
+ * is_upper_ndev_bond_master_filter() returns true if a cookie_netdev
+ * is bond master device and rdma_ndev is its lower netdevice. It might
+ * not have been established as slave device yet.
+ */
+static bool
+is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u32 port,
+ struct net_device *rdma_ndev,
+ void *cookie)
+{
+ struct net_device *cookie_ndev = cookie;
+ bool match = false;
+
+ if (!rdma_ndev)
+ return false;
+
+ rcu_read_lock();
+ if (netif_is_bond_master(cookie_ndev) &&
+ rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev))
+ match = true;
+ rcu_read_unlock();
+ return match;
+}
+
+static void update_gid_ip(enum gid_op_type gid_op,
+ struct ib_device *ib_dev,
+ u32 port, struct net_device *ndev,
+ struct sockaddr *addr)
+{
+ union ib_gid gid;
+ struct ib_gid_attr gid_attr;
+
+ rdma_ip2gid(addr, &gid);
+ memset(&gid_attr, 0, sizeof(gid_attr));
+ gid_attr.ndev = ndev;
+
+ update_gid(gid_op, ib_dev, port, &gid, &gid_attr);
+}
+
+static void bond_delete_netdev_default_gids(struct ib_device *ib_dev,
+ u32 port,
+ struct net_device *rdma_ndev,
+ struct net_device *event_ndev)
+{
+ struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev);
+ unsigned long gid_type_mask;
+
+ if (!rdma_ndev)
+ return;
+
+ if (!real_dev)
+ real_dev = event_ndev;
+
+ rcu_read_lock();
+
+ if (((rdma_ndev != event_ndev &&
+ !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) ||
+ is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev)
+ ==
+ BONDING_SLAVE_STATE_INACTIVE)) {
+ rcu_read_unlock();
+ return;
+ }
+
+ rcu_read_unlock();
+
+ gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+
+ ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev,
+ gid_type_mask,
+ IB_CACHE_GID_DEFAULT_MODE_DELETE);
+}
+
+static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
+ u32 port, struct net_device *ndev)
+{
+ const struct in_ifaddr *ifa;
+ struct in_device *in_dev;
+ struct sin_list {
+ struct list_head list;
+ struct sockaddr_in ip;
+ };
+ struct sin_list *sin_iter;
+ struct sin_list *sin_temp;
+
+ LIST_HEAD(sin_list);
+ if (ndev->reg_state >= NETREG_UNREGISTERING)
+ return;
+
+ rcu_read_lock();
+ in_dev = __in_dev_get_rcu(ndev);
+ if (!in_dev) {
+ rcu_read_unlock();
+ return;
+ }
+
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+
+ if (!entry)
+ continue;
+
+ entry->ip.sin_family = AF_INET;
+ entry->ip.sin_addr.s_addr = ifa->ifa_address;
+ list_add_tail(&entry->list, &sin_list);
+ }
+
+ rcu_read_unlock();
+
+ list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) {
+ update_gid_ip(GID_ADD, ib_dev, port, ndev,
+ (struct sockaddr *)&sin_iter->ip);
+ list_del(&sin_iter->list);
+ kfree(sin_iter);
+ }
+}
+
+static void enum_netdev_ipv6_ips(struct ib_device *ib_dev,
+ u32 port, struct net_device *ndev)
+{
+ struct inet6_ifaddr *ifp;
+ struct inet6_dev *in6_dev;
+ struct sin6_list {
+ struct list_head list;
+ struct sockaddr_in6 sin6;
+ };
+ struct sin6_list *sin6_iter;
+ struct sin6_list *sin6_temp;
+ struct ib_gid_attr gid_attr = {.ndev = ndev};
+ LIST_HEAD(sin6_list);
+
+ if (ndev->reg_state >= NETREG_UNREGISTERING)
+ return;
+
+ in6_dev = in6_dev_get(ndev);
+ if (!in6_dev)
+ return;
+
+ read_lock_bh(&in6_dev->lock);
+ list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
+ struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+
+ if (!entry)
+ continue;
+
+ entry->sin6.sin6_family = AF_INET6;
+ entry->sin6.sin6_addr = ifp->addr;
+ list_add_tail(&entry->list, &sin6_list);
+ }
+ read_unlock_bh(&in6_dev->lock);
+
+ in6_dev_put(in6_dev);
+
+ list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) {
+ union ib_gid gid;
+
+ rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid);
+ update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr);
+ list_del(&sin6_iter->list);
+ kfree(sin6_iter);
+ }
+}
+
+static void _add_netdev_ips(struct ib_device *ib_dev, u32 port,
+ struct net_device *ndev)
+{
+ enum_netdev_ipv4_ips(ib_dev, port, ndev);
+ if (IS_ENABLED(CONFIG_IPV6))
+ enum_netdev_ipv6_ips(ib_dev, port, ndev);
+}
+
+static void add_netdev_ips(struct ib_device *ib_dev, u32 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ _add_netdev_ips(ib_dev, port, cookie);
+}
+
+static void del_netdev_ips(struct ib_device *ib_dev, u32 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ ib_cache_gid_del_all_netdev_gids(ib_dev, port, cookie);
+}
+
+/**
+ * del_default_gids - Delete default GIDs of the event/cookie netdevice
+ * @ib_dev: RDMA device pointer
+ * @port: Port of the RDMA device whose GID table to consider
+ * @rdma_ndev: Unused rdma netdevice
+ * @cookie: Pointer to event netdevice
+ *
+ * del_default_gids() deletes the default GIDs of the event/cookie netdevice.
+ */
+static void del_default_gids(struct ib_device *ib_dev, u32 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *cookie_ndev = cookie;
+ unsigned long gid_type_mask;
+
+ gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+
+ ib_cache_gid_set_default_gid(ib_dev, port, cookie_ndev, gid_type_mask,
+ IB_CACHE_GID_DEFAULT_MODE_DELETE);
+}
+
+static void add_default_gids(struct ib_device *ib_dev, u32 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *event_ndev = cookie;
+ unsigned long gid_type_mask;
+
+ gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+ ib_cache_gid_set_default_gid(ib_dev, port, event_ndev, gid_type_mask,
+ IB_CACHE_GID_DEFAULT_MODE_SET);
+}
+
+static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev,
+ u32 port,
+ struct net_device *rdma_ndev,
+ void *cookie)
+{
+ struct net *net;
+ struct net_device *ndev;
+
+ /* Lock the rtnl to make sure the netdevs does not move under
+ * our feet
+ */
+ rtnl_lock();
+ down_read(&net_rwsem);
+ for_each_net(net)
+ for_each_netdev(net, ndev) {
+ /*
+ * Filter and add default GIDs of the primary netdevice
+ * when not in bonding mode, or add default GIDs
+ * of bond master device, when in bonding mode.
+ */
+ if (is_ndev_for_default_gid_filter(ib_dev, port,
+ rdma_ndev, ndev))
+ add_default_gids(ib_dev, port, rdma_ndev, ndev);
+
+ if (is_eth_port_of_netdev_filter(ib_dev, port,
+ rdma_ndev, ndev))
+ _add_netdev_ips(ib_dev, port, ndev);
+ }
+ up_read(&net_rwsem);
+ rtnl_unlock();
+}
+
+/**
+ * rdma_roce_rescan_device - Rescan all of the network devices in the system
+ * and add their gids, as needed, to the relevant RoCE devices.
+ *
+ * @ib_dev: the rdma device
+ */
+void rdma_roce_rescan_device(struct ib_device *ib_dev)
+{
+ ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL,
+ enum_all_gids_of_dev_cb, NULL);
+}
+EXPORT_SYMBOL(rdma_roce_rescan_device);
+
+static void callback_for_addr_gid_device_scan(struct ib_device *device,
+ u32 port,
+ struct net_device *rdma_ndev,
+ void *cookie)
+{
+ struct update_gid_event_work *parsed = cookie;
+
+ return update_gid(parsed->gid_op, device,
+ port, &parsed->gid,
+ &parsed->gid_attr);
+}
+
+struct upper_list {
+ struct list_head list;
+ struct net_device *upper;
+};
+
+static int netdev_upper_walk(struct net_device *upper,
+ struct netdev_nested_priv *priv)
+{
+ struct upper_list *entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+ struct list_head *upper_list = (struct list_head *)priv->data;
+
+ if (!entry)
+ return 0;
+
+ list_add_tail(&entry->list, upper_list);
+ dev_hold(upper);
+ entry->upper = upper;
+
+ return 0;
+}
+
+static void handle_netdev_upper(struct ib_device *ib_dev, u32 port,
+ void *cookie,
+ void (*handle_netdev)(struct ib_device *ib_dev,
+ u32 port,
+ struct net_device *ndev))
+{
+ struct net_device *ndev = cookie;
+ struct netdev_nested_priv priv;
+ struct upper_list *upper_iter;
+ struct upper_list *upper_temp;
+ LIST_HEAD(upper_list);
+
+ priv.data = &upper_list;
+ rcu_read_lock();
+ netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &priv);
+ rcu_read_unlock();
+
+ handle_netdev(ib_dev, port, ndev);
+ list_for_each_entry_safe(upper_iter, upper_temp, &upper_list,
+ list) {
+ handle_netdev(ib_dev, port, upper_iter->upper);
+ dev_put(upper_iter->upper);
+ list_del(&upper_iter->list);
+ kfree(upper_iter);
+ }
+}
+
+static void _roce_del_all_netdev_gids(struct ib_device *ib_dev, u32 port,
+ struct net_device *event_ndev)
+{
+ ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev);
+}
+
+static void del_netdev_upper_ips(struct ib_device *ib_dev, u32 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ handle_netdev_upper(ib_dev, port, cookie, _roce_del_all_netdev_gids);
+}
+
+static void add_netdev_upper_ips(struct ib_device *ib_dev, u32 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips);
+}
+
+static void del_netdev_default_ips_join(struct ib_device *ib_dev, u32 port,
+ struct net_device *rdma_ndev,
+ void *cookie)
+{
+ struct net_device *master_ndev;
+
+ rcu_read_lock();
+ master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev);
+ if (master_ndev)
+ dev_hold(master_ndev);
+ rcu_read_unlock();
+
+ if (master_ndev) {
+ bond_delete_netdev_default_gids(ib_dev, port, rdma_ndev,
+ master_ndev);
+ dev_put(master_ndev);
+ }
+}
+
+/* The following functions operate on all IB devices. netdevice_event and
+ * addr_event execute ib_enum_all_roce_netdevs through a work.
+ * ib_enum_all_roce_netdevs iterates through all IB devices.
+ */
+
+static void netdevice_event_work_handler(struct work_struct *_work)
+{
+ struct netdev_event_work *work =
+ container_of(_work, struct netdev_event_work, work);
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) {
+ ib_enum_all_roce_netdevs(work->cmds[i].filter,
+ work->cmds[i].filter_ndev,
+ work->cmds[i].cb,
+ work->cmds[i].ndev);
+ dev_put(work->cmds[i].ndev);
+ dev_put(work->cmds[i].filter_ndev);
+ }
+
+ kfree(work);
+}
+
+static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
+ struct net_device *ndev)
+{
+ unsigned int i;
+ struct netdev_event_work *ndev_work =
+ kmalloc(sizeof(*ndev_work), GFP_KERNEL);
+
+ if (!ndev_work)
+ return NOTIFY_DONE;
+
+ memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds));
+ for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) {
+ if (!ndev_work->cmds[i].ndev)
+ ndev_work->cmds[i].ndev = ndev;
+ if (!ndev_work->cmds[i].filter_ndev)
+ ndev_work->cmds[i].filter_ndev = ndev;
+ dev_hold(ndev_work->cmds[i].ndev);
+ dev_hold(ndev_work->cmds[i].filter_ndev);
+ }
+ INIT_WORK(&ndev_work->work, netdevice_event_work_handler);
+
+ queue_work(gid_cache_wq, &ndev_work->work);
+
+ return NOTIFY_DONE;
+}
+
+static const struct netdev_event_work_cmd add_cmd = {
+ .cb = add_netdev_ips,
+ .filter = is_eth_port_of_netdev_filter
+};
+
+static const struct netdev_event_work_cmd add_cmd_upper_ips = {
+ .cb = add_netdev_upper_ips,
+ .filter = is_eth_port_of_netdev_filter
+};
+
+static void
+ndev_event_unlink(struct netdev_notifier_changeupper_info *changeupper_info,
+ struct netdev_event_work_cmd *cmds)
+{
+ static const struct netdev_event_work_cmd
+ upper_ips_del_cmd = {
+ .cb = del_netdev_upper_ips,
+ .filter = upper_device_filter
+ };
+
+ cmds[0] = upper_ips_del_cmd;
+ cmds[0].ndev = changeupper_info->upper_dev;
+ cmds[1] = add_cmd;
+}
+
+static const struct netdev_event_work_cmd bonding_default_add_cmd = {
+ .cb = add_default_gids,
+ .filter = is_upper_ndev_bond_master_filter
+};
+
+static void
+ndev_event_link(struct net_device *event_ndev,
+ struct netdev_notifier_changeupper_info *changeupper_info,
+ struct netdev_event_work_cmd *cmds)
+{
+ static const struct netdev_event_work_cmd
+ bonding_default_del_cmd = {
+ .cb = del_default_gids,
+ .filter = is_upper_ndev_bond_master_filter
+ };
+ /*
+ * When a lower netdev is linked to its upper bonding
+ * netdev, delete lower slave netdev's default GIDs.
+ */
+ cmds[0] = bonding_default_del_cmd;
+ cmds[0].ndev = event_ndev;
+ cmds[0].filter_ndev = changeupper_info->upper_dev;
+
+ /* Now add bonding upper device default GIDs */
+ cmds[1] = bonding_default_add_cmd;
+ cmds[1].ndev = changeupper_info->upper_dev;
+ cmds[1].filter_ndev = changeupper_info->upper_dev;
+
+ /* Now add bonding upper device IP based GIDs */
+ cmds[2] = add_cmd_upper_ips;
+ cmds[2].ndev = changeupper_info->upper_dev;
+ cmds[2].filter_ndev = changeupper_info->upper_dev;
+}
+
+static void netdevice_event_changeupper(struct net_device *event_ndev,
+ struct netdev_notifier_changeupper_info *changeupper_info,
+ struct netdev_event_work_cmd *cmds)
+{
+ if (changeupper_info->linking)
+ ndev_event_link(event_ndev, changeupper_info, cmds);
+ else
+ ndev_event_unlink(changeupper_info, cmds);
+}
+
+static const struct netdev_event_work_cmd add_default_gid_cmd = {
+ .cb = add_default_gids,
+ .filter = is_ndev_for_default_gid_filter,
+};
+
+static int netdevice_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ static const struct netdev_event_work_cmd del_cmd = {
+ .cb = del_netdev_ips, .filter = pass_all_filter};
+ static const struct netdev_event_work_cmd
+ bonding_default_del_cmd_join = {
+ .cb = del_netdev_default_ips_join,
+ .filter = is_eth_port_inactive_slave_filter
+ };
+ static const struct netdev_event_work_cmd
+ netdev_del_cmd = {
+ .cb = del_netdev_ips,
+ .filter = is_eth_port_of_netdev_filter
+ };
+ static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = {
+ .cb = del_netdev_upper_ips, .filter = upper_device_filter};
+ struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+ struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} };
+
+ if (ndev->type != ARPHRD_ETHER)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ case NETDEV_UP:
+ cmds[0] = bonding_default_del_cmd_join;
+ cmds[1] = add_default_gid_cmd;
+ cmds[2] = add_cmd;
+ break;
+
+ case NETDEV_UNREGISTER:
+ if (ndev->reg_state < NETREG_UNREGISTERED)
+ cmds[0] = del_cmd;
+ else
+ return NOTIFY_DONE;
+ break;
+
+ case NETDEV_CHANGEADDR:
+ cmds[0] = netdev_del_cmd;
+ if (ndev->reg_state == NETREG_REGISTERED) {
+ cmds[1] = add_default_gid_cmd;
+ cmds[2] = add_cmd;
+ }
+ break;
+
+ case NETDEV_CHANGEUPPER:
+ netdevice_event_changeupper(ndev,
+ container_of(ptr, struct netdev_notifier_changeupper_info, info),
+ cmds);
+ break;
+
+ case NETDEV_BONDING_FAILOVER:
+ cmds[0] = bonding_event_ips_del_cmd;
+ /* Add default GIDs of the bond device */
+ cmds[1] = bonding_default_add_cmd;
+ /* Add IP based GIDs of the bond device */
+ cmds[2] = add_cmd_upper_ips;
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return netdevice_queue_work(cmds, ndev);
+}
+
+static void update_gid_event_work_handler(struct work_struct *_work)
+{
+ struct update_gid_event_work *work =
+ container_of(_work, struct update_gid_event_work, work);
+
+ ib_enum_all_roce_netdevs(is_eth_port_of_netdev_filter,
+ work->gid_attr.ndev,
+ callback_for_addr_gid_device_scan, work);
+
+ dev_put(work->gid_attr.ndev);
+ kfree(work);
+}
+
+static int addr_event(struct notifier_block *this, unsigned long event,
+ struct sockaddr *sa, struct net_device *ndev)
+{
+ struct update_gid_event_work *work;
+ enum gid_op_type gid_op;
+
+ if (ndev->type != ARPHRD_ETHER)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_UP:
+ gid_op = GID_ADD;
+ break;
+
+ case NETDEV_DOWN:
+ gid_op = GID_DEL;
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (!work)
+ return NOTIFY_DONE;
+
+ INIT_WORK(&work->work, update_gid_event_work_handler);
+
+ rdma_ip2gid(sa, &work->gid);
+ work->gid_op = gid_op;
+
+ memset(&work->gid_attr, 0, sizeof(work->gid_attr));
+ dev_hold(ndev);
+ work->gid_attr.ndev = ndev;
+
+ queue_work(gid_cache_wq, &work->work);
+
+ return NOTIFY_DONE;
+}
+
+static int inetaddr_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct sockaddr_in in;
+ struct net_device *ndev;
+ struct in_ifaddr *ifa = ptr;
+
+ in.sin_family = AF_INET;
+ in.sin_addr.s_addr = ifa->ifa_address;
+ ndev = ifa->ifa_dev->dev;
+
+ return addr_event(this, event, (struct sockaddr *)&in, ndev);
+}
+
+static int inet6addr_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct sockaddr_in6 in6;
+ struct net_device *ndev;
+ struct inet6_ifaddr *ifa6 = ptr;
+
+ in6.sin6_family = AF_INET6;
+ in6.sin6_addr = ifa6->addr;
+ ndev = ifa6->idev->dev;
+
+ return addr_event(this, event, (struct sockaddr *)&in6, ndev);
+}
+
+static struct notifier_block nb_netdevice = {
+ .notifier_call = netdevice_event
+};
+
+static struct notifier_block nb_inetaddr = {
+ .notifier_call = inetaddr_event
+};
+
+static struct notifier_block nb_inet6addr = {
+ .notifier_call = inet6addr_event
+};
+
+int __init roce_gid_mgmt_init(void)
+{
+ gid_cache_wq = alloc_ordered_workqueue("gid-cache-wq", 0);
+ if (!gid_cache_wq)
+ return -ENOMEM;
+
+ register_inetaddr_notifier(&nb_inetaddr);
+ if (IS_ENABLED(CONFIG_IPV6))
+ register_inet6addr_notifier(&nb_inet6addr);
+ /* We relay on the netdevice notifier to enumerate all
+ * existing devices in the system. Register to this notifier
+ * last to make sure we will not miss any IP add/del
+ * callbacks.
+ */
+ register_netdevice_notifier(&nb_netdevice);
+
+ return 0;
+}
+
+void __exit roce_gid_mgmt_cleanup(void)
+{
+ if (IS_ENABLED(CONFIG_IPV6))
+ unregister_inet6addr_notifier(&nb_inet6addr);
+ unregister_inetaddr_notifier(&nb_inetaddr);
+ unregister_netdevice_notifier(&nb_netdevice);
+ /* Ensure all gid deletion tasks complete before we go down,
+ * to avoid any reference to free'd memory. By the time
+ * ib-core is removed, all physical devices have been removed,
+ * so no issue with remaining hardware contexts.
+ */
+ destroy_workqueue(gid_cache_wq);
+}
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
new file mode 100644
index 000000000..8367974b7
--- /dev/null
+++ b/drivers/infiniband/core/rw.c
@@ -0,0 +1,734 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2016 HGST, a Western Digital Company.
+ */
+#include <linux/memremap.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <linux/pci-p2pdma.h>
+#include <rdma/mr_pool.h>
+#include <rdma/rw.h>
+
+enum {
+ RDMA_RW_SINGLE_WR,
+ RDMA_RW_MULTI_WR,
+ RDMA_RW_MR,
+ RDMA_RW_SIG_MR,
+};
+
+static bool rdma_rw_force_mr;
+module_param_named(force_mr, rdma_rw_force_mr, bool, 0);
+MODULE_PARM_DESC(force_mr, "Force usage of MRs for RDMA READ/WRITE operations");
+
+/*
+ * Report whether memory registration should be used. Memory registration must
+ * be used for iWarp devices because of iWARP-specific limitations. Memory
+ * registration is also enabled if registering memory might yield better
+ * performance than using multiple SGE entries, see rdma_rw_io_needs_mr()
+ */
+static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u32 port_num)
+{
+ if (rdma_protocol_iwarp(dev, port_num))
+ return true;
+ if (dev->attrs.max_sgl_rd)
+ return true;
+ if (unlikely(rdma_rw_force_mr))
+ return true;
+ return false;
+}
+
+/*
+ * Check if the device will use memory registration for this RW operation.
+ * For RDMA READs we must use MRs on iWarp and can optionally use them as an
+ * optimization otherwise. Additionally we have a debug option to force usage
+ * of MRs to help testing this code path.
+ */
+static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u32 port_num,
+ enum dma_data_direction dir, int dma_nents)
+{
+ if (dir == DMA_FROM_DEVICE) {
+ if (rdma_protocol_iwarp(dev, port_num))
+ return true;
+ if (dev->attrs.max_sgl_rd && dma_nents > dev->attrs.max_sgl_rd)
+ return true;
+ }
+ if (unlikely(rdma_rw_force_mr))
+ return true;
+ return false;
+}
+
+static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev,
+ bool pi_support)
+{
+ u32 max_pages;
+
+ if (pi_support)
+ max_pages = dev->attrs.max_pi_fast_reg_page_list_len;
+ else
+ max_pages = dev->attrs.max_fast_reg_page_list_len;
+
+ /* arbitrary limit to avoid allocating gigantic resources */
+ return min_t(u32, max_pages, 256);
+}
+
+static inline int rdma_rw_inv_key(struct rdma_rw_reg_ctx *reg)
+{
+ int count = 0;
+
+ if (reg->mr->need_inval) {
+ reg->inv_wr.opcode = IB_WR_LOCAL_INV;
+ reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey;
+ reg->inv_wr.next = &reg->reg_wr.wr;
+ count++;
+ } else {
+ reg->inv_wr.next = NULL;
+ }
+
+ return count;
+}
+
+/* Caller must have zero-initialized *reg. */
+static int rdma_rw_init_one_mr(struct ib_qp *qp, u32 port_num,
+ struct rdma_rw_reg_ctx *reg, struct scatterlist *sg,
+ u32 sg_cnt, u32 offset)
+{
+ u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
+ qp->integrity_en);
+ u32 nents = min(sg_cnt, pages_per_mr);
+ int count = 0, ret;
+
+ reg->mr = ib_mr_pool_get(qp, &qp->rdma_mrs);
+ if (!reg->mr)
+ return -EAGAIN;
+
+ count += rdma_rw_inv_key(reg);
+
+ ret = ib_map_mr_sg(reg->mr, sg, nents, &offset, PAGE_SIZE);
+ if (ret < 0 || ret < nents) {
+ ib_mr_pool_put(qp, &qp->rdma_mrs, reg->mr);
+ return -EINVAL;
+ }
+
+ reg->reg_wr.wr.opcode = IB_WR_REG_MR;
+ reg->reg_wr.mr = reg->mr;
+ reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE;
+ if (rdma_protocol_iwarp(qp->device, port_num))
+ reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE;
+ count++;
+
+ reg->sge.addr = reg->mr->iova;
+ reg->sge.length = reg->mr->length;
+ return count;
+}
+
+static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u32 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset,
+ u64 remote_addr, u32 rkey, enum dma_data_direction dir)
+{
+ struct rdma_rw_reg_ctx *prev = NULL;
+ u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
+ qp->integrity_en);
+ int i, j, ret = 0, count = 0;
+
+ ctx->nr_ops = DIV_ROUND_UP(sg_cnt, pages_per_mr);
+ ctx->reg = kcalloc(ctx->nr_ops, sizeof(*ctx->reg), GFP_KERNEL);
+ if (!ctx->reg) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < ctx->nr_ops; i++) {
+ struct rdma_rw_reg_ctx *reg = &ctx->reg[i];
+ u32 nents = min(sg_cnt, pages_per_mr);
+
+ ret = rdma_rw_init_one_mr(qp, port_num, reg, sg, sg_cnt,
+ offset);
+ if (ret < 0)
+ goto out_free;
+ count += ret;
+
+ if (prev) {
+ if (reg->mr->need_inval)
+ prev->wr.wr.next = &reg->inv_wr;
+ else
+ prev->wr.wr.next = &reg->reg_wr.wr;
+ }
+
+ reg->reg_wr.wr.next = &reg->wr.wr;
+
+ reg->wr.wr.sg_list = &reg->sge;
+ reg->wr.wr.num_sge = 1;
+ reg->wr.remote_addr = remote_addr;
+ reg->wr.rkey = rkey;
+ if (dir == DMA_TO_DEVICE) {
+ reg->wr.wr.opcode = IB_WR_RDMA_WRITE;
+ } else if (!rdma_cap_read_inv(qp->device, port_num)) {
+ reg->wr.wr.opcode = IB_WR_RDMA_READ;
+ } else {
+ reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
+ reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey;
+ }
+ count++;
+
+ remote_addr += reg->sge.length;
+ sg_cnt -= nents;
+ for (j = 0; j < nents; j++)
+ sg = sg_next(sg);
+ prev = reg;
+ offset = 0;
+ }
+
+ if (prev)
+ prev->wr.wr.next = NULL;
+
+ ctx->type = RDMA_RW_MR;
+ return count;
+
+out_free:
+ while (--i >= 0)
+ ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
+ kfree(ctx->reg);
+out:
+ return ret;
+}
+
+static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ struct scatterlist *sg, u32 sg_cnt, u32 offset,
+ u64 remote_addr, u32 rkey, enum dma_data_direction dir)
+{
+ u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge :
+ qp->max_read_sge;
+ struct ib_sge *sge;
+ u32 total_len = 0, i, j;
+
+ ctx->nr_ops = DIV_ROUND_UP(sg_cnt, max_sge);
+
+ ctx->map.sges = sge = kcalloc(sg_cnt, sizeof(*sge), GFP_KERNEL);
+ if (!ctx->map.sges)
+ goto out;
+
+ ctx->map.wrs = kcalloc(ctx->nr_ops, sizeof(*ctx->map.wrs), GFP_KERNEL);
+ if (!ctx->map.wrs)
+ goto out_free_sges;
+
+ for (i = 0; i < ctx->nr_ops; i++) {
+ struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i];
+ u32 nr_sge = min(sg_cnt, max_sge);
+
+ if (dir == DMA_TO_DEVICE)
+ rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
+ else
+ rdma_wr->wr.opcode = IB_WR_RDMA_READ;
+ rdma_wr->remote_addr = remote_addr + total_len;
+ rdma_wr->rkey = rkey;
+ rdma_wr->wr.num_sge = nr_sge;
+ rdma_wr->wr.sg_list = sge;
+
+ for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) {
+ sge->addr = sg_dma_address(sg) + offset;
+ sge->length = sg_dma_len(sg) - offset;
+ sge->lkey = qp->pd->local_dma_lkey;
+
+ total_len += sge->length;
+ sge++;
+ sg_cnt--;
+ offset = 0;
+ }
+
+ rdma_wr->wr.next = i + 1 < ctx->nr_ops ?
+ &ctx->map.wrs[i + 1].wr : NULL;
+ }
+
+ ctx->type = RDMA_RW_MULTI_WR;
+ return ctx->nr_ops;
+
+out_free_sges:
+ kfree(ctx->map.sges);
+out:
+ return -ENOMEM;
+}
+
+static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ struct scatterlist *sg, u32 offset, u64 remote_addr, u32 rkey,
+ enum dma_data_direction dir)
+{
+ struct ib_rdma_wr *rdma_wr = &ctx->single.wr;
+
+ ctx->nr_ops = 1;
+
+ ctx->single.sge.lkey = qp->pd->local_dma_lkey;
+ ctx->single.sge.addr = sg_dma_address(sg) + offset;
+ ctx->single.sge.length = sg_dma_len(sg) - offset;
+
+ memset(rdma_wr, 0, sizeof(*rdma_wr));
+ if (dir == DMA_TO_DEVICE)
+ rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
+ else
+ rdma_wr->wr.opcode = IB_WR_RDMA_READ;
+ rdma_wr->wr.sg_list = &ctx->single.sge;
+ rdma_wr->wr.num_sge = 1;
+ rdma_wr->remote_addr = remote_addr;
+ rdma_wr->rkey = rkey;
+
+ ctx->type = RDMA_RW_SINGLE_WR;
+ return 1;
+}
+
+/**
+ * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context
+ * @ctx: context to initialize
+ * @qp: queue pair to operate on
+ * @port_num: port num to which the connection is bound
+ * @sg: scatterlist to READ/WRITE from/to
+ * @sg_cnt: number of entries in @sg
+ * @sg_offset: current byte offset into @sg
+ * @remote_addr:remote address to read/write (relative to @rkey)
+ * @rkey: remote key to operate on
+ * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
+ *
+ * Returns the number of WQEs that will be needed on the workqueue if
+ * successful, or a negative error code.
+ */
+int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num,
+ struct scatterlist *sg, u32 sg_cnt, u32 sg_offset,
+ u64 remote_addr, u32 rkey, enum dma_data_direction dir)
+{
+ struct ib_device *dev = qp->pd->device;
+ struct sg_table sgt = {
+ .sgl = sg,
+ .orig_nents = sg_cnt,
+ };
+ int ret;
+
+ ret = ib_dma_map_sgtable_attrs(dev, &sgt, dir, 0);
+ if (ret)
+ return ret;
+ sg_cnt = sgt.nents;
+
+ /*
+ * Skip to the S/G entry that sg_offset falls into:
+ */
+ for (;;) {
+ u32 len = sg_dma_len(sg);
+
+ if (sg_offset < len)
+ break;
+
+ sg = sg_next(sg);
+ sg_offset -= len;
+ sg_cnt--;
+ }
+
+ ret = -EIO;
+ if (WARN_ON_ONCE(sg_cnt == 0))
+ goto out_unmap_sg;
+
+ if (rdma_rw_io_needs_mr(qp->device, port_num, dir, sg_cnt)) {
+ ret = rdma_rw_init_mr_wrs(ctx, qp, port_num, sg, sg_cnt,
+ sg_offset, remote_addr, rkey, dir);
+ } else if (sg_cnt > 1) {
+ ret = rdma_rw_init_map_wrs(ctx, qp, sg, sg_cnt, sg_offset,
+ remote_addr, rkey, dir);
+ } else {
+ ret = rdma_rw_init_single_wr(ctx, qp, sg, sg_offset,
+ remote_addr, rkey, dir);
+ }
+
+ if (ret < 0)
+ goto out_unmap_sg;
+ return ret;
+
+out_unmap_sg:
+ ib_dma_unmap_sgtable_attrs(dev, &sgt, dir, 0);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_rw_ctx_init);
+
+/**
+ * rdma_rw_ctx_signature_init - initialize a RW context with signature offload
+ * @ctx: context to initialize
+ * @qp: queue pair to operate on
+ * @port_num: port num to which the connection is bound
+ * @sg: scatterlist to READ/WRITE from/to
+ * @sg_cnt: number of entries in @sg
+ * @prot_sg: scatterlist to READ/WRITE protection information from/to
+ * @prot_sg_cnt: number of entries in @prot_sg
+ * @sig_attrs: signature offloading algorithms
+ * @remote_addr:remote address to read/write (relative to @rkey)
+ * @rkey: remote key to operate on
+ * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
+ *
+ * Returns the number of WQEs that will be needed on the workqueue if
+ * successful, or a negative error code.
+ */
+int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u32 port_num, struct scatterlist *sg, u32 sg_cnt,
+ struct scatterlist *prot_sg, u32 prot_sg_cnt,
+ struct ib_sig_attrs *sig_attrs,
+ u64 remote_addr, u32 rkey, enum dma_data_direction dir)
+{
+ struct ib_device *dev = qp->pd->device;
+ u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device,
+ qp->integrity_en);
+ struct sg_table sgt = {
+ .sgl = sg,
+ .orig_nents = sg_cnt,
+ };
+ struct sg_table prot_sgt = {
+ .sgl = prot_sg,
+ .orig_nents = prot_sg_cnt,
+ };
+ struct ib_rdma_wr *rdma_wr;
+ int count = 0, ret;
+
+ if (sg_cnt > pages_per_mr || prot_sg_cnt > pages_per_mr) {
+ pr_err("SG count too large: sg_cnt=%u, prot_sg_cnt=%u, pages_per_mr=%u\n",
+ sg_cnt, prot_sg_cnt, pages_per_mr);
+ return -EINVAL;
+ }
+
+ ret = ib_dma_map_sgtable_attrs(dev, &sgt, dir, 0);
+ if (ret)
+ return ret;
+
+ if (prot_sg_cnt) {
+ ret = ib_dma_map_sgtable_attrs(dev, &prot_sgt, dir, 0);
+ if (ret)
+ goto out_unmap_sg;
+ }
+
+ ctx->type = RDMA_RW_SIG_MR;
+ ctx->nr_ops = 1;
+ ctx->reg = kzalloc(sizeof(*ctx->reg), GFP_KERNEL);
+ if (!ctx->reg) {
+ ret = -ENOMEM;
+ goto out_unmap_prot_sg;
+ }
+
+ ctx->reg->mr = ib_mr_pool_get(qp, &qp->sig_mrs);
+ if (!ctx->reg->mr) {
+ ret = -EAGAIN;
+ goto out_free_ctx;
+ }
+
+ count += rdma_rw_inv_key(ctx->reg);
+
+ memcpy(ctx->reg->mr->sig_attrs, sig_attrs, sizeof(struct ib_sig_attrs));
+
+ ret = ib_map_mr_sg_pi(ctx->reg->mr, sg, sgt.nents, NULL, prot_sg,
+ prot_sgt.nents, NULL, SZ_4K);
+ if (unlikely(ret)) {
+ pr_err("failed to map PI sg (%u)\n",
+ sgt.nents + prot_sgt.nents);
+ goto out_destroy_sig_mr;
+ }
+
+ ctx->reg->reg_wr.wr.opcode = IB_WR_REG_MR_INTEGRITY;
+ ctx->reg->reg_wr.wr.wr_cqe = NULL;
+ ctx->reg->reg_wr.wr.num_sge = 0;
+ ctx->reg->reg_wr.wr.send_flags = 0;
+ ctx->reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE;
+ if (rdma_protocol_iwarp(qp->device, port_num))
+ ctx->reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE;
+ ctx->reg->reg_wr.mr = ctx->reg->mr;
+ ctx->reg->reg_wr.key = ctx->reg->mr->lkey;
+ count++;
+
+ ctx->reg->sge.addr = ctx->reg->mr->iova;
+ ctx->reg->sge.length = ctx->reg->mr->length;
+ if (sig_attrs->wire.sig_type == IB_SIG_TYPE_NONE)
+ ctx->reg->sge.length -= ctx->reg->mr->sig_attrs->meta_length;
+
+ rdma_wr = &ctx->reg->wr;
+ rdma_wr->wr.sg_list = &ctx->reg->sge;
+ rdma_wr->wr.num_sge = 1;
+ rdma_wr->remote_addr = remote_addr;
+ rdma_wr->rkey = rkey;
+ if (dir == DMA_TO_DEVICE)
+ rdma_wr->wr.opcode = IB_WR_RDMA_WRITE;
+ else
+ rdma_wr->wr.opcode = IB_WR_RDMA_READ;
+ ctx->reg->reg_wr.wr.next = &rdma_wr->wr;
+ count++;
+
+ return count;
+
+out_destroy_sig_mr:
+ ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr);
+out_free_ctx:
+ kfree(ctx->reg);
+out_unmap_prot_sg:
+ if (prot_sgt.nents)
+ ib_dma_unmap_sgtable_attrs(dev, &prot_sgt, dir, 0);
+out_unmap_sg:
+ ib_dma_unmap_sgtable_attrs(dev, &sgt, dir, 0);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_rw_ctx_signature_init);
+
+/*
+ * Now that we are going to post the WRs we can update the lkey and need_inval
+ * state on the MRs. If we were doing this at init time, we would get double
+ * or missing invalidations if a context was initialized but not actually
+ * posted.
+ */
+static void rdma_rw_update_lkey(struct rdma_rw_reg_ctx *reg, bool need_inval)
+{
+ reg->mr->need_inval = need_inval;
+ ib_update_fast_reg_key(reg->mr, ib_inc_rkey(reg->mr->lkey));
+ reg->reg_wr.key = reg->mr->lkey;
+ reg->sge.lkey = reg->mr->lkey;
+}
+
+/**
+ * rdma_rw_ctx_wrs - return chain of WRs for a RDMA READ or WRITE operation
+ * @ctx: context to operate on
+ * @qp: queue pair to operate on
+ * @port_num: port num to which the connection is bound
+ * @cqe: completion queue entry for the last WR
+ * @chain_wr: WR to append to the posted chain
+ *
+ * Return the WR chain for the set of RDMA READ/WRITE operations described by
+ * @ctx, as well as any memory registration operations needed. If @chain_wr
+ * is non-NULL the WR it points to will be appended to the chain of WRs posted.
+ * If @chain_wr is not set @cqe must be set so that the caller gets a
+ * completion notification.
+ */
+struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u32 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
+{
+ struct ib_send_wr *first_wr, *last_wr;
+ int i;
+
+ switch (ctx->type) {
+ case RDMA_RW_SIG_MR:
+ case RDMA_RW_MR:
+ for (i = 0; i < ctx->nr_ops; i++) {
+ rdma_rw_update_lkey(&ctx->reg[i],
+ ctx->reg[i].wr.wr.opcode !=
+ IB_WR_RDMA_READ_WITH_INV);
+ }
+
+ if (ctx->reg[0].inv_wr.next)
+ first_wr = &ctx->reg[0].inv_wr;
+ else
+ first_wr = &ctx->reg[0].reg_wr.wr;
+ last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr;
+ break;
+ case RDMA_RW_MULTI_WR:
+ first_wr = &ctx->map.wrs[0].wr;
+ last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr;
+ break;
+ case RDMA_RW_SINGLE_WR:
+ first_wr = &ctx->single.wr.wr;
+ last_wr = &ctx->single.wr.wr;
+ break;
+ default:
+ BUG();
+ }
+
+ if (chain_wr) {
+ last_wr->next = chain_wr;
+ } else {
+ last_wr->wr_cqe = cqe;
+ last_wr->send_flags |= IB_SEND_SIGNALED;
+ }
+
+ return first_wr;
+}
+EXPORT_SYMBOL(rdma_rw_ctx_wrs);
+
+/**
+ * rdma_rw_ctx_post - post a RDMA READ or RDMA WRITE operation
+ * @ctx: context to operate on
+ * @qp: queue pair to operate on
+ * @port_num: port num to which the connection is bound
+ * @cqe: completion queue entry for the last WR
+ * @chain_wr: WR to append to the posted chain
+ *
+ * Post the set of RDMA READ/WRITE operations described by @ctx, as well as
+ * any memory registration operations needed. If @chain_wr is non-NULL the
+ * WR it points to will be appended to the chain of WRs posted. If @chain_wr
+ * is not set @cqe must be set so that the caller gets a completion
+ * notification.
+ */
+int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num,
+ struct ib_cqe *cqe, struct ib_send_wr *chain_wr)
+{
+ struct ib_send_wr *first_wr;
+
+ first_wr = rdma_rw_ctx_wrs(ctx, qp, port_num, cqe, chain_wr);
+ return ib_post_send(qp, first_wr, NULL);
+}
+EXPORT_SYMBOL(rdma_rw_ctx_post);
+
+/**
+ * rdma_rw_ctx_destroy - release all resources allocated by rdma_rw_ctx_init
+ * @ctx: context to release
+ * @qp: queue pair to operate on
+ * @port_num: port num to which the connection is bound
+ * @sg: scatterlist that was used for the READ/WRITE
+ * @sg_cnt: number of entries in @sg
+ * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
+ */
+void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u32 port_num, struct scatterlist *sg, u32 sg_cnt,
+ enum dma_data_direction dir)
+{
+ int i;
+
+ switch (ctx->type) {
+ case RDMA_RW_MR:
+ for (i = 0; i < ctx->nr_ops; i++)
+ ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr);
+ kfree(ctx->reg);
+ break;
+ case RDMA_RW_MULTI_WR:
+ kfree(ctx->map.wrs);
+ kfree(ctx->map.sges);
+ break;
+ case RDMA_RW_SINGLE_WR:
+ break;
+ default:
+ BUG();
+ break;
+ }
+
+ ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+}
+EXPORT_SYMBOL(rdma_rw_ctx_destroy);
+
+/**
+ * rdma_rw_ctx_destroy_signature - release all resources allocated by
+ * rdma_rw_ctx_signature_init
+ * @ctx: context to release
+ * @qp: queue pair to operate on
+ * @port_num: port num to which the connection is bound
+ * @sg: scatterlist that was used for the READ/WRITE
+ * @sg_cnt: number of entries in @sg
+ * @prot_sg: scatterlist that was used for the READ/WRITE of the PI
+ * @prot_sg_cnt: number of entries in @prot_sg
+ * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
+ */
+void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u32 port_num, struct scatterlist *sg, u32 sg_cnt,
+ struct scatterlist *prot_sg, u32 prot_sg_cnt,
+ enum dma_data_direction dir)
+{
+ if (WARN_ON_ONCE(ctx->type != RDMA_RW_SIG_MR))
+ return;
+
+ ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr);
+ kfree(ctx->reg);
+
+ if (prot_sg_cnt)
+ ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir);
+ ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+}
+EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature);
+
+/**
+ * rdma_rw_mr_factor - return number of MRs required for a payload
+ * @device: device handling the connection
+ * @port_num: port num to which the connection is bound
+ * @maxpages: maximum payload pages per rdma_rw_ctx
+ *
+ * Returns the number of MRs the device requires to move @maxpayload
+ * bytes. The returned value is used during transport creation to
+ * compute max_rdma_ctxts and the size of the transport's Send and
+ * Send Completion Queues.
+ */
+unsigned int rdma_rw_mr_factor(struct ib_device *device, u32 port_num,
+ unsigned int maxpages)
+{
+ unsigned int mr_pages;
+
+ if (rdma_rw_can_use_mr(device, port_num))
+ mr_pages = rdma_rw_fr_page_list_len(device, false);
+ else
+ mr_pages = device->attrs.max_sge_rd;
+ return DIV_ROUND_UP(maxpages, mr_pages);
+}
+EXPORT_SYMBOL(rdma_rw_mr_factor);
+
+void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr)
+{
+ u32 factor;
+
+ WARN_ON_ONCE(attr->port_num == 0);
+
+ /*
+ * Each context needs at least one RDMA READ or WRITE WR.
+ *
+ * For some hardware we might need more, eventually we should ask the
+ * HCA driver for a multiplier here.
+ */
+ factor = 1;
+
+ /*
+ * If the devices needs MRs to perform RDMA READ or WRITE operations,
+ * we'll need two additional MRs for the registrations and the
+ * invalidation.
+ */
+ if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN ||
+ rdma_rw_can_use_mr(dev, attr->port_num))
+ factor += 2; /* inv + reg */
+
+ attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs;
+
+ /*
+ * But maybe we were just too high in the sky and the device doesn't
+ * even support all we need, and we'll have to live with what we get..
+ */
+ attr->cap.max_send_wr =
+ min_t(u32, attr->cap.max_send_wr, dev->attrs.max_qp_wr);
+}
+
+int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr)
+{
+ struct ib_device *dev = qp->pd->device;
+ u32 nr_mrs = 0, nr_sig_mrs = 0, max_num_sg = 0;
+ int ret = 0;
+
+ if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) {
+ nr_sig_mrs = attr->cap.max_rdma_ctxs;
+ nr_mrs = attr->cap.max_rdma_ctxs;
+ max_num_sg = rdma_rw_fr_page_list_len(dev, true);
+ } else if (rdma_rw_can_use_mr(dev, attr->port_num)) {
+ nr_mrs = attr->cap.max_rdma_ctxs;
+ max_num_sg = rdma_rw_fr_page_list_len(dev, false);
+ }
+
+ if (nr_mrs) {
+ ret = ib_mr_pool_init(qp, &qp->rdma_mrs, nr_mrs,
+ IB_MR_TYPE_MEM_REG,
+ max_num_sg, 0);
+ if (ret) {
+ pr_err("%s: failed to allocated %u MRs\n",
+ __func__, nr_mrs);
+ return ret;
+ }
+ }
+
+ if (nr_sig_mrs) {
+ ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs,
+ IB_MR_TYPE_INTEGRITY, max_num_sg, max_num_sg);
+ if (ret) {
+ pr_err("%s: failed to allocated %u SIG MRs\n",
+ __func__, nr_sig_mrs);
+ goto out_free_rdma_mrs;
+ }
+ }
+
+ return 0;
+
+out_free_rdma_mrs:
+ ib_mr_pool_destroy(qp, &qp->rdma_mrs);
+ return ret;
+}
+
+void rdma_rw_cleanup_mrs(struct ib_qp *qp)
+{
+ ib_mr_pool_destroy(qp, &qp->sig_mrs);
+ ib_mr_pool_destroy(qp, &qp->rdma_mrs);
+}
diff --git a/drivers/infiniband/core/sa.h b/drivers/infiniband/core/sa.h
new file mode 100644
index 000000000..143de37ae
--- /dev/null
+++ b/drivers/infiniband/core/sa.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef SA_H
+#define SA_H
+
+#include <rdma/ib_sa.h>
+
+static inline void ib_sa_client_get(struct ib_sa_client *client)
+{
+ atomic_inc(&client->users);
+}
+
+static inline void ib_sa_client_put(struct ib_sa_client *client)
+{
+ if (atomic_dec_and_test(&client->users))
+ complete(&client->comp);
+}
+
+int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
+ struct ib_device *device, u32 port_num, u8 method,
+ struct ib_sa_mcmember_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ unsigned long timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_mcmember_rec *resp,
+ void *context),
+ void *context, struct ib_sa_query **sa_query);
+
+int mcast_init(void);
+void mcast_cleanup(void);
+
+#endif /* SA_H */
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
new file mode 100644
index 000000000..8c69bdb5b
--- /dev/null
+++ b/drivers/infiniband/core/sa_query.c
@@ -0,0 +1,2359 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/random.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/dma-mapping.h>
+#include <linux/kref.h>
+#include <linux/xarray.h>
+#include <linux/workqueue.h>
+#include <uapi/linux/if_ether.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_cache.h>
+#include <rdma/rdma_netlink.h>
+#include <net/netlink.h>
+#include <uapi/rdma/ib_user_sa.h>
+#include <rdma/ib_marshall.h>
+#include <rdma/ib_addr.h>
+#include <rdma/opa_addr.h>
+#include <rdma/rdma_cm.h>
+#include "sa.h"
+#include "core_priv.h"
+
+#define IB_SA_LOCAL_SVC_TIMEOUT_MIN 100
+#define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT 2000
+#define IB_SA_LOCAL_SVC_TIMEOUT_MAX 200000
+#define IB_SA_CPI_MAX_RETRY_CNT 3
+#define IB_SA_CPI_RETRY_WAIT 1000 /*msecs */
+static int sa_local_svc_timeout_ms = IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT;
+
+struct ib_sa_sm_ah {
+ struct ib_ah *ah;
+ struct kref ref;
+ u16 pkey_index;
+ u8 src_path_mask;
+};
+
+enum rdma_class_port_info_type {
+ RDMA_CLASS_PORT_INFO_IB,
+ RDMA_CLASS_PORT_INFO_OPA
+};
+
+struct rdma_class_port_info {
+ enum rdma_class_port_info_type type;
+ union {
+ struct ib_class_port_info ib;
+ struct opa_class_port_info opa;
+ };
+};
+
+struct ib_sa_classport_cache {
+ bool valid;
+ int retry_cnt;
+ struct rdma_class_port_info data;
+};
+
+struct ib_sa_port {
+ struct ib_mad_agent *agent;
+ struct ib_sa_sm_ah *sm_ah;
+ struct work_struct update_task;
+ struct ib_sa_classport_cache classport_info;
+ struct delayed_work ib_cpi_work;
+ spinlock_t classport_lock; /* protects class port info set */
+ spinlock_t ah_lock;
+ u32 port_num;
+};
+
+struct ib_sa_device {
+ int start_port, end_port;
+ struct ib_event_handler event_handler;
+ struct ib_sa_port port[];
+};
+
+struct ib_sa_query {
+ void (*callback)(struct ib_sa_query *sa_query, int status,
+ int num_prs, struct ib_sa_mad *mad);
+ void (*release)(struct ib_sa_query *);
+ struct ib_sa_client *client;
+ struct ib_sa_port *port;
+ struct ib_mad_send_buf *mad_buf;
+ struct ib_sa_sm_ah *sm_ah;
+ int id;
+ u32 flags;
+ struct list_head list; /* Local svc request list */
+ u32 seq; /* Local svc request sequence number */
+ unsigned long timeout; /* Local svc timeout */
+ u8 path_use; /* How will the pathrecord be used */
+
+ /* A separate buffer to save pathrecords of a response, as in cases
+ * like IB/netlink, mulptiple pathrecords are supported, so that
+ * mad->data is not large enough to hold them
+ */
+ void *resp_pr_data;
+};
+
+#define IB_SA_ENABLE_LOCAL_SERVICE 0x00000001
+#define IB_SA_CANCEL 0x00000002
+#define IB_SA_QUERY_OPA 0x00000004
+
+struct ib_sa_path_query {
+ void (*callback)(int status, struct sa_path_rec *rec,
+ int num_paths, void *context);
+ void *context;
+ struct ib_sa_query sa_query;
+ struct sa_path_rec *conv_pr;
+};
+
+struct ib_sa_guidinfo_query {
+ void (*callback)(int, struct ib_sa_guidinfo_rec *, void *);
+ void *context;
+ struct ib_sa_query sa_query;
+};
+
+struct ib_sa_classport_info_query {
+ void (*callback)(void *);
+ void *context;
+ struct ib_sa_query sa_query;
+};
+
+struct ib_sa_mcmember_query {
+ void (*callback)(int, struct ib_sa_mcmember_rec *, void *);
+ void *context;
+ struct ib_sa_query sa_query;
+};
+
+static LIST_HEAD(ib_nl_request_list);
+static DEFINE_SPINLOCK(ib_nl_request_lock);
+static atomic_t ib_nl_sa_request_seq;
+static struct workqueue_struct *ib_nl_wq;
+static struct delayed_work ib_nl_timed_work;
+static const struct nla_policy ib_nl_policy[LS_NLA_TYPE_MAX] = {
+ [LS_NLA_TYPE_PATH_RECORD] = {.type = NLA_BINARY,
+ .len = sizeof(struct ib_path_rec_data)},
+ [LS_NLA_TYPE_TIMEOUT] = {.type = NLA_U32},
+ [LS_NLA_TYPE_SERVICE_ID] = {.type = NLA_U64},
+ [LS_NLA_TYPE_DGID] = {.type = NLA_BINARY,
+ .len = sizeof(struct rdma_nla_ls_gid)},
+ [LS_NLA_TYPE_SGID] = {.type = NLA_BINARY,
+ .len = sizeof(struct rdma_nla_ls_gid)},
+ [LS_NLA_TYPE_TCLASS] = {.type = NLA_U8},
+ [LS_NLA_TYPE_PKEY] = {.type = NLA_U16},
+ [LS_NLA_TYPE_QOS_CLASS] = {.type = NLA_U16},
+};
+
+
+static int ib_sa_add_one(struct ib_device *device);
+static void ib_sa_remove_one(struct ib_device *device, void *client_data);
+
+static struct ib_client sa_client = {
+ .name = "sa",
+ .add = ib_sa_add_one,
+ .remove = ib_sa_remove_one
+};
+
+static DEFINE_XARRAY_FLAGS(queries, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
+
+static DEFINE_SPINLOCK(tid_lock);
+static u32 tid;
+
+#define PATH_REC_FIELD(field) \
+ .struct_offset_bytes = offsetof(struct sa_path_rec, field), \
+ .struct_size_bytes = sizeof_field(struct sa_path_rec, field), \
+ .field_name = "sa_path_rec:" #field
+
+static const struct ib_field path_rec_table[] = {
+ { PATH_REC_FIELD(service_id),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 64 },
+ { PATH_REC_FIELD(dgid),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { PATH_REC_FIELD(sgid),
+ .offset_words = 6,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { PATH_REC_FIELD(ib.dlid),
+ .offset_words = 10,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { PATH_REC_FIELD(ib.slid),
+ .offset_words = 10,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { PATH_REC_FIELD(ib.raw_traffic),
+ .offset_words = 11,
+ .offset_bits = 0,
+ .size_bits = 1 },
+ { RESERVED,
+ .offset_words = 11,
+ .offset_bits = 1,
+ .size_bits = 3 },
+ { PATH_REC_FIELD(flow_label),
+ .offset_words = 11,
+ .offset_bits = 4,
+ .size_bits = 20 },
+ { PATH_REC_FIELD(hop_limit),
+ .offset_words = 11,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { PATH_REC_FIELD(traffic_class),
+ .offset_words = 12,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { PATH_REC_FIELD(reversible),
+ .offset_words = 12,
+ .offset_bits = 8,
+ .size_bits = 1 },
+ { PATH_REC_FIELD(numb_path),
+ .offset_words = 12,
+ .offset_bits = 9,
+ .size_bits = 7 },
+ { PATH_REC_FIELD(pkey),
+ .offset_words = 12,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { PATH_REC_FIELD(qos_class),
+ .offset_words = 13,
+ .offset_bits = 0,
+ .size_bits = 12 },
+ { PATH_REC_FIELD(sl),
+ .offset_words = 13,
+ .offset_bits = 12,
+ .size_bits = 4 },
+ { PATH_REC_FIELD(mtu_selector),
+ .offset_words = 13,
+ .offset_bits = 16,
+ .size_bits = 2 },
+ { PATH_REC_FIELD(mtu),
+ .offset_words = 13,
+ .offset_bits = 18,
+ .size_bits = 6 },
+ { PATH_REC_FIELD(rate_selector),
+ .offset_words = 13,
+ .offset_bits = 24,
+ .size_bits = 2 },
+ { PATH_REC_FIELD(rate),
+ .offset_words = 13,
+ .offset_bits = 26,
+ .size_bits = 6 },
+ { PATH_REC_FIELD(packet_life_time_selector),
+ .offset_words = 14,
+ .offset_bits = 0,
+ .size_bits = 2 },
+ { PATH_REC_FIELD(packet_life_time),
+ .offset_words = 14,
+ .offset_bits = 2,
+ .size_bits = 6 },
+ { PATH_REC_FIELD(preference),
+ .offset_words = 14,
+ .offset_bits = 8,
+ .size_bits = 8 },
+ { RESERVED,
+ .offset_words = 14,
+ .offset_bits = 16,
+ .size_bits = 48 },
+};
+
+#define OPA_PATH_REC_FIELD(field) \
+ .struct_offset_bytes = \
+ offsetof(struct sa_path_rec, field), \
+ .struct_size_bytes = \
+ sizeof_field(struct sa_path_rec, field), \
+ .field_name = "sa_path_rec:" #field
+
+static const struct ib_field opa_path_rec_table[] = {
+ { OPA_PATH_REC_FIELD(service_id),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 64 },
+ { OPA_PATH_REC_FIELD(dgid),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { OPA_PATH_REC_FIELD(sgid),
+ .offset_words = 6,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { OPA_PATH_REC_FIELD(opa.dlid),
+ .offset_words = 10,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { OPA_PATH_REC_FIELD(opa.slid),
+ .offset_words = 11,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { OPA_PATH_REC_FIELD(opa.raw_traffic),
+ .offset_words = 12,
+ .offset_bits = 0,
+ .size_bits = 1 },
+ { RESERVED,
+ .offset_words = 12,
+ .offset_bits = 1,
+ .size_bits = 3 },
+ { OPA_PATH_REC_FIELD(flow_label),
+ .offset_words = 12,
+ .offset_bits = 4,
+ .size_bits = 20 },
+ { OPA_PATH_REC_FIELD(hop_limit),
+ .offset_words = 12,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { OPA_PATH_REC_FIELD(traffic_class),
+ .offset_words = 13,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { OPA_PATH_REC_FIELD(reversible),
+ .offset_words = 13,
+ .offset_bits = 8,
+ .size_bits = 1 },
+ { OPA_PATH_REC_FIELD(numb_path),
+ .offset_words = 13,
+ .offset_bits = 9,
+ .size_bits = 7 },
+ { OPA_PATH_REC_FIELD(pkey),
+ .offset_words = 13,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { OPA_PATH_REC_FIELD(opa.l2_8B),
+ .offset_words = 14,
+ .offset_bits = 0,
+ .size_bits = 1 },
+ { OPA_PATH_REC_FIELD(opa.l2_10B),
+ .offset_words = 14,
+ .offset_bits = 1,
+ .size_bits = 1 },
+ { OPA_PATH_REC_FIELD(opa.l2_9B),
+ .offset_words = 14,
+ .offset_bits = 2,
+ .size_bits = 1 },
+ { OPA_PATH_REC_FIELD(opa.l2_16B),
+ .offset_words = 14,
+ .offset_bits = 3,
+ .size_bits = 1 },
+ { RESERVED,
+ .offset_words = 14,
+ .offset_bits = 4,
+ .size_bits = 2 },
+ { OPA_PATH_REC_FIELD(opa.qos_type),
+ .offset_words = 14,
+ .offset_bits = 6,
+ .size_bits = 2 },
+ { OPA_PATH_REC_FIELD(opa.qos_priority),
+ .offset_words = 14,
+ .offset_bits = 8,
+ .size_bits = 8 },
+ { RESERVED,
+ .offset_words = 14,
+ .offset_bits = 16,
+ .size_bits = 3 },
+ { OPA_PATH_REC_FIELD(sl),
+ .offset_words = 14,
+ .offset_bits = 19,
+ .size_bits = 5 },
+ { RESERVED,
+ .offset_words = 14,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { OPA_PATH_REC_FIELD(mtu_selector),
+ .offset_words = 15,
+ .offset_bits = 0,
+ .size_bits = 2 },
+ { OPA_PATH_REC_FIELD(mtu),
+ .offset_words = 15,
+ .offset_bits = 2,
+ .size_bits = 6 },
+ { OPA_PATH_REC_FIELD(rate_selector),
+ .offset_words = 15,
+ .offset_bits = 8,
+ .size_bits = 2 },
+ { OPA_PATH_REC_FIELD(rate),
+ .offset_words = 15,
+ .offset_bits = 10,
+ .size_bits = 6 },
+ { OPA_PATH_REC_FIELD(packet_life_time_selector),
+ .offset_words = 15,
+ .offset_bits = 16,
+ .size_bits = 2 },
+ { OPA_PATH_REC_FIELD(packet_life_time),
+ .offset_words = 15,
+ .offset_bits = 18,
+ .size_bits = 6 },
+ { OPA_PATH_REC_FIELD(preference),
+ .offset_words = 15,
+ .offset_bits = 24,
+ .size_bits = 8 },
+};
+
+#define MCMEMBER_REC_FIELD(field) \
+ .struct_offset_bytes = offsetof(struct ib_sa_mcmember_rec, field), \
+ .struct_size_bytes = sizeof_field(struct ib_sa_mcmember_rec, field), \
+ .field_name = "sa_mcmember_rec:" #field
+
+static const struct ib_field mcmember_rec_table[] = {
+ { MCMEMBER_REC_FIELD(mgid),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { MCMEMBER_REC_FIELD(port_gid),
+ .offset_words = 4,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { MCMEMBER_REC_FIELD(qkey),
+ .offset_words = 8,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { MCMEMBER_REC_FIELD(mlid),
+ .offset_words = 9,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { MCMEMBER_REC_FIELD(mtu_selector),
+ .offset_words = 9,
+ .offset_bits = 16,
+ .size_bits = 2 },
+ { MCMEMBER_REC_FIELD(mtu),
+ .offset_words = 9,
+ .offset_bits = 18,
+ .size_bits = 6 },
+ { MCMEMBER_REC_FIELD(traffic_class),
+ .offset_words = 9,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { MCMEMBER_REC_FIELD(pkey),
+ .offset_words = 10,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { MCMEMBER_REC_FIELD(rate_selector),
+ .offset_words = 10,
+ .offset_bits = 16,
+ .size_bits = 2 },
+ { MCMEMBER_REC_FIELD(rate),
+ .offset_words = 10,
+ .offset_bits = 18,
+ .size_bits = 6 },
+ { MCMEMBER_REC_FIELD(packet_life_time_selector),
+ .offset_words = 10,
+ .offset_bits = 24,
+ .size_bits = 2 },
+ { MCMEMBER_REC_FIELD(packet_life_time),
+ .offset_words = 10,
+ .offset_bits = 26,
+ .size_bits = 6 },
+ { MCMEMBER_REC_FIELD(sl),
+ .offset_words = 11,
+ .offset_bits = 0,
+ .size_bits = 4 },
+ { MCMEMBER_REC_FIELD(flow_label),
+ .offset_words = 11,
+ .offset_bits = 4,
+ .size_bits = 20 },
+ { MCMEMBER_REC_FIELD(hop_limit),
+ .offset_words = 11,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { MCMEMBER_REC_FIELD(scope),
+ .offset_words = 12,
+ .offset_bits = 0,
+ .size_bits = 4 },
+ { MCMEMBER_REC_FIELD(join_state),
+ .offset_words = 12,
+ .offset_bits = 4,
+ .size_bits = 4 },
+ { MCMEMBER_REC_FIELD(proxy_join),
+ .offset_words = 12,
+ .offset_bits = 8,
+ .size_bits = 1 },
+ { RESERVED,
+ .offset_words = 12,
+ .offset_bits = 9,
+ .size_bits = 23 },
+};
+
+#define CLASSPORTINFO_REC_FIELD(field) \
+ .struct_offset_bytes = offsetof(struct ib_class_port_info, field), \
+ .struct_size_bytes = sizeof_field(struct ib_class_port_info, field), \
+ .field_name = "ib_class_port_info:" #field
+
+static const struct ib_field ib_classport_info_rec_table[] = {
+ { CLASSPORTINFO_REC_FIELD(base_version),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { CLASSPORTINFO_REC_FIELD(class_version),
+ .offset_words = 0,
+ .offset_bits = 8,
+ .size_bits = 8 },
+ { CLASSPORTINFO_REC_FIELD(capability_mask),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { CLASSPORTINFO_REC_FIELD(cap_mask2_resp_time),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { CLASSPORTINFO_REC_FIELD(redirect_gid),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { CLASSPORTINFO_REC_FIELD(redirect_tcslfl),
+ .offset_words = 6,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { CLASSPORTINFO_REC_FIELD(redirect_lid),
+ .offset_words = 7,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { CLASSPORTINFO_REC_FIELD(redirect_pkey),
+ .offset_words = 7,
+ .offset_bits = 16,
+ .size_bits = 16 },
+
+ { CLASSPORTINFO_REC_FIELD(redirect_qp),
+ .offset_words = 8,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { CLASSPORTINFO_REC_FIELD(redirect_qkey),
+ .offset_words = 9,
+ .offset_bits = 0,
+ .size_bits = 32 },
+
+ { CLASSPORTINFO_REC_FIELD(trap_gid),
+ .offset_words = 10,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { CLASSPORTINFO_REC_FIELD(trap_tcslfl),
+ .offset_words = 14,
+ .offset_bits = 0,
+ .size_bits = 32 },
+
+ { CLASSPORTINFO_REC_FIELD(trap_lid),
+ .offset_words = 15,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { CLASSPORTINFO_REC_FIELD(trap_pkey),
+ .offset_words = 15,
+ .offset_bits = 16,
+ .size_bits = 16 },
+
+ { CLASSPORTINFO_REC_FIELD(trap_hlqp),
+ .offset_words = 16,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { CLASSPORTINFO_REC_FIELD(trap_qkey),
+ .offset_words = 17,
+ .offset_bits = 0,
+ .size_bits = 32 },
+};
+
+#define OPA_CLASSPORTINFO_REC_FIELD(field) \
+ .struct_offset_bytes =\
+ offsetof(struct opa_class_port_info, field), \
+ .struct_size_bytes = \
+ sizeof_field(struct opa_class_port_info, field), \
+ .field_name = "opa_class_port_info:" #field
+
+static const struct ib_field opa_classport_info_rec_table[] = {
+ { OPA_CLASSPORTINFO_REC_FIELD(base_version),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { OPA_CLASSPORTINFO_REC_FIELD(class_version),
+ .offset_words = 0,
+ .offset_bits = 8,
+ .size_bits = 8 },
+ { OPA_CLASSPORTINFO_REC_FIELD(cap_mask),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { OPA_CLASSPORTINFO_REC_FIELD(cap_mask2_resp_time),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { OPA_CLASSPORTINFO_REC_FIELD(redirect_gid),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { OPA_CLASSPORTINFO_REC_FIELD(redirect_tc_fl),
+ .offset_words = 6,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { OPA_CLASSPORTINFO_REC_FIELD(redirect_lid),
+ .offset_words = 7,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { OPA_CLASSPORTINFO_REC_FIELD(redirect_sl_qp),
+ .offset_words = 8,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { OPA_CLASSPORTINFO_REC_FIELD(redirect_qkey),
+ .offset_words = 9,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { OPA_CLASSPORTINFO_REC_FIELD(trap_gid),
+ .offset_words = 10,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { OPA_CLASSPORTINFO_REC_FIELD(trap_tc_fl),
+ .offset_words = 14,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { OPA_CLASSPORTINFO_REC_FIELD(trap_lid),
+ .offset_words = 15,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { OPA_CLASSPORTINFO_REC_FIELD(trap_hl_qp),
+ .offset_words = 16,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { OPA_CLASSPORTINFO_REC_FIELD(trap_qkey),
+ .offset_words = 17,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { OPA_CLASSPORTINFO_REC_FIELD(trap_pkey),
+ .offset_words = 18,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { OPA_CLASSPORTINFO_REC_FIELD(redirect_pkey),
+ .offset_words = 18,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { OPA_CLASSPORTINFO_REC_FIELD(trap_sl_rsvd),
+ .offset_words = 19,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { RESERVED,
+ .offset_words = 19,
+ .offset_bits = 8,
+ .size_bits = 24 },
+};
+
+#define GUIDINFO_REC_FIELD(field) \
+ .struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field), \
+ .struct_size_bytes = sizeof_field(struct ib_sa_guidinfo_rec, field), \
+ .field_name = "sa_guidinfo_rec:" #field
+
+static const struct ib_field guidinfo_rec_table[] = {
+ { GUIDINFO_REC_FIELD(lid),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { GUIDINFO_REC_FIELD(block_num),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 8 },
+ { GUIDINFO_REC_FIELD(res1),
+ .offset_words = 0,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { GUIDINFO_REC_FIELD(res2),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { GUIDINFO_REC_FIELD(guid_info_list),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 512 },
+};
+
+static inline void ib_sa_disable_local_svc(struct ib_sa_query *query)
+{
+ query->flags &= ~IB_SA_ENABLE_LOCAL_SERVICE;
+}
+
+static inline int ib_sa_query_cancelled(struct ib_sa_query *query)
+{
+ return (query->flags & IB_SA_CANCEL);
+}
+
+static void ib_nl_set_path_rec_attrs(struct sk_buff *skb,
+ struct ib_sa_query *query)
+{
+ struct sa_path_rec *sa_rec = query->mad_buf->context[1];
+ struct ib_sa_mad *mad = query->mad_buf->mad;
+ ib_sa_comp_mask comp_mask = mad->sa_hdr.comp_mask;
+ u16 val16;
+ u64 val64;
+ struct rdma_ls_resolve_header *header;
+
+ query->mad_buf->context[1] = NULL;
+
+ /* Construct the family header first */
+ header = skb_put(skb, NLMSG_ALIGN(sizeof(*header)));
+ strscpy_pad(header->device_name,
+ dev_name(&query->port->agent->device->dev),
+ LS_DEVICE_NAME_MAX);
+ header->port_num = query->port->port_num;
+
+ if ((comp_mask & IB_SA_PATH_REC_REVERSIBLE) &&
+ sa_rec->reversible != 0)
+ query->path_use = LS_RESOLVE_PATH_USE_ALL;
+ else
+ query->path_use = LS_RESOLVE_PATH_USE_UNIDIRECTIONAL;
+ header->path_use = query->path_use;
+
+ /* Now build the attributes */
+ if (comp_mask & IB_SA_PATH_REC_SERVICE_ID) {
+ val64 = be64_to_cpu(sa_rec->service_id);
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SERVICE_ID,
+ sizeof(val64), &val64);
+ }
+ if (comp_mask & IB_SA_PATH_REC_DGID)
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_DGID,
+ sizeof(sa_rec->dgid), &sa_rec->dgid);
+ if (comp_mask & IB_SA_PATH_REC_SGID)
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SGID,
+ sizeof(sa_rec->sgid), &sa_rec->sgid);
+ if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS)
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_TCLASS,
+ sizeof(sa_rec->traffic_class), &sa_rec->traffic_class);
+
+ if (comp_mask & IB_SA_PATH_REC_PKEY) {
+ val16 = be16_to_cpu(sa_rec->pkey);
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_PKEY,
+ sizeof(val16), &val16);
+ }
+ if (comp_mask & IB_SA_PATH_REC_QOS_CLASS) {
+ val16 = be16_to_cpu(sa_rec->qos_class);
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_QOS_CLASS,
+ sizeof(val16), &val16);
+ }
+}
+
+static int ib_nl_get_path_rec_attrs_len(ib_sa_comp_mask comp_mask)
+{
+ int len = 0;
+
+ if (comp_mask & IB_SA_PATH_REC_SERVICE_ID)
+ len += nla_total_size(sizeof(u64));
+ if (comp_mask & IB_SA_PATH_REC_DGID)
+ len += nla_total_size(sizeof(struct rdma_nla_ls_gid));
+ if (comp_mask & IB_SA_PATH_REC_SGID)
+ len += nla_total_size(sizeof(struct rdma_nla_ls_gid));
+ if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS)
+ len += nla_total_size(sizeof(u8));
+ if (comp_mask & IB_SA_PATH_REC_PKEY)
+ len += nla_total_size(sizeof(u16));
+ if (comp_mask & IB_SA_PATH_REC_QOS_CLASS)
+ len += nla_total_size(sizeof(u16));
+
+ /*
+ * Make sure that at least some of the required comp_mask bits are
+ * set.
+ */
+ if (WARN_ON(len == 0))
+ return len;
+
+ /* Add the family header */
+ len += NLMSG_ALIGN(sizeof(struct rdma_ls_resolve_header));
+
+ return len;
+}
+
+static int ib_nl_make_request(struct ib_sa_query *query, gfp_t gfp_mask)
+{
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ void *data;
+ struct ib_sa_mad *mad;
+ int len;
+ unsigned long flags;
+ unsigned long delay;
+ gfp_t gfp_flag;
+ int ret;
+
+ INIT_LIST_HEAD(&query->list);
+ query->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq);
+
+ mad = query->mad_buf->mad;
+ len = ib_nl_get_path_rec_attrs_len(mad->sa_hdr.comp_mask);
+ if (len <= 0)
+ return -EMSGSIZE;
+
+ skb = nlmsg_new(len, gfp_mask);
+ if (!skb)
+ return -ENOMEM;
+
+ /* Put nlmsg header only for now */
+ data = ibnl_put_msg(skb, &nlh, query->seq, 0, RDMA_NL_LS,
+ RDMA_NL_LS_OP_RESOLVE, NLM_F_REQUEST);
+ if (!data) {
+ nlmsg_free(skb);
+ return -EMSGSIZE;
+ }
+
+ /* Add attributes */
+ ib_nl_set_path_rec_attrs(skb, query);
+
+ /* Repair the nlmsg header length */
+ nlmsg_end(skb, nlh);
+
+ gfp_flag = ((gfp_mask & GFP_ATOMIC) == GFP_ATOMIC) ? GFP_ATOMIC :
+ GFP_NOWAIT;
+
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ ret = rdma_nl_multicast(&init_net, skb, RDMA_NL_GROUP_LS, gfp_flag);
+
+ if (ret)
+ goto out;
+
+ /* Put the request on the list.*/
+ delay = msecs_to_jiffies(sa_local_svc_timeout_ms);
+ query->timeout = delay + jiffies;
+ list_add_tail(&query->list, &ib_nl_request_list);
+ /* Start the timeout if this is the only request */
+ if (ib_nl_request_list.next == &query->list)
+ queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
+
+out:
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+
+ return ret;
+}
+
+static int ib_nl_cancel_request(struct ib_sa_query *query)
+{
+ unsigned long flags;
+ struct ib_sa_query *wait_query;
+ int found = 0;
+
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ list_for_each_entry(wait_query, &ib_nl_request_list, list) {
+ /* Let the timeout to take care of the callback */
+ if (query == wait_query) {
+ query->flags |= IB_SA_CANCEL;
+ query->timeout = jiffies;
+ list_move(&query->list, &ib_nl_request_list);
+ found = 1;
+ mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, 1);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+
+ return found;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *mad_send_wc);
+
+static void ib_nl_process_good_resolve_rsp(struct ib_sa_query *query,
+ const struct nlmsghdr *nlh)
+{
+ struct ib_path_rec_data *srec, *drec;
+ struct ib_sa_path_query *path_query;
+ struct ib_mad_send_wc mad_send_wc;
+ const struct nlattr *head, *curr;
+ struct ib_sa_mad *mad = NULL;
+ int len, rem, num_prs = 0;
+ u32 mask = 0;
+ int status = -EIO;
+
+ if (!query->callback)
+ goto out;
+
+ path_query = container_of(query, struct ib_sa_path_query, sa_query);
+ mad = query->mad_buf->mad;
+ if (!path_query->conv_pr &&
+ (be16_to_cpu(mad->mad_hdr.attr_id) == IB_SA_ATTR_PATH_REC)) {
+ /* Need a larger buffer for possible multiple PRs */
+ query->resp_pr_data = kvcalloc(RDMA_PRIMARY_PATH_MAX_REC_NUM,
+ sizeof(*drec), GFP_KERNEL);
+ if (!query->resp_pr_data) {
+ query->callback(query, -ENOMEM, 0, NULL);
+ return;
+ }
+ }
+
+ head = (const struct nlattr *) nlmsg_data(nlh);
+ len = nlmsg_len(nlh);
+ switch (query->path_use) {
+ case LS_RESOLVE_PATH_USE_UNIDIRECTIONAL:
+ mask = IB_PATH_PRIMARY | IB_PATH_OUTBOUND;
+ break;
+
+ case LS_RESOLVE_PATH_USE_ALL:
+ mask = IB_PATH_PRIMARY;
+ break;
+
+ case LS_RESOLVE_PATH_USE_GMP:
+ default:
+ mask = IB_PATH_PRIMARY | IB_PATH_GMP |
+ IB_PATH_BIDIRECTIONAL;
+ break;
+ }
+
+ drec = (struct ib_path_rec_data *)query->resp_pr_data;
+ nla_for_each_attr(curr, head, len, rem) {
+ if (curr->nla_type != LS_NLA_TYPE_PATH_RECORD)
+ continue;
+
+ srec = nla_data(curr);
+ if ((srec->flags & mask) != mask)
+ continue;
+
+ status = 0;
+ if (!drec) {
+ memcpy(mad->data, srec->path_rec,
+ sizeof(srec->path_rec));
+ num_prs = 1;
+ break;
+ }
+
+ memcpy(drec, srec, sizeof(*drec));
+ drec++;
+ num_prs++;
+ if (num_prs >= RDMA_PRIMARY_PATH_MAX_REC_NUM)
+ break;
+ }
+
+ if (!status)
+ mad->mad_hdr.method |= IB_MGMT_METHOD_RESP;
+
+ query->callback(query, status, num_prs, mad);
+ kvfree(query->resp_pr_data);
+ query->resp_pr_data = NULL;
+
+out:
+ mad_send_wc.send_buf = query->mad_buf;
+ mad_send_wc.status = IB_WC_SUCCESS;
+ send_handler(query->mad_buf->mad_agent, &mad_send_wc);
+}
+
+static void ib_nl_request_timeout(struct work_struct *work)
+{
+ unsigned long flags;
+ struct ib_sa_query *query;
+ unsigned long delay;
+ struct ib_mad_send_wc mad_send_wc;
+ int ret;
+
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ while (!list_empty(&ib_nl_request_list)) {
+ query = list_entry(ib_nl_request_list.next,
+ struct ib_sa_query, list);
+
+ if (time_after(query->timeout, jiffies)) {
+ delay = query->timeout - jiffies;
+ if ((long)delay <= 0)
+ delay = 1;
+ queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
+ break;
+ }
+
+ list_del(&query->list);
+ ib_sa_disable_local_svc(query);
+ /* Hold the lock to protect against query cancellation */
+ if (ib_sa_query_cancelled(query))
+ ret = -1;
+ else
+ ret = ib_post_send_mad(query->mad_buf, NULL);
+ if (ret) {
+ mad_send_wc.send_buf = query->mad_buf;
+ mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ send_handler(query->port->agent, &mad_send_wc);
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ }
+ }
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+}
+
+int ib_nl_handle_set_timeout(struct sk_buff *skb,
+ struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ int timeout, delta, abs_delta;
+ const struct nlattr *attr;
+ unsigned long flags;
+ struct ib_sa_query *query;
+ long delay = 0;
+ struct nlattr *tb[LS_NLA_TYPE_MAX];
+ int ret;
+
+ if (!(nlh->nlmsg_flags & NLM_F_REQUEST) ||
+ !(NETLINK_CB(skb).sk))
+ return -EPERM;
+
+ ret = nla_parse_deprecated(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+ nlmsg_len(nlh), ib_nl_policy, NULL);
+ attr = (const struct nlattr *)tb[LS_NLA_TYPE_TIMEOUT];
+ if (ret || !attr)
+ goto settimeout_out;
+
+ timeout = *(int *) nla_data(attr);
+ if (timeout < IB_SA_LOCAL_SVC_TIMEOUT_MIN)
+ timeout = IB_SA_LOCAL_SVC_TIMEOUT_MIN;
+ if (timeout > IB_SA_LOCAL_SVC_TIMEOUT_MAX)
+ timeout = IB_SA_LOCAL_SVC_TIMEOUT_MAX;
+
+ delta = timeout - sa_local_svc_timeout_ms;
+ if (delta < 0)
+ abs_delta = -delta;
+ else
+ abs_delta = delta;
+
+ if (delta != 0) {
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ sa_local_svc_timeout_ms = timeout;
+ list_for_each_entry(query, &ib_nl_request_list, list) {
+ if (delta < 0 && abs_delta > query->timeout)
+ query->timeout = 0;
+ else
+ query->timeout += delta;
+
+ /* Get the new delay from the first entry */
+ if (!delay) {
+ delay = query->timeout - jiffies;
+ if (delay <= 0)
+ delay = 1;
+ }
+ }
+ if (delay)
+ mod_delayed_work(ib_nl_wq, &ib_nl_timed_work,
+ (unsigned long)delay);
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ }
+
+settimeout_out:
+ return 0;
+}
+
+static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh)
+{
+ struct nlattr *tb[LS_NLA_TYPE_MAX];
+ int ret;
+
+ if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR)
+ return 0;
+
+ ret = nla_parse_deprecated(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+ nlmsg_len(nlh), ib_nl_policy, NULL);
+ if (ret)
+ return 0;
+
+ return 1;
+}
+
+int ib_nl_handle_resolve_resp(struct sk_buff *skb,
+ struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ unsigned long flags;
+ struct ib_sa_query *query = NULL, *iter;
+ struct ib_mad_send_buf *send_buf;
+ struct ib_mad_send_wc mad_send_wc;
+ int ret;
+
+ if ((nlh->nlmsg_flags & NLM_F_REQUEST) ||
+ !(NETLINK_CB(skb).sk))
+ return -EPERM;
+
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ list_for_each_entry(iter, &ib_nl_request_list, list) {
+ /*
+ * If the query is cancelled, let the timeout routine
+ * take care of it.
+ */
+ if (nlh->nlmsg_seq == iter->seq) {
+ if (!ib_sa_query_cancelled(iter)) {
+ list_del(&iter->list);
+ query = iter;
+ }
+ break;
+ }
+ }
+
+ if (!query) {
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ goto resp_out;
+ }
+
+ send_buf = query->mad_buf;
+
+ if (!ib_nl_is_good_resolve_resp(nlh)) {
+ /* if the result is a failure, send out the packet via IB */
+ ib_sa_disable_local_svc(query);
+ ret = ib_post_send_mad(query->mad_buf, NULL);
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ if (ret) {
+ mad_send_wc.send_buf = send_buf;
+ mad_send_wc.status = IB_WC_GENERAL_ERR;
+ send_handler(query->port->agent, &mad_send_wc);
+ }
+ } else {
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ ib_nl_process_good_resolve_rsp(query, nlh);
+ }
+
+resp_out:
+ return 0;
+}
+
+static void free_sm_ah(struct kref *kref)
+{
+ struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
+
+ rdma_destroy_ah(sm_ah->ah, 0);
+ kfree(sm_ah);
+}
+
+void ib_sa_register_client(struct ib_sa_client *client)
+{
+ atomic_set(&client->users, 1);
+ init_completion(&client->comp);
+}
+EXPORT_SYMBOL(ib_sa_register_client);
+
+void ib_sa_unregister_client(struct ib_sa_client *client)
+{
+ ib_sa_client_put(client);
+ wait_for_completion(&client->comp);
+}
+EXPORT_SYMBOL(ib_sa_unregister_client);
+
+/**
+ * ib_sa_cancel_query - try to cancel an SA query
+ * @id:ID of query to cancel
+ * @query:query pointer to cancel
+ *
+ * Try to cancel an SA query. If the id and query don't match up or
+ * the query has already completed, nothing is done. Otherwise the
+ * query is canceled and will complete with a status of -EINTR.
+ */
+void ib_sa_cancel_query(int id, struct ib_sa_query *query)
+{
+ unsigned long flags;
+ struct ib_mad_send_buf *mad_buf;
+
+ xa_lock_irqsave(&queries, flags);
+ if (xa_load(&queries, id) != query) {
+ xa_unlock_irqrestore(&queries, flags);
+ return;
+ }
+ mad_buf = query->mad_buf;
+ xa_unlock_irqrestore(&queries, flags);
+
+ /*
+ * If the query is still on the netlink request list, schedule
+ * it to be cancelled by the timeout routine. Otherwise, it has been
+ * sent to the MAD layer and has to be cancelled from there.
+ */
+ if (!ib_nl_cancel_request(query))
+ ib_cancel_mad(mad_buf);
+}
+EXPORT_SYMBOL(ib_sa_cancel_query);
+
+static u8 get_src_path_mask(struct ib_device *device, u32 port_num)
+{
+ struct ib_sa_device *sa_dev;
+ struct ib_sa_port *port;
+ unsigned long flags;
+ u8 src_path_mask;
+
+ sa_dev = ib_get_client_data(device, &sa_client);
+ if (!sa_dev)
+ return 0x7f;
+
+ port = &sa_dev->port[port_num - sa_dev->start_port];
+ spin_lock_irqsave(&port->ah_lock, flags);
+ src_path_mask = port->sm_ah ? port->sm_ah->src_path_mask : 0x7f;
+ spin_unlock_irqrestore(&port->ah_lock, flags);
+
+ return src_path_mask;
+}
+
+static int init_ah_attr_grh_fields(struct ib_device *device, u32 port_num,
+ struct sa_path_rec *rec,
+ struct rdma_ah_attr *ah_attr,
+ const struct ib_gid_attr *gid_attr)
+{
+ enum ib_gid_type type = sa_conv_pathrec_to_gid_type(rec);
+
+ if (!gid_attr) {
+ gid_attr = rdma_find_gid_by_port(device, &rec->sgid, type,
+ port_num, NULL);
+ if (IS_ERR(gid_attr))
+ return PTR_ERR(gid_attr);
+ } else
+ rdma_hold_gid_attr(gid_attr);
+
+ rdma_move_grh_sgid_attr(ah_attr, &rec->dgid,
+ be32_to_cpu(rec->flow_label),
+ rec->hop_limit, rec->traffic_class,
+ gid_attr);
+ return 0;
+}
+
+/**
+ * ib_init_ah_attr_from_path - Initialize address handle attributes based on
+ * an SA path record.
+ * @device: Device associated ah attributes initialization.
+ * @port_num: Port on the specified device.
+ * @rec: path record entry to use for ah attributes initialization.
+ * @ah_attr: address handle attributes to initialization from path record.
+ * @gid_attr: SGID attribute to consider during initialization.
+ *
+ * When ib_init_ah_attr_from_path() returns success,
+ * (a) for IB link layer it optionally contains a reference to SGID attribute
+ * when GRH is present for IB link layer.
+ * (b) for RoCE link layer it contains a reference to SGID attribute.
+ * User must invoke rdma_destroy_ah_attr() to release reference to SGID
+ * attributes which are initialized using ib_init_ah_attr_from_path().
+ */
+int ib_init_ah_attr_from_path(struct ib_device *device, u32 port_num,
+ struct sa_path_rec *rec,
+ struct rdma_ah_attr *ah_attr,
+ const struct ib_gid_attr *gid_attr)
+{
+ int ret = 0;
+
+ memset(ah_attr, 0, sizeof(*ah_attr));
+ ah_attr->type = rdma_ah_find_type(device, port_num);
+ rdma_ah_set_sl(ah_attr, rec->sl);
+ rdma_ah_set_port_num(ah_attr, port_num);
+ rdma_ah_set_static_rate(ah_attr, rec->rate);
+
+ if (sa_path_is_roce(rec)) {
+ ret = roce_resolve_route_from_path(rec, gid_attr);
+ if (ret)
+ return ret;
+
+ memcpy(ah_attr->roce.dmac, sa_path_get_dmac(rec), ETH_ALEN);
+ } else {
+ rdma_ah_set_dlid(ah_attr, be32_to_cpu(sa_path_get_dlid(rec)));
+ if (sa_path_is_opa(rec) &&
+ rdma_ah_get_dlid(ah_attr) == be16_to_cpu(IB_LID_PERMISSIVE))
+ rdma_ah_set_make_grd(ah_attr, true);
+
+ rdma_ah_set_path_bits(ah_attr,
+ be32_to_cpu(sa_path_get_slid(rec)) &
+ get_src_path_mask(device, port_num));
+ }
+
+ if (rec->hop_limit > 0 || sa_path_is_roce(rec))
+ ret = init_ah_attr_grh_fields(device, port_num,
+ rec, ah_attr, gid_attr);
+ return ret;
+}
+EXPORT_SYMBOL(ib_init_ah_attr_from_path);
+
+static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask)
+{
+ struct rdma_ah_attr ah_attr;
+ unsigned long flags;
+
+ spin_lock_irqsave(&query->port->ah_lock, flags);
+ if (!query->port->sm_ah) {
+ spin_unlock_irqrestore(&query->port->ah_lock, flags);
+ return -EAGAIN;
+ }
+ kref_get(&query->port->sm_ah->ref);
+ query->sm_ah = query->port->sm_ah;
+ spin_unlock_irqrestore(&query->port->ah_lock, flags);
+
+ /*
+ * Always check if sm_ah has valid dlid assigned,
+ * before querying for class port info
+ */
+ if ((rdma_query_ah(query->sm_ah->ah, &ah_attr) < 0) ||
+ !rdma_is_valid_unicast_lid(&ah_attr)) {
+ kref_put(&query->sm_ah->ref, free_sm_ah);
+ return -EAGAIN;
+ }
+ query->mad_buf = ib_create_send_mad(query->port->agent, 1,
+ query->sm_ah->pkey_index,
+ 0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA,
+ gfp_mask,
+ ((query->flags & IB_SA_QUERY_OPA) ?
+ OPA_MGMT_BASE_VERSION :
+ IB_MGMT_BASE_VERSION));
+ if (IS_ERR(query->mad_buf)) {
+ kref_put(&query->sm_ah->ref, free_sm_ah);
+ return -ENOMEM;
+ }
+
+ query->mad_buf->ah = query->sm_ah->ah;
+
+ return 0;
+}
+
+static void free_mad(struct ib_sa_query *query)
+{
+ ib_free_send_mad(query->mad_buf);
+ kref_put(&query->sm_ah->ref, free_sm_ah);
+}
+
+static void init_mad(struct ib_sa_query *query, struct ib_mad_agent *agent)
+{
+ struct ib_sa_mad *mad = query->mad_buf->mad;
+ unsigned long flags;
+
+ memset(mad, 0, sizeof *mad);
+
+ if (query->flags & IB_SA_QUERY_OPA) {
+ mad->mad_hdr.base_version = OPA_MGMT_BASE_VERSION;
+ mad->mad_hdr.class_version = OPA_SA_CLASS_VERSION;
+ } else {
+ mad->mad_hdr.base_version = IB_MGMT_BASE_VERSION;
+ mad->mad_hdr.class_version = IB_SA_CLASS_VERSION;
+ }
+ mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
+ spin_lock_irqsave(&tid_lock, flags);
+ mad->mad_hdr.tid =
+ cpu_to_be64(((u64) agent->hi_tid) << 32 | tid++);
+ spin_unlock_irqrestore(&tid_lock, flags);
+}
+
+static int send_mad(struct ib_sa_query *query, unsigned long timeout_ms,
+ gfp_t gfp_mask)
+{
+ unsigned long flags;
+ int ret, id;
+ const int nmbr_sa_query_retries = 10;
+
+ xa_lock_irqsave(&queries, flags);
+ ret = __xa_alloc(&queries, &id, query, xa_limit_32b, gfp_mask);
+ xa_unlock_irqrestore(&queries, flags);
+ if (ret < 0)
+ return ret;
+
+ query->mad_buf->timeout_ms = timeout_ms / nmbr_sa_query_retries;
+ query->mad_buf->retries = nmbr_sa_query_retries;
+ if (!query->mad_buf->timeout_ms) {
+ /* Special case, very small timeout_ms */
+ query->mad_buf->timeout_ms = 1;
+ query->mad_buf->retries = timeout_ms;
+ }
+ query->mad_buf->context[0] = query;
+ query->id = id;
+
+ if ((query->flags & IB_SA_ENABLE_LOCAL_SERVICE) &&
+ (!(query->flags & IB_SA_QUERY_OPA))) {
+ if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) {
+ if (!ib_nl_make_request(query, gfp_mask))
+ return id;
+ }
+ ib_sa_disable_local_svc(query);
+ }
+
+ ret = ib_post_send_mad(query->mad_buf, NULL);
+ if (ret) {
+ xa_lock_irqsave(&queries, flags);
+ __xa_erase(&queries, id);
+ xa_unlock_irqrestore(&queries, flags);
+ }
+
+ /*
+ * It's not safe to dereference query any more, because the
+ * send may already have completed and freed the query in
+ * another context.
+ */
+ return ret ? ret : id;
+}
+
+void ib_sa_unpack_path(void *attribute, struct sa_path_rec *rec)
+{
+ ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), attribute, rec);
+}
+EXPORT_SYMBOL(ib_sa_unpack_path);
+
+void ib_sa_pack_path(struct sa_path_rec *rec, void *attribute)
+{
+ ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, attribute);
+}
+EXPORT_SYMBOL(ib_sa_pack_path);
+
+static bool ib_sa_opa_pathrecord_support(struct ib_sa_client *client,
+ struct ib_sa_device *sa_dev,
+ u32 port_num)
+{
+ struct ib_sa_port *port;
+ unsigned long flags;
+ bool ret = false;
+
+ port = &sa_dev->port[port_num - sa_dev->start_port];
+ spin_lock_irqsave(&port->classport_lock, flags);
+ if (!port->classport_info.valid)
+ goto ret;
+
+ if (port->classport_info.data.type == RDMA_CLASS_PORT_INFO_OPA)
+ ret = opa_get_cpi_capmask2(&port->classport_info.data.opa) &
+ OPA_CLASS_PORT_INFO_PR_SUPPORT;
+ret:
+ spin_unlock_irqrestore(&port->classport_lock, flags);
+ return ret;
+}
+
+enum opa_pr_supported {
+ PR_NOT_SUPPORTED,
+ PR_OPA_SUPPORTED,
+ PR_IB_SUPPORTED
+};
+
+/*
+ * opa_pr_query_possible - Check if current PR query can be an OPA query.
+ *
+ * Retuns PR_NOT_SUPPORTED if a path record query is not
+ * possible, PR_OPA_SUPPORTED if an OPA path record query
+ * is possible and PR_IB_SUPPORTED if an IB path record
+ * query is possible.
+ */
+static int opa_pr_query_possible(struct ib_sa_client *client,
+ struct ib_sa_device *sa_dev,
+ struct ib_device *device, u32 port_num)
+{
+ struct ib_port_attr port_attr;
+
+ if (ib_query_port(device, port_num, &port_attr))
+ return PR_NOT_SUPPORTED;
+
+ if (ib_sa_opa_pathrecord_support(client, sa_dev, port_num))
+ return PR_OPA_SUPPORTED;
+
+ if (port_attr.lid >= be16_to_cpu(IB_MULTICAST_LID_BASE))
+ return PR_NOT_SUPPORTED;
+ else
+ return PR_IB_SUPPORTED;
+}
+
+static void ib_sa_pr_callback_single(struct ib_sa_path_query *query,
+ int status, struct ib_sa_mad *mad)
+{
+ struct sa_path_rec rec = {};
+
+ ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table),
+ mad->data, &rec);
+ rec.rec_type = SA_PATH_REC_TYPE_IB;
+ sa_path_set_dmac_zero(&rec);
+
+ if (query->conv_pr) {
+ struct sa_path_rec opa;
+
+ memset(&opa, 0, sizeof(struct sa_path_rec));
+ sa_convert_path_ib_to_opa(&opa, &rec);
+ query->callback(status, &opa, 1, query->context);
+ } else {
+ query->callback(status, &rec, 1, query->context);
+ }
+}
+
+/**
+ * ib_sa_pr_callback_multiple() - Parse path records then do callback.
+ *
+ * In a multiple-PR case the PRs are saved in "query->resp_pr_data"
+ * (instead of"mad->data") and with "ib_path_rec_data" structure format,
+ * so that rec->flags can be set to indicate the type of PR.
+ * This is valid only in IB fabric.
+ */
+static void ib_sa_pr_callback_multiple(struct ib_sa_path_query *query,
+ int status, int num_prs,
+ struct ib_path_rec_data *rec_data)
+{
+ struct sa_path_rec *rec;
+ int i;
+
+ rec = kvcalloc(num_prs, sizeof(*rec), GFP_KERNEL);
+ if (!rec) {
+ query->callback(-ENOMEM, NULL, 0, query->context);
+ return;
+ }
+
+ for (i = 0; i < num_prs; i++) {
+ ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table),
+ rec_data[i].path_rec, rec + i);
+ rec[i].rec_type = SA_PATH_REC_TYPE_IB;
+ sa_path_set_dmac_zero(rec + i);
+ rec[i].flags = rec_data[i].flags;
+ }
+
+ query->callback(status, rec, num_prs, query->context);
+ kvfree(rec);
+}
+
+static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
+ int status, int num_prs,
+ struct ib_sa_mad *mad)
+{
+ struct ib_sa_path_query *query =
+ container_of(sa_query, struct ib_sa_path_query, sa_query);
+ struct sa_path_rec rec;
+
+ if (!mad || !num_prs) {
+ query->callback(status, NULL, 0, query->context);
+ return;
+ }
+
+ if (sa_query->flags & IB_SA_QUERY_OPA) {
+ if (num_prs != 1) {
+ query->callback(-EINVAL, NULL, 0, query->context);
+ return;
+ }
+
+ ib_unpack(opa_path_rec_table, ARRAY_SIZE(opa_path_rec_table),
+ mad->data, &rec);
+ rec.rec_type = SA_PATH_REC_TYPE_OPA;
+ query->callback(status, &rec, num_prs, query->context);
+ } else {
+ if (!sa_query->resp_pr_data)
+ ib_sa_pr_callback_single(query, status, mad);
+ else
+ ib_sa_pr_callback_multiple(query, status, num_prs,
+ sa_query->resp_pr_data);
+ }
+}
+
+static void ib_sa_path_rec_release(struct ib_sa_query *sa_query)
+{
+ struct ib_sa_path_query *query =
+ container_of(sa_query, struct ib_sa_path_query, sa_query);
+
+ kfree(query->conv_pr);
+ kfree(query);
+}
+
+/**
+ * ib_sa_path_rec_get - Start a Path get query
+ * @client:SA client
+ * @device:device to send query on
+ * @port_num: port number to send query on
+ * @rec:Path Record to send in query
+ * @comp_mask:component mask to send in query
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when query completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:query context, used to cancel query
+ *
+ * Send a Path Record Get query to the SA to look up a path. The
+ * callback function will be called when the query completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query. The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_path_rec_get() is negative, it is an
+ * error code. Otherwise it is a query ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_path_rec_get(struct ib_sa_client *client,
+ struct ib_device *device, u32 port_num,
+ struct sa_path_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ unsigned long timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct sa_path_rec *resp,
+ int num_paths, void *context),
+ void *context,
+ struct ib_sa_query **sa_query)
+{
+ struct ib_sa_path_query *query;
+ struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_port *port;
+ struct ib_mad_agent *agent;
+ struct ib_sa_mad *mad;
+ enum opa_pr_supported status;
+ int ret;
+
+ if (!sa_dev)
+ return -ENODEV;
+
+ if ((rec->rec_type != SA_PATH_REC_TYPE_IB) &&
+ (rec->rec_type != SA_PATH_REC_TYPE_OPA))
+ return -EINVAL;
+
+ port = &sa_dev->port[port_num - sa_dev->start_port];
+ agent = port->agent;
+
+ query = kzalloc(sizeof(*query), gfp_mask);
+ if (!query)
+ return -ENOMEM;
+
+ query->sa_query.port = port;
+ if (rec->rec_type == SA_PATH_REC_TYPE_OPA) {
+ status = opa_pr_query_possible(client, sa_dev, device, port_num);
+ if (status == PR_NOT_SUPPORTED) {
+ ret = -EINVAL;
+ goto err1;
+ } else if (status == PR_OPA_SUPPORTED) {
+ query->sa_query.flags |= IB_SA_QUERY_OPA;
+ } else {
+ query->conv_pr =
+ kmalloc(sizeof(*query->conv_pr), gfp_mask);
+ if (!query->conv_pr) {
+ ret = -ENOMEM;
+ goto err1;
+ }
+ }
+ }
+
+ ret = alloc_mad(&query->sa_query, gfp_mask);
+ if (ret)
+ goto err2;
+
+ ib_sa_client_get(client);
+ query->sa_query.client = client;
+ query->callback = callback;
+ query->context = context;
+
+ mad = query->sa_query.mad_buf->mad;
+ init_mad(&query->sa_query, agent);
+
+ query->sa_query.callback = callback ? ib_sa_path_rec_callback : NULL;
+ query->sa_query.release = ib_sa_path_rec_release;
+ mad->mad_hdr.method = IB_MGMT_METHOD_GET;
+ mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_PATH_REC);
+ mad->sa_hdr.comp_mask = comp_mask;
+
+ if (query->sa_query.flags & IB_SA_QUERY_OPA) {
+ ib_pack(opa_path_rec_table, ARRAY_SIZE(opa_path_rec_table),
+ rec, mad->data);
+ } else if (query->conv_pr) {
+ sa_convert_path_opa_to_ib(query->conv_pr, rec);
+ ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table),
+ query->conv_pr, mad->data);
+ } else {
+ ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table),
+ rec, mad->data);
+ }
+
+ *sa_query = &query->sa_query;
+
+ query->sa_query.flags |= IB_SA_ENABLE_LOCAL_SERVICE;
+ query->sa_query.mad_buf->context[1] = (query->conv_pr) ?
+ query->conv_pr : rec;
+
+ ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+ if (ret < 0)
+ goto err3;
+
+ return ret;
+
+err3:
+ *sa_query = NULL;
+ ib_sa_client_put(query->sa_query.client);
+ free_mad(&query->sa_query);
+err2:
+ kfree(query->conv_pr);
+err1:
+ kfree(query);
+ return ret;
+}
+EXPORT_SYMBOL(ib_sa_path_rec_get);
+
+static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query,
+ int status, int num_prs,
+ struct ib_sa_mad *mad)
+{
+ struct ib_sa_mcmember_query *query =
+ container_of(sa_query, struct ib_sa_mcmember_query, sa_query);
+
+ if (mad) {
+ struct ib_sa_mcmember_rec rec;
+
+ ib_unpack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table),
+ mad->data, &rec);
+ query->callback(status, &rec, query->context);
+ } else
+ query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_mcmember_rec_release(struct ib_sa_query *sa_query)
+{
+ kfree(container_of(sa_query, struct ib_sa_mcmember_query, sa_query));
+}
+
+int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
+ struct ib_device *device, u32 port_num,
+ u8 method,
+ struct ib_sa_mcmember_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ unsigned long timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_mcmember_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query)
+{
+ struct ib_sa_mcmember_query *query;
+ struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_port *port;
+ struct ib_mad_agent *agent;
+ struct ib_sa_mad *mad;
+ int ret;
+
+ if (!sa_dev)
+ return -ENODEV;
+
+ port = &sa_dev->port[port_num - sa_dev->start_port];
+ agent = port->agent;
+
+ query = kzalloc(sizeof(*query), gfp_mask);
+ if (!query)
+ return -ENOMEM;
+
+ query->sa_query.port = port;
+ ret = alloc_mad(&query->sa_query, gfp_mask);
+ if (ret)
+ goto err1;
+
+ ib_sa_client_get(client);
+ query->sa_query.client = client;
+ query->callback = callback;
+ query->context = context;
+
+ mad = query->sa_query.mad_buf->mad;
+ init_mad(&query->sa_query, agent);
+
+ query->sa_query.callback = callback ? ib_sa_mcmember_rec_callback : NULL;
+ query->sa_query.release = ib_sa_mcmember_rec_release;
+ mad->mad_hdr.method = method;
+ mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
+ mad->sa_hdr.comp_mask = comp_mask;
+
+ ib_pack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table),
+ rec, mad->data);
+
+ *sa_query = &query->sa_query;
+
+ ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+ if (ret < 0)
+ goto err2;
+
+ return ret;
+
+err2:
+ *sa_query = NULL;
+ ib_sa_client_put(query->sa_query.client);
+ free_mad(&query->sa_query);
+
+err1:
+ kfree(query);
+ return ret;
+}
+
+/* Support GuidInfoRecord */
+static void ib_sa_guidinfo_rec_callback(struct ib_sa_query *sa_query,
+ int status, int num_paths,
+ struct ib_sa_mad *mad)
+{
+ struct ib_sa_guidinfo_query *query =
+ container_of(sa_query, struct ib_sa_guidinfo_query, sa_query);
+
+ if (mad) {
+ struct ib_sa_guidinfo_rec rec;
+
+ ib_unpack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table),
+ mad->data, &rec);
+ query->callback(status, &rec, query->context);
+ } else
+ query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_guidinfo_rec_release(struct ib_sa_query *sa_query)
+{
+ kfree(container_of(sa_query, struct ib_sa_guidinfo_query, sa_query));
+}
+
+int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
+ struct ib_device *device, u32 port_num,
+ struct ib_sa_guidinfo_rec *rec,
+ ib_sa_comp_mask comp_mask, u8 method,
+ unsigned long timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_guidinfo_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query)
+{
+ struct ib_sa_guidinfo_query *query;
+ struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_port *port;
+ struct ib_mad_agent *agent;
+ struct ib_sa_mad *mad;
+ int ret;
+
+ if (!sa_dev)
+ return -ENODEV;
+
+ if (method != IB_MGMT_METHOD_GET &&
+ method != IB_MGMT_METHOD_SET &&
+ method != IB_SA_METHOD_DELETE) {
+ return -EINVAL;
+ }
+
+ port = &sa_dev->port[port_num - sa_dev->start_port];
+ agent = port->agent;
+
+ query = kzalloc(sizeof(*query), gfp_mask);
+ if (!query)
+ return -ENOMEM;
+
+ query->sa_query.port = port;
+ ret = alloc_mad(&query->sa_query, gfp_mask);
+ if (ret)
+ goto err1;
+
+ ib_sa_client_get(client);
+ query->sa_query.client = client;
+ query->callback = callback;
+ query->context = context;
+
+ mad = query->sa_query.mad_buf->mad;
+ init_mad(&query->sa_query, agent);
+
+ query->sa_query.callback = callback ? ib_sa_guidinfo_rec_callback : NULL;
+ query->sa_query.release = ib_sa_guidinfo_rec_release;
+
+ mad->mad_hdr.method = method;
+ mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_GUID_INFO_REC);
+ mad->sa_hdr.comp_mask = comp_mask;
+
+ ib_pack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table), rec,
+ mad->data);
+
+ *sa_query = &query->sa_query;
+
+ ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+ if (ret < 0)
+ goto err2;
+
+ return ret;
+
+err2:
+ *sa_query = NULL;
+ ib_sa_client_put(query->sa_query.client);
+ free_mad(&query->sa_query);
+
+err1:
+ kfree(query);
+ return ret;
+}
+EXPORT_SYMBOL(ib_sa_guid_info_rec_query);
+
+struct ib_classport_info_context {
+ struct completion done;
+ struct ib_sa_query *sa_query;
+};
+
+static void ib_classportinfo_cb(void *context)
+{
+ struct ib_classport_info_context *cb_ctx = context;
+
+ complete(&cb_ctx->done);
+}
+
+static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query,
+ int status, int num_prs,
+ struct ib_sa_mad *mad)
+{
+ unsigned long flags;
+ struct ib_sa_classport_info_query *query =
+ container_of(sa_query, struct ib_sa_classport_info_query, sa_query);
+ struct ib_sa_classport_cache *info = &sa_query->port->classport_info;
+
+ if (mad) {
+ if (sa_query->flags & IB_SA_QUERY_OPA) {
+ struct opa_class_port_info rec;
+
+ ib_unpack(opa_classport_info_rec_table,
+ ARRAY_SIZE(opa_classport_info_rec_table),
+ mad->data, &rec);
+
+ spin_lock_irqsave(&sa_query->port->classport_lock,
+ flags);
+ if (!status && !info->valid) {
+ memcpy(&info->data.opa, &rec,
+ sizeof(info->data.opa));
+
+ info->valid = true;
+ info->data.type = RDMA_CLASS_PORT_INFO_OPA;
+ }
+ spin_unlock_irqrestore(&sa_query->port->classport_lock,
+ flags);
+
+ } else {
+ struct ib_class_port_info rec;
+
+ ib_unpack(ib_classport_info_rec_table,
+ ARRAY_SIZE(ib_classport_info_rec_table),
+ mad->data, &rec);
+
+ spin_lock_irqsave(&sa_query->port->classport_lock,
+ flags);
+ if (!status && !info->valid) {
+ memcpy(&info->data.ib, &rec,
+ sizeof(info->data.ib));
+
+ info->valid = true;
+ info->data.type = RDMA_CLASS_PORT_INFO_IB;
+ }
+ spin_unlock_irqrestore(&sa_query->port->classport_lock,
+ flags);
+ }
+ }
+ query->callback(query->context);
+}
+
+static void ib_sa_classport_info_rec_release(struct ib_sa_query *sa_query)
+{
+ kfree(container_of(sa_query, struct ib_sa_classport_info_query,
+ sa_query));
+}
+
+static int ib_sa_classport_info_rec_query(struct ib_sa_port *port,
+ unsigned long timeout_ms,
+ void (*callback)(void *context),
+ void *context,
+ struct ib_sa_query **sa_query)
+{
+ struct ib_mad_agent *agent;
+ struct ib_sa_classport_info_query *query;
+ struct ib_sa_mad *mad;
+ gfp_t gfp_mask = GFP_KERNEL;
+ int ret;
+
+ agent = port->agent;
+
+ query = kzalloc(sizeof(*query), gfp_mask);
+ if (!query)
+ return -ENOMEM;
+
+ query->sa_query.port = port;
+ query->sa_query.flags |= rdma_cap_opa_ah(port->agent->device,
+ port->port_num) ?
+ IB_SA_QUERY_OPA : 0;
+ ret = alloc_mad(&query->sa_query, gfp_mask);
+ if (ret)
+ goto err_free;
+
+ query->callback = callback;
+ query->context = context;
+
+ mad = query->sa_query.mad_buf->mad;
+ init_mad(&query->sa_query, agent);
+
+ query->sa_query.callback = ib_sa_classport_info_rec_callback;
+ query->sa_query.release = ib_sa_classport_info_rec_release;
+ mad->mad_hdr.method = IB_MGMT_METHOD_GET;
+ mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_CLASS_PORTINFO);
+ mad->sa_hdr.comp_mask = 0;
+ *sa_query = &query->sa_query;
+
+ ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+ if (ret < 0)
+ goto err_free_mad;
+
+ return ret;
+
+err_free_mad:
+ *sa_query = NULL;
+ free_mad(&query->sa_query);
+
+err_free:
+ kfree(query);
+ return ret;
+}
+
+static void update_ib_cpi(struct work_struct *work)
+{
+ struct ib_sa_port *port =
+ container_of(work, struct ib_sa_port, ib_cpi_work.work);
+ struct ib_classport_info_context *cb_context;
+ unsigned long flags;
+ int ret;
+
+ /* If the classport info is valid, nothing
+ * to do here.
+ */
+ spin_lock_irqsave(&port->classport_lock, flags);
+ if (port->classport_info.valid) {
+ spin_unlock_irqrestore(&port->classport_lock, flags);
+ return;
+ }
+ spin_unlock_irqrestore(&port->classport_lock, flags);
+
+ cb_context = kmalloc(sizeof(*cb_context), GFP_KERNEL);
+ if (!cb_context)
+ goto err_nomem;
+
+ init_completion(&cb_context->done);
+
+ ret = ib_sa_classport_info_rec_query(port, 3000,
+ ib_classportinfo_cb, cb_context,
+ &cb_context->sa_query);
+ if (ret < 0)
+ goto free_cb_err;
+ wait_for_completion(&cb_context->done);
+free_cb_err:
+ kfree(cb_context);
+ spin_lock_irqsave(&port->classport_lock, flags);
+
+ /* If the classport info is still not valid, the query should have
+ * failed for some reason. Retry issuing the query
+ */
+ if (!port->classport_info.valid) {
+ port->classport_info.retry_cnt++;
+ if (port->classport_info.retry_cnt <=
+ IB_SA_CPI_MAX_RETRY_CNT) {
+ unsigned long delay =
+ msecs_to_jiffies(IB_SA_CPI_RETRY_WAIT);
+
+ queue_delayed_work(ib_wq, &port->ib_cpi_work, delay);
+ }
+ }
+ spin_unlock_irqrestore(&port->classport_lock, flags);
+
+err_nomem:
+ return;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ struct ib_sa_query *query = mad_send_wc->send_buf->context[0];
+ unsigned long flags;
+
+ if (query->callback)
+ switch (mad_send_wc->status) {
+ case IB_WC_SUCCESS:
+ /* No callback -- already got recv */
+ break;
+ case IB_WC_RESP_TIMEOUT_ERR:
+ query->callback(query, -ETIMEDOUT, 0, NULL);
+ break;
+ case IB_WC_WR_FLUSH_ERR:
+ query->callback(query, -EINTR, 0, NULL);
+ break;
+ default:
+ query->callback(query, -EIO, 0, NULL);
+ break;
+ }
+
+ xa_lock_irqsave(&queries, flags);
+ __xa_erase(&queries, query->id);
+ xa_unlock_irqrestore(&queries, flags);
+
+ free_mad(query);
+ if (query->client)
+ ib_sa_client_put(query->client);
+ query->release(query);
+}
+
+static void recv_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_sa_query *query;
+
+ if (!send_buf)
+ return;
+
+ query = send_buf->context[0];
+ if (query->callback) {
+ if (mad_recv_wc->wc->status == IB_WC_SUCCESS)
+ query->callback(query,
+ mad_recv_wc->recv_buf.mad->mad_hdr.status ?
+ -EINVAL : 0, 1,
+ (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad);
+ else
+ query->callback(query, -EIO, 0, NULL);
+ }
+
+ ib_free_recv_mad(mad_recv_wc);
+}
+
+static void update_sm_ah(struct work_struct *work)
+{
+ struct ib_sa_port *port =
+ container_of(work, struct ib_sa_port, update_task);
+ struct ib_sa_sm_ah *new_ah;
+ struct ib_port_attr port_attr;
+ struct rdma_ah_attr ah_attr;
+ bool grh_required;
+
+ if (ib_query_port(port->agent->device, port->port_num, &port_attr)) {
+ pr_warn("Couldn't query port\n");
+ return;
+ }
+
+ new_ah = kmalloc(sizeof(*new_ah), GFP_KERNEL);
+ if (!new_ah)
+ return;
+
+ kref_init(&new_ah->ref);
+ new_ah->src_path_mask = (1 << port_attr.lmc) - 1;
+
+ new_ah->pkey_index = 0;
+ if (ib_find_pkey(port->agent->device, port->port_num,
+ IB_DEFAULT_PKEY_FULL, &new_ah->pkey_index))
+ pr_err("Couldn't find index for default PKey\n");
+
+ memset(&ah_attr, 0, sizeof(ah_attr));
+ ah_attr.type = rdma_ah_find_type(port->agent->device,
+ port->port_num);
+ rdma_ah_set_dlid(&ah_attr, port_attr.sm_lid);
+ rdma_ah_set_sl(&ah_attr, port_attr.sm_sl);
+ rdma_ah_set_port_num(&ah_attr, port->port_num);
+
+ grh_required = rdma_is_grh_required(port->agent->device,
+ port->port_num);
+
+ /*
+ * The OPA sm_lid of 0xFFFF needs special handling so that it can be
+ * differentiated from a permissive LID of 0xFFFF. We set the
+ * grh_required flag here so the SA can program the DGID in the
+ * address handle appropriately
+ */
+ if (ah_attr.type == RDMA_AH_ATTR_TYPE_OPA &&
+ (grh_required ||
+ port_attr.sm_lid == be16_to_cpu(IB_LID_PERMISSIVE)))
+ rdma_ah_set_make_grd(&ah_attr, true);
+
+ if (ah_attr.type == RDMA_AH_ATTR_TYPE_IB && grh_required) {
+ rdma_ah_set_ah_flags(&ah_attr, IB_AH_GRH);
+ rdma_ah_set_subnet_prefix(&ah_attr,
+ cpu_to_be64(port_attr.subnet_prefix));
+ rdma_ah_set_interface_id(&ah_attr,
+ cpu_to_be64(IB_SA_WELL_KNOWN_GUID));
+ }
+
+ new_ah->ah = rdma_create_ah(port->agent->qp->pd, &ah_attr,
+ RDMA_CREATE_AH_SLEEPABLE);
+ if (IS_ERR(new_ah->ah)) {
+ pr_warn("Couldn't create new SM AH\n");
+ kfree(new_ah);
+ return;
+ }
+
+ spin_lock_irq(&port->ah_lock);
+ if (port->sm_ah)
+ kref_put(&port->sm_ah->ref, free_sm_ah);
+ port->sm_ah = new_ah;
+ spin_unlock_irq(&port->ah_lock);
+}
+
+static void ib_sa_event(struct ib_event_handler *handler,
+ struct ib_event *event)
+{
+ if (event->event == IB_EVENT_PORT_ERR ||
+ event->event == IB_EVENT_PORT_ACTIVE ||
+ event->event == IB_EVENT_LID_CHANGE ||
+ event->event == IB_EVENT_PKEY_CHANGE ||
+ event->event == IB_EVENT_SM_CHANGE ||
+ event->event == IB_EVENT_CLIENT_REREGISTER) {
+ unsigned long flags;
+ struct ib_sa_device *sa_dev =
+ container_of(handler, typeof(*sa_dev), event_handler);
+ u32 port_num = event->element.port_num - sa_dev->start_port;
+ struct ib_sa_port *port = &sa_dev->port[port_num];
+
+ if (!rdma_cap_ib_sa(handler->device, port->port_num))
+ return;
+
+ spin_lock_irqsave(&port->ah_lock, flags);
+ if (port->sm_ah)
+ kref_put(&port->sm_ah->ref, free_sm_ah);
+ port->sm_ah = NULL;
+ spin_unlock_irqrestore(&port->ah_lock, flags);
+
+ if (event->event == IB_EVENT_SM_CHANGE ||
+ event->event == IB_EVENT_CLIENT_REREGISTER ||
+ event->event == IB_EVENT_LID_CHANGE ||
+ event->event == IB_EVENT_PORT_ACTIVE) {
+ unsigned long delay =
+ msecs_to_jiffies(IB_SA_CPI_RETRY_WAIT);
+
+ spin_lock_irqsave(&port->classport_lock, flags);
+ port->classport_info.valid = false;
+ port->classport_info.retry_cnt = 0;
+ spin_unlock_irqrestore(&port->classport_lock, flags);
+ queue_delayed_work(ib_wq,
+ &port->ib_cpi_work, delay);
+ }
+ queue_work(ib_wq, &sa_dev->port[port_num].update_task);
+ }
+}
+
+static int ib_sa_add_one(struct ib_device *device)
+{
+ struct ib_sa_device *sa_dev;
+ int s, e, i;
+ int count = 0;
+ int ret;
+
+ s = rdma_start_port(device);
+ e = rdma_end_port(device);
+
+ sa_dev = kzalloc(struct_size(sa_dev, port,
+ size_add(size_sub(e, s), 1)),
+ GFP_KERNEL);
+ if (!sa_dev)
+ return -ENOMEM;
+
+ sa_dev->start_port = s;
+ sa_dev->end_port = e;
+
+ for (i = 0; i <= e - s; ++i) {
+ spin_lock_init(&sa_dev->port[i].ah_lock);
+ if (!rdma_cap_ib_sa(device, i + 1))
+ continue;
+
+ sa_dev->port[i].sm_ah = NULL;
+ sa_dev->port[i].port_num = i + s;
+
+ spin_lock_init(&sa_dev->port[i].classport_lock);
+ sa_dev->port[i].classport_info.valid = false;
+
+ sa_dev->port[i].agent =
+ ib_register_mad_agent(device, i + s, IB_QPT_GSI,
+ NULL, 0, send_handler,
+ recv_handler, sa_dev, 0);
+ if (IS_ERR(sa_dev->port[i].agent)) {
+ ret = PTR_ERR(sa_dev->port[i].agent);
+ goto err;
+ }
+
+ INIT_WORK(&sa_dev->port[i].update_task, update_sm_ah);
+ INIT_DELAYED_WORK(&sa_dev->port[i].ib_cpi_work,
+ update_ib_cpi);
+
+ count++;
+ }
+
+ if (!count) {
+ ret = -EOPNOTSUPP;
+ goto free;
+ }
+
+ ib_set_client_data(device, &sa_client, sa_dev);
+
+ /*
+ * We register our event handler after everything is set up,
+ * and then update our cached info after the event handler is
+ * registered to avoid any problems if a port changes state
+ * during our initialization.
+ */
+
+ INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event);
+ ib_register_event_handler(&sa_dev->event_handler);
+
+ for (i = 0; i <= e - s; ++i) {
+ if (rdma_cap_ib_sa(device, i + 1))
+ update_sm_ah(&sa_dev->port[i].update_task);
+ }
+
+ return 0;
+
+err:
+ while (--i >= 0) {
+ if (rdma_cap_ib_sa(device, i + 1))
+ ib_unregister_mad_agent(sa_dev->port[i].agent);
+ }
+free:
+ kfree(sa_dev);
+ return ret;
+}
+
+static void ib_sa_remove_one(struct ib_device *device, void *client_data)
+{
+ struct ib_sa_device *sa_dev = client_data;
+ int i;
+
+ ib_unregister_event_handler(&sa_dev->event_handler);
+ flush_workqueue(ib_wq);
+
+ for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) {
+ if (rdma_cap_ib_sa(device, i + 1)) {
+ cancel_delayed_work_sync(&sa_dev->port[i].ib_cpi_work);
+ ib_unregister_mad_agent(sa_dev->port[i].agent);
+ if (sa_dev->port[i].sm_ah)
+ kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah);
+ }
+
+ }
+
+ kfree(sa_dev);
+}
+
+int ib_sa_init(void)
+{
+ int ret;
+
+ get_random_bytes(&tid, sizeof tid);
+
+ atomic_set(&ib_nl_sa_request_seq, 0);
+
+ ret = ib_register_client(&sa_client);
+ if (ret) {
+ pr_err("Couldn't register ib_sa client\n");
+ goto err1;
+ }
+
+ ret = mcast_init();
+ if (ret) {
+ pr_err("Couldn't initialize multicast handling\n");
+ goto err2;
+ }
+
+ ib_nl_wq = alloc_ordered_workqueue("ib_nl_sa_wq", WQ_MEM_RECLAIM);
+ if (!ib_nl_wq) {
+ ret = -ENOMEM;
+ goto err3;
+ }
+
+ INIT_DELAYED_WORK(&ib_nl_timed_work, ib_nl_request_timeout);
+
+ return 0;
+
+err3:
+ mcast_cleanup();
+err2:
+ ib_unregister_client(&sa_client);
+err1:
+ return ret;
+}
+
+void ib_sa_cleanup(void)
+{
+ cancel_delayed_work(&ib_nl_timed_work);
+ destroy_workqueue(ib_nl_wq);
+ mcast_cleanup();
+ ib_unregister_client(&sa_client);
+ WARN_ON(!xa_empty(&queries));
+}
diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c
new file mode 100644
index 000000000..3512c2e54
--- /dev/null
+++ b/drivers/infiniband/core/security.c
@@ -0,0 +1,750 @@
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/security.h>
+#include <linux/completion.h>
+#include <linux/list.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+#include "core_priv.h"
+#include "mad_priv.h"
+
+static LIST_HEAD(mad_agent_list);
+/* Lock to protect mad_agent_list */
+static DEFINE_SPINLOCK(mad_agent_list_lock);
+
+static struct pkey_index_qp_list *get_pkey_idx_qp_list(struct ib_port_pkey *pp)
+{
+ struct pkey_index_qp_list *pkey = NULL;
+ struct pkey_index_qp_list *tmp_pkey;
+ struct ib_device *dev = pp->sec->dev;
+
+ spin_lock(&dev->port_data[pp->port_num].pkey_list_lock);
+ list_for_each_entry (tmp_pkey, &dev->port_data[pp->port_num].pkey_list,
+ pkey_index_list) {
+ if (tmp_pkey->pkey_index == pp->pkey_index) {
+ pkey = tmp_pkey;
+ break;
+ }
+ }
+ spin_unlock(&dev->port_data[pp->port_num].pkey_list_lock);
+ return pkey;
+}
+
+static int get_pkey_and_subnet_prefix(struct ib_port_pkey *pp,
+ u16 *pkey,
+ u64 *subnet_prefix)
+{
+ struct ib_device *dev = pp->sec->dev;
+ int ret;
+
+ ret = ib_get_cached_pkey(dev, pp->port_num, pp->pkey_index, pkey);
+ if (ret)
+ return ret;
+
+ ib_get_cached_subnet_prefix(dev, pp->port_num, subnet_prefix);
+
+ return ret;
+}
+
+static int enforce_qp_pkey_security(u16 pkey,
+ u64 subnet_prefix,
+ struct ib_qp_security *qp_sec)
+{
+ struct ib_qp_security *shared_qp_sec;
+ int ret;
+
+ ret = security_ib_pkey_access(qp_sec->security, subnet_prefix, pkey);
+ if (ret)
+ return ret;
+
+ list_for_each_entry(shared_qp_sec,
+ &qp_sec->shared_qp_list,
+ shared_qp_list) {
+ ret = security_ib_pkey_access(shared_qp_sec->security,
+ subnet_prefix,
+ pkey);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/* The caller of this function must hold the QP security
+ * mutex of the QP of the security structure in *pps.
+ *
+ * It takes separate ports_pkeys and security structure
+ * because in some cases the pps will be for a new settings
+ * or the pps will be for the real QP and security structure
+ * will be for a shared QP.
+ */
+static int check_qp_port_pkey_settings(struct ib_ports_pkeys *pps,
+ struct ib_qp_security *sec)
+{
+ u64 subnet_prefix;
+ u16 pkey;
+ int ret = 0;
+
+ if (!pps)
+ return 0;
+
+ if (pps->main.state != IB_PORT_PKEY_NOT_VALID) {
+ ret = get_pkey_and_subnet_prefix(&pps->main,
+ &pkey,
+ &subnet_prefix);
+ if (ret)
+ return ret;
+
+ ret = enforce_qp_pkey_security(pkey,
+ subnet_prefix,
+ sec);
+ if (ret)
+ return ret;
+ }
+
+ if (pps->alt.state != IB_PORT_PKEY_NOT_VALID) {
+ ret = get_pkey_and_subnet_prefix(&pps->alt,
+ &pkey,
+ &subnet_prefix);
+ if (ret)
+ return ret;
+
+ ret = enforce_qp_pkey_security(pkey,
+ subnet_prefix,
+ sec);
+ }
+
+ return ret;
+}
+
+/* The caller of this function must hold the QP security
+ * mutex.
+ */
+static void qp_to_error(struct ib_qp_security *sec)
+{
+ struct ib_qp_security *shared_qp_sec;
+ struct ib_qp_attr attr = {
+ .qp_state = IB_QPS_ERR
+ };
+ struct ib_event event = {
+ .event = IB_EVENT_QP_FATAL
+ };
+
+ /* If the QP is in the process of being destroyed
+ * the qp pointer in the security structure is
+ * undefined. It cannot be modified now.
+ */
+ if (sec->destroying)
+ return;
+
+ ib_modify_qp(sec->qp,
+ &attr,
+ IB_QP_STATE);
+
+ if (sec->qp->event_handler && sec->qp->qp_context) {
+ event.element.qp = sec->qp;
+ sec->qp->event_handler(&event,
+ sec->qp->qp_context);
+ }
+
+ list_for_each_entry(shared_qp_sec,
+ &sec->shared_qp_list,
+ shared_qp_list) {
+ struct ib_qp *qp = shared_qp_sec->qp;
+
+ if (qp->event_handler && qp->qp_context) {
+ event.element.qp = qp;
+ event.device = qp->device;
+ qp->event_handler(&event,
+ qp->qp_context);
+ }
+ }
+}
+
+static inline void check_pkey_qps(struct pkey_index_qp_list *pkey,
+ struct ib_device *device,
+ u32 port_num,
+ u64 subnet_prefix)
+{
+ struct ib_port_pkey *pp, *tmp_pp;
+ bool comp;
+ LIST_HEAD(to_error_list);
+ u16 pkey_val;
+
+ if (!ib_get_cached_pkey(device,
+ port_num,
+ pkey->pkey_index,
+ &pkey_val)) {
+ spin_lock(&pkey->qp_list_lock);
+ list_for_each_entry(pp, &pkey->qp_list, qp_list) {
+ if (atomic_read(&pp->sec->error_list_count))
+ continue;
+
+ if (enforce_qp_pkey_security(pkey_val,
+ subnet_prefix,
+ pp->sec)) {
+ atomic_inc(&pp->sec->error_list_count);
+ list_add(&pp->to_error_list,
+ &to_error_list);
+ }
+ }
+ spin_unlock(&pkey->qp_list_lock);
+ }
+
+ list_for_each_entry_safe(pp,
+ tmp_pp,
+ &to_error_list,
+ to_error_list) {
+ mutex_lock(&pp->sec->mutex);
+ qp_to_error(pp->sec);
+ list_del(&pp->to_error_list);
+ atomic_dec(&pp->sec->error_list_count);
+ comp = pp->sec->destroying;
+ mutex_unlock(&pp->sec->mutex);
+
+ if (comp)
+ complete(&pp->sec->error_complete);
+ }
+}
+
+/* The caller of this function must hold the QP security
+ * mutex.
+ */
+static int port_pkey_list_insert(struct ib_port_pkey *pp)
+{
+ struct pkey_index_qp_list *tmp_pkey;
+ struct pkey_index_qp_list *pkey;
+ struct ib_device *dev;
+ u32 port_num = pp->port_num;
+ int ret = 0;
+
+ if (pp->state != IB_PORT_PKEY_VALID)
+ return 0;
+
+ dev = pp->sec->dev;
+
+ pkey = get_pkey_idx_qp_list(pp);
+
+ if (!pkey) {
+ bool found = false;
+
+ pkey = kzalloc(sizeof(*pkey), GFP_KERNEL);
+ if (!pkey)
+ return -ENOMEM;
+
+ spin_lock(&dev->port_data[port_num].pkey_list_lock);
+ /* Check for the PKey again. A racing process may
+ * have created it.
+ */
+ list_for_each_entry(tmp_pkey,
+ &dev->port_data[port_num].pkey_list,
+ pkey_index_list) {
+ if (tmp_pkey->pkey_index == pp->pkey_index) {
+ kfree(pkey);
+ pkey = tmp_pkey;
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ pkey->pkey_index = pp->pkey_index;
+ spin_lock_init(&pkey->qp_list_lock);
+ INIT_LIST_HEAD(&pkey->qp_list);
+ list_add(&pkey->pkey_index_list,
+ &dev->port_data[port_num].pkey_list);
+ }
+ spin_unlock(&dev->port_data[port_num].pkey_list_lock);
+ }
+
+ spin_lock(&pkey->qp_list_lock);
+ list_add(&pp->qp_list, &pkey->qp_list);
+ spin_unlock(&pkey->qp_list_lock);
+
+ pp->state = IB_PORT_PKEY_LISTED;
+
+ return ret;
+}
+
+/* The caller of this function must hold the QP security
+ * mutex.
+ */
+static void port_pkey_list_remove(struct ib_port_pkey *pp)
+{
+ struct pkey_index_qp_list *pkey;
+
+ if (pp->state != IB_PORT_PKEY_LISTED)
+ return;
+
+ pkey = get_pkey_idx_qp_list(pp);
+
+ spin_lock(&pkey->qp_list_lock);
+ list_del(&pp->qp_list);
+ spin_unlock(&pkey->qp_list_lock);
+
+ /* The setting may still be valid, i.e. after
+ * a destroy has failed for example.
+ */
+ pp->state = IB_PORT_PKEY_VALID;
+}
+
+static void destroy_qp_security(struct ib_qp_security *sec)
+{
+ security_ib_free_security(sec->security);
+ kfree(sec->ports_pkeys);
+ kfree(sec);
+}
+
+/* The caller of this function must hold the QP security
+ * mutex.
+ */
+static struct ib_ports_pkeys *get_new_pps(const struct ib_qp *qp,
+ const struct ib_qp_attr *qp_attr,
+ int qp_attr_mask)
+{
+ struct ib_ports_pkeys *new_pps;
+ struct ib_ports_pkeys *qp_pps = qp->qp_sec->ports_pkeys;
+
+ new_pps = kzalloc(sizeof(*new_pps), GFP_KERNEL);
+ if (!new_pps)
+ return NULL;
+
+ if (qp_attr_mask & IB_QP_PORT)
+ new_pps->main.port_num = qp_attr->port_num;
+ else if (qp_pps)
+ new_pps->main.port_num = qp_pps->main.port_num;
+
+ if (qp_attr_mask & IB_QP_PKEY_INDEX)
+ new_pps->main.pkey_index = qp_attr->pkey_index;
+ else if (qp_pps)
+ new_pps->main.pkey_index = qp_pps->main.pkey_index;
+
+ if (((qp_attr_mask & IB_QP_PKEY_INDEX) &&
+ (qp_attr_mask & IB_QP_PORT)) ||
+ (qp_pps && qp_pps->main.state != IB_PORT_PKEY_NOT_VALID))
+ new_pps->main.state = IB_PORT_PKEY_VALID;
+
+ if (qp_attr_mask & IB_QP_ALT_PATH) {
+ new_pps->alt.port_num = qp_attr->alt_port_num;
+ new_pps->alt.pkey_index = qp_attr->alt_pkey_index;
+ new_pps->alt.state = IB_PORT_PKEY_VALID;
+ } else if (qp_pps) {
+ new_pps->alt.port_num = qp_pps->alt.port_num;
+ new_pps->alt.pkey_index = qp_pps->alt.pkey_index;
+ if (qp_pps->alt.state != IB_PORT_PKEY_NOT_VALID)
+ new_pps->alt.state = IB_PORT_PKEY_VALID;
+ }
+
+ new_pps->main.sec = qp->qp_sec;
+ new_pps->alt.sec = qp->qp_sec;
+ return new_pps;
+}
+
+int ib_open_shared_qp_security(struct ib_qp *qp, struct ib_device *dev)
+{
+ struct ib_qp *real_qp = qp->real_qp;
+ int ret;
+
+ ret = ib_create_qp_security(qp, dev);
+
+ if (ret)
+ return ret;
+
+ if (!qp->qp_sec)
+ return 0;
+
+ mutex_lock(&real_qp->qp_sec->mutex);
+ ret = check_qp_port_pkey_settings(real_qp->qp_sec->ports_pkeys,
+ qp->qp_sec);
+
+ if (ret)
+ goto ret;
+
+ if (qp != real_qp)
+ list_add(&qp->qp_sec->shared_qp_list,
+ &real_qp->qp_sec->shared_qp_list);
+ret:
+ mutex_unlock(&real_qp->qp_sec->mutex);
+ if (ret)
+ destroy_qp_security(qp->qp_sec);
+
+ return ret;
+}
+
+void ib_close_shared_qp_security(struct ib_qp_security *sec)
+{
+ struct ib_qp *real_qp = sec->qp->real_qp;
+
+ mutex_lock(&real_qp->qp_sec->mutex);
+ list_del(&sec->shared_qp_list);
+ mutex_unlock(&real_qp->qp_sec->mutex);
+
+ destroy_qp_security(sec);
+}
+
+int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev)
+{
+ unsigned int i;
+ bool is_ib = false;
+ int ret;
+
+ rdma_for_each_port (dev, i) {
+ is_ib = rdma_protocol_ib(dev, i);
+ if (is_ib)
+ break;
+ }
+
+ /* If this isn't an IB device don't create the security context */
+ if (!is_ib)
+ return 0;
+
+ qp->qp_sec = kzalloc(sizeof(*qp->qp_sec), GFP_KERNEL);
+ if (!qp->qp_sec)
+ return -ENOMEM;
+
+ qp->qp_sec->qp = qp;
+ qp->qp_sec->dev = dev;
+ mutex_init(&qp->qp_sec->mutex);
+ INIT_LIST_HEAD(&qp->qp_sec->shared_qp_list);
+ atomic_set(&qp->qp_sec->error_list_count, 0);
+ init_completion(&qp->qp_sec->error_complete);
+ ret = security_ib_alloc_security(&qp->qp_sec->security);
+ if (ret) {
+ kfree(qp->qp_sec);
+ qp->qp_sec = NULL;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_create_qp_security);
+
+void ib_destroy_qp_security_begin(struct ib_qp_security *sec)
+{
+ /* Return if not IB */
+ if (!sec)
+ return;
+
+ mutex_lock(&sec->mutex);
+
+ /* Remove the QP from the lists so it won't get added to
+ * a to_error_list during the destroy process.
+ */
+ if (sec->ports_pkeys) {
+ port_pkey_list_remove(&sec->ports_pkeys->main);
+ port_pkey_list_remove(&sec->ports_pkeys->alt);
+ }
+
+ /* If the QP is already in one or more of those lists
+ * the destroying flag will ensure the to error flow
+ * doesn't operate on an undefined QP.
+ */
+ sec->destroying = true;
+
+ /* Record the error list count to know how many completions
+ * to wait for.
+ */
+ sec->error_comps_pending = atomic_read(&sec->error_list_count);
+
+ mutex_unlock(&sec->mutex);
+}
+
+void ib_destroy_qp_security_abort(struct ib_qp_security *sec)
+{
+ int ret;
+ int i;
+
+ /* Return if not IB */
+ if (!sec)
+ return;
+
+ /* If a concurrent cache update is in progress this
+ * QP security could be marked for an error state
+ * transition. Wait for this to complete.
+ */
+ for (i = 0; i < sec->error_comps_pending; i++)
+ wait_for_completion(&sec->error_complete);
+
+ mutex_lock(&sec->mutex);
+ sec->destroying = false;
+
+ /* Restore the position in the lists and verify
+ * access is still allowed in case a cache update
+ * occurred while attempting to destroy.
+ *
+ * Because these setting were listed already
+ * and removed during ib_destroy_qp_security_begin
+ * we know the pkey_index_qp_list for the PKey
+ * already exists so port_pkey_list_insert won't fail.
+ */
+ if (sec->ports_pkeys) {
+ port_pkey_list_insert(&sec->ports_pkeys->main);
+ port_pkey_list_insert(&sec->ports_pkeys->alt);
+ }
+
+ ret = check_qp_port_pkey_settings(sec->ports_pkeys, sec);
+ if (ret)
+ qp_to_error(sec);
+
+ mutex_unlock(&sec->mutex);
+}
+
+void ib_destroy_qp_security_end(struct ib_qp_security *sec)
+{
+ int i;
+
+ /* Return if not IB */
+ if (!sec)
+ return;
+
+ /* If a concurrent cache update is occurring we must
+ * wait until this QP security structure is processed
+ * in the QP to error flow before destroying it because
+ * the to_error_list is in use.
+ */
+ for (i = 0; i < sec->error_comps_pending; i++)
+ wait_for_completion(&sec->error_complete);
+
+ destroy_qp_security(sec);
+}
+
+void ib_security_cache_change(struct ib_device *device,
+ u32 port_num,
+ u64 subnet_prefix)
+{
+ struct pkey_index_qp_list *pkey;
+
+ list_for_each_entry (pkey, &device->port_data[port_num].pkey_list,
+ pkey_index_list) {
+ check_pkey_qps(pkey,
+ device,
+ port_num,
+ subnet_prefix);
+ }
+}
+
+void ib_security_release_port_pkey_list(struct ib_device *device)
+{
+ struct pkey_index_qp_list *pkey, *tmp_pkey;
+ unsigned int i;
+
+ rdma_for_each_port (device, i) {
+ list_for_each_entry_safe(pkey,
+ tmp_pkey,
+ &device->port_data[i].pkey_list,
+ pkey_index_list) {
+ list_del(&pkey->pkey_index_list);
+ kfree(pkey);
+ }
+ }
+}
+
+int ib_security_modify_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask,
+ struct ib_udata *udata)
+{
+ int ret = 0;
+ struct ib_ports_pkeys *tmp_pps;
+ struct ib_ports_pkeys *new_pps = NULL;
+ struct ib_qp *real_qp = qp->real_qp;
+ bool special_qp = (real_qp->qp_type == IB_QPT_SMI ||
+ real_qp->qp_type == IB_QPT_GSI ||
+ real_qp->qp_type >= IB_QPT_RESERVED1);
+ bool pps_change = ((qp_attr_mask & (IB_QP_PKEY_INDEX | IB_QP_PORT)) ||
+ (qp_attr_mask & IB_QP_ALT_PATH));
+
+ WARN_ONCE((qp_attr_mask & IB_QP_PORT &&
+ rdma_protocol_ib(real_qp->device, qp_attr->port_num) &&
+ !real_qp->qp_sec),
+ "%s: QP security is not initialized for IB QP: %u\n",
+ __func__, real_qp->qp_num);
+
+ /* The port/pkey settings are maintained only for the real QP. Open
+ * handles on the real QP will be in the shared_qp_list. When
+ * enforcing security on the real QP all the shared QPs will be
+ * checked as well.
+ */
+
+ if (pps_change && !special_qp && real_qp->qp_sec) {
+ mutex_lock(&real_qp->qp_sec->mutex);
+ new_pps = get_new_pps(real_qp,
+ qp_attr,
+ qp_attr_mask);
+ if (!new_pps) {
+ mutex_unlock(&real_qp->qp_sec->mutex);
+ return -ENOMEM;
+ }
+ /* Add this QP to the lists for the new port
+ * and pkey settings before checking for permission
+ * in case there is a concurrent cache update
+ * occurring. Walking the list for a cache change
+ * doesn't acquire the security mutex unless it's
+ * sending the QP to error.
+ */
+ ret = port_pkey_list_insert(&new_pps->main);
+
+ if (!ret)
+ ret = port_pkey_list_insert(&new_pps->alt);
+
+ if (!ret)
+ ret = check_qp_port_pkey_settings(new_pps,
+ real_qp->qp_sec);
+ }
+
+ if (!ret)
+ ret = real_qp->device->ops.modify_qp(real_qp,
+ qp_attr,
+ qp_attr_mask,
+ udata);
+
+ if (new_pps) {
+ /* Clean up the lists and free the appropriate
+ * ports_pkeys structure.
+ */
+ if (ret) {
+ tmp_pps = new_pps;
+ } else {
+ tmp_pps = real_qp->qp_sec->ports_pkeys;
+ real_qp->qp_sec->ports_pkeys = new_pps;
+ }
+
+ if (tmp_pps) {
+ port_pkey_list_remove(&tmp_pps->main);
+ port_pkey_list_remove(&tmp_pps->alt);
+ }
+ kfree(tmp_pps);
+ mutex_unlock(&real_qp->qp_sec->mutex);
+ }
+ return ret;
+}
+
+static int ib_security_pkey_access(struct ib_device *dev,
+ u32 port_num,
+ u16 pkey_index,
+ void *sec)
+{
+ u64 subnet_prefix;
+ u16 pkey;
+ int ret;
+
+ if (!rdma_protocol_ib(dev, port_num))
+ return 0;
+
+ ret = ib_get_cached_pkey(dev, port_num, pkey_index, &pkey);
+ if (ret)
+ return ret;
+
+ ib_get_cached_subnet_prefix(dev, port_num, &subnet_prefix);
+
+ return security_ib_pkey_access(sec, subnet_prefix, pkey);
+}
+
+void ib_mad_agent_security_change(void)
+{
+ struct ib_mad_agent *ag;
+
+ spin_lock(&mad_agent_list_lock);
+ list_for_each_entry(ag,
+ &mad_agent_list,
+ mad_agent_sec_list)
+ WRITE_ONCE(ag->smp_allowed,
+ !security_ib_endport_manage_subnet(ag->security,
+ dev_name(&ag->device->dev), ag->port_num));
+ spin_unlock(&mad_agent_list_lock);
+}
+
+int ib_mad_agent_security_setup(struct ib_mad_agent *agent,
+ enum ib_qp_type qp_type)
+{
+ int ret;
+
+ if (!rdma_protocol_ib(agent->device, agent->port_num))
+ return 0;
+
+ INIT_LIST_HEAD(&agent->mad_agent_sec_list);
+
+ ret = security_ib_alloc_security(&agent->security);
+ if (ret)
+ return ret;
+
+ if (qp_type != IB_QPT_SMI)
+ return 0;
+
+ spin_lock(&mad_agent_list_lock);
+ ret = security_ib_endport_manage_subnet(agent->security,
+ dev_name(&agent->device->dev),
+ agent->port_num);
+ if (ret)
+ goto free_security;
+
+ WRITE_ONCE(agent->smp_allowed, true);
+ list_add(&agent->mad_agent_sec_list, &mad_agent_list);
+ spin_unlock(&mad_agent_list_lock);
+ return 0;
+
+free_security:
+ spin_unlock(&mad_agent_list_lock);
+ security_ib_free_security(agent->security);
+ return ret;
+}
+
+void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent)
+{
+ if (!rdma_protocol_ib(agent->device, agent->port_num))
+ return;
+
+ if (agent->qp->qp_type == IB_QPT_SMI) {
+ spin_lock(&mad_agent_list_lock);
+ list_del(&agent->mad_agent_sec_list);
+ spin_unlock(&mad_agent_list_lock);
+ }
+
+ security_ib_free_security(agent->security);
+}
+
+int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index)
+{
+ if (!rdma_protocol_ib(map->agent.device, map->agent.port_num))
+ return 0;
+
+ if (map->agent.qp->qp_type == IB_QPT_SMI) {
+ if (!READ_ONCE(map->agent.smp_allowed))
+ return -EACCES;
+ return 0;
+ }
+
+ return ib_security_pkey_access(map->agent.device,
+ map->agent.port_num,
+ pkey_index,
+ map->agent.security);
+}
diff --git a/drivers/infiniband/core/smi.c b/drivers/infiniband/core/smi.c
new file mode 100644
index 000000000..45f09b75c
--- /dev/null
+++ b/drivers/infiniband/core/smi.c
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) 2004, 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004, 2005 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <rdma/ib_smi.h>
+#include "smi.h"
+#include "opa_smi.h"
+
+static enum smi_action __smi_handle_dr_smp_send(bool is_switch, u32 port_num,
+ u8 *hop_ptr, u8 hop_cnt,
+ const u8 *initial_path,
+ const u8 *return_path,
+ u8 direction,
+ bool dr_dlid_is_permissive,
+ bool dr_slid_is_permissive)
+{
+ /* See section 14.2.2.2, Vol 1 IB spec */
+ /* C14-6 -- valid hop_cnt values are from 0 to 63 */
+ if (hop_cnt >= IB_SMP_MAX_PATH_HOPS)
+ return IB_SMI_DISCARD;
+
+ if (!direction) {
+ /* C14-9:1 */
+ if (hop_cnt && *hop_ptr == 0) {
+ (*hop_ptr)++;
+ return (initial_path[*hop_ptr] ==
+ port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-9:2 */
+ if (*hop_ptr && *hop_ptr < hop_cnt) {
+ if (!is_switch)
+ return IB_SMI_DISCARD;
+
+ /* return_path set when received */
+ (*hop_ptr)++;
+ return (initial_path[*hop_ptr] ==
+ port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-9:3 -- We're at the end of the DR segment of path */
+ if (*hop_ptr == hop_cnt) {
+ /* return_path set when received */
+ (*hop_ptr)++;
+ return (is_switch ||
+ dr_dlid_is_permissive ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+ /* C14-9:5 -- Fail unreasonable hop pointer */
+ return (*hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+
+ } else {
+ /* C14-13:1 */
+ if (hop_cnt && *hop_ptr == hop_cnt + 1) {
+ (*hop_ptr)--;
+ return (return_path[*hop_ptr] ==
+ port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:2 */
+ if (2 <= *hop_ptr && *hop_ptr <= hop_cnt) {
+ if (!is_switch)
+ return IB_SMI_DISCARD;
+
+ (*hop_ptr)--;
+ return (return_path[*hop_ptr] ==
+ port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:3 -- at the end of the DR segment of path */
+ if (*hop_ptr == 1) {
+ (*hop_ptr)--;
+ /* C14-13:3 -- SMPs destined for SM shouldn't be here */
+ return (is_switch ||
+ dr_slid_is_permissive ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:4 -- hop_ptr = 0 -> should have gone to SM */
+ if (*hop_ptr == 0)
+ return IB_SMI_HANDLE;
+
+ /* C14-13:5 -- Check for unreasonable hop pointer */
+ return IB_SMI_DISCARD;
+ }
+}
+
+/*
+ * Fixup a directed route SMP for sending
+ * Return IB_SMI_DISCARD if the SMP should be discarded
+ */
+enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
+ bool is_switch, u32 port_num)
+{
+ return __smi_handle_dr_smp_send(is_switch, port_num,
+ &smp->hop_ptr, smp->hop_cnt,
+ smp->initial_path,
+ smp->return_path,
+ ib_get_smp_direction(smp),
+ smp->dr_dlid == IB_LID_PERMISSIVE,
+ smp->dr_slid == IB_LID_PERMISSIVE);
+}
+
+enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp,
+ bool is_switch, u32 port_num)
+{
+ return __smi_handle_dr_smp_send(is_switch, port_num,
+ &smp->hop_ptr, smp->hop_cnt,
+ smp->route.dr.initial_path,
+ smp->route.dr.return_path,
+ opa_get_smp_direction(smp),
+ smp->route.dr.dr_dlid ==
+ OPA_LID_PERMISSIVE,
+ smp->route.dr.dr_slid ==
+ OPA_LID_PERMISSIVE);
+}
+
+static enum smi_action __smi_handle_dr_smp_recv(bool is_switch, u32 port_num,
+ int phys_port_cnt,
+ u8 *hop_ptr, u8 hop_cnt,
+ const u8 *initial_path,
+ u8 *return_path,
+ u8 direction,
+ bool dr_dlid_is_permissive,
+ bool dr_slid_is_permissive)
+{
+ /* See section 14.2.2.2, Vol 1 IB spec */
+ /* C14-6 -- valid hop_cnt values are from 0 to 63 */
+ if (hop_cnt >= IB_SMP_MAX_PATH_HOPS)
+ return IB_SMI_DISCARD;
+
+ if (!direction) {
+ /* C14-9:1 -- sender should have incremented hop_ptr */
+ if (hop_cnt && *hop_ptr == 0)
+ return IB_SMI_DISCARD;
+
+ /* C14-9:2 -- intermediate hop */
+ if (*hop_ptr && *hop_ptr < hop_cnt) {
+ if (!is_switch)
+ return IB_SMI_DISCARD;
+
+ return_path[*hop_ptr] = port_num;
+ /* hop_ptr updated when sending */
+ return (initial_path[*hop_ptr+1] <= phys_port_cnt ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-9:3 -- We're at the end of the DR segment of path */
+ if (*hop_ptr == hop_cnt) {
+ if (hop_cnt)
+ return_path[*hop_ptr] = port_num;
+ /* hop_ptr updated when sending */
+
+ return (is_switch ||
+ dr_dlid_is_permissive ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+ /* C14-9:5 -- fail unreasonable hop pointer */
+ return (*hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+
+ } else {
+
+ /* C14-13:1 */
+ if (hop_cnt && *hop_ptr == hop_cnt + 1) {
+ (*hop_ptr)--;
+ return (return_path[*hop_ptr] ==
+ port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:2 */
+ if (2 <= *hop_ptr && *hop_ptr <= hop_cnt) {
+ if (!is_switch)
+ return IB_SMI_DISCARD;
+
+ /* hop_ptr updated when sending */
+ return (return_path[*hop_ptr-1] <= phys_port_cnt ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:3 -- We're at the end of the DR segment of path */
+ if (*hop_ptr == 1) {
+ if (dr_slid_is_permissive) {
+ /* giving SMP to SM - update hop_ptr */
+ (*hop_ptr)--;
+ return IB_SMI_HANDLE;
+ }
+ /* hop_ptr updated when sending */
+ return (is_switch ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:4 -- hop_ptr = 0 -> give to SM */
+ /* C14-13:5 -- Check for unreasonable hop pointer */
+ return (*hop_ptr == 0 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+}
+
+/*
+ * Adjust information for a received SMP
+ * Return IB_SMI_DISCARD if the SMP should be dropped
+ */
+enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch,
+ u32 port_num, int phys_port_cnt)
+{
+ return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt,
+ &smp->hop_ptr, smp->hop_cnt,
+ smp->initial_path,
+ smp->return_path,
+ ib_get_smp_direction(smp),
+ smp->dr_dlid == IB_LID_PERMISSIVE,
+ smp->dr_slid == IB_LID_PERMISSIVE);
+}
+
+/*
+ * Adjust information for a received SMP
+ * Return IB_SMI_DISCARD if the SMP should be dropped
+ */
+enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch,
+ u32 port_num, int phys_port_cnt)
+{
+ return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt,
+ &smp->hop_ptr, smp->hop_cnt,
+ smp->route.dr.initial_path,
+ smp->route.dr.return_path,
+ opa_get_smp_direction(smp),
+ smp->route.dr.dr_dlid ==
+ OPA_LID_PERMISSIVE,
+ smp->route.dr.dr_slid ==
+ OPA_LID_PERMISSIVE);
+}
+
+static enum smi_forward_action __smi_check_forward_dr_smp(u8 hop_ptr, u8 hop_cnt,
+ u8 direction,
+ bool dr_dlid_is_permissive,
+ bool dr_slid_is_permissive)
+{
+ if (!direction) {
+ /* C14-9:2 -- intermediate hop */
+ if (hop_ptr && hop_ptr < hop_cnt)
+ return IB_SMI_FORWARD;
+
+ /* C14-9:3 -- at the end of the DR segment of path */
+ if (hop_ptr == hop_cnt)
+ return (dr_dlid_is_permissive ?
+ IB_SMI_SEND : IB_SMI_LOCAL);
+
+ /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+ if (hop_ptr == hop_cnt + 1)
+ return IB_SMI_SEND;
+ } else {
+ /* C14-13:2 -- intermediate hop */
+ if (2 <= hop_ptr && hop_ptr <= hop_cnt)
+ return IB_SMI_FORWARD;
+
+ /* C14-13:3 -- at the end of the DR segment of path */
+ if (hop_ptr == 1)
+ return (!dr_slid_is_permissive ?
+ IB_SMI_SEND : IB_SMI_LOCAL);
+ }
+ return IB_SMI_LOCAL;
+
+}
+
+enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp)
+{
+ return __smi_check_forward_dr_smp(smp->hop_ptr, smp->hop_cnt,
+ ib_get_smp_direction(smp),
+ smp->dr_dlid == IB_LID_PERMISSIVE,
+ smp->dr_slid == IB_LID_PERMISSIVE);
+}
+
+enum smi_forward_action opa_smi_check_forward_dr_smp(struct opa_smp *smp)
+{
+ return __smi_check_forward_dr_smp(smp->hop_ptr, smp->hop_cnt,
+ opa_get_smp_direction(smp),
+ smp->route.dr.dr_dlid ==
+ OPA_LID_PERMISSIVE,
+ smp->route.dr.dr_slid ==
+ OPA_LID_PERMISSIVE);
+}
+
+/*
+ * Return the forwarding port number from initial_path for outgoing SMP and
+ * from return_path for returning SMP
+ */
+int smi_get_fwd_port(struct ib_smp *smp)
+{
+ return (!ib_get_smp_direction(smp) ? smp->initial_path[smp->hop_ptr+1] :
+ smp->return_path[smp->hop_ptr-1]);
+}
+
+/*
+ * Return the forwarding port number from initial_path for outgoing SMP and
+ * from return_path for returning SMP
+ */
+int opa_smi_get_fwd_port(struct opa_smp *smp)
+{
+ return !opa_get_smp_direction(smp) ? smp->route.dr.initial_path[smp->hop_ptr+1] :
+ smp->route.dr.return_path[smp->hop_ptr-1];
+}
diff --git a/drivers/infiniband/core/smi.h b/drivers/infiniband/core/smi.h
new file mode 100644
index 000000000..e350ed623
--- /dev/null
+++ b/drivers/infiniband/core/smi.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __SMI_H_
+#define __SMI_H_
+
+#include <rdma/ib_smi.h>
+
+enum smi_action {
+ IB_SMI_DISCARD,
+ IB_SMI_HANDLE
+};
+
+enum smi_forward_action {
+ IB_SMI_LOCAL, /* SMP should be completed up the stack */
+ IB_SMI_SEND, /* received DR SMP should be forwarded to the send queue */
+ IB_SMI_FORWARD /* SMP should be forwarded (for switches only) */
+};
+
+enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch,
+ u32 port_num, int phys_port_cnt);
+int smi_get_fwd_port(struct ib_smp *smp);
+extern enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp);
+extern enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
+ bool is_switch, u32 port_num);
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action smi_check_local_smp(struct ib_smp *smp,
+ struct ib_device *device)
+{
+ /* C14-9:3 -- We're at the end of the DR segment of path */
+ /* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */
+ return ((device->ops.process_mad &&
+ !ib_get_smp_direction(smp) &&
+ (smp->hop_ptr == smp->hop_cnt + 1)) ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+}
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action smi_check_local_returning_smp(struct ib_smp *smp,
+ struct ib_device *device)
+{
+ /* C14-13:3 -- We're at the end of the DR segment of path */
+ /* C14-13:4 -- Hop Pointer == 0 -> give to SM */
+ return ((device->ops.process_mad &&
+ ib_get_smp_direction(smp) &&
+ !smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+}
+
+#endif /* __SMI_H_ */
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
new file mode 100644
index 000000000..ec5efdc16
--- /dev/null
+++ b/drivers/infiniband/core/sysfs.c
@@ -0,0 +1,1480 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "core_priv.h"
+
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_pma.h>
+#include <rdma/ib_cache.h>
+#include <rdma/rdma_counter.h>
+#include <rdma/ib_sysfs.h>
+
+struct port_table_attribute {
+ struct ib_port_attribute attr;
+ char name[8];
+ int index;
+ __be16 attr_id;
+};
+
+struct gid_attr_group {
+ struct ib_port *port;
+ struct kobject kobj;
+ struct attribute_group groups[2];
+ const struct attribute_group *groups_list[3];
+ struct port_table_attribute attrs_list[];
+};
+
+struct ib_port {
+ struct kobject kobj;
+ struct ib_device *ibdev;
+ struct gid_attr_group *gid_attr_group;
+ struct hw_stats_port_data *hw_stats_data;
+
+ struct attribute_group groups[3];
+ const struct attribute_group *groups_list[5];
+ u32 port_num;
+ struct port_table_attribute attrs_list[];
+};
+
+struct hw_stats_device_attribute {
+ struct device_attribute attr;
+ ssize_t (*show)(struct ib_device *ibdev, struct rdma_hw_stats *stats,
+ unsigned int index, unsigned int port_num, char *buf);
+ ssize_t (*store)(struct ib_device *ibdev, struct rdma_hw_stats *stats,
+ unsigned int index, unsigned int port_num,
+ const char *buf, size_t count);
+};
+
+struct hw_stats_port_attribute {
+ struct ib_port_attribute attr;
+ ssize_t (*show)(struct ib_device *ibdev, struct rdma_hw_stats *stats,
+ unsigned int index, unsigned int port_num, char *buf);
+ ssize_t (*store)(struct ib_device *ibdev, struct rdma_hw_stats *stats,
+ unsigned int index, unsigned int port_num,
+ const char *buf, size_t count);
+};
+
+struct hw_stats_device_data {
+ struct attribute_group group;
+ struct rdma_hw_stats *stats;
+ struct hw_stats_device_attribute attrs[];
+};
+
+struct hw_stats_port_data {
+ struct rdma_hw_stats *stats;
+ struct hw_stats_port_attribute attrs[];
+};
+
+static ssize_t port_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct ib_port_attribute *port_attr =
+ container_of(attr, struct ib_port_attribute, attr);
+ struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+
+ if (!port_attr->show)
+ return -EIO;
+
+ return port_attr->show(p->ibdev, p->port_num, port_attr, buf);
+}
+
+static ssize_t port_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t count)
+{
+ struct ib_port_attribute *port_attr =
+ container_of(attr, struct ib_port_attribute, attr);
+ struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+
+ if (!port_attr->store)
+ return -EIO;
+ return port_attr->store(p->ibdev, p->port_num, port_attr, buf, count);
+}
+
+struct ib_device *ib_port_sysfs_get_ibdev_kobj(struct kobject *kobj,
+ u32 *port_num)
+{
+ struct ib_port *port = container_of(kobj, struct ib_port, kobj);
+
+ *port_num = port->port_num;
+ return port->ibdev;
+}
+EXPORT_SYMBOL(ib_port_sysfs_get_ibdev_kobj);
+
+static const struct sysfs_ops port_sysfs_ops = {
+ .show = port_attr_show,
+ .store = port_attr_store
+};
+
+static ssize_t hw_stat_device_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hw_stats_device_attribute *stat_attr =
+ container_of(attr, struct hw_stats_device_attribute, attr);
+ struct ib_device *ibdev = container_of(dev, struct ib_device, dev);
+
+ return stat_attr->show(ibdev, ibdev->hw_stats_data->stats,
+ stat_attr - ibdev->hw_stats_data->attrs, 0, buf);
+}
+
+static ssize_t hw_stat_device_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct hw_stats_device_attribute *stat_attr =
+ container_of(attr, struct hw_stats_device_attribute, attr);
+ struct ib_device *ibdev = container_of(dev, struct ib_device, dev);
+
+ return stat_attr->store(ibdev, ibdev->hw_stats_data->stats,
+ stat_attr - ibdev->hw_stats_data->attrs, 0, buf,
+ count);
+}
+
+static ssize_t hw_stat_port_show(struct ib_device *ibdev, u32 port_num,
+ struct ib_port_attribute *attr, char *buf)
+{
+ struct hw_stats_port_attribute *stat_attr =
+ container_of(attr, struct hw_stats_port_attribute, attr);
+ struct ib_port *port = ibdev->port_data[port_num].sysfs;
+
+ return stat_attr->show(ibdev, port->hw_stats_data->stats,
+ stat_attr - port->hw_stats_data->attrs,
+ port->port_num, buf);
+}
+
+static ssize_t hw_stat_port_store(struct ib_device *ibdev, u32 port_num,
+ struct ib_port_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct hw_stats_port_attribute *stat_attr =
+ container_of(attr, struct hw_stats_port_attribute, attr);
+ struct ib_port *port = ibdev->port_data[port_num].sysfs;
+
+ return stat_attr->store(ibdev, port->hw_stats_data->stats,
+ stat_attr - port->hw_stats_data->attrs,
+ port->port_num, buf, count);
+}
+
+static ssize_t gid_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct ib_port_attribute *port_attr =
+ container_of(attr, struct ib_port_attribute, attr);
+ struct ib_port *p = container_of(kobj, struct gid_attr_group,
+ kobj)->port;
+
+ if (!port_attr->show)
+ return -EIO;
+
+ return port_attr->show(p->ibdev, p->port_num, port_attr, buf);
+}
+
+static const struct sysfs_ops gid_attr_sysfs_ops = {
+ .show = gid_attr_show
+};
+
+static ssize_t state_show(struct ib_device *ibdev, u32 port_num,
+ struct ib_port_attribute *unused, char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ static const char *state_name[] = {
+ [IB_PORT_NOP] = "NOP",
+ [IB_PORT_DOWN] = "DOWN",
+ [IB_PORT_INIT] = "INIT",
+ [IB_PORT_ARMED] = "ARMED",
+ [IB_PORT_ACTIVE] = "ACTIVE",
+ [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER"
+ };
+
+ ret = ib_query_port(ibdev, port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sysfs_emit(buf, "%d: %s\n", attr.state,
+ attr.state >= 0 &&
+ attr.state < ARRAY_SIZE(state_name) ?
+ state_name[attr.state] :
+ "UNKNOWN");
+}
+
+static ssize_t lid_show(struct ib_device *ibdev, u32 port_num,
+ struct ib_port_attribute *unused, char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(ibdev, port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sysfs_emit(buf, "0x%x\n", attr.lid);
+}
+
+static ssize_t lid_mask_count_show(struct ib_device *ibdev, u32 port_num,
+ struct ib_port_attribute *unused, char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(ibdev, port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sysfs_emit(buf, "%u\n", attr.lmc);
+}
+
+static ssize_t sm_lid_show(struct ib_device *ibdev, u32 port_num,
+ struct ib_port_attribute *unused, char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(ibdev, port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sysfs_emit(buf, "0x%x\n", attr.sm_lid);
+}
+
+static ssize_t sm_sl_show(struct ib_device *ibdev, u32 port_num,
+ struct ib_port_attribute *unused, char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(ibdev, port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sysfs_emit(buf, "%u\n", attr.sm_sl);
+}
+
+static ssize_t cap_mask_show(struct ib_device *ibdev, u32 port_num,
+ struct ib_port_attribute *unused, char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(ibdev, port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sysfs_emit(buf, "0x%08x\n", attr.port_cap_flags);
+}
+
+static ssize_t rate_show(struct ib_device *ibdev, u32 port_num,
+ struct ib_port_attribute *unused, char *buf)
+{
+ struct ib_port_attr attr;
+ char *speed = "";
+ int rate; /* in deci-Gb/sec */
+ ssize_t ret;
+
+ ret = ib_query_port(ibdev, port_num, &attr);
+ if (ret)
+ return ret;
+
+ switch (attr.active_speed) {
+ case IB_SPEED_DDR:
+ speed = " DDR";
+ rate = 50;
+ break;
+ case IB_SPEED_QDR:
+ speed = " QDR";
+ rate = 100;
+ break;
+ case IB_SPEED_FDR10:
+ speed = " FDR10";
+ rate = 100;
+ break;
+ case IB_SPEED_FDR:
+ speed = " FDR";
+ rate = 140;
+ break;
+ case IB_SPEED_EDR:
+ speed = " EDR";
+ rate = 250;
+ break;
+ case IB_SPEED_HDR:
+ speed = " HDR";
+ rate = 500;
+ break;
+ case IB_SPEED_NDR:
+ speed = " NDR";
+ rate = 1000;
+ break;
+ case IB_SPEED_SDR:
+ default: /* default to SDR for invalid rates */
+ speed = " SDR";
+ rate = 25;
+ break;
+ }
+
+ rate *= ib_width_enum_to_int(attr.active_width);
+ if (rate < 0)
+ return -EINVAL;
+
+ return sysfs_emit(buf, "%d%s Gb/sec (%dX%s)\n", rate / 10,
+ rate % 10 ? ".5" : "",
+ ib_width_enum_to_int(attr.active_width), speed);
+}
+
+static const char *phys_state_to_str(enum ib_port_phys_state phys_state)
+{
+ static const char *phys_state_str[] = {
+ "<unknown>",
+ "Sleep",
+ "Polling",
+ "Disabled",
+ "PortConfigurationTraining",
+ "LinkUp",
+ "LinkErrorRecovery",
+ "Phy Test",
+ };
+
+ if (phys_state < ARRAY_SIZE(phys_state_str))
+ return phys_state_str[phys_state];
+ return "<unknown>";
+}
+
+static ssize_t phys_state_show(struct ib_device *ibdev, u32 port_num,
+ struct ib_port_attribute *unused, char *buf)
+{
+ struct ib_port_attr attr;
+
+ ssize_t ret;
+
+ ret = ib_query_port(ibdev, port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sysfs_emit(buf, "%u: %s\n", attr.phys_state,
+ phys_state_to_str(attr.phys_state));
+}
+
+static ssize_t link_layer_show(struct ib_device *ibdev, u32 port_num,
+ struct ib_port_attribute *unused, char *buf)
+{
+ const char *output;
+
+ switch (rdma_port_get_link_layer(ibdev, port_num)) {
+ case IB_LINK_LAYER_INFINIBAND:
+ output = "InfiniBand";
+ break;
+ case IB_LINK_LAYER_ETHERNET:
+ output = "Ethernet";
+ break;
+ default:
+ output = "Unknown";
+ break;
+ }
+
+ return sysfs_emit(buf, "%s\n", output);
+}
+
+static IB_PORT_ATTR_RO(state);
+static IB_PORT_ATTR_RO(lid);
+static IB_PORT_ATTR_RO(lid_mask_count);
+static IB_PORT_ATTR_RO(sm_lid);
+static IB_PORT_ATTR_RO(sm_sl);
+static IB_PORT_ATTR_RO(cap_mask);
+static IB_PORT_ATTR_RO(rate);
+static IB_PORT_ATTR_RO(phys_state);
+static IB_PORT_ATTR_RO(link_layer);
+
+static struct attribute *port_default_attrs[] = {
+ &ib_port_attr_state.attr,
+ &ib_port_attr_lid.attr,
+ &ib_port_attr_lid_mask_count.attr,
+ &ib_port_attr_sm_lid.attr,
+ &ib_port_attr_sm_sl.attr,
+ &ib_port_attr_cap_mask.attr,
+ &ib_port_attr_rate.attr,
+ &ib_port_attr_phys_state.attr,
+ &ib_port_attr_link_layer.attr,
+ NULL
+};
+ATTRIBUTE_GROUPS(port_default);
+
+static ssize_t print_ndev(const struct ib_gid_attr *gid_attr, char *buf)
+{
+ struct net_device *ndev;
+ int ret = -EINVAL;
+
+ rcu_read_lock();
+ ndev = rcu_dereference(gid_attr->ndev);
+ if (ndev)
+ ret = sysfs_emit(buf, "%s\n", ndev->name);
+ rcu_read_unlock();
+ return ret;
+}
+
+static ssize_t print_gid_type(const struct ib_gid_attr *gid_attr, char *buf)
+{
+ return sysfs_emit(buf, "%s\n",
+ ib_cache_gid_type_str(gid_attr->gid_type));
+}
+
+static ssize_t _show_port_gid_attr(
+ struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *attr,
+ char *buf,
+ ssize_t (*print)(const struct ib_gid_attr *gid_attr, char *buf))
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ const struct ib_gid_attr *gid_attr;
+ ssize_t ret;
+
+ gid_attr = rdma_get_gid_attr(ibdev, port_num, tab_attr->index);
+ if (IS_ERR(gid_attr))
+ /* -EINVAL is returned for user space compatibility reasons. */
+ return -EINVAL;
+
+ ret = print(gid_attr, buf);
+ rdma_put_gid_attr(gid_attr);
+ return ret;
+}
+
+static ssize_t show_port_gid(struct ib_device *ibdev, u32 port_num,
+ struct ib_port_attribute *attr, char *buf)
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ const struct ib_gid_attr *gid_attr;
+ int len;
+
+ gid_attr = rdma_get_gid_attr(ibdev, port_num, tab_attr->index);
+ if (IS_ERR(gid_attr)) {
+ const union ib_gid zgid = {};
+
+ /* If reading GID fails, it is likely due to GID entry being
+ * empty (invalid) or reserved GID in the table. User space
+ * expects to read GID table entries as long as it given index
+ * is within GID table size. Administrative/debugging tool
+ * fails to query rest of the GID entries if it hits error
+ * while querying a GID of the given index. To avoid user
+ * space throwing such error on fail to read gid, return zero
+ * GID as before. This maintains backward compatibility.
+ */
+ return sysfs_emit(buf, "%pI6\n", zgid.raw);
+ }
+
+ len = sysfs_emit(buf, "%pI6\n", gid_attr->gid.raw);
+ rdma_put_gid_attr(gid_attr);
+ return len;
+}
+
+static ssize_t show_port_gid_attr_ndev(struct ib_device *ibdev, u32 port_num,
+ struct ib_port_attribute *attr,
+ char *buf)
+{
+ return _show_port_gid_attr(ibdev, port_num, attr, buf, print_ndev);
+}
+
+static ssize_t show_port_gid_attr_gid_type(struct ib_device *ibdev,
+ u32 port_num,
+ struct ib_port_attribute *attr,
+ char *buf)
+{
+ return _show_port_gid_attr(ibdev, port_num, attr, buf, print_gid_type);
+}
+
+static ssize_t show_port_pkey(struct ib_device *ibdev, u32 port_num,
+ struct ib_port_attribute *attr, char *buf)
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ u16 pkey;
+ int ret;
+
+ ret = ib_query_pkey(ibdev, port_num, tab_attr->index, &pkey);
+ if (ret)
+ return ret;
+
+ return sysfs_emit(buf, "0x%04x\n", pkey);
+}
+
+#define PORT_PMA_ATTR(_name, _counter, _width, _offset) \
+struct port_table_attribute port_pma_attr_##_name = { \
+ .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \
+ .index = (_offset) | ((_width) << 16) | ((_counter) << 24), \
+ .attr_id = IB_PMA_PORT_COUNTERS, \
+}
+
+#define PORT_PMA_ATTR_EXT(_name, _width, _offset) \
+struct port_table_attribute port_pma_attr_ext_##_name = { \
+ .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \
+ .index = (_offset) | ((_width) << 16), \
+ .attr_id = IB_PMA_PORT_COUNTERS_EXT, \
+}
+
+/*
+ * Get a Perfmgmt MAD block of data.
+ * Returns error code or the number of bytes retrieved.
+ */
+static int get_perf_mad(struct ib_device *dev, int port_num, __be16 attr,
+ void *data, int offset, size_t size)
+{
+ struct ib_mad *in_mad;
+ struct ib_mad *out_mad;
+ size_t mad_size = sizeof(*out_mad);
+ u16 out_mad_pkey_index = 0;
+ ssize_t ret;
+
+ if (!dev->ops.process_mad)
+ return -ENOSYS;
+
+ in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+ out_mad = kzalloc(sizeof(*out_mad), GFP_KERNEL);
+ if (!in_mad || !out_mad) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ in_mad->mad_hdr.base_version = 1;
+ in_mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_PERF_MGMT;
+ in_mad->mad_hdr.class_version = 1;
+ in_mad->mad_hdr.method = IB_MGMT_METHOD_GET;
+ in_mad->mad_hdr.attr_id = attr;
+
+ if (attr != IB_PMA_CLASS_PORT_INFO)
+ in_mad->data[41] = port_num; /* PortSelect field */
+
+ if ((dev->ops.process_mad(dev, IB_MAD_IGNORE_MKEY, port_num, NULL, NULL,
+ in_mad, out_mad, &mad_size,
+ &out_mad_pkey_index) &
+ (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) !=
+ (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ memcpy(data, out_mad->data + offset, size);
+ ret = size;
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+ return ret;
+}
+
+static ssize_t show_pma_counter(struct ib_device *ibdev, u32 port_num,
+ struct ib_port_attribute *attr, char *buf)
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ int offset = tab_attr->index & 0xffff;
+ int width = (tab_attr->index >> 16) & 0xff;
+ int ret;
+ u8 data[8];
+ int len;
+
+ ret = get_perf_mad(ibdev, port_num, tab_attr->attr_id, &data,
+ 40 + offset / 8, sizeof(data));
+ if (ret < 0)
+ return ret;
+
+ switch (width) {
+ case 4:
+ len = sysfs_emit(buf, "%d\n",
+ (*data >> (4 - (offset % 8))) & 0xf);
+ break;
+ case 8:
+ len = sysfs_emit(buf, "%u\n", *data);
+ break;
+ case 16:
+ len = sysfs_emit(buf, "%u\n", be16_to_cpup((__be16 *)data));
+ break;
+ case 32:
+ len = sysfs_emit(buf, "%u\n", be32_to_cpup((__be32 *)data));
+ break;
+ case 64:
+ len = sysfs_emit(buf, "%llu\n", be64_to_cpup((__be64 *)data));
+ break;
+ default:
+ len = 0;
+ break;
+ }
+
+ return len;
+}
+
+static PORT_PMA_ATTR(symbol_error , 0, 16, 32);
+static PORT_PMA_ATTR(link_error_recovery , 1, 8, 48);
+static PORT_PMA_ATTR(link_downed , 2, 8, 56);
+static PORT_PMA_ATTR(port_rcv_errors , 3, 16, 64);
+static PORT_PMA_ATTR(port_rcv_remote_physical_errors, 4, 16, 80);
+static PORT_PMA_ATTR(port_rcv_switch_relay_errors , 5, 16, 96);
+static PORT_PMA_ATTR(port_xmit_discards , 6, 16, 112);
+static PORT_PMA_ATTR(port_xmit_constraint_errors , 7, 8, 128);
+static PORT_PMA_ATTR(port_rcv_constraint_errors , 8, 8, 136);
+static PORT_PMA_ATTR(local_link_integrity_errors , 9, 4, 152);
+static PORT_PMA_ATTR(excessive_buffer_overrun_errors, 10, 4, 156);
+static PORT_PMA_ATTR(VL15_dropped , 11, 16, 176);
+static PORT_PMA_ATTR(port_xmit_data , 12, 32, 192);
+static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224);
+static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256);
+static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288);
+static PORT_PMA_ATTR(port_xmit_wait , 0, 32, 320);
+
+/*
+ * Counters added by extended set
+ */
+static PORT_PMA_ATTR_EXT(port_xmit_data , 64, 64);
+static PORT_PMA_ATTR_EXT(port_rcv_data , 64, 128);
+static PORT_PMA_ATTR_EXT(port_xmit_packets , 64, 192);
+static PORT_PMA_ATTR_EXT(port_rcv_packets , 64, 256);
+static PORT_PMA_ATTR_EXT(unicast_xmit_packets , 64, 320);
+static PORT_PMA_ATTR_EXT(unicast_rcv_packets , 64, 384);
+static PORT_PMA_ATTR_EXT(multicast_xmit_packets , 64, 448);
+static PORT_PMA_ATTR_EXT(multicast_rcv_packets , 64, 512);
+
+static struct attribute *pma_attrs[] = {
+ &port_pma_attr_symbol_error.attr.attr,
+ &port_pma_attr_link_error_recovery.attr.attr,
+ &port_pma_attr_link_downed.attr.attr,
+ &port_pma_attr_port_rcv_errors.attr.attr,
+ &port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+ &port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+ &port_pma_attr_port_xmit_discards.attr.attr,
+ &port_pma_attr_port_xmit_constraint_errors.attr.attr,
+ &port_pma_attr_port_rcv_constraint_errors.attr.attr,
+ &port_pma_attr_local_link_integrity_errors.attr.attr,
+ &port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+ &port_pma_attr_VL15_dropped.attr.attr,
+ &port_pma_attr_port_xmit_data.attr.attr,
+ &port_pma_attr_port_rcv_data.attr.attr,
+ &port_pma_attr_port_xmit_packets.attr.attr,
+ &port_pma_attr_port_rcv_packets.attr.attr,
+ &port_pma_attr_port_xmit_wait.attr.attr,
+ NULL
+};
+
+static struct attribute *pma_attrs_ext[] = {
+ &port_pma_attr_symbol_error.attr.attr,
+ &port_pma_attr_link_error_recovery.attr.attr,
+ &port_pma_attr_link_downed.attr.attr,
+ &port_pma_attr_port_rcv_errors.attr.attr,
+ &port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+ &port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+ &port_pma_attr_port_xmit_discards.attr.attr,
+ &port_pma_attr_port_xmit_constraint_errors.attr.attr,
+ &port_pma_attr_port_rcv_constraint_errors.attr.attr,
+ &port_pma_attr_local_link_integrity_errors.attr.attr,
+ &port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+ &port_pma_attr_VL15_dropped.attr.attr,
+ &port_pma_attr_ext_port_xmit_data.attr.attr,
+ &port_pma_attr_ext_port_rcv_data.attr.attr,
+ &port_pma_attr_ext_port_xmit_packets.attr.attr,
+ &port_pma_attr_port_xmit_wait.attr.attr,
+ &port_pma_attr_ext_port_rcv_packets.attr.attr,
+ &port_pma_attr_ext_unicast_rcv_packets.attr.attr,
+ &port_pma_attr_ext_unicast_xmit_packets.attr.attr,
+ &port_pma_attr_ext_multicast_rcv_packets.attr.attr,
+ &port_pma_attr_ext_multicast_xmit_packets.attr.attr,
+ NULL
+};
+
+static struct attribute *pma_attrs_noietf[] = {
+ &port_pma_attr_symbol_error.attr.attr,
+ &port_pma_attr_link_error_recovery.attr.attr,
+ &port_pma_attr_link_downed.attr.attr,
+ &port_pma_attr_port_rcv_errors.attr.attr,
+ &port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+ &port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+ &port_pma_attr_port_xmit_discards.attr.attr,
+ &port_pma_attr_port_xmit_constraint_errors.attr.attr,
+ &port_pma_attr_port_rcv_constraint_errors.attr.attr,
+ &port_pma_attr_local_link_integrity_errors.attr.attr,
+ &port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+ &port_pma_attr_VL15_dropped.attr.attr,
+ &port_pma_attr_ext_port_xmit_data.attr.attr,
+ &port_pma_attr_ext_port_rcv_data.attr.attr,
+ &port_pma_attr_ext_port_xmit_packets.attr.attr,
+ &port_pma_attr_ext_port_rcv_packets.attr.attr,
+ &port_pma_attr_port_xmit_wait.attr.attr,
+ NULL
+};
+
+static const struct attribute_group pma_group = {
+ .name = "counters",
+ .attrs = pma_attrs
+};
+
+static const struct attribute_group pma_group_ext = {
+ .name = "counters",
+ .attrs = pma_attrs_ext
+};
+
+static const struct attribute_group pma_group_noietf = {
+ .name = "counters",
+ .attrs = pma_attrs_noietf
+};
+
+static void ib_port_release(struct kobject *kobj)
+{
+ struct ib_port *port = container_of(kobj, struct ib_port, kobj);
+ int i;
+
+ for (i = 0; i != ARRAY_SIZE(port->groups); i++)
+ kfree(port->groups[i].attrs);
+ if (port->hw_stats_data)
+ rdma_free_hw_stats_struct(port->hw_stats_data->stats);
+ kfree(port->hw_stats_data);
+ kvfree(port);
+}
+
+static void ib_port_gid_attr_release(struct kobject *kobj)
+{
+ struct gid_attr_group *gid_attr_group =
+ container_of(kobj, struct gid_attr_group, kobj);
+ int i;
+
+ for (i = 0; i != ARRAY_SIZE(gid_attr_group->groups); i++)
+ kfree(gid_attr_group->groups[i].attrs);
+ kfree(gid_attr_group);
+}
+
+static struct kobj_type port_type = {
+ .release = ib_port_release,
+ .sysfs_ops = &port_sysfs_ops,
+ .default_groups = port_default_groups,
+};
+
+static struct kobj_type gid_attr_type = {
+ .sysfs_ops = &gid_attr_sysfs_ops,
+ .release = ib_port_gid_attr_release
+};
+
+/*
+ * Figure out which counter table to use depending on
+ * the device capabilities.
+ */
+static const struct attribute_group *get_counter_table(struct ib_device *dev,
+ int port_num)
+{
+ struct ib_class_port_info cpi;
+
+ if (get_perf_mad(dev, port_num, IB_PMA_CLASS_PORT_INFO,
+ &cpi, 40, sizeof(cpi)) >= 0) {
+ if (cpi.capability_mask & IB_PMA_CLASS_CAP_EXT_WIDTH)
+ /* We have extended counters */
+ return &pma_group_ext;
+
+ if (cpi.capability_mask & IB_PMA_CLASS_CAP_EXT_WIDTH_NOIETF)
+ /* But not the IETF ones */
+ return &pma_group_noietf;
+ }
+
+ /* Fall back to normal counters */
+ return &pma_group;
+}
+
+static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats,
+ u32 port_num, int index)
+{
+ int ret;
+
+ if (time_is_after_eq_jiffies(stats->timestamp + stats->lifespan))
+ return 0;
+ ret = dev->ops.get_hw_stats(dev, stats, port_num, index);
+ if (ret < 0)
+ return ret;
+ if (ret == stats->num_counters)
+ stats->timestamp = jiffies;
+
+ return 0;
+}
+
+static int print_hw_stat(struct ib_device *dev, int port_num,
+ struct rdma_hw_stats *stats, int index, char *buf)
+{
+ u64 v = rdma_counter_get_hwstat_value(dev, port_num, index);
+
+ return sysfs_emit(buf, "%llu\n", stats->value[index] + v);
+}
+
+static ssize_t show_hw_stats(struct ib_device *ibdev,
+ struct rdma_hw_stats *stats, unsigned int index,
+ unsigned int port_num, char *buf)
+{
+ int ret;
+
+ mutex_lock(&stats->lock);
+ ret = update_hw_stats(ibdev, stats, port_num, index);
+ if (ret)
+ goto unlock;
+ ret = print_hw_stat(ibdev, port_num, stats, index, buf);
+unlock:
+ mutex_unlock(&stats->lock);
+
+ return ret;
+}
+
+static ssize_t show_stats_lifespan(struct ib_device *ibdev,
+ struct rdma_hw_stats *stats,
+ unsigned int index, unsigned int port_num,
+ char *buf)
+{
+ int msecs;
+
+ mutex_lock(&stats->lock);
+ msecs = jiffies_to_msecs(stats->lifespan);
+ mutex_unlock(&stats->lock);
+
+ return sysfs_emit(buf, "%d\n", msecs);
+}
+
+static ssize_t set_stats_lifespan(struct ib_device *ibdev,
+ struct rdma_hw_stats *stats,
+ unsigned int index, unsigned int port_num,
+ const char *buf, size_t count)
+{
+ int msecs;
+ int jiffies;
+ int ret;
+
+ ret = kstrtoint(buf, 10, &msecs);
+ if (ret)
+ return ret;
+ if (msecs < 0 || msecs > 10000)
+ return -EINVAL;
+ jiffies = msecs_to_jiffies(msecs);
+
+ mutex_lock(&stats->lock);
+ stats->lifespan = jiffies;
+ mutex_unlock(&stats->lock);
+
+ return count;
+}
+
+static struct hw_stats_device_data *
+alloc_hw_stats_device(struct ib_device *ibdev)
+{
+ struct hw_stats_device_data *data;
+ struct rdma_hw_stats *stats;
+
+ if (!ibdev->ops.alloc_hw_device_stats)
+ return ERR_PTR(-EOPNOTSUPP);
+ stats = ibdev->ops.alloc_hw_device_stats(ibdev);
+ if (!stats)
+ return ERR_PTR(-ENOMEM);
+ if (!stats->descs || stats->num_counters <= 0)
+ goto err_free_stats;
+
+ /*
+ * Two extra attribue elements here, one for the lifespan entry and
+ * one to NULL terminate the list for the sysfs core code
+ */
+ data = kzalloc(struct_size(data, attrs, size_add(stats->num_counters, 1)),
+ GFP_KERNEL);
+ if (!data)
+ goto err_free_stats;
+ data->group.attrs = kcalloc(stats->num_counters + 2,
+ sizeof(*data->group.attrs), GFP_KERNEL);
+ if (!data->group.attrs)
+ goto err_free_data;
+
+ data->group.name = "hw_counters";
+ data->stats = stats;
+ return data;
+
+err_free_data:
+ kfree(data);
+err_free_stats:
+ rdma_free_hw_stats_struct(stats);
+ return ERR_PTR(-ENOMEM);
+}
+
+void ib_device_release_hw_stats(struct hw_stats_device_data *data)
+{
+ kfree(data->group.attrs);
+ rdma_free_hw_stats_struct(data->stats);
+ kfree(data);
+}
+
+int ib_setup_device_attrs(struct ib_device *ibdev)
+{
+ struct hw_stats_device_attribute *attr;
+ struct hw_stats_device_data *data;
+ bool opstat_skipped = false;
+ int i, ret, pos = 0;
+
+ data = alloc_hw_stats_device(ibdev);
+ if (IS_ERR(data)) {
+ if (PTR_ERR(data) == -EOPNOTSUPP)
+ return 0;
+ return PTR_ERR(data);
+ }
+ ibdev->hw_stats_data = data;
+
+ ret = ibdev->ops.get_hw_stats(ibdev, data->stats, 0,
+ data->stats->num_counters);
+ if (ret != data->stats->num_counters) {
+ if (WARN_ON(ret >= 0))
+ return -EINVAL;
+ return ret;
+ }
+
+ data->stats->timestamp = jiffies;
+
+ for (i = 0; i < data->stats->num_counters; i++) {
+ if (data->stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL) {
+ opstat_skipped = true;
+ continue;
+ }
+
+ WARN_ON(opstat_skipped);
+ attr = &data->attrs[pos];
+ sysfs_attr_init(&attr->attr.attr);
+ attr->attr.attr.name = data->stats->descs[i].name;
+ attr->attr.attr.mode = 0444;
+ attr->attr.show = hw_stat_device_show;
+ attr->show = show_hw_stats;
+ data->group.attrs[pos] = &attr->attr.attr;
+ pos++;
+ }
+
+ attr = &data->attrs[pos];
+ sysfs_attr_init(&attr->attr.attr);
+ attr->attr.attr.name = "lifespan";
+ attr->attr.attr.mode = 0644;
+ attr->attr.show = hw_stat_device_show;
+ attr->show = show_stats_lifespan;
+ attr->attr.store = hw_stat_device_store;
+ attr->store = set_stats_lifespan;
+ data->group.attrs[pos] = &attr->attr.attr;
+ for (i = 0; i != ARRAY_SIZE(ibdev->groups); i++)
+ if (!ibdev->groups[i]) {
+ ibdev->groups[i] = &data->group;
+ return 0;
+ }
+ WARN(true, "struct ib_device->groups is too small");
+ return -EINVAL;
+}
+
+static struct hw_stats_port_data *
+alloc_hw_stats_port(struct ib_port *port, struct attribute_group *group)
+{
+ struct ib_device *ibdev = port->ibdev;
+ struct hw_stats_port_data *data;
+ struct rdma_hw_stats *stats;
+
+ if (!ibdev->ops.alloc_hw_port_stats)
+ return ERR_PTR(-EOPNOTSUPP);
+ stats = ibdev->ops.alloc_hw_port_stats(port->ibdev, port->port_num);
+ if (!stats)
+ return ERR_PTR(-ENOMEM);
+ if (!stats->descs || stats->num_counters <= 0)
+ goto err_free_stats;
+
+ /*
+ * Two extra attribue elements here, one for the lifespan entry and
+ * one to NULL terminate the list for the sysfs core code
+ */
+ data = kzalloc(struct_size(data, attrs, size_add(stats->num_counters, 1)),
+ GFP_KERNEL);
+ if (!data)
+ goto err_free_stats;
+ group->attrs = kcalloc(stats->num_counters + 2,
+ sizeof(*group->attrs), GFP_KERNEL);
+ if (!group->attrs)
+ goto err_free_data;
+
+ group->name = "hw_counters";
+ data->stats = stats;
+ return data;
+
+err_free_data:
+ kfree(data);
+err_free_stats:
+ rdma_free_hw_stats_struct(stats);
+ return ERR_PTR(-ENOMEM);
+}
+
+static int setup_hw_port_stats(struct ib_port *port,
+ struct attribute_group *group)
+{
+ struct hw_stats_port_attribute *attr;
+ struct hw_stats_port_data *data;
+ bool opstat_skipped = false;
+ int i, ret, pos = 0;
+
+ data = alloc_hw_stats_port(port, group);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+
+ ret = port->ibdev->ops.get_hw_stats(port->ibdev, data->stats,
+ port->port_num,
+ data->stats->num_counters);
+ if (ret != data->stats->num_counters) {
+ if (WARN_ON(ret >= 0))
+ return -EINVAL;
+ return ret;
+ }
+
+ data->stats->timestamp = jiffies;
+
+ for (i = 0; i < data->stats->num_counters; i++) {
+ if (data->stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL) {
+ opstat_skipped = true;
+ continue;
+ }
+
+ WARN_ON(opstat_skipped);
+ attr = &data->attrs[pos];
+ sysfs_attr_init(&attr->attr.attr);
+ attr->attr.attr.name = data->stats->descs[i].name;
+ attr->attr.attr.mode = 0444;
+ attr->attr.show = hw_stat_port_show;
+ attr->show = show_hw_stats;
+ group->attrs[pos] = &attr->attr.attr;
+ pos++;
+ }
+
+ attr = &data->attrs[pos];
+ sysfs_attr_init(&attr->attr.attr);
+ attr->attr.attr.name = "lifespan";
+ attr->attr.attr.mode = 0644;
+ attr->attr.show = hw_stat_port_show;
+ attr->show = show_stats_lifespan;
+ attr->attr.store = hw_stat_port_store;
+ attr->store = set_stats_lifespan;
+ group->attrs[pos] = &attr->attr.attr;
+
+ port->hw_stats_data = data;
+ return 0;
+}
+
+struct rdma_hw_stats *ib_get_hw_stats_port(struct ib_device *ibdev,
+ u32 port_num)
+{
+ if (!ibdev->port_data || !rdma_is_port_valid(ibdev, port_num) ||
+ !ibdev->port_data[port_num].sysfs->hw_stats_data)
+ return NULL;
+ return ibdev->port_data[port_num].sysfs->hw_stats_data->stats;
+}
+
+static int
+alloc_port_table_group(const char *name, struct attribute_group *group,
+ struct port_table_attribute *attrs, size_t num,
+ ssize_t (*show)(struct ib_device *ibdev, u32 port_num,
+ struct ib_port_attribute *, char *buf))
+{
+ struct attribute **attr_list;
+ int i;
+
+ attr_list = kcalloc(num + 1, sizeof(*attr_list), GFP_KERNEL);
+ if (!attr_list)
+ return -ENOMEM;
+
+ for (i = 0; i < num; i++) {
+ struct port_table_attribute *element = &attrs[i];
+
+ if (snprintf(element->name, sizeof(element->name), "%d", i) >=
+ sizeof(element->name))
+ goto err;
+
+ sysfs_attr_init(&element->attr.attr);
+ element->attr.attr.name = element->name;
+ element->attr.attr.mode = 0444;
+ element->attr.show = show;
+ element->index = i;
+
+ attr_list[i] = &element->attr.attr;
+ }
+ group->name = name;
+ group->attrs = attr_list;
+ return 0;
+err:
+ kfree(attr_list);
+ return -EINVAL;
+}
+
+/*
+ * Create the sysfs:
+ * ibp0s9/ports/XX/gid_attrs/{ndevs,types}/YYY
+ * YYY is the gid table index in decimal
+ */
+static int setup_gid_attrs(struct ib_port *port,
+ const struct ib_port_attr *attr)
+{
+ struct gid_attr_group *gid_attr_group;
+ int ret;
+
+ gid_attr_group = kzalloc(struct_size(gid_attr_group, attrs_list,
+ size_mul(attr->gid_tbl_len, 2)),
+ GFP_KERNEL);
+ if (!gid_attr_group)
+ return -ENOMEM;
+ gid_attr_group->port = port;
+ kobject_init(&gid_attr_group->kobj, &gid_attr_type);
+
+ ret = alloc_port_table_group("ndevs", &gid_attr_group->groups[0],
+ gid_attr_group->attrs_list,
+ attr->gid_tbl_len,
+ show_port_gid_attr_ndev);
+ if (ret)
+ goto err_put;
+ gid_attr_group->groups_list[0] = &gid_attr_group->groups[0];
+
+ ret = alloc_port_table_group(
+ "types", &gid_attr_group->groups[1],
+ gid_attr_group->attrs_list + attr->gid_tbl_len,
+ attr->gid_tbl_len, show_port_gid_attr_gid_type);
+ if (ret)
+ goto err_put;
+ gid_attr_group->groups_list[1] = &gid_attr_group->groups[1];
+
+ ret = kobject_add(&gid_attr_group->kobj, &port->kobj, "gid_attrs");
+ if (ret)
+ goto err_put;
+ ret = sysfs_create_groups(&gid_attr_group->kobj,
+ gid_attr_group->groups_list);
+ if (ret)
+ goto err_del;
+ port->gid_attr_group = gid_attr_group;
+ return 0;
+
+err_del:
+ kobject_del(&gid_attr_group->kobj);
+err_put:
+ kobject_put(&gid_attr_group->kobj);
+ return ret;
+}
+
+static void destroy_gid_attrs(struct ib_port *port)
+{
+ struct gid_attr_group *gid_attr_group = port->gid_attr_group;
+
+ if (!gid_attr_group)
+ return;
+ sysfs_remove_groups(&gid_attr_group->kobj, gid_attr_group->groups_list);
+ kobject_del(&gid_attr_group->kobj);
+ kobject_put(&gid_attr_group->kobj);
+}
+
+/*
+ * Create the sysfs:
+ * ibp0s9/ports/XX/{gids,pkeys,counters}/YYY
+ */
+static struct ib_port *setup_port(struct ib_core_device *coredev, int port_num,
+ const struct ib_port_attr *attr)
+{
+ struct ib_device *device = rdma_device_to_ibdev(&coredev->dev);
+ bool is_full_dev = &device->coredev == coredev;
+ const struct attribute_group **cur_group;
+ struct ib_port *p;
+ int ret;
+
+ p = kvzalloc(struct_size(p, attrs_list,
+ size_add(attr->gid_tbl_len, attr->pkey_tbl_len)),
+ GFP_KERNEL);
+ if (!p)
+ return ERR_PTR(-ENOMEM);
+ p->ibdev = device;
+ p->port_num = port_num;
+ kobject_init(&p->kobj, &port_type);
+
+ if (device->port_data && is_full_dev)
+ device->port_data[port_num].sysfs = p;
+
+ cur_group = p->groups_list;
+ ret = alloc_port_table_group("gids", &p->groups[0], p->attrs_list,
+ attr->gid_tbl_len, show_port_gid);
+ if (ret)
+ goto err_put;
+ *cur_group++ = &p->groups[0];
+
+ if (attr->pkey_tbl_len) {
+ ret = alloc_port_table_group("pkeys", &p->groups[1],
+ p->attrs_list + attr->gid_tbl_len,
+ attr->pkey_tbl_len, show_port_pkey);
+ if (ret)
+ goto err_put;
+ *cur_group++ = &p->groups[1];
+ }
+
+ /*
+ * If port == 0, it means hw_counters are per device and not per
+ * port, so holder should be device. Therefore skip per port
+ * counter initialization.
+ */
+ if (port_num && is_full_dev) {
+ ret = setup_hw_port_stats(p, &p->groups[2]);
+ if (ret && ret != -EOPNOTSUPP)
+ goto err_put;
+ if (!ret)
+ *cur_group++ = &p->groups[2];
+ }
+
+ if (device->ops.process_mad && is_full_dev)
+ *cur_group++ = get_counter_table(device, port_num);
+
+ ret = kobject_add(&p->kobj, coredev->ports_kobj, "%d", port_num);
+ if (ret)
+ goto err_put;
+ ret = sysfs_create_groups(&p->kobj, p->groups_list);
+ if (ret)
+ goto err_del;
+ if (is_full_dev) {
+ ret = sysfs_create_groups(&p->kobj, device->ops.port_groups);
+ if (ret)
+ goto err_groups;
+ }
+
+ list_add_tail(&p->kobj.entry, &coredev->port_list);
+ return p;
+
+err_groups:
+ sysfs_remove_groups(&p->kobj, p->groups_list);
+err_del:
+ kobject_del(&p->kobj);
+err_put:
+ if (device->port_data && is_full_dev)
+ device->port_data[port_num].sysfs = NULL;
+ kobject_put(&p->kobj);
+ return ERR_PTR(ret);
+}
+
+static void destroy_port(struct ib_core_device *coredev, struct ib_port *port)
+{
+ bool is_full_dev = &port->ibdev->coredev == coredev;
+
+ list_del(&port->kobj.entry);
+ if (is_full_dev)
+ sysfs_remove_groups(&port->kobj, port->ibdev->ops.port_groups);
+
+ sysfs_remove_groups(&port->kobj, port->groups_list);
+ kobject_del(&port->kobj);
+
+ if (port->ibdev->port_data &&
+ port->ibdev->port_data[port->port_num].sysfs == port)
+ port->ibdev->port_data[port->port_num].sysfs = NULL;
+
+ kobject_put(&port->kobj);
+}
+
+static const char *node_type_string(int node_type)
+{
+ switch (node_type) {
+ case RDMA_NODE_IB_CA:
+ return "CA";
+ case RDMA_NODE_IB_SWITCH:
+ return "switch";
+ case RDMA_NODE_IB_ROUTER:
+ return "router";
+ case RDMA_NODE_RNIC:
+ return "RNIC";
+ case RDMA_NODE_USNIC:
+ return "usNIC";
+ case RDMA_NODE_USNIC_UDP:
+ return "usNIC UDP";
+ case RDMA_NODE_UNSPECIFIED:
+ return "unspecified";
+ }
+ return "<unknown>";
+}
+
+static ssize_t node_type_show(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct ib_device *dev = rdma_device_to_ibdev(device);
+
+ return sysfs_emit(buf, "%u: %s\n", dev->node_type,
+ node_type_string(dev->node_type));
+}
+static DEVICE_ATTR_RO(node_type);
+
+static ssize_t sys_image_guid_show(struct device *device,
+ struct device_attribute *dev_attr, char *buf)
+{
+ struct ib_device *dev = rdma_device_to_ibdev(device);
+ __be16 *guid = (__be16 *)&dev->attrs.sys_image_guid;
+
+ return sysfs_emit(buf, "%04x:%04x:%04x:%04x\n",
+ be16_to_cpu(guid[0]),
+ be16_to_cpu(guid[1]),
+ be16_to_cpu(guid[2]),
+ be16_to_cpu(guid[3]));
+}
+static DEVICE_ATTR_RO(sys_image_guid);
+
+static ssize_t node_guid_show(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct ib_device *dev = rdma_device_to_ibdev(device);
+ __be16 *node_guid = (__be16 *)&dev->node_guid;
+
+ return sysfs_emit(buf, "%04x:%04x:%04x:%04x\n",
+ be16_to_cpu(node_guid[0]),
+ be16_to_cpu(node_guid[1]),
+ be16_to_cpu(node_guid[2]),
+ be16_to_cpu(node_guid[3]));
+}
+static DEVICE_ATTR_RO(node_guid);
+
+static ssize_t node_desc_show(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct ib_device *dev = rdma_device_to_ibdev(device);
+
+ return sysfs_emit(buf, "%.64s\n", dev->node_desc);
+}
+
+static ssize_t node_desc_store(struct device *device,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct ib_device *dev = rdma_device_to_ibdev(device);
+ struct ib_device_modify desc = {};
+ int ret;
+
+ if (!dev->ops.modify_device)
+ return -EOPNOTSUPP;
+
+ memcpy(desc.node_desc, buf, min_t(int, count, IB_DEVICE_NODE_DESC_MAX));
+ ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc);
+ if (ret)
+ return ret;
+
+ return count;
+}
+static DEVICE_ATTR_RW(node_desc);
+
+static ssize_t fw_ver_show(struct device *device, struct device_attribute *attr,
+ char *buf)
+{
+ struct ib_device *dev = rdma_device_to_ibdev(device);
+ char version[IB_FW_VERSION_NAME_MAX] = {};
+
+ ib_get_device_fw_str(dev, version);
+
+ return sysfs_emit(buf, "%s\n", version);
+}
+static DEVICE_ATTR_RO(fw_ver);
+
+static struct attribute *ib_dev_attrs[] = {
+ &dev_attr_node_type.attr,
+ &dev_attr_node_guid.attr,
+ &dev_attr_sys_image_guid.attr,
+ &dev_attr_fw_ver.attr,
+ &dev_attr_node_desc.attr,
+ NULL,
+};
+
+const struct attribute_group ib_dev_attr_group = {
+ .attrs = ib_dev_attrs,
+};
+
+void ib_free_port_attrs(struct ib_core_device *coredev)
+{
+ struct kobject *p, *t;
+
+ list_for_each_entry_safe(p, t, &coredev->port_list, entry) {
+ struct ib_port *port = container_of(p, struct ib_port, kobj);
+
+ destroy_gid_attrs(port);
+ destroy_port(coredev, port);
+ }
+
+ kobject_put(coredev->ports_kobj);
+}
+
+int ib_setup_port_attrs(struct ib_core_device *coredev)
+{
+ struct ib_device *device = rdma_device_to_ibdev(&coredev->dev);
+ u32 port_num;
+ int ret;
+
+ coredev->ports_kobj = kobject_create_and_add("ports",
+ &coredev->dev.kobj);
+ if (!coredev->ports_kobj)
+ return -ENOMEM;
+
+ rdma_for_each_port (device, port_num) {
+ struct ib_port_attr attr;
+ struct ib_port *port;
+
+ ret = ib_query_port(device, port_num, &attr);
+ if (ret)
+ goto err_put;
+
+ port = setup_port(coredev, port_num, &attr);
+ if (IS_ERR(port)) {
+ ret = PTR_ERR(port);
+ goto err_put;
+ }
+
+ ret = setup_gid_attrs(port, &attr);
+ if (ret)
+ goto err_put;
+ }
+ return 0;
+
+err_put:
+ ib_free_port_attrs(coredev);
+ return ret;
+}
+
+/**
+ * ib_port_register_client_groups - Add an ib_client's attributes to the port
+ *
+ * @ibdev: IB device to add counters
+ * @port_num: valid port number
+ * @groups: Group list of attributes
+ *
+ * Do not use. Only for legacy sysfs compatibility.
+ */
+int ib_port_register_client_groups(struct ib_device *ibdev, u32 port_num,
+ const struct attribute_group **groups)
+{
+ return sysfs_create_groups(&ibdev->port_data[port_num].sysfs->kobj,
+ groups);
+}
+EXPORT_SYMBOL(ib_port_register_client_groups);
+
+void ib_port_unregister_client_groups(struct ib_device *ibdev, u32 port_num,
+ const struct attribute_group **groups)
+{
+ return sysfs_remove_groups(&ibdev->port_data[port_num].sysfs->kobj,
+ groups);
+}
+EXPORT_SYMBOL(ib_port_unregister_client_groups);
diff --git a/drivers/infiniband/core/trace.c b/drivers/infiniband/core/trace.c
new file mode 100644
index 000000000..31e7860d3
--- /dev/null
+++ b/drivers/infiniband/core/trace.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Trace points for core RDMA functions.
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#define CREATE_TRACE_POINTS
+
+#include <trace/events/rdma_core.h>
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
new file mode 100644
index 000000000..bf42650f1
--- /dev/null
+++ b/drivers/infiniband/core/ucma.c
@@ -0,0 +1,1896 @@
+/*
+ * Copyright (c) 2005-2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/file.h>
+#include <linux/mutex.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/idr.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/module.h>
+#include <linux/nsproxy.h>
+
+#include <linux/nospec.h>
+
+#include <rdma/rdma_user_cm.h>
+#include <rdma/ib_marshall.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/rdma_cm_ib.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib.h>
+#include <rdma/ib_cm.h>
+#include <rdma/rdma_netlink.h>
+#include "core_priv.h"
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static unsigned int max_backlog = 1024;
+
+static struct ctl_table_header *ucma_ctl_table_hdr;
+static struct ctl_table ucma_ctl_table[] = {
+ {
+ .procname = "max_backlog",
+ .data = &max_backlog,
+ .maxlen = sizeof max_backlog,
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+struct ucma_file {
+ struct mutex mut;
+ struct file *filp;
+ struct list_head ctx_list;
+ struct list_head event_list;
+ wait_queue_head_t poll_wait;
+};
+
+struct ucma_context {
+ u32 id;
+ struct completion comp;
+ refcount_t ref;
+ int events_reported;
+ atomic_t backlog;
+
+ struct ucma_file *file;
+ struct rdma_cm_id *cm_id;
+ struct mutex mutex;
+ u64 uid;
+
+ struct list_head list;
+ struct list_head mc_list;
+ struct work_struct close_work;
+};
+
+struct ucma_multicast {
+ struct ucma_context *ctx;
+ u32 id;
+ int events_reported;
+
+ u64 uid;
+ u8 join_state;
+ struct list_head list;
+ struct sockaddr_storage addr;
+};
+
+struct ucma_event {
+ struct ucma_context *ctx;
+ struct ucma_context *conn_req_ctx;
+ struct ucma_multicast *mc;
+ struct list_head list;
+ struct rdma_ucm_event_resp resp;
+};
+
+static DEFINE_XARRAY_ALLOC(ctx_table);
+static DEFINE_XARRAY_ALLOC(multicast_table);
+
+static const struct file_operations ucma_fops;
+static int ucma_destroy_private_ctx(struct ucma_context *ctx);
+
+static inline struct ucma_context *_ucma_find_context(int id,
+ struct ucma_file *file)
+{
+ struct ucma_context *ctx;
+
+ ctx = xa_load(&ctx_table, id);
+ if (!ctx)
+ ctx = ERR_PTR(-ENOENT);
+ else if (ctx->file != file)
+ ctx = ERR_PTR(-EINVAL);
+ return ctx;
+}
+
+static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id)
+{
+ struct ucma_context *ctx;
+
+ xa_lock(&ctx_table);
+ ctx = _ucma_find_context(id, file);
+ if (!IS_ERR(ctx))
+ if (!refcount_inc_not_zero(&ctx->ref))
+ ctx = ERR_PTR(-ENXIO);
+ xa_unlock(&ctx_table);
+ return ctx;
+}
+
+static void ucma_put_ctx(struct ucma_context *ctx)
+{
+ if (refcount_dec_and_test(&ctx->ref))
+ complete(&ctx->comp);
+}
+
+/*
+ * Same as ucm_get_ctx but requires that ->cm_id->device is valid, eg that the
+ * CM_ID is bound.
+ */
+static struct ucma_context *ucma_get_ctx_dev(struct ucma_file *file, int id)
+{
+ struct ucma_context *ctx = ucma_get_ctx(file, id);
+
+ if (IS_ERR(ctx))
+ return ctx;
+ if (!ctx->cm_id->device) {
+ ucma_put_ctx(ctx);
+ return ERR_PTR(-EINVAL);
+ }
+ return ctx;
+}
+
+static void ucma_close_id(struct work_struct *work)
+{
+ struct ucma_context *ctx = container_of(work, struct ucma_context, close_work);
+
+ /* once all inflight tasks are finished, we close all underlying
+ * resources. The context is still alive till its explicit destryoing
+ * by its creator. This puts back the xarray's reference.
+ */
+ ucma_put_ctx(ctx);
+ wait_for_completion(&ctx->comp);
+ /* No new events will be generated after destroying the id. */
+ rdma_destroy_id(ctx->cm_id);
+
+ /* Reading the cm_id without holding a positive ref is not allowed */
+ ctx->cm_id = NULL;
+}
+
+static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
+{
+ struct ucma_context *ctx;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return NULL;
+
+ INIT_WORK(&ctx->close_work, ucma_close_id);
+ init_completion(&ctx->comp);
+ INIT_LIST_HEAD(&ctx->mc_list);
+ /* So list_del() will work if we don't do ucma_finish_ctx() */
+ INIT_LIST_HEAD(&ctx->list);
+ ctx->file = file;
+ mutex_init(&ctx->mutex);
+
+ if (xa_alloc(&ctx_table, &ctx->id, NULL, xa_limit_32b, GFP_KERNEL)) {
+ kfree(ctx);
+ return NULL;
+ }
+ return ctx;
+}
+
+static void ucma_set_ctx_cm_id(struct ucma_context *ctx,
+ struct rdma_cm_id *cm_id)
+{
+ refcount_set(&ctx->ref, 1);
+ ctx->cm_id = cm_id;
+}
+
+static void ucma_finish_ctx(struct ucma_context *ctx)
+{
+ lockdep_assert_held(&ctx->file->mut);
+ list_add_tail(&ctx->list, &ctx->file->ctx_list);
+ xa_store(&ctx_table, ctx->id, ctx, GFP_KERNEL);
+}
+
+static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst,
+ struct rdma_conn_param *src)
+{
+ if (src->private_data_len)
+ memcpy(dst->private_data, src->private_data,
+ src->private_data_len);
+ dst->private_data_len = src->private_data_len;
+ dst->responder_resources = src->responder_resources;
+ dst->initiator_depth = src->initiator_depth;
+ dst->flow_control = src->flow_control;
+ dst->retry_count = src->retry_count;
+ dst->rnr_retry_count = src->rnr_retry_count;
+ dst->srq = src->srq;
+ dst->qp_num = src->qp_num;
+}
+
+static void ucma_copy_ud_event(struct ib_device *device,
+ struct rdma_ucm_ud_param *dst,
+ struct rdma_ud_param *src)
+{
+ if (src->private_data_len)
+ memcpy(dst->private_data, src->private_data,
+ src->private_data_len);
+ dst->private_data_len = src->private_data_len;
+ ib_copy_ah_attr_to_user(device, &dst->ah_attr, &src->ah_attr);
+ dst->qp_num = src->qp_num;
+ dst->qkey = src->qkey;
+}
+
+static struct ucma_event *ucma_create_uevent(struct ucma_context *ctx,
+ struct rdma_cm_event *event)
+{
+ struct ucma_event *uevent;
+
+ uevent = kzalloc(sizeof(*uevent), GFP_KERNEL);
+ if (!uevent)
+ return NULL;
+
+ uevent->ctx = ctx;
+ switch (event->event) {
+ case RDMA_CM_EVENT_MULTICAST_JOIN:
+ case RDMA_CM_EVENT_MULTICAST_ERROR:
+ uevent->mc = (struct ucma_multicast *)
+ event->param.ud.private_data;
+ uevent->resp.uid = uevent->mc->uid;
+ uevent->resp.id = uevent->mc->id;
+ break;
+ default:
+ uevent->resp.uid = ctx->uid;
+ uevent->resp.id = ctx->id;
+ break;
+ }
+ uevent->resp.event = event->event;
+ uevent->resp.status = event->status;
+ if (ctx->cm_id->qp_type == IB_QPT_UD)
+ ucma_copy_ud_event(ctx->cm_id->device, &uevent->resp.param.ud,
+ &event->param.ud);
+ else
+ ucma_copy_conn_event(&uevent->resp.param.conn,
+ &event->param.conn);
+
+ uevent->resp.ece.vendor_id = event->ece.vendor_id;
+ uevent->resp.ece.attr_mod = event->ece.attr_mod;
+ return uevent;
+}
+
+static int ucma_connect_event_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event)
+{
+ struct ucma_context *listen_ctx = cm_id->context;
+ struct ucma_context *ctx;
+ struct ucma_event *uevent;
+
+ if (!atomic_add_unless(&listen_ctx->backlog, -1, 0))
+ return -ENOMEM;
+ ctx = ucma_alloc_ctx(listen_ctx->file);
+ if (!ctx)
+ goto err_backlog;
+ ucma_set_ctx_cm_id(ctx, cm_id);
+
+ uevent = ucma_create_uevent(listen_ctx, event);
+ if (!uevent)
+ goto err_alloc;
+ uevent->conn_req_ctx = ctx;
+ uevent->resp.id = ctx->id;
+
+ ctx->cm_id->context = ctx;
+
+ mutex_lock(&ctx->file->mut);
+ ucma_finish_ctx(ctx);
+ list_add_tail(&uevent->list, &ctx->file->event_list);
+ mutex_unlock(&ctx->file->mut);
+ wake_up_interruptible(&ctx->file->poll_wait);
+ return 0;
+
+err_alloc:
+ ucma_destroy_private_ctx(ctx);
+err_backlog:
+ atomic_inc(&listen_ctx->backlog);
+ /* Returning error causes the new ID to be destroyed */
+ return -ENOMEM;
+}
+
+static int ucma_event_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event)
+{
+ struct ucma_event *uevent;
+ struct ucma_context *ctx = cm_id->context;
+
+ if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST)
+ return ucma_connect_event_handler(cm_id, event);
+
+ /*
+ * We ignore events for new connections until userspace has set their
+ * context. This can only happen if an error occurs on a new connection
+ * before the user accepts it. This is okay, since the accept will just
+ * fail later. However, we do need to release the underlying HW
+ * resources in case of a device removal event.
+ */
+ if (ctx->uid) {
+ uevent = ucma_create_uevent(ctx, event);
+ if (!uevent)
+ return 0;
+
+ mutex_lock(&ctx->file->mut);
+ list_add_tail(&uevent->list, &ctx->file->event_list);
+ mutex_unlock(&ctx->file->mut);
+ wake_up_interruptible(&ctx->file->poll_wait);
+ }
+
+ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
+ xa_lock(&ctx_table);
+ if (xa_load(&ctx_table, ctx->id) == ctx)
+ queue_work(system_unbound_wq, &ctx->close_work);
+ xa_unlock(&ctx_table);
+ }
+ return 0;
+}
+
+static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_get_event cmd;
+ struct ucma_event *uevent;
+
+ /*
+ * Old 32 bit user space does not send the 4 byte padding in the
+ * reserved field. We don't care, allow it to keep working.
+ */
+ if (out_len < sizeof(uevent->resp) - sizeof(uevent->resp.reserved) -
+ sizeof(uevent->resp.ece))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ mutex_lock(&file->mut);
+ while (list_empty(&file->event_list)) {
+ mutex_unlock(&file->mut);
+
+ if (file->filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ if (wait_event_interruptible(file->poll_wait,
+ !list_empty(&file->event_list)))
+ return -ERESTARTSYS;
+
+ mutex_lock(&file->mut);
+ }
+
+ uevent = list_first_entry(&file->event_list, struct ucma_event, list);
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response),
+ &uevent->resp,
+ min_t(size_t, out_len, sizeof(uevent->resp)))) {
+ mutex_unlock(&file->mut);
+ return -EFAULT;
+ }
+
+ list_del(&uevent->list);
+ uevent->ctx->events_reported++;
+ if (uevent->mc)
+ uevent->mc->events_reported++;
+ if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST)
+ atomic_inc(&uevent->ctx->backlog);
+ mutex_unlock(&file->mut);
+
+ kfree(uevent);
+ return 0;
+}
+
+static int ucma_get_qp_type(struct rdma_ucm_create_id *cmd, enum ib_qp_type *qp_type)
+{
+ switch (cmd->ps) {
+ case RDMA_PS_TCP:
+ *qp_type = IB_QPT_RC;
+ return 0;
+ case RDMA_PS_UDP:
+ case RDMA_PS_IPOIB:
+ *qp_type = IB_QPT_UD;
+ return 0;
+ case RDMA_PS_IB:
+ *qp_type = cmd->qp_type;
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_create_id cmd;
+ struct rdma_ucm_create_id_resp resp;
+ struct ucma_context *ctx;
+ struct rdma_cm_id *cm_id;
+ enum ib_qp_type qp_type;
+ int ret;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ret = ucma_get_qp_type(&cmd, &qp_type);
+ if (ret)
+ return ret;
+
+ ctx = ucma_alloc_ctx(file);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->uid = cmd.uid;
+ cm_id = rdma_create_user_id(ucma_event_handler, ctx, cmd.ps, qp_type);
+ if (IS_ERR(cm_id)) {
+ ret = PTR_ERR(cm_id);
+ goto err1;
+ }
+ ucma_set_ctx_cm_id(ctx, cm_id);
+
+ resp.id = ctx->id;
+ if (copy_to_user(u64_to_user_ptr(cmd.response),
+ &resp, sizeof(resp))) {
+ ret = -EFAULT;
+ goto err1;
+ }
+
+ mutex_lock(&file->mut);
+ ucma_finish_ctx(ctx);
+ mutex_unlock(&file->mut);
+ return 0;
+
+err1:
+ ucma_destroy_private_ctx(ctx);
+ return ret;
+}
+
+static void ucma_cleanup_multicast(struct ucma_context *ctx)
+{
+ struct ucma_multicast *mc, *tmp;
+
+ xa_lock(&multicast_table);
+ list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) {
+ list_del(&mc->list);
+ /*
+ * At this point mc->ctx->ref is 0 so the mc cannot leave the
+ * lock on the reader and this is enough serialization
+ */
+ __xa_erase(&multicast_table, mc->id);
+ kfree(mc);
+ }
+ xa_unlock(&multicast_table);
+}
+
+static void ucma_cleanup_mc_events(struct ucma_multicast *mc)
+{
+ struct ucma_event *uevent, *tmp;
+
+ rdma_lock_handler(mc->ctx->cm_id);
+ mutex_lock(&mc->ctx->file->mut);
+ list_for_each_entry_safe(uevent, tmp, &mc->ctx->file->event_list, list) {
+ if (uevent->mc != mc)
+ continue;
+
+ list_del(&uevent->list);
+ kfree(uevent);
+ }
+ mutex_unlock(&mc->ctx->file->mut);
+ rdma_unlock_handler(mc->ctx->cm_id);
+}
+
+static int ucma_cleanup_ctx_events(struct ucma_context *ctx)
+{
+ int events_reported;
+ struct ucma_event *uevent, *tmp;
+ LIST_HEAD(list);
+
+ /* Cleanup events not yet reported to the user.*/
+ mutex_lock(&ctx->file->mut);
+ list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) {
+ if (uevent->ctx != ctx)
+ continue;
+
+ if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST &&
+ xa_cmpxchg(&ctx_table, uevent->conn_req_ctx->id,
+ uevent->conn_req_ctx, XA_ZERO_ENTRY,
+ GFP_KERNEL) == uevent->conn_req_ctx) {
+ list_move_tail(&uevent->list, &list);
+ continue;
+ }
+ list_del(&uevent->list);
+ kfree(uevent);
+ }
+ list_del(&ctx->list);
+ events_reported = ctx->events_reported;
+ mutex_unlock(&ctx->file->mut);
+
+ /*
+ * If this was a listening ID then any connections spawned from it that
+ * have not been delivered to userspace are cleaned up too. Must be done
+ * outside any locks.
+ */
+ list_for_each_entry_safe(uevent, tmp, &list, list) {
+ ucma_destroy_private_ctx(uevent->conn_req_ctx);
+ kfree(uevent);
+ }
+ return events_reported;
+}
+
+/*
+ * When this is called the xarray must have a XA_ZERO_ENTRY in the ctx->id (ie
+ * the ctx is not public to the user). This either because:
+ * - ucma_finish_ctx() hasn't been called
+ * - xa_cmpxchg() succeed to remove the entry (only one thread can succeed)
+ */
+static int ucma_destroy_private_ctx(struct ucma_context *ctx)
+{
+ int events_reported;
+
+ /*
+ * Destroy the underlying cm_id. New work queuing is prevented now by
+ * the removal from the xarray. Once the work is cancled ref will either
+ * be 0 because the work ran to completion and consumed the ref from the
+ * xarray, or it will be positive because we still have the ref from the
+ * xarray. This can also be 0 in cases where cm_id was never set
+ */
+ cancel_work_sync(&ctx->close_work);
+ if (refcount_read(&ctx->ref))
+ ucma_close_id(&ctx->close_work);
+
+ events_reported = ucma_cleanup_ctx_events(ctx);
+ ucma_cleanup_multicast(ctx);
+
+ WARN_ON(xa_cmpxchg(&ctx_table, ctx->id, XA_ZERO_ENTRY, NULL,
+ GFP_KERNEL) != NULL);
+ mutex_destroy(&ctx->mutex);
+ kfree(ctx);
+ return events_reported;
+}
+
+static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_destroy_id cmd;
+ struct rdma_ucm_destroy_id_resp resp;
+ struct ucma_context *ctx;
+ int ret = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ xa_lock(&ctx_table);
+ ctx = _ucma_find_context(cmd.id, file);
+ if (!IS_ERR(ctx)) {
+ if (__xa_cmpxchg(&ctx_table, ctx->id, ctx, XA_ZERO_ENTRY,
+ GFP_KERNEL) != ctx)
+ ctx = ERR_PTR(-ENOENT);
+ }
+ xa_unlock(&ctx_table);
+
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ resp.events_reported = ucma_destroy_private_ctx(ctx);
+ if (copy_to_user(u64_to_user_ptr(cmd.response),
+ &resp, sizeof(resp)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static ssize_t ucma_bind_ip(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_bind_ip cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ if (!rdma_addr_size_in6(&cmd.addr))
+ return -EINVAL;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&ctx->mutex);
+ ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr);
+ mutex_unlock(&ctx->mutex);
+
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_bind(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_bind cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ if (cmd.reserved || !cmd.addr_size ||
+ cmd.addr_size != rdma_addr_size_kss(&cmd.addr))
+ return -EINVAL;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&ctx->mutex);
+ ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr);
+ mutex_unlock(&ctx->mutex);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_resolve_ip(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_resolve_ip cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ if ((cmd.src_addr.sin6_family && !rdma_addr_size_in6(&cmd.src_addr)) ||
+ !rdma_addr_size_in6(&cmd.dst_addr))
+ return -EINVAL;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&ctx->mutex);
+ ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr,
+ (struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms);
+ mutex_unlock(&ctx->mutex);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_resolve_addr(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_resolve_addr cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ if (cmd.reserved ||
+ (cmd.src_size && (cmd.src_size != rdma_addr_size_kss(&cmd.src_addr))) ||
+ !cmd.dst_size || (cmd.dst_size != rdma_addr_size_kss(&cmd.dst_addr)))
+ return -EINVAL;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&ctx->mutex);
+ ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr,
+ (struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms);
+ mutex_unlock(&ctx->mutex);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_resolve_route(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_resolve_route cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx_dev(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&ctx->mutex);
+ ret = rdma_resolve_route(ctx->cm_id, cmd.timeout_ms);
+ mutex_unlock(&ctx->mutex);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp,
+ struct rdma_route *route)
+{
+ struct rdma_dev_addr *dev_addr;
+
+ resp->num_paths = route->num_pri_alt_paths;
+ switch (route->num_pri_alt_paths) {
+ case 0:
+ dev_addr = &route->addr.dev_addr;
+ rdma_addr_get_dgid(dev_addr,
+ (union ib_gid *) &resp->ib_route[0].dgid);
+ rdma_addr_get_sgid(dev_addr,
+ (union ib_gid *) &resp->ib_route[0].sgid);
+ resp->ib_route[0].pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
+ break;
+ case 2:
+ ib_copy_path_rec_to_user(&resp->ib_route[1],
+ &route->path_rec[1]);
+ fallthrough;
+ case 1:
+ ib_copy_path_rec_to_user(&resp->ib_route[0],
+ &route->path_rec[0]);
+ break;
+ default:
+ break;
+ }
+}
+
+static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp,
+ struct rdma_route *route)
+{
+
+ resp->num_paths = route->num_pri_alt_paths;
+ switch (route->num_pri_alt_paths) {
+ case 0:
+ rdma_ip2gid((struct sockaddr *)&route->addr.dst_addr,
+ (union ib_gid *)&resp->ib_route[0].dgid);
+ rdma_ip2gid((struct sockaddr *)&route->addr.src_addr,
+ (union ib_gid *)&resp->ib_route[0].sgid);
+ resp->ib_route[0].pkey = cpu_to_be16(0xffff);
+ break;
+ case 2:
+ ib_copy_path_rec_to_user(&resp->ib_route[1],
+ &route->path_rec[1]);
+ fallthrough;
+ case 1:
+ ib_copy_path_rec_to_user(&resp->ib_route[0],
+ &route->path_rec[0]);
+ break;
+ default:
+ break;
+ }
+}
+
+static void ucma_copy_iw_route(struct rdma_ucm_query_route_resp *resp,
+ struct rdma_route *route)
+{
+ struct rdma_dev_addr *dev_addr;
+
+ dev_addr = &route->addr.dev_addr;
+ rdma_addr_get_dgid(dev_addr, (union ib_gid *) &resp->ib_route[0].dgid);
+ rdma_addr_get_sgid(dev_addr, (union ib_gid *) &resp->ib_route[0].sgid);
+}
+
+static ssize_t ucma_query_route(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_query cmd;
+ struct rdma_ucm_query_route_resp resp;
+ struct ucma_context *ctx;
+ struct sockaddr *addr;
+ int ret = 0;
+
+ if (out_len < offsetof(struct rdma_ucm_query_route_resp, ibdev_index))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&ctx->mutex);
+ memset(&resp, 0, sizeof resp);
+ addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr;
+ memcpy(&resp.src_addr, addr, addr->sa_family == AF_INET ?
+ sizeof(struct sockaddr_in) :
+ sizeof(struct sockaddr_in6));
+ addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr;
+ memcpy(&resp.dst_addr, addr, addr->sa_family == AF_INET ?
+ sizeof(struct sockaddr_in) :
+ sizeof(struct sockaddr_in6));
+ if (!ctx->cm_id->device)
+ goto out;
+
+ resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid;
+ resp.ibdev_index = ctx->cm_id->device->index;
+ resp.port_num = ctx->cm_id->port_num;
+
+ if (rdma_cap_ib_sa(ctx->cm_id->device, ctx->cm_id->port_num))
+ ucma_copy_ib_route(&resp, &ctx->cm_id->route);
+ else if (rdma_protocol_roce(ctx->cm_id->device, ctx->cm_id->port_num))
+ ucma_copy_iboe_route(&resp, &ctx->cm_id->route);
+ else if (rdma_protocol_iwarp(ctx->cm_id->device, ctx->cm_id->port_num))
+ ucma_copy_iw_route(&resp, &ctx->cm_id->route);
+
+out:
+ mutex_unlock(&ctx->mutex);
+ if (copy_to_user(u64_to_user_ptr(cmd.response), &resp,
+ min_t(size_t, out_len, sizeof(resp))))
+ ret = -EFAULT;
+
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static void ucma_query_device_addr(struct rdma_cm_id *cm_id,
+ struct rdma_ucm_query_addr_resp *resp)
+{
+ if (!cm_id->device)
+ return;
+
+ resp->node_guid = (__force __u64) cm_id->device->node_guid;
+ resp->ibdev_index = cm_id->device->index;
+ resp->port_num = cm_id->port_num;
+ resp->pkey = (__force __u16) cpu_to_be16(
+ ib_addr_get_pkey(&cm_id->route.addr.dev_addr));
+}
+
+static ssize_t ucma_query_addr(struct ucma_context *ctx,
+ void __user *response, int out_len)
+{
+ struct rdma_ucm_query_addr_resp resp;
+ struct sockaddr *addr;
+ int ret = 0;
+
+ if (out_len < offsetof(struct rdma_ucm_query_addr_resp, ibdev_index))
+ return -ENOSPC;
+
+ memset(&resp, 0, sizeof resp);
+
+ addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr;
+ resp.src_size = rdma_addr_size(addr);
+ memcpy(&resp.src_addr, addr, resp.src_size);
+
+ addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr;
+ resp.dst_size = rdma_addr_size(addr);
+ memcpy(&resp.dst_addr, addr, resp.dst_size);
+
+ ucma_query_device_addr(ctx->cm_id, &resp);
+
+ if (copy_to_user(response, &resp, min_t(size_t, out_len, sizeof(resp))))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static ssize_t ucma_query_path(struct ucma_context *ctx,
+ void __user *response, int out_len)
+{
+ struct rdma_ucm_query_path_resp *resp;
+ int i, ret = 0;
+
+ if (out_len < sizeof(*resp))
+ return -ENOSPC;
+
+ resp = kzalloc(out_len, GFP_KERNEL);
+ if (!resp)
+ return -ENOMEM;
+
+ resp->num_paths = ctx->cm_id->route.num_pri_alt_paths;
+ for (i = 0, out_len -= sizeof(*resp);
+ i < resp->num_paths && out_len > sizeof(struct ib_path_rec_data);
+ i++, out_len -= sizeof(struct ib_path_rec_data)) {
+ struct sa_path_rec *rec = &ctx->cm_id->route.path_rec[i];
+
+ resp->path_data[i].flags = IB_PATH_GMP | IB_PATH_PRIMARY |
+ IB_PATH_BIDIRECTIONAL;
+ if (rec->rec_type == SA_PATH_REC_TYPE_OPA) {
+ struct sa_path_rec ib;
+
+ sa_convert_path_opa_to_ib(&ib, rec);
+ ib_sa_pack_path(&ib, &resp->path_data[i].path_rec);
+
+ } else {
+ ib_sa_pack_path(rec, &resp->path_data[i].path_rec);
+ }
+ }
+
+ if (copy_to_user(response, resp, struct_size(resp, path_data, i)))
+ ret = -EFAULT;
+
+ kfree(resp);
+ return ret;
+}
+
+static ssize_t ucma_query_gid(struct ucma_context *ctx,
+ void __user *response, int out_len)
+{
+ struct rdma_ucm_query_addr_resp resp;
+ struct sockaddr_ib *addr;
+ int ret = 0;
+
+ if (out_len < offsetof(struct rdma_ucm_query_addr_resp, ibdev_index))
+ return -ENOSPC;
+
+ memset(&resp, 0, sizeof resp);
+
+ ucma_query_device_addr(ctx->cm_id, &resp);
+
+ addr = (struct sockaddr_ib *) &resp.src_addr;
+ resp.src_size = sizeof(*addr);
+ if (ctx->cm_id->route.addr.src_addr.ss_family == AF_IB) {
+ memcpy(addr, &ctx->cm_id->route.addr.src_addr, resp.src_size);
+ } else {
+ addr->sib_family = AF_IB;
+ addr->sib_pkey = (__force __be16) resp.pkey;
+ rdma_read_gids(ctx->cm_id, (union ib_gid *)&addr->sib_addr,
+ NULL);
+ addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *)
+ &ctx->cm_id->route.addr.src_addr);
+ }
+
+ addr = (struct sockaddr_ib *) &resp.dst_addr;
+ resp.dst_size = sizeof(*addr);
+ if (ctx->cm_id->route.addr.dst_addr.ss_family == AF_IB) {
+ memcpy(addr, &ctx->cm_id->route.addr.dst_addr, resp.dst_size);
+ } else {
+ addr->sib_family = AF_IB;
+ addr->sib_pkey = (__force __be16) resp.pkey;
+ rdma_read_gids(ctx->cm_id, NULL,
+ (union ib_gid *)&addr->sib_addr);
+ addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *)
+ &ctx->cm_id->route.addr.dst_addr);
+ }
+
+ if (copy_to_user(response, &resp, min_t(size_t, out_len, sizeof(resp))))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static ssize_t ucma_query(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_query cmd;
+ struct ucma_context *ctx;
+ void __user *response;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ response = u64_to_user_ptr(cmd.response);
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&ctx->mutex);
+ switch (cmd.option) {
+ case RDMA_USER_CM_QUERY_ADDR:
+ ret = ucma_query_addr(ctx, response, out_len);
+ break;
+ case RDMA_USER_CM_QUERY_PATH:
+ ret = ucma_query_path(ctx, response, out_len);
+ break;
+ case RDMA_USER_CM_QUERY_GID:
+ ret = ucma_query_gid(ctx, response, out_len);
+ break;
+ default:
+ ret = -ENOSYS;
+ break;
+ }
+ mutex_unlock(&ctx->mutex);
+
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static void ucma_copy_conn_param(struct rdma_cm_id *id,
+ struct rdma_conn_param *dst,
+ struct rdma_ucm_conn_param *src)
+{
+ dst->private_data = src->private_data;
+ dst->private_data_len = src->private_data_len;
+ dst->responder_resources = src->responder_resources;
+ dst->initiator_depth = src->initiator_depth;
+ dst->flow_control = src->flow_control;
+ dst->retry_count = src->retry_count;
+ dst->rnr_retry_count = src->rnr_retry_count;
+ dst->srq = src->srq;
+ dst->qp_num = src->qp_num & 0xFFFFFF;
+ dst->qkey = (id->route.addr.src_addr.ss_family == AF_IB) ? src->qkey : 0;
+}
+
+static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_conn_param conn_param;
+ struct rdma_ucm_ece ece = {};
+ struct rdma_ucm_connect cmd;
+ struct ucma_context *ctx;
+ size_t in_size;
+ int ret;
+
+ if (in_len < offsetofend(typeof(cmd), reserved))
+ return -EINVAL;
+ in_size = min_t(size_t, in_len, sizeof(cmd));
+ if (copy_from_user(&cmd, inbuf, in_size))
+ return -EFAULT;
+
+ if (!cmd.conn_param.valid)
+ return -EINVAL;
+
+ ctx = ucma_get_ctx_dev(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param);
+ if (offsetofend(typeof(cmd), ece) <= in_size) {
+ ece.vendor_id = cmd.ece.vendor_id;
+ ece.attr_mod = cmd.ece.attr_mod;
+ }
+
+ mutex_lock(&ctx->mutex);
+ ret = rdma_connect_ece(ctx->cm_id, &conn_param, &ece);
+ mutex_unlock(&ctx->mutex);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_listen(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_listen cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ if (cmd.backlog <= 0 || cmd.backlog > max_backlog)
+ cmd.backlog = max_backlog;
+ atomic_set(&ctx->backlog, cmd.backlog);
+
+ mutex_lock(&ctx->mutex);
+ ret = rdma_listen(ctx->cm_id, cmd.backlog);
+ mutex_unlock(&ctx->mutex);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_accept cmd;
+ struct rdma_conn_param conn_param;
+ struct rdma_ucm_ece ece = {};
+ struct ucma_context *ctx;
+ size_t in_size;
+ int ret;
+
+ if (in_len < offsetofend(typeof(cmd), reserved))
+ return -EINVAL;
+ in_size = min_t(size_t, in_len, sizeof(cmd));
+ if (copy_from_user(&cmd, inbuf, in_size))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx_dev(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ if (offsetofend(typeof(cmd), ece) <= in_size) {
+ ece.vendor_id = cmd.ece.vendor_id;
+ ece.attr_mod = cmd.ece.attr_mod;
+ }
+
+ if (cmd.conn_param.valid) {
+ ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param);
+ mutex_lock(&ctx->mutex);
+ rdma_lock_handler(ctx->cm_id);
+ ret = rdma_accept_ece(ctx->cm_id, &conn_param, &ece);
+ if (!ret) {
+ /* The uid must be set atomically with the handler */
+ ctx->uid = cmd.uid;
+ }
+ rdma_unlock_handler(ctx->cm_id);
+ mutex_unlock(&ctx->mutex);
+ } else {
+ mutex_lock(&ctx->mutex);
+ rdma_lock_handler(ctx->cm_id);
+ ret = rdma_accept_ece(ctx->cm_id, NULL, &ece);
+ rdma_unlock_handler(ctx->cm_id);
+ mutex_unlock(&ctx->mutex);
+ }
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_reject(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_reject cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ if (!cmd.reason)
+ cmd.reason = IB_CM_REJ_CONSUMER_DEFINED;
+
+ switch (cmd.reason) {
+ case IB_CM_REJ_CONSUMER_DEFINED:
+ case IB_CM_REJ_VENDOR_OPTION_NOT_SUPPORTED:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ ctx = ucma_get_ctx_dev(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&ctx->mutex);
+ ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len,
+ cmd.reason);
+ mutex_unlock(&ctx->mutex);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_disconnect(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_disconnect cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx_dev(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&ctx->mutex);
+ ret = rdma_disconnect(ctx->cm_id);
+ mutex_unlock(&ctx->mutex);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_init_qp_attr(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_init_qp_attr cmd;
+ struct ib_uverbs_qp_attr resp;
+ struct ucma_context *ctx;
+ struct ib_qp_attr qp_attr;
+ int ret;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ if (cmd.qp_state > IB_QPS_ERR)
+ return -EINVAL;
+
+ ctx = ucma_get_ctx_dev(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ resp.qp_attr_mask = 0;
+ memset(&qp_attr, 0, sizeof qp_attr);
+ qp_attr.qp_state = cmd.qp_state;
+ mutex_lock(&ctx->mutex);
+ ret = rdma_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask);
+ mutex_unlock(&ctx->mutex);
+ if (ret)
+ goto out;
+
+ ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr);
+ if (copy_to_user(u64_to_user_ptr(cmd.response),
+ &resp, sizeof(resp)))
+ ret = -EFAULT;
+
+out:
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static int ucma_set_option_id(struct ucma_context *ctx, int optname,
+ void *optval, size_t optlen)
+{
+ int ret = 0;
+
+ switch (optname) {
+ case RDMA_OPTION_ID_TOS:
+ if (optlen != sizeof(u8)) {
+ ret = -EINVAL;
+ break;
+ }
+ rdma_set_service_type(ctx->cm_id, *((u8 *) optval));
+ break;
+ case RDMA_OPTION_ID_REUSEADDR:
+ if (optlen != sizeof(int)) {
+ ret = -EINVAL;
+ break;
+ }
+ ret = rdma_set_reuseaddr(ctx->cm_id, *((int *) optval) ? 1 : 0);
+ break;
+ case RDMA_OPTION_ID_AFONLY:
+ if (optlen != sizeof(int)) {
+ ret = -EINVAL;
+ break;
+ }
+ ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0);
+ break;
+ case RDMA_OPTION_ID_ACK_TIMEOUT:
+ if (optlen != sizeof(u8)) {
+ ret = -EINVAL;
+ break;
+ }
+ ret = rdma_set_ack_timeout(ctx->cm_id, *((u8 *)optval));
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+
+ return ret;
+}
+
+static int ucma_set_ib_path(struct ucma_context *ctx,
+ struct ib_path_rec_data *path_data, size_t optlen)
+{
+ struct sa_path_rec sa_path;
+ struct rdma_cm_event event;
+ int ret;
+
+ if (optlen % sizeof(*path_data))
+ return -EINVAL;
+
+ for (; optlen; optlen -= sizeof(*path_data), path_data++) {
+ if (path_data->flags == (IB_PATH_GMP | IB_PATH_PRIMARY |
+ IB_PATH_BIDIRECTIONAL))
+ break;
+ }
+
+ if (!optlen)
+ return -EINVAL;
+
+ if (!ctx->cm_id->device)
+ return -EINVAL;
+
+ memset(&sa_path, 0, sizeof(sa_path));
+
+ sa_path.rec_type = SA_PATH_REC_TYPE_IB;
+ ib_sa_unpack_path(path_data->path_rec, &sa_path);
+
+ if (rdma_cap_opa_ah(ctx->cm_id->device, ctx->cm_id->port_num)) {
+ struct sa_path_rec opa;
+
+ sa_convert_path_ib_to_opa(&opa, &sa_path);
+ mutex_lock(&ctx->mutex);
+ ret = rdma_set_ib_path(ctx->cm_id, &opa);
+ mutex_unlock(&ctx->mutex);
+ } else {
+ mutex_lock(&ctx->mutex);
+ ret = rdma_set_ib_path(ctx->cm_id, &sa_path);
+ mutex_unlock(&ctx->mutex);
+ }
+ if (ret)
+ return ret;
+
+ memset(&event, 0, sizeof event);
+ event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+ return ucma_event_handler(ctx->cm_id, &event);
+}
+
+static int ucma_set_option_ib(struct ucma_context *ctx, int optname,
+ void *optval, size_t optlen)
+{
+ int ret;
+
+ switch (optname) {
+ case RDMA_OPTION_IB_PATH:
+ ret = ucma_set_ib_path(ctx, optval, optlen);
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+
+ return ret;
+}
+
+static int ucma_set_option_level(struct ucma_context *ctx, int level,
+ int optname, void *optval, size_t optlen)
+{
+ int ret;
+
+ switch (level) {
+ case RDMA_OPTION_ID:
+ mutex_lock(&ctx->mutex);
+ ret = ucma_set_option_id(ctx, optname, optval, optlen);
+ mutex_unlock(&ctx->mutex);
+ break;
+ case RDMA_OPTION_IB:
+ ret = ucma_set_option_ib(ctx, optname, optval, optlen);
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+
+ return ret;
+}
+
+static ssize_t ucma_set_option(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_set_option cmd;
+ struct ucma_context *ctx;
+ void *optval;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ if (unlikely(cmd.optlen > KMALLOC_MAX_SIZE))
+ return -EINVAL;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ optval = memdup_user(u64_to_user_ptr(cmd.optval),
+ cmd.optlen);
+ if (IS_ERR(optval)) {
+ ret = PTR_ERR(optval);
+ goto out;
+ }
+
+ ret = ucma_set_option_level(ctx, cmd.level, cmd.optname, optval,
+ cmd.optlen);
+ kfree(optval);
+
+out:
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_notify(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_notify cmd;
+ struct ucma_context *ctx;
+ int ret = -EINVAL;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&ctx->mutex);
+ if (ctx->cm_id->device)
+ ret = rdma_notify(ctx->cm_id, (enum ib_event_type)cmd.event);
+ mutex_unlock(&ctx->mutex);
+
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_process_join(struct ucma_file *file,
+ struct rdma_ucm_join_mcast *cmd, int out_len)
+{
+ struct rdma_ucm_create_id_resp resp;
+ struct ucma_context *ctx;
+ struct ucma_multicast *mc;
+ struct sockaddr *addr;
+ int ret;
+ u8 join_state;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ addr = (struct sockaddr *) &cmd->addr;
+ if (cmd->addr_size != rdma_addr_size(addr))
+ return -EINVAL;
+
+ if (cmd->join_flags == RDMA_MC_JOIN_FLAG_FULLMEMBER)
+ join_state = BIT(FULLMEMBER_JOIN);
+ else if (cmd->join_flags == RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER)
+ join_state = BIT(SENDONLY_FULLMEMBER_JOIN);
+ else
+ return -EINVAL;
+
+ ctx = ucma_get_ctx_dev(file, cmd->id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mc = kzalloc(sizeof(*mc), GFP_KERNEL);
+ if (!mc) {
+ ret = -ENOMEM;
+ goto err_put_ctx;
+ }
+
+ mc->ctx = ctx;
+ mc->join_state = join_state;
+ mc->uid = cmd->uid;
+ memcpy(&mc->addr, addr, cmd->addr_size);
+
+ xa_lock(&multicast_table);
+ if (__xa_alloc(&multicast_table, &mc->id, NULL, xa_limit_32b,
+ GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto err_free_mc;
+ }
+
+ list_add_tail(&mc->list, &ctx->mc_list);
+ xa_unlock(&multicast_table);
+
+ mutex_lock(&ctx->mutex);
+ ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *)&mc->addr,
+ join_state, mc);
+ mutex_unlock(&ctx->mutex);
+ if (ret)
+ goto err_xa_erase;
+
+ resp.id = mc->id;
+ if (copy_to_user(u64_to_user_ptr(cmd->response),
+ &resp, sizeof(resp))) {
+ ret = -EFAULT;
+ goto err_leave_multicast;
+ }
+
+ xa_store(&multicast_table, mc->id, mc, 0);
+
+ ucma_put_ctx(ctx);
+ return 0;
+
+err_leave_multicast:
+ mutex_lock(&ctx->mutex);
+ rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr);
+ mutex_unlock(&ctx->mutex);
+ ucma_cleanup_mc_events(mc);
+err_xa_erase:
+ xa_lock(&multicast_table);
+ list_del(&mc->list);
+ __xa_erase(&multicast_table, mc->id);
+err_free_mc:
+ xa_unlock(&multicast_table);
+ kfree(mc);
+err_put_ctx:
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_join_ip_multicast(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_join_ip_mcast cmd;
+ struct rdma_ucm_join_mcast join_cmd;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ join_cmd.response = cmd.response;
+ join_cmd.uid = cmd.uid;
+ join_cmd.id = cmd.id;
+ join_cmd.addr_size = rdma_addr_size_in6(&cmd.addr);
+ if (!join_cmd.addr_size)
+ return -EINVAL;
+
+ join_cmd.join_flags = RDMA_MC_JOIN_FLAG_FULLMEMBER;
+ memcpy(&join_cmd.addr, &cmd.addr, join_cmd.addr_size);
+
+ return ucma_process_join(file, &join_cmd, out_len);
+}
+
+static ssize_t ucma_join_multicast(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_join_mcast cmd;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ if (!rdma_addr_size_kss(&cmd.addr))
+ return -EINVAL;
+
+ return ucma_process_join(file, &cmd, out_len);
+}
+
+static ssize_t ucma_leave_multicast(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_destroy_id cmd;
+ struct rdma_ucm_destroy_id_resp resp;
+ struct ucma_multicast *mc;
+ int ret = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ xa_lock(&multicast_table);
+ mc = xa_load(&multicast_table, cmd.id);
+ if (!mc)
+ mc = ERR_PTR(-ENOENT);
+ else if (READ_ONCE(mc->ctx->file) != file)
+ mc = ERR_PTR(-EINVAL);
+ else if (!refcount_inc_not_zero(&mc->ctx->ref))
+ mc = ERR_PTR(-ENXIO);
+
+ if (IS_ERR(mc)) {
+ xa_unlock(&multicast_table);
+ ret = PTR_ERR(mc);
+ goto out;
+ }
+
+ list_del(&mc->list);
+ __xa_erase(&multicast_table, mc->id);
+ xa_unlock(&multicast_table);
+
+ mutex_lock(&mc->ctx->mutex);
+ rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr);
+ mutex_unlock(&mc->ctx->mutex);
+
+ ucma_cleanup_mc_events(mc);
+
+ ucma_put_ctx(mc->ctx);
+ resp.events_reported = mc->events_reported;
+ kfree(mc);
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response),
+ &resp, sizeof(resp)))
+ ret = -EFAULT;
+out:
+ return ret;
+}
+
+static ssize_t ucma_migrate_id(struct ucma_file *new_file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_migrate_id cmd;
+ struct rdma_ucm_migrate_resp resp;
+ struct ucma_event *uevent, *tmp;
+ struct ucma_context *ctx;
+ LIST_HEAD(event_list);
+ struct fd f;
+ struct ucma_file *cur_file;
+ int ret = 0;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ /* Get current fd to protect against it being closed */
+ f = fdget(cmd.fd);
+ if (!f.file)
+ return -ENOENT;
+ if (f.file->f_op != &ucma_fops) {
+ ret = -EINVAL;
+ goto file_put;
+ }
+ cur_file = f.file->private_data;
+
+ /* Validate current fd and prevent destruction of id. */
+ ctx = ucma_get_ctx(cur_file, cmd.id);
+ if (IS_ERR(ctx)) {
+ ret = PTR_ERR(ctx);
+ goto file_put;
+ }
+
+ rdma_lock_handler(ctx->cm_id);
+ /*
+ * ctx->file can only be changed under the handler & xa_lock. xa_load()
+ * must be checked again to ensure the ctx hasn't begun destruction
+ * since the ucma_get_ctx().
+ */
+ xa_lock(&ctx_table);
+ if (_ucma_find_context(cmd.id, cur_file) != ctx) {
+ xa_unlock(&ctx_table);
+ ret = -ENOENT;
+ goto err_unlock;
+ }
+ ctx->file = new_file;
+ xa_unlock(&ctx_table);
+
+ mutex_lock(&cur_file->mut);
+ list_del(&ctx->list);
+ /*
+ * At this point lock_handler() prevents addition of new uevents for
+ * this ctx.
+ */
+ list_for_each_entry_safe(uevent, tmp, &cur_file->event_list, list)
+ if (uevent->ctx == ctx)
+ list_move_tail(&uevent->list, &event_list);
+ resp.events_reported = ctx->events_reported;
+ mutex_unlock(&cur_file->mut);
+
+ mutex_lock(&new_file->mut);
+ list_add_tail(&ctx->list, &new_file->ctx_list);
+ list_splice_tail(&event_list, &new_file->event_list);
+ mutex_unlock(&new_file->mut);
+
+ if (copy_to_user(u64_to_user_ptr(cmd.response),
+ &resp, sizeof(resp)))
+ ret = -EFAULT;
+
+err_unlock:
+ rdma_unlock_handler(ctx->cm_id);
+ ucma_put_ctx(ctx);
+file_put:
+ fdput(f);
+ return ret;
+}
+
+static ssize_t (*ucma_cmd_table[])(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len) = {
+ [RDMA_USER_CM_CMD_CREATE_ID] = ucma_create_id,
+ [RDMA_USER_CM_CMD_DESTROY_ID] = ucma_destroy_id,
+ [RDMA_USER_CM_CMD_BIND_IP] = ucma_bind_ip,
+ [RDMA_USER_CM_CMD_RESOLVE_IP] = ucma_resolve_ip,
+ [RDMA_USER_CM_CMD_RESOLVE_ROUTE] = ucma_resolve_route,
+ [RDMA_USER_CM_CMD_QUERY_ROUTE] = ucma_query_route,
+ [RDMA_USER_CM_CMD_CONNECT] = ucma_connect,
+ [RDMA_USER_CM_CMD_LISTEN] = ucma_listen,
+ [RDMA_USER_CM_CMD_ACCEPT] = ucma_accept,
+ [RDMA_USER_CM_CMD_REJECT] = ucma_reject,
+ [RDMA_USER_CM_CMD_DISCONNECT] = ucma_disconnect,
+ [RDMA_USER_CM_CMD_INIT_QP_ATTR] = ucma_init_qp_attr,
+ [RDMA_USER_CM_CMD_GET_EVENT] = ucma_get_event,
+ [RDMA_USER_CM_CMD_GET_OPTION] = NULL,
+ [RDMA_USER_CM_CMD_SET_OPTION] = ucma_set_option,
+ [RDMA_USER_CM_CMD_NOTIFY] = ucma_notify,
+ [RDMA_USER_CM_CMD_JOIN_IP_MCAST] = ucma_join_ip_multicast,
+ [RDMA_USER_CM_CMD_LEAVE_MCAST] = ucma_leave_multicast,
+ [RDMA_USER_CM_CMD_MIGRATE_ID] = ucma_migrate_id,
+ [RDMA_USER_CM_CMD_QUERY] = ucma_query,
+ [RDMA_USER_CM_CMD_BIND] = ucma_bind,
+ [RDMA_USER_CM_CMD_RESOLVE_ADDR] = ucma_resolve_addr,
+ [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast
+};
+
+static ssize_t ucma_write(struct file *filp, const char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct ucma_file *file = filp->private_data;
+ struct rdma_ucm_cmd_hdr hdr;
+ ssize_t ret;
+
+ if (!ib_safe_file_access(filp)) {
+ pr_err_once("%s: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n",
+ __func__, task_tgid_vnr(current), current->comm);
+ return -EACCES;
+ }
+
+ if (len < sizeof(hdr))
+ return -EINVAL;
+
+ if (copy_from_user(&hdr, buf, sizeof(hdr)))
+ return -EFAULT;
+
+ if (hdr.cmd >= ARRAY_SIZE(ucma_cmd_table))
+ return -EINVAL;
+ hdr.cmd = array_index_nospec(hdr.cmd, ARRAY_SIZE(ucma_cmd_table));
+
+ if (hdr.in + sizeof(hdr) > len)
+ return -EINVAL;
+
+ if (!ucma_cmd_table[hdr.cmd])
+ return -ENOSYS;
+
+ ret = ucma_cmd_table[hdr.cmd](file, buf + sizeof(hdr), hdr.in, hdr.out);
+ if (!ret)
+ ret = len;
+
+ return ret;
+}
+
+static __poll_t ucma_poll(struct file *filp, struct poll_table_struct *wait)
+{
+ struct ucma_file *file = filp->private_data;
+ __poll_t mask = 0;
+
+ poll_wait(filp, &file->poll_wait, wait);
+
+ if (!list_empty(&file->event_list))
+ mask = EPOLLIN | EPOLLRDNORM;
+
+ return mask;
+}
+
+/*
+ * ucma_open() does not need the BKL:
+ *
+ * - no global state is referred to;
+ * - there is no ioctl method to race against;
+ * - no further module initialization is required for open to work
+ * after the device is registered.
+ */
+static int ucma_open(struct inode *inode, struct file *filp)
+{
+ struct ucma_file *file;
+
+ file = kmalloc(sizeof *file, GFP_KERNEL);
+ if (!file)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&file->event_list);
+ INIT_LIST_HEAD(&file->ctx_list);
+ init_waitqueue_head(&file->poll_wait);
+ mutex_init(&file->mut);
+
+ filp->private_data = file;
+ file->filp = filp;
+
+ return stream_open(inode, filp);
+}
+
+static int ucma_close(struct inode *inode, struct file *filp)
+{
+ struct ucma_file *file = filp->private_data;
+
+ /*
+ * All paths that touch ctx_list or ctx_list starting from write() are
+ * prevented by this being a FD release function. The list_add_tail() in
+ * ucma_connect_event_handler() can run concurrently, however it only
+ * adds to the list *after* a listening ID. By only reading the first of
+ * the list, and relying on ucma_destroy_private_ctx() to block
+ * ucma_connect_event_handler(), no additional locking is needed.
+ */
+ while (!list_empty(&file->ctx_list)) {
+ struct ucma_context *ctx = list_first_entry(
+ &file->ctx_list, struct ucma_context, list);
+
+ WARN_ON(xa_cmpxchg(&ctx_table, ctx->id, ctx, XA_ZERO_ENTRY,
+ GFP_KERNEL) != ctx);
+ ucma_destroy_private_ctx(ctx);
+ }
+ kfree(file);
+ return 0;
+}
+
+static const struct file_operations ucma_fops = {
+ .owner = THIS_MODULE,
+ .open = ucma_open,
+ .release = ucma_close,
+ .write = ucma_write,
+ .poll = ucma_poll,
+ .llseek = no_llseek,
+};
+
+static struct miscdevice ucma_misc = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "rdma_cm",
+ .nodename = "infiniband/rdma_cm",
+ .mode = 0666,
+ .fops = &ucma_fops,
+};
+
+static int ucma_get_global_nl_info(struct ib_client_nl_info *res)
+{
+ res->abi = RDMA_USER_CM_ABI_VERSION;
+ res->cdev = ucma_misc.this_device;
+ return 0;
+}
+
+static struct ib_client rdma_cma_client = {
+ .name = "rdma_cm",
+ .get_global_nl_info = ucma_get_global_nl_info,
+};
+MODULE_ALIAS_RDMA_CLIENT("rdma_cm");
+
+static ssize_t abi_version_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", RDMA_USER_CM_ABI_VERSION);
+}
+static DEVICE_ATTR_RO(abi_version);
+
+static int __init ucma_init(void)
+{
+ int ret;
+
+ ret = misc_register(&ucma_misc);
+ if (ret)
+ return ret;
+
+ ret = device_create_file(ucma_misc.this_device, &dev_attr_abi_version);
+ if (ret) {
+ pr_err("rdma_ucm: couldn't create abi_version attr\n");
+ goto err1;
+ }
+
+ ucma_ctl_table_hdr = register_net_sysctl(&init_net, "net/rdma_ucm", ucma_ctl_table);
+ if (!ucma_ctl_table_hdr) {
+ pr_err("rdma_ucm: couldn't register sysctl paths\n");
+ ret = -ENOMEM;
+ goto err2;
+ }
+
+ ret = ib_register_client(&rdma_cma_client);
+ if (ret)
+ goto err3;
+
+ return 0;
+err3:
+ unregister_net_sysctl_table(ucma_ctl_table_hdr);
+err2:
+ device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
+err1:
+ misc_deregister(&ucma_misc);
+ return ret;
+}
+
+static void __exit ucma_cleanup(void)
+{
+ ib_unregister_client(&rdma_cma_client);
+ unregister_net_sysctl_table(ucma_ctl_table_hdr);
+ device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
+ misc_deregister(&ucma_misc);
+}
+
+module_init(ucma_init);
+module_exit(ucma_cleanup);
diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c
new file mode 100644
index 000000000..64d9c492d
--- /dev/null
+++ b/drivers/infiniband/core/ud_header.c
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/export.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+
+#include <rdma/ib_pack.h>
+
+#define STRUCT_FIELD(header, field) \
+ .struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field), \
+ .struct_size_bytes = sizeof_field(struct ib_unpacked_ ## header, field), \
+ .field_name = #header ":" #field
+
+static const struct ib_field lrh_table[] = {
+ { STRUCT_FIELD(lrh, virtual_lane),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 4 },
+ { STRUCT_FIELD(lrh, link_version),
+ .offset_words = 0,
+ .offset_bits = 4,
+ .size_bits = 4 },
+ { STRUCT_FIELD(lrh, service_level),
+ .offset_words = 0,
+ .offset_bits = 8,
+ .size_bits = 4 },
+ { RESERVED,
+ .offset_words = 0,
+ .offset_bits = 12,
+ .size_bits = 2 },
+ { STRUCT_FIELD(lrh, link_next_header),
+ .offset_words = 0,
+ .offset_bits = 14,
+ .size_bits = 2 },
+ { STRUCT_FIELD(lrh, destination_lid),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { RESERVED,
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 5 },
+ { STRUCT_FIELD(lrh, packet_length),
+ .offset_words = 1,
+ .offset_bits = 5,
+ .size_bits = 11 },
+ { STRUCT_FIELD(lrh, source_lid),
+ .offset_words = 1,
+ .offset_bits = 16,
+ .size_bits = 16 }
+};
+
+static const struct ib_field eth_table[] = {
+ { STRUCT_FIELD(eth, dmac_h),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { STRUCT_FIELD(eth, dmac_l),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { STRUCT_FIELD(eth, smac_h),
+ .offset_words = 1,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { STRUCT_FIELD(eth, smac_l),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { STRUCT_FIELD(eth, type),
+ .offset_words = 3,
+ .offset_bits = 0,
+ .size_bits = 16 }
+};
+
+static const struct ib_field vlan_table[] = {
+ { STRUCT_FIELD(vlan, tag),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { STRUCT_FIELD(vlan, type),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 }
+};
+
+static const struct ib_field ip4_table[] = {
+ { STRUCT_FIELD(ip4, ver),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 4 },
+ { STRUCT_FIELD(ip4, hdr_len),
+ .offset_words = 0,
+ .offset_bits = 4,
+ .size_bits = 4 },
+ { STRUCT_FIELD(ip4, tos),
+ .offset_words = 0,
+ .offset_bits = 8,
+ .size_bits = 8 },
+ { STRUCT_FIELD(ip4, tot_len),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { STRUCT_FIELD(ip4, id),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { STRUCT_FIELD(ip4, frag_off),
+ .offset_words = 1,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { STRUCT_FIELD(ip4, ttl),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { STRUCT_FIELD(ip4, protocol),
+ .offset_words = 2,
+ .offset_bits = 8,
+ .size_bits = 8 },
+ { STRUCT_FIELD(ip4, check),
+ .offset_words = 2,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { STRUCT_FIELD(ip4, saddr),
+ .offset_words = 3,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { STRUCT_FIELD(ip4, daddr),
+ .offset_words = 4,
+ .offset_bits = 0,
+ .size_bits = 32 }
+};
+
+static const struct ib_field udp_table[] = {
+ { STRUCT_FIELD(udp, sport),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { STRUCT_FIELD(udp, dport),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { STRUCT_FIELD(udp, length),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { STRUCT_FIELD(udp, csum),
+ .offset_words = 1,
+ .offset_bits = 16,
+ .size_bits = 16 }
+};
+
+static const struct ib_field grh_table[] = {
+ { STRUCT_FIELD(grh, ip_version),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 4 },
+ { STRUCT_FIELD(grh, traffic_class),
+ .offset_words = 0,
+ .offset_bits = 4,
+ .size_bits = 8 },
+ { STRUCT_FIELD(grh, flow_label),
+ .offset_words = 0,
+ .offset_bits = 12,
+ .size_bits = 20 },
+ { STRUCT_FIELD(grh, payload_length),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { STRUCT_FIELD(grh, next_header),
+ .offset_words = 1,
+ .offset_bits = 16,
+ .size_bits = 8 },
+ { STRUCT_FIELD(grh, hop_limit),
+ .offset_words = 1,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { STRUCT_FIELD(grh, source_gid),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { STRUCT_FIELD(grh, destination_gid),
+ .offset_words = 6,
+ .offset_bits = 0,
+ .size_bits = 128 }
+};
+
+static const struct ib_field bth_table[] = {
+ { STRUCT_FIELD(bth, opcode),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { STRUCT_FIELD(bth, solicited_event),
+ .offset_words = 0,
+ .offset_bits = 8,
+ .size_bits = 1 },
+ { STRUCT_FIELD(bth, mig_req),
+ .offset_words = 0,
+ .offset_bits = 9,
+ .size_bits = 1 },
+ { STRUCT_FIELD(bth, pad_count),
+ .offset_words = 0,
+ .offset_bits = 10,
+ .size_bits = 2 },
+ { STRUCT_FIELD(bth, transport_header_version),
+ .offset_words = 0,
+ .offset_bits = 12,
+ .size_bits = 4 },
+ { STRUCT_FIELD(bth, pkey),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { RESERVED,
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { STRUCT_FIELD(bth, destination_qpn),
+ .offset_words = 1,
+ .offset_bits = 8,
+ .size_bits = 24 },
+ { STRUCT_FIELD(bth, ack_req),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 1 },
+ { RESERVED,
+ .offset_words = 2,
+ .offset_bits = 1,
+ .size_bits = 7 },
+ { STRUCT_FIELD(bth, psn),
+ .offset_words = 2,
+ .offset_bits = 8,
+ .size_bits = 24 }
+};
+
+static const struct ib_field deth_table[] = {
+ { STRUCT_FIELD(deth, qkey),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { RESERVED,
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { STRUCT_FIELD(deth, source_qpn),
+ .offset_words = 1,
+ .offset_bits = 8,
+ .size_bits = 24 }
+};
+
+__sum16 ib_ud_ip4_csum(struct ib_ud_header *header)
+{
+ struct iphdr iph;
+
+ iph.ihl = 5;
+ iph.version = 4;
+ iph.tos = header->ip4.tos;
+ iph.tot_len = header->ip4.tot_len;
+ iph.id = header->ip4.id;
+ iph.frag_off = header->ip4.frag_off;
+ iph.ttl = header->ip4.ttl;
+ iph.protocol = header->ip4.protocol;
+ iph.check = 0;
+ iph.saddr = header->ip4.saddr;
+ iph.daddr = header->ip4.daddr;
+
+ return ip_fast_csum((u8 *)&iph, iph.ihl);
+}
+EXPORT_SYMBOL(ib_ud_ip4_csum);
+
+/**
+ * ib_ud_header_init - Initialize UD header structure
+ * @payload_bytes:Length of packet payload
+ * @lrh_present: specify if LRH is present
+ * @eth_present: specify if Eth header is present
+ * @vlan_present: packet is tagged vlan
+ * @grh_present: GRH flag (if non-zero, GRH will be included)
+ * @ip_version: if non-zero, IP header, V4 or V6, will be included
+ * @udp_present :if non-zero, UDP header will be included
+ * @immediate_present: specify if immediate data is present
+ * @header:Structure to initialize
+ */
+int ib_ud_header_init(int payload_bytes,
+ int lrh_present,
+ int eth_present,
+ int vlan_present,
+ int grh_present,
+ int ip_version,
+ int udp_present,
+ int immediate_present,
+ struct ib_ud_header *header)
+{
+ size_t udp_bytes = udp_present ? IB_UDP_BYTES : 0;
+
+ grh_present = grh_present && !ip_version;
+ memset(header, 0, sizeof *header);
+
+ /*
+ * UDP header without IP header doesn't make sense
+ */
+ if (udp_present && ip_version != 4 && ip_version != 6)
+ return -EINVAL;
+
+ if (lrh_present) {
+ u16 packet_length;
+
+ header->lrh.link_version = 0;
+ header->lrh.link_next_header =
+ grh_present ? IB_LNH_IBA_GLOBAL : IB_LNH_IBA_LOCAL;
+ packet_length = (IB_LRH_BYTES +
+ IB_BTH_BYTES +
+ IB_DETH_BYTES +
+ (grh_present ? IB_GRH_BYTES : 0) +
+ payload_bytes +
+ 4 + /* ICRC */
+ 3) / 4; /* round up */
+ header->lrh.packet_length = cpu_to_be16(packet_length);
+ }
+
+ if (vlan_present)
+ header->eth.type = cpu_to_be16(ETH_P_8021Q);
+
+ if (ip_version == 6 || grh_present) {
+ header->grh.ip_version = 6;
+ header->grh.payload_length =
+ cpu_to_be16((udp_bytes +
+ IB_BTH_BYTES +
+ IB_DETH_BYTES +
+ payload_bytes +
+ 4 + /* ICRC */
+ 3) & ~3); /* round up */
+ header->grh.next_header = udp_present ? IPPROTO_UDP : 0x1b;
+ }
+
+ if (ip_version == 4) {
+ header->ip4.ver = 4; /* version 4 */
+ header->ip4.hdr_len = 5; /* 5 words */
+ header->ip4.tot_len =
+ cpu_to_be16(IB_IP4_BYTES +
+ udp_bytes +
+ IB_BTH_BYTES +
+ IB_DETH_BYTES +
+ payload_bytes +
+ 4); /* ICRC */
+ header->ip4.protocol = IPPROTO_UDP;
+ }
+ if (udp_present && ip_version)
+ header->udp.length =
+ cpu_to_be16(IB_UDP_BYTES +
+ IB_BTH_BYTES +
+ IB_DETH_BYTES +
+ payload_bytes +
+ 4); /* ICRC */
+
+ if (immediate_present)
+ header->bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+ else
+ header->bth.opcode = IB_OPCODE_UD_SEND_ONLY;
+ header->bth.pad_count = (4 - payload_bytes) & 3;
+ header->bth.transport_header_version = 0;
+
+ header->lrh_present = lrh_present;
+ header->eth_present = eth_present;
+ header->vlan_present = vlan_present;
+ header->grh_present = grh_present || (ip_version == 6);
+ header->ipv4_present = ip_version == 4;
+ header->udp_present = udp_present;
+ header->immediate_present = immediate_present;
+ return 0;
+}
+EXPORT_SYMBOL(ib_ud_header_init);
+
+/**
+ * ib_ud_header_pack - Pack UD header struct into wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() packs the UD header structure @header into wire
+ * format in the buffer @buf.
+ */
+int ib_ud_header_pack(struct ib_ud_header *header,
+ void *buf)
+{
+ int len = 0;
+
+ if (header->lrh_present) {
+ ib_pack(lrh_table, ARRAY_SIZE(lrh_table),
+ &header->lrh, buf + len);
+ len += IB_LRH_BYTES;
+ }
+ if (header->eth_present) {
+ ib_pack(eth_table, ARRAY_SIZE(eth_table),
+ &header->eth, buf + len);
+ len += IB_ETH_BYTES;
+ }
+ if (header->vlan_present) {
+ ib_pack(vlan_table, ARRAY_SIZE(vlan_table),
+ &header->vlan, buf + len);
+ len += IB_VLAN_BYTES;
+ }
+ if (header->grh_present) {
+ ib_pack(grh_table, ARRAY_SIZE(grh_table),
+ &header->grh, buf + len);
+ len += IB_GRH_BYTES;
+ }
+ if (header->ipv4_present) {
+ ib_pack(ip4_table, ARRAY_SIZE(ip4_table),
+ &header->ip4, buf + len);
+ len += IB_IP4_BYTES;
+ }
+ if (header->udp_present) {
+ ib_pack(udp_table, ARRAY_SIZE(udp_table),
+ &header->udp, buf + len);
+ len += IB_UDP_BYTES;
+ }
+
+ ib_pack(bth_table, ARRAY_SIZE(bth_table),
+ &header->bth, buf + len);
+ len += IB_BTH_BYTES;
+
+ ib_pack(deth_table, ARRAY_SIZE(deth_table),
+ &header->deth, buf + len);
+ len += IB_DETH_BYTES;
+
+ if (header->immediate_present) {
+ memcpy(buf + len, &header->immediate_data, sizeof header->immediate_data);
+ len += sizeof header->immediate_data;
+ }
+
+ return len;
+}
+EXPORT_SYMBOL(ib_ud_header_pack);
+
+/**
+ * ib_ud_header_unpack - Unpack UD header struct from wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() unpacks the UD header structure @header from wire
+ * format in the buffer @buf.
+ */
+int ib_ud_header_unpack(void *buf,
+ struct ib_ud_header *header)
+{
+ ib_unpack(lrh_table, ARRAY_SIZE(lrh_table),
+ buf, &header->lrh);
+ buf += IB_LRH_BYTES;
+
+ if (header->lrh.link_version != 0) {
+ pr_warn("Invalid LRH.link_version %u\n",
+ header->lrh.link_version);
+ return -EINVAL;
+ }
+
+ switch (header->lrh.link_next_header) {
+ case IB_LNH_IBA_LOCAL:
+ header->grh_present = 0;
+ break;
+
+ case IB_LNH_IBA_GLOBAL:
+ header->grh_present = 1;
+ ib_unpack(grh_table, ARRAY_SIZE(grh_table),
+ buf, &header->grh);
+ buf += IB_GRH_BYTES;
+
+ if (header->grh.ip_version != 6) {
+ pr_warn("Invalid GRH.ip_version %u\n",
+ header->grh.ip_version);
+ return -EINVAL;
+ }
+ if (header->grh.next_header != 0x1b) {
+ pr_warn("Invalid GRH.next_header 0x%02x\n",
+ header->grh.next_header);
+ return -EINVAL;
+ }
+ break;
+
+ default:
+ pr_warn("Invalid LRH.link_next_header %u\n",
+ header->lrh.link_next_header);
+ return -EINVAL;
+ }
+
+ ib_unpack(bth_table, ARRAY_SIZE(bth_table),
+ buf, &header->bth);
+ buf += IB_BTH_BYTES;
+
+ switch (header->bth.opcode) {
+ case IB_OPCODE_UD_SEND_ONLY:
+ header->immediate_present = 0;
+ break;
+ case IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE:
+ header->immediate_present = 1;
+ break;
+ default:
+ pr_warn("Invalid BTH.opcode 0x%02x\n", header->bth.opcode);
+ return -EINVAL;
+ }
+
+ if (header->bth.transport_header_version != 0) {
+ pr_warn("Invalid BTH.transport_header_version %u\n",
+ header->bth.transport_header_version);
+ return -EINVAL;
+ }
+
+ ib_unpack(deth_table, ARRAY_SIZE(deth_table),
+ buf, &header->deth);
+ buf += IB_DETH_BYTES;
+
+ if (header->immediate_present)
+ memcpy(&header->immediate_data, buf, sizeof header->immediate_data);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_ud_header_unpack);
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
new file mode 100644
index 000000000..8ce569bf7
--- /dev/null
+++ b/drivers/infiniband/core/umem.c
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2020 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/mm.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/count_zeros.h>
+#include <rdma/ib_umem_odp.h>
+
+#include "uverbs.h"
+
+static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
+{
+ bool make_dirty = umem->writable && dirty;
+ struct scatterlist *sg;
+ unsigned int i;
+
+ if (dirty)
+ ib_dma_unmap_sgtable_attrs(dev, &umem->sgt_append.sgt,
+ DMA_BIDIRECTIONAL, 0);
+
+ for_each_sgtable_sg(&umem->sgt_append.sgt, sg, i)
+ unpin_user_page_range_dirty_lock(sg_page(sg),
+ DIV_ROUND_UP(sg->length, PAGE_SIZE), make_dirty);
+
+ sg_free_append_table(&umem->sgt_append);
+}
+
+/**
+ * ib_umem_find_best_pgsz - Find best HW page size to use for this MR
+ *
+ * @umem: umem struct
+ * @pgsz_bitmap: bitmap of HW supported page sizes
+ * @virt: IOVA
+ *
+ * This helper is intended for HW that support multiple page
+ * sizes but can do only a single page size in an MR.
+ *
+ * Returns 0 if the umem requires page sizes not supported by
+ * the driver to be mapped. Drivers always supporting PAGE_SIZE
+ * or smaller will never see a 0 result.
+ */
+unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
+ unsigned long pgsz_bitmap,
+ unsigned long virt)
+{
+ struct scatterlist *sg;
+ unsigned long va, pgoff;
+ dma_addr_t mask;
+ int i;
+
+ umem->iova = va = virt;
+
+ if (umem->is_odp) {
+ unsigned int page_size = BIT(to_ib_umem_odp(umem)->page_shift);
+
+ /* ODP must always be self consistent. */
+ if (!(pgsz_bitmap & page_size))
+ return 0;
+ return page_size;
+ }
+
+ /* The best result is the smallest page size that results in the minimum
+ * number of required pages. Compute the largest page size that could
+ * work based on VA address bits that don't change.
+ */
+ mask = pgsz_bitmap &
+ GENMASK(BITS_PER_LONG - 1,
+ bits_per((umem->length - 1 + virt) ^ virt));
+ /* offset into first SGL */
+ pgoff = umem->address & ~PAGE_MASK;
+
+ for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) {
+ /* Walk SGL and reduce max page size if VA/PA bits differ
+ * for any address.
+ */
+ mask |= (sg_dma_address(sg) + pgoff) ^ va;
+ va += sg_dma_len(sg) - pgoff;
+ /* Except for the last entry, the ending iova alignment sets
+ * the maximum possible page size as the low bits of the iova
+ * must be zero when starting the next chunk.
+ */
+ if (i != (umem->sgt_append.sgt.nents - 1))
+ mask |= va;
+ pgoff = 0;
+ }
+
+ /* The mask accumulates 1's in each position where the VA and physical
+ * address differ, thus the length of trailing 0 is the largest page
+ * size that can pass the VA through to the physical.
+ */
+ if (mask)
+ pgsz_bitmap &= GENMASK(count_trailing_zeros(mask), 0);
+ return pgsz_bitmap ? rounddown_pow_of_two(pgsz_bitmap) : 0;
+}
+EXPORT_SYMBOL(ib_umem_find_best_pgsz);
+
+/**
+ * ib_umem_get - Pin and DMA map userspace memory.
+ *
+ * @device: IB device to connect UMEM
+ * @addr: userspace virtual address to start at
+ * @size: length of region to pin
+ * @access: IB_ACCESS_xxx flags for memory being pinned
+ */
+struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
+ size_t size, int access)
+{
+ struct ib_umem *umem;
+ struct page **page_list;
+ unsigned long lock_limit;
+ unsigned long new_pinned;
+ unsigned long cur_base;
+ unsigned long dma_attr = 0;
+ struct mm_struct *mm;
+ unsigned long npages;
+ int pinned, ret;
+ unsigned int gup_flags = FOLL_WRITE;
+
+ /*
+ * If the combination of the addr and size requested for this memory
+ * region causes an integer overflow, return error.
+ */
+ if (((addr + size) < addr) ||
+ PAGE_ALIGN(addr + size) < (addr + size))
+ return ERR_PTR(-EINVAL);
+
+ if (!can_do_mlock())
+ return ERR_PTR(-EPERM);
+
+ if (access & IB_ACCESS_ON_DEMAND)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+ if (!umem)
+ return ERR_PTR(-ENOMEM);
+ umem->ibdev = device;
+ umem->length = size;
+ umem->address = addr;
+ /*
+ * Drivers should call ib_umem_find_best_pgsz() to set the iova
+ * correctly.
+ */
+ umem->iova = addr;
+ umem->writable = ib_access_writable(access);
+ umem->owning_mm = mm = current->mm;
+ mmgrab(mm);
+
+ page_list = (struct page **) __get_free_page(GFP_KERNEL);
+ if (!page_list) {
+ ret = -ENOMEM;
+ goto umem_kfree;
+ }
+
+ npages = ib_umem_num_pages(umem);
+ if (npages == 0 || npages > UINT_MAX) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ new_pinned = atomic64_add_return(npages, &mm->pinned_vm);
+ if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
+ atomic64_sub(npages, &mm->pinned_vm);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ cur_base = addr & PAGE_MASK;
+
+ if (!umem->writable)
+ gup_flags |= FOLL_FORCE;
+
+ while (npages) {
+ cond_resched();
+ pinned = pin_user_pages_fast(cur_base,
+ min_t(unsigned long, npages,
+ PAGE_SIZE /
+ sizeof(struct page *)),
+ gup_flags | FOLL_LONGTERM, page_list);
+ if (pinned < 0) {
+ ret = pinned;
+ goto umem_release;
+ }
+
+ cur_base += pinned * PAGE_SIZE;
+ npages -= pinned;
+ ret = sg_alloc_append_table_from_pages(
+ &umem->sgt_append, page_list, pinned, 0,
+ pinned << PAGE_SHIFT, ib_dma_max_seg_size(device),
+ npages, GFP_KERNEL);
+ if (ret) {
+ unpin_user_pages_dirty_lock(page_list, pinned, 0);
+ goto umem_release;
+ }
+ }
+
+ if (access & IB_ACCESS_RELAXED_ORDERING)
+ dma_attr |= DMA_ATTR_WEAK_ORDERING;
+
+ ret = ib_dma_map_sgtable_attrs(device, &umem->sgt_append.sgt,
+ DMA_BIDIRECTIONAL, dma_attr);
+ if (ret)
+ goto umem_release;
+ goto out;
+
+umem_release:
+ __ib_umem_release(device, umem, 0);
+ atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
+out:
+ free_page((unsigned long) page_list);
+umem_kfree:
+ if (ret) {
+ mmdrop(umem->owning_mm);
+ kfree(umem);
+ }
+ return ret ? ERR_PTR(ret) : umem;
+}
+EXPORT_SYMBOL(ib_umem_get);
+
+/**
+ * ib_umem_release - release memory pinned with ib_umem_get
+ * @umem: umem struct to release
+ */
+void ib_umem_release(struct ib_umem *umem)
+{
+ if (!umem)
+ return;
+ if (umem->is_dmabuf)
+ return ib_umem_dmabuf_release(to_ib_umem_dmabuf(umem));
+ if (umem->is_odp)
+ return ib_umem_odp_release(to_ib_umem_odp(umem));
+
+ __ib_umem_release(umem->ibdev, umem, 1);
+
+ atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
+ mmdrop(umem->owning_mm);
+ kfree(umem);
+}
+EXPORT_SYMBOL(ib_umem_release);
+
+/*
+ * Copy from the given ib_umem's pages to the given buffer.
+ *
+ * umem - the umem to copy from
+ * offset - offset to start copying from
+ * dst - destination buffer
+ * length - buffer length
+ *
+ * Returns 0 on success, or an error code.
+ */
+int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
+ size_t length)
+{
+ size_t end = offset + length;
+ int ret;
+
+ if (offset > umem->length || length > umem->length - offset) {
+ pr_err("%s not in range. offset: %zd umem length: %zd end: %zd\n",
+ __func__, offset, umem->length, end);
+ return -EINVAL;
+ }
+
+ ret = sg_pcopy_to_buffer(umem->sgt_append.sgt.sgl,
+ umem->sgt_append.sgt.orig_nents, dst, length,
+ offset + ib_umem_offset(umem));
+
+ if (ret < 0)
+ return ret;
+ else if (ret != length)
+ return -EINVAL;
+ else
+ return 0;
+}
+EXPORT_SYMBOL(ib_umem_copy_from);
diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c
new file mode 100644
index 000000000..04c04e6d2
--- /dev/null
+++ b/drivers/infiniband/core/umem_dmabuf.c
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright (c) 2020 Intel Corporation. All rights reserved.
+ */
+
+#include <linux/dma-buf.h>
+#include <linux/dma-resv.h>
+#include <linux/dma-mapping.h>
+#include <linux/module.h>
+
+#include "uverbs.h"
+
+MODULE_IMPORT_NS(DMA_BUF);
+
+int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf)
+{
+ struct sg_table *sgt;
+ struct scatterlist *sg;
+ unsigned long start, end, cur = 0;
+ unsigned int nmap = 0;
+ long ret;
+ int i;
+
+ dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
+
+ if (umem_dmabuf->sgt)
+ goto wait_fence;
+
+ sgt = dma_buf_map_attachment(umem_dmabuf->attach, DMA_BIDIRECTIONAL);
+ if (IS_ERR(sgt))
+ return PTR_ERR(sgt);
+
+ /* modify the sg list in-place to match umem address and length */
+
+ start = ALIGN_DOWN(umem_dmabuf->umem.address, PAGE_SIZE);
+ end = ALIGN(umem_dmabuf->umem.address + umem_dmabuf->umem.length,
+ PAGE_SIZE);
+ for_each_sgtable_dma_sg(sgt, sg, i) {
+ if (start < cur + sg_dma_len(sg) && cur < end)
+ nmap++;
+ if (cur <= start && start < cur + sg_dma_len(sg)) {
+ unsigned long offset = start - cur;
+
+ umem_dmabuf->first_sg = sg;
+ umem_dmabuf->first_sg_offset = offset;
+ sg_dma_address(sg) += offset;
+ sg_dma_len(sg) -= offset;
+ cur += offset;
+ }
+ if (cur < end && end <= cur + sg_dma_len(sg)) {
+ unsigned long trim = cur + sg_dma_len(sg) - end;
+
+ umem_dmabuf->last_sg = sg;
+ umem_dmabuf->last_sg_trim = trim;
+ sg_dma_len(sg) -= trim;
+ break;
+ }
+ cur += sg_dma_len(sg);
+ }
+
+ umem_dmabuf->umem.sgt_append.sgt.sgl = umem_dmabuf->first_sg;
+ umem_dmabuf->umem.sgt_append.sgt.nents = nmap;
+ umem_dmabuf->sgt = sgt;
+
+wait_fence:
+ /*
+ * Although the sg list is valid now, the content of the pages
+ * may be not up-to-date. Wait for the exporter to finish
+ * the migration.
+ */
+ ret = dma_resv_wait_timeout(umem_dmabuf->attach->dmabuf->resv,
+ DMA_RESV_USAGE_KERNEL,
+ false, MAX_SCHEDULE_TIMEOUT);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ return -ETIMEDOUT;
+ return 0;
+}
+EXPORT_SYMBOL(ib_umem_dmabuf_map_pages);
+
+void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf)
+{
+ dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
+
+ if (!umem_dmabuf->sgt)
+ return;
+
+ /* retore the original sg list */
+ if (umem_dmabuf->first_sg) {
+ sg_dma_address(umem_dmabuf->first_sg) -=
+ umem_dmabuf->first_sg_offset;
+ sg_dma_len(umem_dmabuf->first_sg) +=
+ umem_dmabuf->first_sg_offset;
+ umem_dmabuf->first_sg = NULL;
+ umem_dmabuf->first_sg_offset = 0;
+ }
+ if (umem_dmabuf->last_sg) {
+ sg_dma_len(umem_dmabuf->last_sg) +=
+ umem_dmabuf->last_sg_trim;
+ umem_dmabuf->last_sg = NULL;
+ umem_dmabuf->last_sg_trim = 0;
+ }
+
+ dma_buf_unmap_attachment(umem_dmabuf->attach, umem_dmabuf->sgt,
+ DMA_BIDIRECTIONAL);
+
+ umem_dmabuf->sgt = NULL;
+}
+EXPORT_SYMBOL(ib_umem_dmabuf_unmap_pages);
+
+struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device,
+ unsigned long offset, size_t size,
+ int fd, int access,
+ const struct dma_buf_attach_ops *ops)
+{
+ struct dma_buf *dmabuf;
+ struct ib_umem_dmabuf *umem_dmabuf;
+ struct ib_umem *umem;
+ unsigned long end;
+ struct ib_umem_dmabuf *ret = ERR_PTR(-EINVAL);
+
+ if (check_add_overflow(offset, (unsigned long)size, &end))
+ return ret;
+
+ if (unlikely(!ops || !ops->move_notify))
+ return ret;
+
+ dmabuf = dma_buf_get(fd);
+ if (IS_ERR(dmabuf))
+ return ERR_CAST(dmabuf);
+
+ if (dmabuf->size < end)
+ goto out_release_dmabuf;
+
+ umem_dmabuf = kzalloc(sizeof(*umem_dmabuf), GFP_KERNEL);
+ if (!umem_dmabuf) {
+ ret = ERR_PTR(-ENOMEM);
+ goto out_release_dmabuf;
+ }
+
+ umem = &umem_dmabuf->umem;
+ umem->ibdev = device;
+ umem->length = size;
+ umem->address = offset;
+ umem->writable = ib_access_writable(access);
+ umem->is_dmabuf = 1;
+
+ if (!ib_umem_num_pages(umem))
+ goto out_free_umem;
+
+ umem_dmabuf->attach = dma_buf_dynamic_attach(
+ dmabuf,
+ device->dma_device,
+ ops,
+ umem_dmabuf);
+ if (IS_ERR(umem_dmabuf->attach)) {
+ ret = ERR_CAST(umem_dmabuf->attach);
+ goto out_free_umem;
+ }
+ return umem_dmabuf;
+
+out_free_umem:
+ kfree(umem_dmabuf);
+
+out_release_dmabuf:
+ dma_buf_put(dmabuf);
+ return ret;
+}
+EXPORT_SYMBOL(ib_umem_dmabuf_get);
+
+static void
+ib_umem_dmabuf_unsupported_move_notify(struct dma_buf_attachment *attach)
+{
+ struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
+
+ ibdev_warn_ratelimited(umem_dmabuf->umem.ibdev,
+ "Invalidate callback should not be called when memory is pinned\n");
+}
+
+static struct dma_buf_attach_ops ib_umem_dmabuf_attach_pinned_ops = {
+ .allow_peer2peer = true,
+ .move_notify = ib_umem_dmabuf_unsupported_move_notify,
+};
+
+struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device,
+ unsigned long offset,
+ size_t size, int fd,
+ int access)
+{
+ struct ib_umem_dmabuf *umem_dmabuf;
+ int err;
+
+ umem_dmabuf = ib_umem_dmabuf_get(device, offset, size, fd, access,
+ &ib_umem_dmabuf_attach_pinned_ops);
+ if (IS_ERR(umem_dmabuf))
+ return umem_dmabuf;
+
+ dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
+ err = dma_buf_pin(umem_dmabuf->attach);
+ if (err)
+ goto err_release;
+ umem_dmabuf->pinned = 1;
+
+ err = ib_umem_dmabuf_map_pages(umem_dmabuf);
+ if (err)
+ goto err_unpin;
+ dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+
+ return umem_dmabuf;
+
+err_unpin:
+ dma_buf_unpin(umem_dmabuf->attach);
+err_release:
+ dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
+ ib_umem_release(&umem_dmabuf->umem);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(ib_umem_dmabuf_get_pinned);
+
+void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf)
+{
+ struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf;
+
+ dma_resv_lock(dmabuf->resv, NULL);
+ ib_umem_dmabuf_unmap_pages(umem_dmabuf);
+ if (umem_dmabuf->pinned)
+ dma_buf_unpin(umem_dmabuf->attach);
+ dma_resv_unlock(dmabuf->resv);
+
+ dma_buf_detach(dmabuf, umem_dmabuf->attach);
+ dma_buf_put(dmabuf);
+ kfree(umem_dmabuf);
+}
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
new file mode 100644
index 000000000..e9fa22d31
--- /dev/null
+++ b/drivers/infiniband/core/umem_odp.c
@@ -0,0 +1,515 @@
+/*
+ * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/task.h>
+#include <linux/pid.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/vmalloc.h>
+#include <linux/hugetlb.h>
+#include <linux/interval_tree.h>
+#include <linux/hmm.h>
+#include <linux/pagemap.h>
+
+#include <rdma/ib_umem_odp.h>
+
+#include "uverbs.h"
+
+static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
+ const struct mmu_interval_notifier_ops *ops)
+{
+ int ret;
+
+ umem_odp->umem.is_odp = 1;
+ mutex_init(&umem_odp->umem_mutex);
+
+ if (!umem_odp->is_implicit_odp) {
+ size_t page_size = 1UL << umem_odp->page_shift;
+ unsigned long start;
+ unsigned long end;
+ size_t ndmas, npfns;
+
+ start = ALIGN_DOWN(umem_odp->umem.address, page_size);
+ if (check_add_overflow(umem_odp->umem.address,
+ (unsigned long)umem_odp->umem.length,
+ &end))
+ return -EOVERFLOW;
+ end = ALIGN(end, page_size);
+ if (unlikely(end < page_size))
+ return -EOVERFLOW;
+
+ ndmas = (end - start) >> umem_odp->page_shift;
+ if (!ndmas)
+ return -EINVAL;
+
+ npfns = (end - start) >> PAGE_SHIFT;
+ umem_odp->pfn_list = kvcalloc(
+ npfns, sizeof(*umem_odp->pfn_list), GFP_KERNEL);
+ if (!umem_odp->pfn_list)
+ return -ENOMEM;
+
+ umem_odp->dma_list = kvcalloc(
+ ndmas, sizeof(*umem_odp->dma_list), GFP_KERNEL);
+ if (!umem_odp->dma_list) {
+ ret = -ENOMEM;
+ goto out_pfn_list;
+ }
+
+ ret = mmu_interval_notifier_insert(&umem_odp->notifier,
+ umem_odp->umem.owning_mm,
+ start, end - start, ops);
+ if (ret)
+ goto out_dma_list;
+ }
+
+ return 0;
+
+out_dma_list:
+ kvfree(umem_odp->dma_list);
+out_pfn_list:
+ kvfree(umem_odp->pfn_list);
+ return ret;
+}
+
+/**
+ * ib_umem_odp_alloc_implicit - Allocate a parent implicit ODP umem
+ *
+ * Implicit ODP umems do not have a VA range and do not have any page lists.
+ * They exist only to hold the per_mm reference to help the driver create
+ * children umems.
+ *
+ * @device: IB device to create UMEM
+ * @access: ib_reg_mr access flags
+ */
+struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
+ int access)
+{
+ struct ib_umem *umem;
+ struct ib_umem_odp *umem_odp;
+ int ret;
+
+ if (access & IB_ACCESS_HUGETLB)
+ return ERR_PTR(-EINVAL);
+
+ umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL);
+ if (!umem_odp)
+ return ERR_PTR(-ENOMEM);
+ umem = &umem_odp->umem;
+ umem->ibdev = device;
+ umem->writable = ib_access_writable(access);
+ umem->owning_mm = current->mm;
+ umem_odp->is_implicit_odp = 1;
+ umem_odp->page_shift = PAGE_SHIFT;
+
+ umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
+ ret = ib_init_umem_odp(umem_odp, NULL);
+ if (ret) {
+ put_pid(umem_odp->tgid);
+ kfree(umem_odp);
+ return ERR_PTR(ret);
+ }
+ return umem_odp;
+}
+EXPORT_SYMBOL(ib_umem_odp_alloc_implicit);
+
+/**
+ * ib_umem_odp_alloc_child - Allocate a child ODP umem under an implicit
+ * parent ODP umem
+ *
+ * @root: The parent umem enclosing the child. This must be allocated using
+ * ib_alloc_implicit_odp_umem()
+ * @addr: The starting userspace VA
+ * @size: The length of the userspace VA
+ * @ops: MMU interval ops, currently only @invalidate
+ */
+struct ib_umem_odp *
+ib_umem_odp_alloc_child(struct ib_umem_odp *root, unsigned long addr,
+ size_t size,
+ const struct mmu_interval_notifier_ops *ops)
+{
+ /*
+ * Caller must ensure that root cannot be freed during the call to
+ * ib_alloc_odp_umem.
+ */
+ struct ib_umem_odp *odp_data;
+ struct ib_umem *umem;
+ int ret;
+
+ if (WARN_ON(!root->is_implicit_odp))
+ return ERR_PTR(-EINVAL);
+
+ odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL);
+ if (!odp_data)
+ return ERR_PTR(-ENOMEM);
+ umem = &odp_data->umem;
+ umem->ibdev = root->umem.ibdev;
+ umem->length = size;
+ umem->address = addr;
+ umem->writable = root->umem.writable;
+ umem->owning_mm = root->umem.owning_mm;
+ odp_data->page_shift = PAGE_SHIFT;
+ odp_data->notifier.ops = ops;
+
+ /*
+ * A mmget must be held when registering a notifier, the owming_mm only
+ * has a mm_grab at this point.
+ */
+ if (!mmget_not_zero(umem->owning_mm)) {
+ ret = -EFAULT;
+ goto out_free;
+ }
+
+ odp_data->tgid = get_pid(root->tgid);
+ ret = ib_init_umem_odp(odp_data, ops);
+ if (ret)
+ goto out_tgid;
+ mmput(umem->owning_mm);
+ return odp_data;
+
+out_tgid:
+ put_pid(odp_data->tgid);
+ mmput(umem->owning_mm);
+out_free:
+ kfree(odp_data);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_umem_odp_alloc_child);
+
+/**
+ * ib_umem_odp_get - Create a umem_odp for a userspace va
+ *
+ * @device: IB device struct to get UMEM
+ * @addr: userspace virtual address to start at
+ * @size: length of region to pin
+ * @access: IB_ACCESS_xxx flags for memory being pinned
+ * @ops: MMU interval ops, currently only @invalidate
+ *
+ * The driver should use when the access flags indicate ODP memory. It avoids
+ * pinning, instead, stores the mm for future page fault handling in
+ * conjunction with MMU notifiers.
+ */
+struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device,
+ unsigned long addr, size_t size, int access,
+ const struct mmu_interval_notifier_ops *ops)
+{
+ struct ib_umem_odp *umem_odp;
+ int ret;
+
+ if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)))
+ return ERR_PTR(-EINVAL);
+
+ umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL);
+ if (!umem_odp)
+ return ERR_PTR(-ENOMEM);
+
+ umem_odp->umem.ibdev = device;
+ umem_odp->umem.length = size;
+ umem_odp->umem.address = addr;
+ umem_odp->umem.writable = ib_access_writable(access);
+ umem_odp->umem.owning_mm = current->mm;
+ umem_odp->notifier.ops = ops;
+
+ umem_odp->page_shift = PAGE_SHIFT;
+#ifdef CONFIG_HUGETLB_PAGE
+ if (access & IB_ACCESS_HUGETLB)
+ umem_odp->page_shift = HPAGE_SHIFT;
+#endif
+
+ umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
+ ret = ib_init_umem_odp(umem_odp, ops);
+ if (ret)
+ goto err_put_pid;
+ return umem_odp;
+
+err_put_pid:
+ put_pid(umem_odp->tgid);
+ kfree(umem_odp);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_umem_odp_get);
+
+void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
+{
+ /*
+ * Ensure that no more pages are mapped in the umem.
+ *
+ * It is the driver's responsibility to ensure, before calling us,
+ * that the hardware will not attempt to access the MR any more.
+ */
+ if (!umem_odp->is_implicit_odp) {
+ mutex_lock(&umem_odp->umem_mutex);
+ ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
+ ib_umem_end(umem_odp));
+ mutex_unlock(&umem_odp->umem_mutex);
+ mmu_interval_notifier_remove(&umem_odp->notifier);
+ kvfree(umem_odp->dma_list);
+ kvfree(umem_odp->pfn_list);
+ }
+ put_pid(umem_odp->tgid);
+ kfree(umem_odp);
+}
+EXPORT_SYMBOL(ib_umem_odp_release);
+
+/*
+ * Map for DMA and insert a single page into the on-demand paging page tables.
+ *
+ * @umem: the umem to insert the page to.
+ * @dma_index: index in the umem to add the dma to.
+ * @page: the page struct to map and add.
+ * @access_mask: access permissions needed for this page.
+ *
+ * The function returns -EFAULT if the DMA mapping operation fails.
+ *
+ */
+static int ib_umem_odp_map_dma_single_page(
+ struct ib_umem_odp *umem_odp,
+ unsigned int dma_index,
+ struct page *page,
+ u64 access_mask)
+{
+ struct ib_device *dev = umem_odp->umem.ibdev;
+ dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index];
+
+ if (*dma_addr) {
+ /*
+ * If the page is already dma mapped it means it went through
+ * a non-invalidating trasition, like read-only to writable.
+ * Resync the flags.
+ */
+ *dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask;
+ return 0;
+ }
+
+ *dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift,
+ DMA_BIDIRECTIONAL);
+ if (ib_dma_mapping_error(dev, *dma_addr)) {
+ *dma_addr = 0;
+ return -EFAULT;
+ }
+ umem_odp->npages++;
+ *dma_addr |= access_mask;
+ return 0;
+}
+
+/**
+ * ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it.
+ *
+ * Maps the range passed in the argument to DMA addresses.
+ * The DMA addresses of the mapped pages is updated in umem_odp->dma_list.
+ * Upon success the ODP MR will be locked to let caller complete its device
+ * page table update.
+ *
+ * Returns the number of pages mapped in success, negative error code
+ * for failure.
+ * @umem_odp: the umem to map and pin
+ * @user_virt: the address from which we need to map.
+ * @bcnt: the minimal number of bytes to pin and map. The mapping might be
+ * bigger due to alignment, and may also be smaller in case of an error
+ * pinning or mapping a page. The actual pages mapped is returned in
+ * the return value.
+ * @access_mask: bit mask of the requested access permissions for the given
+ * range.
+ * @fault: is faulting required for the given range
+ */
+int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
+ u64 bcnt, u64 access_mask, bool fault)
+ __acquires(&umem_odp->umem_mutex)
+{
+ struct task_struct *owning_process = NULL;
+ struct mm_struct *owning_mm = umem_odp->umem.owning_mm;
+ int pfn_index, dma_index, ret = 0, start_idx;
+ unsigned int page_shift, hmm_order, pfn_start_idx;
+ unsigned long num_pfns, current_seq;
+ struct hmm_range range = {};
+ unsigned long timeout;
+
+ if (access_mask == 0)
+ return -EINVAL;
+
+ if (user_virt < ib_umem_start(umem_odp) ||
+ user_virt + bcnt > ib_umem_end(umem_odp))
+ return -EFAULT;
+
+ page_shift = umem_odp->page_shift;
+
+ /*
+ * owning_process is allowed to be NULL, this means somehow the mm is
+ * existing beyond the lifetime of the originating process.. Presumably
+ * mmget_not_zero will fail in this case.
+ */
+ owning_process = get_pid_task(umem_odp->tgid, PIDTYPE_PID);
+ if (!owning_process || !mmget_not_zero(owning_mm)) {
+ ret = -EINVAL;
+ goto out_put_task;
+ }
+
+ range.notifier = &umem_odp->notifier;
+ range.start = ALIGN_DOWN(user_virt, 1UL << page_shift);
+ range.end = ALIGN(user_virt + bcnt, 1UL << page_shift);
+ pfn_start_idx = (range.start - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
+ num_pfns = (range.end - range.start) >> PAGE_SHIFT;
+ if (fault) {
+ range.default_flags = HMM_PFN_REQ_FAULT;
+
+ if (access_mask & ODP_WRITE_ALLOWED_BIT)
+ range.default_flags |= HMM_PFN_REQ_WRITE;
+ }
+
+ range.hmm_pfns = &(umem_odp->pfn_list[pfn_start_idx]);
+ timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
+
+retry:
+ current_seq = range.notifier_seq =
+ mmu_interval_read_begin(&umem_odp->notifier);
+
+ mmap_read_lock(owning_mm);
+ ret = hmm_range_fault(&range);
+ mmap_read_unlock(owning_mm);
+ if (unlikely(ret)) {
+ if (ret == -EBUSY && !time_after(jiffies, timeout))
+ goto retry;
+ goto out_put_mm;
+ }
+
+ start_idx = (range.start - ib_umem_start(umem_odp)) >> page_shift;
+ dma_index = start_idx;
+
+ mutex_lock(&umem_odp->umem_mutex);
+ if (mmu_interval_read_retry(&umem_odp->notifier, current_seq)) {
+ mutex_unlock(&umem_odp->umem_mutex);
+ goto retry;
+ }
+
+ for (pfn_index = 0; pfn_index < num_pfns;
+ pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) {
+
+ if (fault) {
+ /*
+ * Since we asked for hmm_range_fault() to populate
+ * pages it shouldn't return an error entry on success.
+ */
+ WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
+ WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
+ } else {
+ if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) {
+ WARN_ON(umem_odp->dma_list[dma_index]);
+ continue;
+ }
+ access_mask = ODP_READ_ALLOWED_BIT;
+ if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE)
+ access_mask |= ODP_WRITE_ALLOWED_BIT;
+ }
+
+ hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]);
+ /* If a hugepage was detected and ODP wasn't set for, the umem
+ * page_shift will be used, the opposite case is an error.
+ */
+ if (hmm_order + PAGE_SHIFT < page_shift) {
+ ret = -EINVAL;
+ ibdev_dbg(umem_odp->umem.ibdev,
+ "%s: un-expected hmm_order %u, page_shift %u\n",
+ __func__, hmm_order, page_shift);
+ break;
+ }
+
+ ret = ib_umem_odp_map_dma_single_page(
+ umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]),
+ access_mask);
+ if (ret < 0) {
+ ibdev_dbg(umem_odp->umem.ibdev,
+ "ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
+ break;
+ }
+ }
+ /* upon success lock should stay on hold for the callee */
+ if (!ret)
+ ret = dma_index - start_idx;
+ else
+ mutex_unlock(&umem_odp->umem_mutex);
+
+out_put_mm:
+ mmput_async(owning_mm);
+out_put_task:
+ if (owning_process)
+ put_task_struct(owning_process);
+ return ret;
+}
+EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock);
+
+void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
+ u64 bound)
+{
+ dma_addr_t dma_addr;
+ dma_addr_t dma;
+ int idx;
+ u64 addr;
+ struct ib_device *dev = umem_odp->umem.ibdev;
+
+ lockdep_assert_held(&umem_odp->umem_mutex);
+
+ virt = max_t(u64, virt, ib_umem_start(umem_odp));
+ bound = min_t(u64, bound, ib_umem_end(umem_odp));
+ for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
+ idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
+ dma = umem_odp->dma_list[idx];
+
+ /* The access flags guaranteed a valid DMA address in case was NULL */
+ if (dma) {
+ unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
+ struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
+
+ dma_addr = dma & ODP_DMA_ADDR_MASK;
+ ib_dma_unmap_page(dev, dma_addr,
+ BIT(umem_odp->page_shift),
+ DMA_BIDIRECTIONAL);
+ if (dma & ODP_WRITE_ALLOWED_BIT) {
+ struct page *head_page = compound_head(page);
+ /*
+ * set_page_dirty prefers being called with
+ * the page lock. However, MMU notifiers are
+ * called sometimes with and sometimes without
+ * the lock. We rely on the umem_mutex instead
+ * to prevent other mmu notifiers from
+ * continuing and allowing the page mapping to
+ * be removed.
+ */
+ set_page_dirty(head_page);
+ }
+ umem_odp->dma_list[idx] = 0;
+ umem_odp->npages--;
+ }
+ }
+}
+EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
new file mode 100644
index 000000000..d96c78e43
--- /dev/null
+++ b/drivers/infiniband/core/user_mad.c
@@ -0,0 +1,1497 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2008 Cisco. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "user_mad: " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <linux/dma-mapping.h>
+#include <linux/poll.h>
+#include <linux/mutex.h>
+#include <linux/kref.h>
+#include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/semaphore.h>
+#include <linux/slab.h>
+#include <linux/nospec.h>
+
+#include <linux/uaccess.h>
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_mad.h>
+#include <rdma/rdma_netlink.h>
+
+#include "core_priv.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand userspace MAD packet access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+ IB_UMAD_MAX_PORTS = RDMA_MAX_PORTS,
+ IB_UMAD_MAX_AGENTS = 32,
+
+ IB_UMAD_MAJOR = 231,
+ IB_UMAD_MINOR_BASE = 0,
+ IB_UMAD_NUM_FIXED_MINOR = 64,
+ IB_UMAD_NUM_DYNAMIC_MINOR = IB_UMAD_MAX_PORTS - IB_UMAD_NUM_FIXED_MINOR,
+ IB_ISSM_MINOR_BASE = IB_UMAD_NUM_FIXED_MINOR,
+};
+
+/*
+ * Our lifetime rules for these structs are the following:
+ * device special file is opened, we take a reference on the
+ * ib_umad_port's struct ib_umad_device. We drop these
+ * references in the corresponding close().
+ *
+ * In addition to references coming from open character devices, there
+ * is one more reference to each ib_umad_device representing the
+ * module's reference taken when allocating the ib_umad_device in
+ * ib_umad_add_one().
+ *
+ * When destroying an ib_umad_device, we drop the module's reference.
+ */
+
+struct ib_umad_port {
+ struct cdev cdev;
+ struct device dev;
+ struct cdev sm_cdev;
+ struct device sm_dev;
+ struct semaphore sm_sem;
+
+ struct mutex file_mutex;
+ struct list_head file_list;
+
+ struct ib_device *ib_dev;
+ struct ib_umad_device *umad_dev;
+ int dev_num;
+ u32 port_num;
+};
+
+struct ib_umad_device {
+ struct kref kref;
+ struct ib_umad_port ports[];
+};
+
+struct ib_umad_file {
+ struct mutex mutex;
+ struct ib_umad_port *port;
+ struct list_head recv_list;
+ struct list_head send_list;
+ struct list_head port_list;
+ spinlock_t send_lock;
+ wait_queue_head_t recv_wait;
+ struct ib_mad_agent *agent[IB_UMAD_MAX_AGENTS];
+ int agents_dead;
+ u8 use_pkey_index;
+ u8 already_used;
+};
+
+struct ib_umad_packet {
+ struct ib_mad_send_buf *msg;
+ struct ib_mad_recv_wc *recv_wc;
+ struct list_head list;
+ int length;
+ struct ib_user_mad mad;
+};
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/ib_umad.h>
+
+static const dev_t base_umad_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE);
+static const dev_t base_issm_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE) +
+ IB_UMAD_NUM_FIXED_MINOR;
+static dev_t dynamic_umad_dev;
+static dev_t dynamic_issm_dev;
+
+static DEFINE_IDA(umad_ida);
+
+static int ib_umad_add_one(struct ib_device *device);
+static void ib_umad_remove_one(struct ib_device *device, void *client_data);
+
+static void ib_umad_dev_free(struct kref *kref)
+{
+ struct ib_umad_device *dev =
+ container_of(kref, struct ib_umad_device, kref);
+
+ kfree(dev);
+}
+
+static void ib_umad_dev_get(struct ib_umad_device *dev)
+{
+ kref_get(&dev->kref);
+}
+
+static void ib_umad_dev_put(struct ib_umad_device *dev)
+{
+ kref_put(&dev->kref, ib_umad_dev_free);
+}
+
+static int hdr_size(struct ib_umad_file *file)
+{
+ return file->use_pkey_index ? sizeof(struct ib_user_mad_hdr) :
+ sizeof(struct ib_user_mad_hdr_old);
+}
+
+/* caller must hold file->mutex */
+static struct ib_mad_agent *__get_agent(struct ib_umad_file *file, int id)
+{
+ return file->agents_dead ? NULL : file->agent[id];
+}
+
+static int queue_packet(struct ib_umad_file *file,
+ struct ib_mad_agent *agent,
+ struct ib_umad_packet *packet)
+{
+ int ret = 1;
+
+ mutex_lock(&file->mutex);
+
+ for (packet->mad.hdr.id = 0;
+ packet->mad.hdr.id < IB_UMAD_MAX_AGENTS;
+ packet->mad.hdr.id++)
+ if (agent == __get_agent(file, packet->mad.hdr.id)) {
+ list_add_tail(&packet->list, &file->recv_list);
+ wake_up_interruptible(&file->recv_wait);
+ ret = 0;
+ break;
+ }
+
+ mutex_unlock(&file->mutex);
+
+ return ret;
+}
+
+static void dequeue_send(struct ib_umad_file *file,
+ struct ib_umad_packet *packet)
+{
+ spin_lock_irq(&file->send_lock);
+ list_del(&packet->list);
+ spin_unlock_irq(&file->send_lock);
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *send_wc)
+{
+ struct ib_umad_file *file = agent->context;
+ struct ib_umad_packet *packet = send_wc->send_buf->context[0];
+
+ dequeue_send(file, packet);
+ rdma_destroy_ah(packet->msg->ah, RDMA_DESTROY_AH_SLEEPABLE);
+ ib_free_send_mad(packet->msg);
+
+ if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) {
+ packet->length = IB_MGMT_MAD_HDR;
+ packet->mad.hdr.status = ETIMEDOUT;
+ if (!queue_packet(file, agent, packet))
+ return;
+ }
+ kfree(packet);
+}
+
+static void recv_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_buf *send_buf,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_umad_file *file = agent->context;
+ struct ib_umad_packet *packet;
+
+ if (mad_recv_wc->wc->status != IB_WC_SUCCESS)
+ goto err1;
+
+ packet = kzalloc(sizeof *packet, GFP_KERNEL);
+ if (!packet)
+ goto err1;
+
+ packet->length = mad_recv_wc->mad_len;
+ packet->recv_wc = mad_recv_wc;
+
+ packet->mad.hdr.status = 0;
+ packet->mad.hdr.length = hdr_size(file) + mad_recv_wc->mad_len;
+ packet->mad.hdr.qpn = cpu_to_be32(mad_recv_wc->wc->src_qp);
+ /*
+ * On OPA devices it is okay to lose the upper 16 bits of LID as this
+ * information is obtained elsewhere. Mask off the upper 16 bits.
+ */
+ if (rdma_cap_opa_mad(agent->device, agent->port_num))
+ packet->mad.hdr.lid = ib_lid_be16(0xFFFF &
+ mad_recv_wc->wc->slid);
+ else
+ packet->mad.hdr.lid = ib_lid_be16(mad_recv_wc->wc->slid);
+ packet->mad.hdr.sl = mad_recv_wc->wc->sl;
+ packet->mad.hdr.path_bits = mad_recv_wc->wc->dlid_path_bits;
+ packet->mad.hdr.pkey_index = mad_recv_wc->wc->pkey_index;
+ packet->mad.hdr.grh_present = !!(mad_recv_wc->wc->wc_flags & IB_WC_GRH);
+ if (packet->mad.hdr.grh_present) {
+ struct rdma_ah_attr ah_attr;
+ const struct ib_global_route *grh;
+ int ret;
+
+ ret = ib_init_ah_attr_from_wc(agent->device, agent->port_num,
+ mad_recv_wc->wc,
+ mad_recv_wc->recv_buf.grh,
+ &ah_attr);
+ if (ret)
+ goto err2;
+
+ grh = rdma_ah_read_grh(&ah_attr);
+ packet->mad.hdr.gid_index = grh->sgid_index;
+ packet->mad.hdr.hop_limit = grh->hop_limit;
+ packet->mad.hdr.traffic_class = grh->traffic_class;
+ memcpy(packet->mad.hdr.gid, &grh->dgid, 16);
+ packet->mad.hdr.flow_label = cpu_to_be32(grh->flow_label);
+ rdma_destroy_ah_attr(&ah_attr);
+ }
+
+ if (queue_packet(file, agent, packet))
+ goto err2;
+ return;
+
+err2:
+ kfree(packet);
+err1:
+ ib_free_recv_mad(mad_recv_wc);
+}
+
+static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf,
+ struct ib_umad_packet *packet, size_t count)
+{
+ struct ib_mad_recv_buf *recv_buf;
+ int left, seg_payload, offset, max_seg_payload;
+ size_t seg_size;
+
+ recv_buf = &packet->recv_wc->recv_buf;
+ seg_size = packet->recv_wc->mad_seg_size;
+
+ /* We need enough room to copy the first (or only) MAD segment. */
+ if ((packet->length <= seg_size &&
+ count < hdr_size(file) + packet->length) ||
+ (packet->length > seg_size &&
+ count < hdr_size(file) + seg_size))
+ return -EINVAL;
+
+ if (copy_to_user(buf, &packet->mad, hdr_size(file)))
+ return -EFAULT;
+
+ buf += hdr_size(file);
+ seg_payload = min_t(int, packet->length, seg_size);
+ if (copy_to_user(buf, recv_buf->mad, seg_payload))
+ return -EFAULT;
+
+ if (seg_payload < packet->length) {
+ /*
+ * Multipacket RMPP MAD message. Copy remainder of message.
+ * Note that last segment may have a shorter payload.
+ */
+ if (count < hdr_size(file) + packet->length) {
+ /*
+ * The buffer is too small, return the first RMPP segment,
+ * which includes the RMPP message length.
+ */
+ return -ENOSPC;
+ }
+ offset = ib_get_mad_data_offset(recv_buf->mad->mad_hdr.mgmt_class);
+ max_seg_payload = seg_size - offset;
+
+ for (left = packet->length - seg_payload, buf += seg_payload;
+ left; left -= seg_payload, buf += seg_payload) {
+ recv_buf = container_of(recv_buf->list.next,
+ struct ib_mad_recv_buf, list);
+ seg_payload = min(left, max_seg_payload);
+ if (copy_to_user(buf, ((void *) recv_buf->mad) + offset,
+ seg_payload))
+ return -EFAULT;
+ }
+ }
+
+ trace_ib_umad_read_recv(file, &packet->mad.hdr, &recv_buf->mad->mad_hdr);
+
+ return hdr_size(file) + packet->length;
+}
+
+static ssize_t copy_send_mad(struct ib_umad_file *file, char __user *buf,
+ struct ib_umad_packet *packet, size_t count)
+{
+ ssize_t size = hdr_size(file) + packet->length;
+
+ if (count < size)
+ return -EINVAL;
+
+ if (copy_to_user(buf, &packet->mad, hdr_size(file)))
+ return -EFAULT;
+
+ buf += hdr_size(file);
+
+ if (copy_to_user(buf, packet->mad.data, packet->length))
+ return -EFAULT;
+
+ trace_ib_umad_read_send(file, &packet->mad.hdr,
+ (struct ib_mad_hdr *)&packet->mad.data);
+
+ return size;
+}
+
+static ssize_t ib_umad_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct ib_umad_file *file = filp->private_data;
+ struct ib_umad_packet *packet;
+ ssize_t ret;
+
+ if (count < hdr_size(file))
+ return -EINVAL;
+
+ mutex_lock(&file->mutex);
+
+ if (file->agents_dead) {
+ mutex_unlock(&file->mutex);
+ return -EIO;
+ }
+
+ while (list_empty(&file->recv_list)) {
+ mutex_unlock(&file->mutex);
+
+ if (filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ if (wait_event_interruptible(file->recv_wait,
+ !list_empty(&file->recv_list)))
+ return -ERESTARTSYS;
+
+ mutex_lock(&file->mutex);
+ }
+
+ if (file->agents_dead) {
+ mutex_unlock(&file->mutex);
+ return -EIO;
+ }
+
+ packet = list_entry(file->recv_list.next, struct ib_umad_packet, list);
+ list_del(&packet->list);
+
+ mutex_unlock(&file->mutex);
+
+ if (packet->recv_wc)
+ ret = copy_recv_mad(file, buf, packet, count);
+ else
+ ret = copy_send_mad(file, buf, packet, count);
+
+ if (ret < 0) {
+ /* Requeue packet */
+ mutex_lock(&file->mutex);
+ list_add(&packet->list, &file->recv_list);
+ mutex_unlock(&file->mutex);
+ } else {
+ if (packet->recv_wc)
+ ib_free_recv_mad(packet->recv_wc);
+ kfree(packet);
+ }
+ return ret;
+}
+
+static int copy_rmpp_mad(struct ib_mad_send_buf *msg, const char __user *buf)
+{
+ int left, seg;
+
+ /* Copy class specific header */
+ if ((msg->hdr_len > IB_MGMT_RMPP_HDR) &&
+ copy_from_user(msg->mad + IB_MGMT_RMPP_HDR, buf + IB_MGMT_RMPP_HDR,
+ msg->hdr_len - IB_MGMT_RMPP_HDR))
+ return -EFAULT;
+
+ /* All headers are in place. Copy data segments. */
+ for (seg = 1, left = msg->data_len, buf += msg->hdr_len; left > 0;
+ seg++, left -= msg->seg_size, buf += msg->seg_size) {
+ if (copy_from_user(ib_get_rmpp_segment(msg, seg), buf,
+ min(left, msg->seg_size)))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+static int same_destination(struct ib_user_mad_hdr *hdr1,
+ struct ib_user_mad_hdr *hdr2)
+{
+ if (!hdr1->grh_present && !hdr2->grh_present)
+ return (hdr1->lid == hdr2->lid);
+
+ if (hdr1->grh_present && hdr2->grh_present)
+ return !memcmp(hdr1->gid, hdr2->gid, 16);
+
+ return 0;
+}
+
+static int is_duplicate(struct ib_umad_file *file,
+ struct ib_umad_packet *packet)
+{
+ struct ib_umad_packet *sent_packet;
+ struct ib_mad_hdr *sent_hdr, *hdr;
+
+ hdr = (struct ib_mad_hdr *) packet->mad.data;
+ list_for_each_entry(sent_packet, &file->send_list, list) {
+ sent_hdr = (struct ib_mad_hdr *) sent_packet->mad.data;
+
+ if ((hdr->tid != sent_hdr->tid) ||
+ (hdr->mgmt_class != sent_hdr->mgmt_class))
+ continue;
+
+ /*
+ * No need to be overly clever here. If two new operations have
+ * the same TID, reject the second as a duplicate. This is more
+ * restrictive than required by the spec.
+ */
+ if (!ib_response_mad(hdr)) {
+ if (!ib_response_mad(sent_hdr))
+ return 1;
+ continue;
+ } else if (!ib_response_mad(sent_hdr))
+ continue;
+
+ if (same_destination(&packet->mad.hdr, &sent_packet->mad.hdr))
+ return 1;
+ }
+
+ return 0;
+}
+
+static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct ib_umad_file *file = filp->private_data;
+ struct ib_umad_packet *packet;
+ struct ib_mad_agent *agent;
+ struct rdma_ah_attr ah_attr;
+ struct ib_ah *ah;
+ struct ib_rmpp_mad *rmpp_mad;
+ __be64 *tid;
+ int ret, data_len, hdr_len, copy_offset, rmpp_active;
+ u8 base_version;
+
+ if (count < hdr_size(file) + IB_MGMT_RMPP_HDR)
+ return -EINVAL;
+
+ packet = kzalloc(sizeof *packet + IB_MGMT_RMPP_HDR, GFP_KERNEL);
+ if (!packet)
+ return -ENOMEM;
+
+ if (copy_from_user(&packet->mad, buf, hdr_size(file))) {
+ ret = -EFAULT;
+ goto err;
+ }
+
+ if (packet->mad.hdr.id >= IB_UMAD_MAX_AGENTS) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ buf += hdr_size(file);
+
+ if (copy_from_user(packet->mad.data, buf, IB_MGMT_RMPP_HDR)) {
+ ret = -EFAULT;
+ goto err;
+ }
+
+ mutex_lock(&file->mutex);
+
+ trace_ib_umad_write(file, &packet->mad.hdr,
+ (struct ib_mad_hdr *)&packet->mad.data);
+
+ agent = __get_agent(file, packet->mad.hdr.id);
+ if (!agent) {
+ ret = -EIO;
+ goto err_up;
+ }
+
+ memset(&ah_attr, 0, sizeof ah_attr);
+ ah_attr.type = rdma_ah_find_type(agent->device,
+ file->port->port_num);
+ rdma_ah_set_dlid(&ah_attr, be16_to_cpu(packet->mad.hdr.lid));
+ rdma_ah_set_sl(&ah_attr, packet->mad.hdr.sl);
+ rdma_ah_set_path_bits(&ah_attr, packet->mad.hdr.path_bits);
+ rdma_ah_set_port_num(&ah_attr, file->port->port_num);
+ if (packet->mad.hdr.grh_present) {
+ rdma_ah_set_grh(&ah_attr, NULL,
+ be32_to_cpu(packet->mad.hdr.flow_label),
+ packet->mad.hdr.gid_index,
+ packet->mad.hdr.hop_limit,
+ packet->mad.hdr.traffic_class);
+ rdma_ah_set_dgid_raw(&ah_attr, packet->mad.hdr.gid);
+ }
+
+ ah = rdma_create_user_ah(agent->qp->pd, &ah_attr, NULL);
+ if (IS_ERR(ah)) {
+ ret = PTR_ERR(ah);
+ goto err_up;
+ }
+
+ rmpp_mad = (struct ib_rmpp_mad *) packet->mad.data;
+ hdr_len = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
+
+ if (ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)
+ && ib_mad_kernel_rmpp_agent(agent)) {
+ copy_offset = IB_MGMT_RMPP_HDR;
+ rmpp_active = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+ IB_MGMT_RMPP_FLAG_ACTIVE;
+ } else {
+ copy_offset = IB_MGMT_MAD_HDR;
+ rmpp_active = 0;
+ }
+
+ base_version = ((struct ib_mad_hdr *)&packet->mad.data)->base_version;
+ data_len = count - hdr_size(file) - hdr_len;
+ packet->msg = ib_create_send_mad(agent,
+ be32_to_cpu(packet->mad.hdr.qpn),
+ packet->mad.hdr.pkey_index, rmpp_active,
+ hdr_len, data_len, GFP_KERNEL,
+ base_version);
+ if (IS_ERR(packet->msg)) {
+ ret = PTR_ERR(packet->msg);
+ goto err_ah;
+ }
+
+ packet->msg->ah = ah;
+ packet->msg->timeout_ms = packet->mad.hdr.timeout_ms;
+ packet->msg->retries = packet->mad.hdr.retries;
+ packet->msg->context[0] = packet;
+
+ /* Copy MAD header. Any RMPP header is already in place. */
+ memcpy(packet->msg->mad, packet->mad.data, IB_MGMT_MAD_HDR);
+
+ if (!rmpp_active) {
+ if (copy_from_user(packet->msg->mad + copy_offset,
+ buf + copy_offset,
+ hdr_len + data_len - copy_offset)) {
+ ret = -EFAULT;
+ goto err_msg;
+ }
+ } else {
+ ret = copy_rmpp_mad(packet->msg, buf);
+ if (ret)
+ goto err_msg;
+ }
+
+ /*
+ * Set the high-order part of the transaction ID to make MADs from
+ * different agents unique, and allow routing responses back to the
+ * original requestor.
+ */
+ if (!ib_response_mad(packet->msg->mad)) {
+ tid = &((struct ib_mad_hdr *) packet->msg->mad)->tid;
+ *tid = cpu_to_be64(((u64) agent->hi_tid) << 32 |
+ (be64_to_cpup(tid) & 0xffffffff));
+ rmpp_mad->mad_hdr.tid = *tid;
+ }
+
+ if (!ib_mad_kernel_rmpp_agent(agent)
+ && ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)
+ && (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) {
+ spin_lock_irq(&file->send_lock);
+ list_add_tail(&packet->list, &file->send_list);
+ spin_unlock_irq(&file->send_lock);
+ } else {
+ spin_lock_irq(&file->send_lock);
+ ret = is_duplicate(file, packet);
+ if (!ret)
+ list_add_tail(&packet->list, &file->send_list);
+ spin_unlock_irq(&file->send_lock);
+ if (ret) {
+ ret = -EINVAL;
+ goto err_msg;
+ }
+ }
+
+ ret = ib_post_send_mad(packet->msg, NULL);
+ if (ret)
+ goto err_send;
+
+ mutex_unlock(&file->mutex);
+ return count;
+
+err_send:
+ dequeue_send(file, packet);
+err_msg:
+ ib_free_send_mad(packet->msg);
+err_ah:
+ rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE);
+err_up:
+ mutex_unlock(&file->mutex);
+err:
+ kfree(packet);
+ return ret;
+}
+
+static __poll_t ib_umad_poll(struct file *filp, struct poll_table_struct *wait)
+{
+ struct ib_umad_file *file = filp->private_data;
+
+ /* we will always be able to post a MAD send */
+ __poll_t mask = EPOLLOUT | EPOLLWRNORM;
+
+ mutex_lock(&file->mutex);
+ poll_wait(filp, &file->recv_wait, wait);
+
+ if (!list_empty(&file->recv_list))
+ mask |= EPOLLIN | EPOLLRDNORM;
+ if (file->agents_dead)
+ mask = EPOLLERR;
+ mutex_unlock(&file->mutex);
+
+ return mask;
+}
+
+static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg,
+ int compat_method_mask)
+{
+ struct ib_user_mad_reg_req ureq;
+ struct ib_mad_reg_req req;
+ struct ib_mad_agent *agent = NULL;
+ int agent_id;
+ int ret;
+
+ mutex_lock(&file->port->file_mutex);
+ mutex_lock(&file->mutex);
+
+ if (!file->port->ib_dev) {
+ dev_notice(&file->port->dev, "%s: invalid device\n", __func__);
+ ret = -EPIPE;
+ goto out;
+ }
+
+ if (copy_from_user(&ureq, arg, sizeof ureq)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (ureq.qpn != 0 && ureq.qpn != 1) {
+ dev_notice(&file->port->dev,
+ "%s: invalid QPN %u specified\n", __func__,
+ ureq.qpn);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id)
+ if (!__get_agent(file, agent_id))
+ goto found;
+
+ dev_notice(&file->port->dev, "%s: Max Agents (%u) reached\n", __func__,
+ IB_UMAD_MAX_AGENTS);
+
+ ret = -ENOMEM;
+ goto out;
+
+found:
+ if (ureq.mgmt_class) {
+ memset(&req, 0, sizeof(req));
+ req.mgmt_class = ureq.mgmt_class;
+ req.mgmt_class_version = ureq.mgmt_class_version;
+ memcpy(req.oui, ureq.oui, sizeof req.oui);
+
+ if (compat_method_mask) {
+ u32 *umm = (u32 *) ureq.method_mask;
+ int i;
+
+ for (i = 0; i < BITS_TO_LONGS(IB_MGMT_MAX_METHODS); ++i)
+ req.method_mask[i] =
+ umm[i * 2] | ((u64) umm[i * 2 + 1] << 32);
+ } else
+ memcpy(req.method_mask, ureq.method_mask,
+ sizeof req.method_mask);
+ }
+
+ agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num,
+ ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI,
+ ureq.mgmt_class ? &req : NULL,
+ ureq.rmpp_version,
+ send_handler, recv_handler, file, 0);
+ if (IS_ERR(agent)) {
+ ret = PTR_ERR(agent);
+ agent = NULL;
+ goto out;
+ }
+
+ if (put_user(agent_id,
+ (u32 __user *) (arg + offsetof(struct ib_user_mad_reg_req, id)))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (!file->already_used) {
+ file->already_used = 1;
+ if (!file->use_pkey_index) {
+ dev_warn(&file->port->dev,
+ "process %s did not enable P_Key index support.\n",
+ current->comm);
+ dev_warn(&file->port->dev,
+ " Documentation/infiniband/user_mad.rst has info on the new ABI.\n");
+ }
+ }
+
+ file->agent[agent_id] = agent;
+ ret = 0;
+
+out:
+ mutex_unlock(&file->mutex);
+
+ if (ret && agent)
+ ib_unregister_mad_agent(agent);
+
+ mutex_unlock(&file->port->file_mutex);
+
+ return ret;
+}
+
+static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg)
+{
+ struct ib_user_mad_reg_req2 ureq;
+ struct ib_mad_reg_req req;
+ struct ib_mad_agent *agent = NULL;
+ int agent_id;
+ int ret;
+
+ mutex_lock(&file->port->file_mutex);
+ mutex_lock(&file->mutex);
+
+ if (!file->port->ib_dev) {
+ dev_notice(&file->port->dev, "%s: invalid device\n", __func__);
+ ret = -EPIPE;
+ goto out;
+ }
+
+ if (copy_from_user(&ureq, arg, sizeof(ureq))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (ureq.qpn != 0 && ureq.qpn != 1) {
+ dev_notice(&file->port->dev, "%s: invalid QPN %u specified\n",
+ __func__, ureq.qpn);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (ureq.flags & ~IB_USER_MAD_REG_FLAGS_CAP) {
+ dev_notice(&file->port->dev,
+ "%s failed: invalid registration flags specified 0x%x; supported 0x%x\n",
+ __func__, ureq.flags, IB_USER_MAD_REG_FLAGS_CAP);
+ ret = -EINVAL;
+
+ if (put_user((u32)IB_USER_MAD_REG_FLAGS_CAP,
+ (u32 __user *) (arg + offsetof(struct
+ ib_user_mad_reg_req2, flags))))
+ ret = -EFAULT;
+
+ goto out;
+ }
+
+ for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id)
+ if (!__get_agent(file, agent_id))
+ goto found;
+
+ dev_notice(&file->port->dev, "%s: Max Agents (%u) reached\n", __func__,
+ IB_UMAD_MAX_AGENTS);
+ ret = -ENOMEM;
+ goto out;
+
+found:
+ if (ureq.mgmt_class) {
+ memset(&req, 0, sizeof(req));
+ req.mgmt_class = ureq.mgmt_class;
+ req.mgmt_class_version = ureq.mgmt_class_version;
+ if (ureq.oui & 0xff000000) {
+ dev_notice(&file->port->dev,
+ "%s failed: oui invalid 0x%08x\n", __func__,
+ ureq.oui);
+ ret = -EINVAL;
+ goto out;
+ }
+ req.oui[2] = ureq.oui & 0x0000ff;
+ req.oui[1] = (ureq.oui & 0x00ff00) >> 8;
+ req.oui[0] = (ureq.oui & 0xff0000) >> 16;
+ memcpy(req.method_mask, ureq.method_mask,
+ sizeof(req.method_mask));
+ }
+
+ agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num,
+ ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI,
+ ureq.mgmt_class ? &req : NULL,
+ ureq.rmpp_version,
+ send_handler, recv_handler, file,
+ ureq.flags);
+ if (IS_ERR(agent)) {
+ ret = PTR_ERR(agent);
+ agent = NULL;
+ goto out;
+ }
+
+ if (put_user(agent_id,
+ (u32 __user *)(arg +
+ offsetof(struct ib_user_mad_reg_req2, id)))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (!file->already_used) {
+ file->already_used = 1;
+ file->use_pkey_index = 1;
+ }
+
+ file->agent[agent_id] = agent;
+ ret = 0;
+
+out:
+ mutex_unlock(&file->mutex);
+
+ if (ret && agent)
+ ib_unregister_mad_agent(agent);
+
+ mutex_unlock(&file->port->file_mutex);
+
+ return ret;
+}
+
+
+static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg)
+{
+ struct ib_mad_agent *agent = NULL;
+ u32 id;
+ int ret = 0;
+
+ if (get_user(id, arg))
+ return -EFAULT;
+ if (id >= IB_UMAD_MAX_AGENTS)
+ return -EINVAL;
+
+ mutex_lock(&file->port->file_mutex);
+ mutex_lock(&file->mutex);
+
+ id = array_index_nospec(id, IB_UMAD_MAX_AGENTS);
+ if (!__get_agent(file, id)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ agent = file->agent[id];
+ file->agent[id] = NULL;
+
+out:
+ mutex_unlock(&file->mutex);
+
+ if (agent)
+ ib_unregister_mad_agent(agent);
+
+ mutex_unlock(&file->port->file_mutex);
+
+ return ret;
+}
+
+static long ib_umad_enable_pkey(struct ib_umad_file *file)
+{
+ int ret = 0;
+
+ mutex_lock(&file->mutex);
+ if (file->already_used)
+ ret = -EINVAL;
+ else
+ file->use_pkey_index = 1;
+ mutex_unlock(&file->mutex);
+
+ return ret;
+}
+
+static long ib_umad_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ switch (cmd) {
+ case IB_USER_MAD_REGISTER_AGENT:
+ return ib_umad_reg_agent(filp->private_data, (void __user *) arg, 0);
+ case IB_USER_MAD_UNREGISTER_AGENT:
+ return ib_umad_unreg_agent(filp->private_data, (__u32 __user *) arg);
+ case IB_USER_MAD_ENABLE_PKEY:
+ return ib_umad_enable_pkey(filp->private_data);
+ case IB_USER_MAD_REGISTER_AGENT2:
+ return ib_umad_reg_agent2(filp->private_data, (void __user *) arg);
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+static long ib_umad_compat_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ switch (cmd) {
+ case IB_USER_MAD_REGISTER_AGENT:
+ return ib_umad_reg_agent(filp->private_data, compat_ptr(arg), 1);
+ case IB_USER_MAD_UNREGISTER_AGENT:
+ return ib_umad_unreg_agent(filp->private_data, compat_ptr(arg));
+ case IB_USER_MAD_ENABLE_PKEY:
+ return ib_umad_enable_pkey(filp->private_data);
+ case IB_USER_MAD_REGISTER_AGENT2:
+ return ib_umad_reg_agent2(filp->private_data, compat_ptr(arg));
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+#endif
+
+/*
+ * ib_umad_open() does not need the BKL:
+ *
+ * - the ib_umad_port structures are properly reference counted, and
+ * everything else is purely local to the file being created, so
+ * races against other open calls are not a problem;
+ * - the ioctl method does not affect any global state outside of the
+ * file structure being operated on;
+ */
+static int ib_umad_open(struct inode *inode, struct file *filp)
+{
+ struct ib_umad_port *port;
+ struct ib_umad_file *file;
+ int ret = 0;
+
+ port = container_of(inode->i_cdev, struct ib_umad_port, cdev);
+
+ mutex_lock(&port->file_mutex);
+
+ if (!port->ib_dev) {
+ ret = -ENXIO;
+ goto out;
+ }
+
+ if (!rdma_dev_access_netns(port->ib_dev, current->nsproxy->net_ns)) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ file = kzalloc(sizeof(*file), GFP_KERNEL);
+ if (!file) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ mutex_init(&file->mutex);
+ spin_lock_init(&file->send_lock);
+ INIT_LIST_HEAD(&file->recv_list);
+ INIT_LIST_HEAD(&file->send_list);
+ init_waitqueue_head(&file->recv_wait);
+
+ file->port = port;
+ filp->private_data = file;
+
+ list_add_tail(&file->port_list, &port->file_list);
+
+ stream_open(inode, filp);
+out:
+ mutex_unlock(&port->file_mutex);
+ return ret;
+}
+
+static int ib_umad_close(struct inode *inode, struct file *filp)
+{
+ struct ib_umad_file *file = filp->private_data;
+ struct ib_umad_packet *packet, *tmp;
+ int already_dead;
+ int i;
+
+ mutex_lock(&file->port->file_mutex);
+ mutex_lock(&file->mutex);
+
+ already_dead = file->agents_dead;
+ file->agents_dead = 1;
+
+ list_for_each_entry_safe(packet, tmp, &file->recv_list, list) {
+ if (packet->recv_wc)
+ ib_free_recv_mad(packet->recv_wc);
+ kfree(packet);
+ }
+
+ list_del(&file->port_list);
+
+ mutex_unlock(&file->mutex);
+
+ if (!already_dead)
+ for (i = 0; i < IB_UMAD_MAX_AGENTS; ++i)
+ if (file->agent[i])
+ ib_unregister_mad_agent(file->agent[i]);
+
+ mutex_unlock(&file->port->file_mutex);
+ mutex_destroy(&file->mutex);
+ kfree(file);
+ return 0;
+}
+
+static const struct file_operations umad_fops = {
+ .owner = THIS_MODULE,
+ .read = ib_umad_read,
+ .write = ib_umad_write,
+ .poll = ib_umad_poll,
+ .unlocked_ioctl = ib_umad_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ib_umad_compat_ioctl,
+#endif
+ .open = ib_umad_open,
+ .release = ib_umad_close,
+ .llseek = no_llseek,
+};
+
+static int ib_umad_sm_open(struct inode *inode, struct file *filp)
+{
+ struct ib_umad_port *port;
+ struct ib_port_modify props = {
+ .set_port_cap_mask = IB_PORT_SM
+ };
+ int ret;
+
+ port = container_of(inode->i_cdev, struct ib_umad_port, sm_cdev);
+
+ if (filp->f_flags & O_NONBLOCK) {
+ if (down_trylock(&port->sm_sem)) {
+ ret = -EAGAIN;
+ goto fail;
+ }
+ } else {
+ if (down_interruptible(&port->sm_sem)) {
+ ret = -ERESTARTSYS;
+ goto fail;
+ }
+ }
+
+ if (!rdma_dev_access_netns(port->ib_dev, current->nsproxy->net_ns)) {
+ ret = -EPERM;
+ goto err_up_sem;
+ }
+
+ ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+ if (ret)
+ goto err_up_sem;
+
+ filp->private_data = port;
+
+ nonseekable_open(inode, filp);
+ return 0;
+
+err_up_sem:
+ up(&port->sm_sem);
+
+fail:
+ return ret;
+}
+
+static int ib_umad_sm_close(struct inode *inode, struct file *filp)
+{
+ struct ib_umad_port *port = filp->private_data;
+ struct ib_port_modify props = {
+ .clr_port_cap_mask = IB_PORT_SM
+ };
+ int ret = 0;
+
+ mutex_lock(&port->file_mutex);
+ if (port->ib_dev)
+ ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+ mutex_unlock(&port->file_mutex);
+
+ up(&port->sm_sem);
+
+ return ret;
+}
+
+static const struct file_operations umad_sm_fops = {
+ .owner = THIS_MODULE,
+ .open = ib_umad_sm_open,
+ .release = ib_umad_sm_close,
+ .llseek = no_llseek,
+};
+
+static struct ib_umad_port *get_port(struct ib_device *ibdev,
+ struct ib_umad_device *umad_dev,
+ u32 port)
+{
+ if (!umad_dev)
+ return ERR_PTR(-EOPNOTSUPP);
+ if (!rdma_is_port_valid(ibdev, port))
+ return ERR_PTR(-EINVAL);
+ if (!rdma_cap_ib_mad(ibdev, port))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ return &umad_dev->ports[port - rdma_start_port(ibdev)];
+}
+
+static int ib_umad_get_nl_info(struct ib_device *ibdev, void *client_data,
+ struct ib_client_nl_info *res)
+{
+ struct ib_umad_port *port = get_port(ibdev, client_data, res->port);
+
+ if (IS_ERR(port))
+ return PTR_ERR(port);
+
+ res->abi = IB_USER_MAD_ABI_VERSION;
+ res->cdev = &port->dev;
+ return 0;
+}
+
+static struct ib_client umad_client = {
+ .name = "umad",
+ .add = ib_umad_add_one,
+ .remove = ib_umad_remove_one,
+ .get_nl_info = ib_umad_get_nl_info,
+};
+MODULE_ALIAS_RDMA_CLIENT("umad");
+
+static int ib_issm_get_nl_info(struct ib_device *ibdev, void *client_data,
+ struct ib_client_nl_info *res)
+{
+ struct ib_umad_port *port = get_port(ibdev, client_data, res->port);
+
+ if (IS_ERR(port))
+ return PTR_ERR(port);
+
+ res->abi = IB_USER_MAD_ABI_VERSION;
+ res->cdev = &port->sm_dev;
+ return 0;
+}
+
+static struct ib_client issm_client = {
+ .name = "issm",
+ .get_nl_info = ib_issm_get_nl_info,
+};
+MODULE_ALIAS_RDMA_CLIENT("issm");
+
+static ssize_t ibdev_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct ib_umad_port *port = dev_get_drvdata(dev);
+
+ if (!port)
+ return -ENODEV;
+
+ return sysfs_emit(buf, "%s\n", dev_name(&port->ib_dev->dev));
+}
+static DEVICE_ATTR_RO(ibdev);
+
+static ssize_t port_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct ib_umad_port *port = dev_get_drvdata(dev);
+
+ if (!port)
+ return -ENODEV;
+
+ return sysfs_emit(buf, "%d\n", port->port_num);
+}
+static DEVICE_ATTR_RO(port);
+
+static struct attribute *umad_class_dev_attrs[] = {
+ &dev_attr_ibdev.attr,
+ &dev_attr_port.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(umad_class_dev);
+
+static char *umad_devnode(struct device *dev, umode_t *mode)
+{
+ return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
+static ssize_t abi_version_show(struct class *class,
+ struct class_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", IB_USER_MAD_ABI_VERSION);
+}
+static CLASS_ATTR_RO(abi_version);
+
+static struct attribute *umad_class_attrs[] = {
+ &class_attr_abi_version.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(umad_class);
+
+static struct class umad_class = {
+ .name = "infiniband_mad",
+ .devnode = umad_devnode,
+ .class_groups = umad_class_groups,
+ .dev_groups = umad_class_dev_groups,
+};
+
+static void ib_umad_release_port(struct device *device)
+{
+ struct ib_umad_port *port = dev_get_drvdata(device);
+ struct ib_umad_device *umad_dev = port->umad_dev;
+
+ ib_umad_dev_put(umad_dev);
+}
+
+static void ib_umad_init_port_dev(struct device *dev,
+ struct ib_umad_port *port,
+ const struct ib_device *device)
+{
+ device_initialize(dev);
+ ib_umad_dev_get(port->umad_dev);
+ dev->class = &umad_class;
+ dev->parent = device->dev.parent;
+ dev_set_drvdata(dev, port);
+ dev->release = ib_umad_release_port;
+}
+
+static int ib_umad_init_port(struct ib_device *device, int port_num,
+ struct ib_umad_device *umad_dev,
+ struct ib_umad_port *port)
+{
+ int devnum;
+ dev_t base_umad;
+ dev_t base_issm;
+ int ret;
+
+ devnum = ida_alloc_max(&umad_ida, IB_UMAD_MAX_PORTS - 1, GFP_KERNEL);
+ if (devnum < 0)
+ return -1;
+ port->dev_num = devnum;
+ if (devnum >= IB_UMAD_NUM_FIXED_MINOR) {
+ base_umad = dynamic_umad_dev + devnum - IB_UMAD_NUM_FIXED_MINOR;
+ base_issm = dynamic_issm_dev + devnum - IB_UMAD_NUM_FIXED_MINOR;
+ } else {
+ base_umad = devnum + base_umad_dev;
+ base_issm = devnum + base_issm_dev;
+ }
+
+ port->ib_dev = device;
+ port->umad_dev = umad_dev;
+ port->port_num = port_num;
+ sema_init(&port->sm_sem, 1);
+ mutex_init(&port->file_mutex);
+ INIT_LIST_HEAD(&port->file_list);
+
+ ib_umad_init_port_dev(&port->dev, port, device);
+ port->dev.devt = base_umad;
+ dev_set_name(&port->dev, "umad%d", port->dev_num);
+ cdev_init(&port->cdev, &umad_fops);
+ port->cdev.owner = THIS_MODULE;
+
+ ret = cdev_device_add(&port->cdev, &port->dev);
+ if (ret)
+ goto err_cdev;
+
+ ib_umad_init_port_dev(&port->sm_dev, port, device);
+ port->sm_dev.devt = base_issm;
+ dev_set_name(&port->sm_dev, "issm%d", port->dev_num);
+ cdev_init(&port->sm_cdev, &umad_sm_fops);
+ port->sm_cdev.owner = THIS_MODULE;
+
+ ret = cdev_device_add(&port->sm_cdev, &port->sm_dev);
+ if (ret)
+ goto err_dev;
+
+ return 0;
+
+err_dev:
+ put_device(&port->sm_dev);
+ cdev_device_del(&port->cdev, &port->dev);
+err_cdev:
+ put_device(&port->dev);
+ ida_free(&umad_ida, devnum);
+ return ret;
+}
+
+static void ib_umad_kill_port(struct ib_umad_port *port)
+{
+ struct ib_umad_file *file;
+ int id;
+
+ cdev_device_del(&port->sm_cdev, &port->sm_dev);
+ cdev_device_del(&port->cdev, &port->dev);
+
+ mutex_lock(&port->file_mutex);
+
+ /* Mark ib_dev NULL and block ioctl or other file ops to progress
+ * further.
+ */
+ port->ib_dev = NULL;
+
+ list_for_each_entry(file, &port->file_list, port_list) {
+ mutex_lock(&file->mutex);
+ file->agents_dead = 1;
+ wake_up_interruptible(&file->recv_wait);
+ mutex_unlock(&file->mutex);
+
+ for (id = 0; id < IB_UMAD_MAX_AGENTS; ++id)
+ if (file->agent[id])
+ ib_unregister_mad_agent(file->agent[id]);
+ }
+
+ mutex_unlock(&port->file_mutex);
+
+ ida_free(&umad_ida, port->dev_num);
+
+ /* balances device_initialize() */
+ put_device(&port->sm_dev);
+ put_device(&port->dev);
+}
+
+static int ib_umad_add_one(struct ib_device *device)
+{
+ struct ib_umad_device *umad_dev;
+ int s, e, i;
+ int count = 0;
+ int ret;
+
+ s = rdma_start_port(device);
+ e = rdma_end_port(device);
+
+ umad_dev = kzalloc(struct_size(umad_dev, ports,
+ size_add(size_sub(e, s), 1)),
+ GFP_KERNEL);
+ if (!umad_dev)
+ return -ENOMEM;
+
+ kref_init(&umad_dev->kref);
+ for (i = s; i <= e; ++i) {
+ if (!rdma_cap_ib_mad(device, i))
+ continue;
+
+ ret = ib_umad_init_port(device, i, umad_dev,
+ &umad_dev->ports[i - s]);
+ if (ret)
+ goto err;
+
+ count++;
+ }
+
+ if (!count) {
+ ret = -EOPNOTSUPP;
+ goto free;
+ }
+
+ ib_set_client_data(device, &umad_client, umad_dev);
+
+ return 0;
+
+err:
+ while (--i >= s) {
+ if (!rdma_cap_ib_mad(device, i))
+ continue;
+
+ ib_umad_kill_port(&umad_dev->ports[i - s]);
+ }
+free:
+ /* balances kref_init */
+ ib_umad_dev_put(umad_dev);
+ return ret;
+}
+
+static void ib_umad_remove_one(struct ib_device *device, void *client_data)
+{
+ struct ib_umad_device *umad_dev = client_data;
+ unsigned int i;
+
+ rdma_for_each_port (device, i) {
+ if (rdma_cap_ib_mad(device, i))
+ ib_umad_kill_port(
+ &umad_dev->ports[i - rdma_start_port(device)]);
+ }
+ /* balances kref_init() */
+ ib_umad_dev_put(umad_dev);
+}
+
+static int __init ib_umad_init(void)
+{
+ int ret;
+
+ ret = register_chrdev_region(base_umad_dev,
+ IB_UMAD_NUM_FIXED_MINOR * 2,
+ umad_class.name);
+ if (ret) {
+ pr_err("couldn't register device number\n");
+ goto out;
+ }
+
+ ret = alloc_chrdev_region(&dynamic_umad_dev, 0,
+ IB_UMAD_NUM_DYNAMIC_MINOR * 2,
+ umad_class.name);
+ if (ret) {
+ pr_err("couldn't register dynamic device number\n");
+ goto out_alloc;
+ }
+ dynamic_issm_dev = dynamic_umad_dev + IB_UMAD_NUM_DYNAMIC_MINOR;
+
+ ret = class_register(&umad_class);
+ if (ret) {
+ pr_err("couldn't create class infiniband_mad\n");
+ goto out_chrdev;
+ }
+
+ ret = ib_register_client(&umad_client);
+ if (ret)
+ goto out_class;
+
+ ret = ib_register_client(&issm_client);
+ if (ret)
+ goto out_client;
+
+ return 0;
+
+out_client:
+ ib_unregister_client(&umad_client);
+out_class:
+ class_unregister(&umad_class);
+
+out_chrdev:
+ unregister_chrdev_region(dynamic_umad_dev,
+ IB_UMAD_NUM_DYNAMIC_MINOR * 2);
+
+out_alloc:
+ unregister_chrdev_region(base_umad_dev,
+ IB_UMAD_NUM_FIXED_MINOR * 2);
+
+out:
+ return ret;
+}
+
+static void __exit ib_umad_cleanup(void)
+{
+ ib_unregister_client(&issm_client);
+ ib_unregister_client(&umad_client);
+ class_unregister(&umad_class);
+ unregister_chrdev_region(base_umad_dev,
+ IB_UMAD_NUM_FIXED_MINOR * 2);
+ unregister_chrdev_region(dynamic_umad_dev,
+ IB_UMAD_NUM_DYNAMIC_MINOR * 2);
+}
+
+module_init(ib_umad_init);
+module_exit(ib_umad_cleanup);
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
new file mode 100644
index 000000000..821d93c8f
--- /dev/null
+++ b/drivers/infiniband/core/uverbs.h
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef UVERBS_H
+#define UVERBS_H
+
+#include <linux/kref.h>
+#include <linux/idr.h>
+#include <linux/mutex.h>
+#include <linux/completion.h>
+#include <linux/cdev.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/uverbs_std_types.h>
+
+#define UVERBS_MODULE_NAME ib_uverbs
+#include <rdma/uverbs_named_ioctl.h>
+
+static inline void
+ib_uverbs_init_udata(struct ib_udata *udata,
+ const void __user *ibuf,
+ void __user *obuf,
+ size_t ilen, size_t olen)
+{
+ udata->inbuf = ibuf;
+ udata->outbuf = obuf;
+ udata->inlen = ilen;
+ udata->outlen = olen;
+}
+
+static inline void
+ib_uverbs_init_udata_buf_or_null(struct ib_udata *udata,
+ const void __user *ibuf,
+ void __user *obuf,
+ size_t ilen, size_t olen)
+{
+ ib_uverbs_init_udata(udata,
+ ilen ? ibuf : NULL, olen ? obuf : NULL,
+ ilen, olen);
+}
+
+/*
+ * Our lifetime rules for these structs are the following:
+ *
+ * struct ib_uverbs_device: One reference is held by the module and
+ * released in ib_uverbs_remove_one(). Another reference is taken by
+ * ib_uverbs_open() each time the character special file is opened,
+ * and released in ib_uverbs_release_file() when the file is released.
+ *
+ * struct ib_uverbs_file: One reference is held by the VFS and
+ * released when the file is closed. Another reference is taken when
+ * an asynchronous event queue file is created and released when the
+ * event file is closed.
+ *
+ * struct ib_uverbs_event_queue: Base structure for
+ * struct ib_uverbs_async_event_file and struct ib_uverbs_completion_event_file.
+ * One reference is held by the VFS and released when the file is closed.
+ * For asynchronous event files, another reference is held by the corresponding
+ * main context file and released when that file is closed. For completion
+ * event files, a reference is taken when a CQ is created that uses the file,
+ * and released when the CQ is destroyed.
+ */
+
+struct ib_uverbs_device {
+ refcount_t refcount;
+ u32 num_comp_vectors;
+ struct completion comp;
+ struct device dev;
+ /* First group for device attributes, NULL terminated array */
+ const struct attribute_group *groups[2];
+ struct ib_device __rcu *ib_dev;
+ int devnum;
+ struct cdev cdev;
+ struct rb_root xrcd_tree;
+ struct mutex xrcd_tree_mutex;
+ struct srcu_struct disassociate_srcu;
+ struct mutex lists_mutex; /* protect lists */
+ struct list_head uverbs_file_list;
+ struct uverbs_api *uapi;
+};
+
+struct ib_uverbs_event_queue {
+ spinlock_t lock;
+ int is_closed;
+ wait_queue_head_t poll_wait;
+ struct fasync_struct *async_queue;
+ struct list_head event_list;
+};
+
+struct ib_uverbs_async_event_file {
+ struct ib_uobject uobj;
+ struct ib_uverbs_event_queue ev_queue;
+ struct ib_event_handler event_handler;
+};
+
+struct ib_uverbs_completion_event_file {
+ struct ib_uobject uobj;
+ struct ib_uverbs_event_queue ev_queue;
+};
+
+struct ib_uverbs_file {
+ struct kref ref;
+ struct ib_uverbs_device *device;
+ struct mutex ucontext_lock;
+ /*
+ * ucontext must be accessed via ib_uverbs_get_ucontext() or with
+ * ucontext_lock held
+ */
+ struct ib_ucontext *ucontext;
+ struct ib_uverbs_async_event_file *default_async_file;
+ struct list_head list;
+
+ /*
+ * To access the uobjects list hw_destroy_rwsem must be held for write
+ * OR hw_destroy_rwsem held for read AND uobjects_lock held.
+ * hw_destroy_rwsem should be called across any destruction of the HW
+ * object of an associated uobject.
+ */
+ struct rw_semaphore hw_destroy_rwsem;
+ spinlock_t uobjects_lock;
+ struct list_head uobjects;
+
+ struct mutex umap_lock;
+ struct list_head umaps;
+ struct page *disassociate_page;
+
+ struct xarray idr;
+};
+
+struct ib_uverbs_event {
+ union {
+ struct ib_uverbs_async_event_desc async;
+ struct ib_uverbs_comp_event_desc comp;
+ } desc;
+ struct list_head list;
+ struct list_head obj_list;
+ u32 *counter;
+};
+
+struct ib_uverbs_mcast_entry {
+ struct list_head list;
+ union ib_gid gid;
+ u16 lid;
+};
+
+struct ib_uevent_object {
+ struct ib_uobject uobject;
+ struct ib_uverbs_async_event_file *event_file;
+ /* List member for ib_uverbs_async_event_file list */
+ struct list_head event_list;
+ u32 events_reported;
+};
+
+struct ib_uxrcd_object {
+ struct ib_uobject uobject;
+ atomic_t refcnt;
+};
+
+struct ib_usrq_object {
+ struct ib_uevent_object uevent;
+ struct ib_uxrcd_object *uxrcd;
+};
+
+struct ib_uqp_object {
+ struct ib_uevent_object uevent;
+ /* lock for mcast list */
+ struct mutex mcast_lock;
+ struct list_head mcast_list;
+ struct ib_uxrcd_object *uxrcd;
+};
+
+struct ib_uwq_object {
+ struct ib_uevent_object uevent;
+};
+
+struct ib_ucq_object {
+ struct ib_uevent_object uevent;
+ struct list_head comp_list;
+ u32 comp_events_reported;
+};
+
+extern const struct file_operations uverbs_event_fops;
+extern const struct file_operations uverbs_async_event_fops;
+void ib_uverbs_init_event_queue(struct ib_uverbs_event_queue *ev_queue);
+void ib_uverbs_init_async_event_file(struct ib_uverbs_async_event_file *ev_file);
+void ib_uverbs_free_event_queue(struct ib_uverbs_event_queue *event_queue);
+void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res);
+int uverbs_async_event_release(struct inode *inode, struct file *filp);
+
+int ib_alloc_ucontext(struct uverbs_attr_bundle *attrs);
+int ib_init_ucontext(struct uverbs_attr_bundle *attrs);
+
+void ib_uverbs_release_ucq(struct ib_uverbs_completion_event_file *ev_file,
+ struct ib_ucq_object *uobj);
+void ib_uverbs_release_uevent(struct ib_uevent_object *uobj);
+void ib_uverbs_release_file(struct kref *ref);
+void ib_uverbs_async_handler(struct ib_uverbs_async_event_file *async_file,
+ __u64 element, __u64 event,
+ struct list_head *obj_list, u32 *counter);
+
+void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context);
+void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr);
+int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject, struct ib_xrcd *xrcd,
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs);
+
+int uverbs_dealloc_mw(struct ib_mw *mw);
+void ib_uverbs_detach_umcast(struct ib_qp *qp,
+ struct ib_uqp_object *uobj);
+
+long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+
+struct ib_uverbs_flow_spec {
+ union {
+ union {
+ struct ib_uverbs_flow_spec_hdr hdr;
+ struct {
+ __u32 type;
+ __u16 size;
+ __u16 reserved;
+ };
+ };
+ struct ib_uverbs_flow_spec_eth eth;
+ struct ib_uverbs_flow_spec_ipv4 ipv4;
+ struct ib_uverbs_flow_spec_esp esp;
+ struct ib_uverbs_flow_spec_tcp_udp tcp_udp;
+ struct ib_uverbs_flow_spec_ipv6 ipv6;
+ struct ib_uverbs_flow_spec_action_tag flow_tag;
+ struct ib_uverbs_flow_spec_action_drop drop;
+ struct ib_uverbs_flow_spec_action_handle action;
+ struct ib_uverbs_flow_spec_action_count flow_count;
+ };
+};
+
+int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type,
+ const void *kern_spec_mask,
+ const void *kern_spec_val,
+ size_t kern_filter_sz,
+ union ib_flow_spec *ib_spec);
+
+/*
+ * ib_uverbs_query_port_resp.port_cap_flags started out as just a copy of the
+ * PortInfo CapabilityMask, but was extended with unique bits.
+ */
+static inline u32 make_port_cap_flags(const struct ib_port_attr *attr)
+{
+ u32 res;
+
+ /* All IBA CapabilityMask bits are passed through here, except bit 26,
+ * which is overridden with IP_BASED_GIDS. This is due to a historical
+ * mistake in the implementation of IP_BASED_GIDS. Otherwise all other
+ * bits match the IBA definition across all kernel versions.
+ */
+ res = attr->port_cap_flags & ~(u32)IB_UVERBS_PCF_IP_BASED_GIDS;
+
+ if (attr->ip_gids)
+ res |= IB_UVERBS_PCF_IP_BASED_GIDS;
+
+ return res;
+}
+
+static inline struct ib_uverbs_async_event_file *
+ib_uverbs_get_async_event(struct uverbs_attr_bundle *attrs,
+ u16 id)
+{
+ struct ib_uobject *async_ev_file_uobj;
+ struct ib_uverbs_async_event_file *async_ev_file;
+
+ async_ev_file_uobj = uverbs_attr_get_uobject(attrs, id);
+ if (IS_ERR(async_ev_file_uobj))
+ async_ev_file = READ_ONCE(attrs->ufile->default_async_file);
+ else
+ async_ev_file = container_of(async_ev_file_uobj,
+ struct ib_uverbs_async_event_file,
+ uobj);
+ if (async_ev_file)
+ uverbs_uobject_get(&async_ev_file->uobj);
+ return async_ev_file;
+}
+
+void copy_port_attr_to_resp(struct ib_port_attr *attr,
+ struct ib_uverbs_query_port_resp *resp,
+ struct ib_device *ib_dev, u8 port_num);
+#endif /* UVERBS_H */
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
new file mode 100644
index 000000000..e836c9c47
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -0,0 +1,4052 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ * Copyright (c) 2006 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include <linux/uaccess.h>
+
+#include <rdma/uverbs_types.h>
+#include <rdma/uverbs_std_types.h>
+#include "rdma_core.h"
+
+#include "uverbs.h"
+#include "core_priv.h"
+
+/*
+ * Copy a response to userspace. If the provided 'resp' is larger than the
+ * user buffer it is silently truncated. If the user provided a larger buffer
+ * then the trailing portion is zero filled.
+ *
+ * These semantics are intended to support future extension of the output
+ * structures.
+ */
+static int uverbs_response(struct uverbs_attr_bundle *attrs, const void *resp,
+ size_t resp_len)
+{
+ int ret;
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CORE_OUT))
+ return uverbs_copy_to_struct_or_zero(
+ attrs, UVERBS_ATTR_CORE_OUT, resp, resp_len);
+
+ if (copy_to_user(attrs->ucore.outbuf, resp,
+ min(attrs->ucore.outlen, resp_len)))
+ return -EFAULT;
+
+ if (resp_len < attrs->ucore.outlen) {
+ /*
+ * Zero fill any extra memory that user
+ * space might have provided.
+ */
+ ret = clear_user(attrs->ucore.outbuf + resp_len,
+ attrs->ucore.outlen - resp_len);
+ if (ret)
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/*
+ * Copy a request from userspace. If the provided 'req' is larger than the
+ * user buffer then the user buffer is zero extended into the 'req'. If 'req'
+ * is smaller than the user buffer then the uncopied bytes in the user buffer
+ * must be zero.
+ */
+static int uverbs_request(struct uverbs_attr_bundle *attrs, void *req,
+ size_t req_len)
+{
+ if (copy_from_user(req, attrs->ucore.inbuf,
+ min(attrs->ucore.inlen, req_len)))
+ return -EFAULT;
+
+ if (attrs->ucore.inlen < req_len) {
+ memset(req + attrs->ucore.inlen, 0,
+ req_len - attrs->ucore.inlen);
+ } else if (attrs->ucore.inlen > req_len) {
+ if (!ib_is_buffer_cleared(attrs->ucore.inbuf + req_len,
+ attrs->ucore.inlen - req_len))
+ return -EOPNOTSUPP;
+ }
+ return 0;
+}
+
+/*
+ * Generate the value for the 'response_length' protocol used by write_ex.
+ * This is the number of bytes the kernel actually wrote. Userspace can use
+ * this to detect what structure members in the response the kernel
+ * understood.
+ */
+static u32 uverbs_response_length(struct uverbs_attr_bundle *attrs,
+ size_t resp_len)
+{
+ return min_t(size_t, attrs->ucore.outlen, resp_len);
+}
+
+/*
+ * The iterator version of the request interface is for handlers that need to
+ * step over a flex array at the end of a command header.
+ */
+struct uverbs_req_iter {
+ const void __user *cur;
+ const void __user *end;
+};
+
+static int uverbs_request_start(struct uverbs_attr_bundle *attrs,
+ struct uverbs_req_iter *iter,
+ void *req,
+ size_t req_len)
+{
+ if (attrs->ucore.inlen < req_len)
+ return -ENOSPC;
+
+ if (copy_from_user(req, attrs->ucore.inbuf, req_len))
+ return -EFAULT;
+
+ iter->cur = attrs->ucore.inbuf + req_len;
+ iter->end = attrs->ucore.inbuf + attrs->ucore.inlen;
+ return 0;
+}
+
+static int uverbs_request_next(struct uverbs_req_iter *iter, void *val,
+ size_t len)
+{
+ if (iter->cur + len > iter->end)
+ return -ENOSPC;
+
+ if (copy_from_user(val, iter->cur, len))
+ return -EFAULT;
+
+ iter->cur += len;
+ return 0;
+}
+
+static const void __user *uverbs_request_next_ptr(struct uverbs_req_iter *iter,
+ size_t len)
+{
+ const void __user *res = iter->cur;
+
+ if (iter->cur + len > iter->end)
+ return (void __force __user *)ERR_PTR(-ENOSPC);
+ iter->cur += len;
+ return res;
+}
+
+static int uverbs_request_finish(struct uverbs_req_iter *iter)
+{
+ if (!ib_is_buffer_cleared(iter->cur, iter->end - iter->cur))
+ return -EOPNOTSUPP;
+ return 0;
+}
+
+/*
+ * When calling a destroy function during an error unwind we need to pass in
+ * the udata that is sanitized of all user arguments. Ie from the driver
+ * perspective it looks like no udata was passed.
+ */
+struct ib_udata *uverbs_get_cleared_udata(struct uverbs_attr_bundle *attrs)
+{
+ attrs->driver_udata = (struct ib_udata){};
+ return &attrs->driver_udata;
+}
+
+static struct ib_uverbs_completion_event_file *
+_ib_uverbs_lookup_comp_file(s32 fd, struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj = ufd_get_read(UVERBS_OBJECT_COMP_CHANNEL,
+ fd, attrs);
+
+ if (IS_ERR(uobj))
+ return (void *)uobj;
+
+ uverbs_uobject_get(uobj);
+ uobj_put_read(uobj);
+
+ return container_of(uobj, struct ib_uverbs_completion_event_file,
+ uobj);
+}
+#define ib_uverbs_lookup_comp_file(_fd, _ufile) \
+ _ib_uverbs_lookup_comp_file((_fd)*typecheck(s32, _fd), _ufile)
+
+int ib_alloc_ucontext(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_file *ufile = attrs->ufile;
+ struct ib_ucontext *ucontext;
+ struct ib_device *ib_dev;
+
+ ib_dev = srcu_dereference(ufile->device->ib_dev,
+ &ufile->device->disassociate_srcu);
+ if (!ib_dev)
+ return -EIO;
+
+ ucontext = rdma_zalloc_drv_obj(ib_dev, ib_ucontext);
+ if (!ucontext)
+ return -ENOMEM;
+
+ ucontext->device = ib_dev;
+ ucontext->ufile = ufile;
+ xa_init_flags(&ucontext->mmap_xa, XA_FLAGS_ALLOC);
+
+ rdma_restrack_new(&ucontext->res, RDMA_RESTRACK_CTX);
+ rdma_restrack_set_name(&ucontext->res, NULL);
+ attrs->context = ucontext;
+ return 0;
+}
+
+int ib_init_ucontext(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_ucontext *ucontext = attrs->context;
+ struct ib_uverbs_file *file = attrs->ufile;
+ int ret;
+
+ if (!down_read_trylock(&file->hw_destroy_rwsem))
+ return -EIO;
+ mutex_lock(&file->ucontext_lock);
+ if (file->ucontext) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ret = ib_rdmacg_try_charge(&ucontext->cg_obj, ucontext->device,
+ RDMACG_RESOURCE_HCA_HANDLE);
+ if (ret)
+ goto err;
+
+ ret = ucontext->device->ops.alloc_ucontext(ucontext,
+ &attrs->driver_udata);
+ if (ret)
+ goto err_uncharge;
+
+ rdma_restrack_add(&ucontext->res);
+
+ /*
+ * Make sure that ib_uverbs_get_ucontext() sees the pointer update
+ * only after all writes to setup the ucontext have completed
+ */
+ smp_store_release(&file->ucontext, ucontext);
+
+ mutex_unlock(&file->ucontext_lock);
+ up_read(&file->hw_destroy_rwsem);
+ return 0;
+
+err_uncharge:
+ ib_rdmacg_uncharge(&ucontext->cg_obj, ucontext->device,
+ RDMACG_RESOURCE_HCA_HANDLE);
+err:
+ mutex_unlock(&file->ucontext_lock);
+ up_read(&file->hw_destroy_rwsem);
+ return ret;
+}
+
+static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_get_context_resp resp;
+ struct ib_uverbs_get_context cmd;
+ struct ib_device *ib_dev;
+ struct ib_uobject *uobj;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ ret = ib_alloc_ucontext(attrs);
+ if (ret)
+ return ret;
+
+ uobj = uobj_alloc(UVERBS_OBJECT_ASYNC_EVENT, attrs, &ib_dev);
+ if (IS_ERR(uobj)) {
+ ret = PTR_ERR(uobj);
+ goto err_ucontext;
+ }
+
+ resp = (struct ib_uverbs_get_context_resp){
+ .num_comp_vectors = attrs->ufile->device->num_comp_vectors,
+ .async_fd = uobj->id,
+ };
+ ret = uverbs_response(attrs, &resp, sizeof(resp));
+ if (ret)
+ goto err_uobj;
+
+ ret = ib_init_ucontext(attrs);
+ if (ret)
+ goto err_uobj;
+
+ ib_uverbs_init_async_event_file(
+ container_of(uobj, struct ib_uverbs_async_event_file, uobj));
+ rdma_alloc_commit_uobject(uobj, attrs);
+ return 0;
+
+err_uobj:
+ rdma_alloc_abort_uobject(uobj, attrs, false);
+err_ucontext:
+ rdma_restrack_put(&attrs->context->res);
+ kfree(attrs->context);
+ attrs->context = NULL;
+ return ret;
+}
+
+static void copy_query_dev_fields(struct ib_ucontext *ucontext,
+ struct ib_uverbs_query_device_resp *resp,
+ struct ib_device_attr *attr)
+{
+ struct ib_device *ib_dev = ucontext->device;
+
+ resp->fw_ver = attr->fw_ver;
+ resp->node_guid = ib_dev->node_guid;
+ resp->sys_image_guid = attr->sys_image_guid;
+ resp->max_mr_size = attr->max_mr_size;
+ resp->page_size_cap = attr->page_size_cap;
+ resp->vendor_id = attr->vendor_id;
+ resp->vendor_part_id = attr->vendor_part_id;
+ resp->hw_ver = attr->hw_ver;
+ resp->max_qp = attr->max_qp;
+ resp->max_qp_wr = attr->max_qp_wr;
+ resp->device_cap_flags = lower_32_bits(attr->device_cap_flags);
+ resp->max_sge = min(attr->max_send_sge, attr->max_recv_sge);
+ resp->max_sge_rd = attr->max_sge_rd;
+ resp->max_cq = attr->max_cq;
+ resp->max_cqe = attr->max_cqe;
+ resp->max_mr = attr->max_mr;
+ resp->max_pd = attr->max_pd;
+ resp->max_qp_rd_atom = attr->max_qp_rd_atom;
+ resp->max_ee_rd_atom = attr->max_ee_rd_atom;
+ resp->max_res_rd_atom = attr->max_res_rd_atom;
+ resp->max_qp_init_rd_atom = attr->max_qp_init_rd_atom;
+ resp->max_ee_init_rd_atom = attr->max_ee_init_rd_atom;
+ resp->atomic_cap = attr->atomic_cap;
+ resp->max_ee = attr->max_ee;
+ resp->max_rdd = attr->max_rdd;
+ resp->max_mw = attr->max_mw;
+ resp->max_raw_ipv6_qp = attr->max_raw_ipv6_qp;
+ resp->max_raw_ethy_qp = attr->max_raw_ethy_qp;
+ resp->max_mcast_grp = attr->max_mcast_grp;
+ resp->max_mcast_qp_attach = attr->max_mcast_qp_attach;
+ resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach;
+ resp->max_ah = attr->max_ah;
+ resp->max_srq = attr->max_srq;
+ resp->max_srq_wr = attr->max_srq_wr;
+ resp->max_srq_sge = attr->max_srq_sge;
+ resp->max_pkeys = attr->max_pkeys;
+ resp->local_ca_ack_delay = attr->local_ca_ack_delay;
+ resp->phys_port_cnt = min_t(u32, ib_dev->phys_port_cnt, U8_MAX);
+}
+
+static int ib_uverbs_query_device(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_query_device cmd;
+ struct ib_uverbs_query_device_resp resp;
+ struct ib_ucontext *ucontext;
+ int ret;
+
+ ucontext = ib_uverbs_get_ucontext(attrs);
+ if (IS_ERR(ucontext))
+ return PTR_ERR(ucontext);
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ memset(&resp, 0, sizeof resp);
+ copy_query_dev_fields(ucontext, &resp, &ucontext->device->attrs);
+
+ return uverbs_response(attrs, &resp, sizeof(resp));
+}
+
+static int ib_uverbs_query_port(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_query_port cmd;
+ struct ib_uverbs_query_port_resp resp;
+ struct ib_port_attr attr;
+ int ret;
+ struct ib_ucontext *ucontext;
+ struct ib_device *ib_dev;
+
+ ucontext = ib_uverbs_get_ucontext(attrs);
+ if (IS_ERR(ucontext))
+ return PTR_ERR(ucontext);
+ ib_dev = ucontext->device;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ ret = ib_query_port(ib_dev, cmd.port_num, &attr);
+ if (ret)
+ return ret;
+
+ memset(&resp, 0, sizeof resp);
+ copy_port_attr_to_resp(&attr, &resp, ib_dev, cmd.port_num);
+
+ return uverbs_response(attrs, &resp, sizeof(resp));
+}
+
+static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_alloc_pd_resp resp = {};
+ struct ib_uverbs_alloc_pd cmd;
+ struct ib_uobject *uobj;
+ struct ib_pd *pd;
+ int ret;
+ struct ib_device *ib_dev;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ uobj = uobj_alloc(UVERBS_OBJECT_PD, attrs, &ib_dev);
+ if (IS_ERR(uobj))
+ return PTR_ERR(uobj);
+
+ pd = rdma_zalloc_drv_obj(ib_dev, ib_pd);
+ if (!pd) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ pd->device = ib_dev;
+ pd->uobject = uobj;
+ atomic_set(&pd->usecnt, 0);
+
+ rdma_restrack_new(&pd->res, RDMA_RESTRACK_PD);
+ rdma_restrack_set_name(&pd->res, NULL);
+
+ ret = ib_dev->ops.alloc_pd(pd, &attrs->driver_udata);
+ if (ret)
+ goto err_alloc;
+ rdma_restrack_add(&pd->res);
+
+ uobj->object = pd;
+ uobj_finalize_uobj_create(uobj, attrs);
+
+ resp.pd_handle = uobj->id;
+ return uverbs_response(attrs, &resp, sizeof(resp));
+
+err_alloc:
+ rdma_restrack_put(&pd->res);
+ kfree(pd);
+err:
+ uobj_alloc_abort(uobj, attrs);
+ return ret;
+}
+
+static int ib_uverbs_dealloc_pd(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_dealloc_pd cmd;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ return uobj_perform_destroy(UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
+}
+
+struct xrcd_table_entry {
+ struct rb_node node;
+ struct ib_xrcd *xrcd;
+ struct inode *inode;
+};
+
+static int xrcd_table_insert(struct ib_uverbs_device *dev,
+ struct inode *inode,
+ struct ib_xrcd *xrcd)
+{
+ struct xrcd_table_entry *entry, *scan;
+ struct rb_node **p = &dev->xrcd_tree.rb_node;
+ struct rb_node *parent = NULL;
+
+ entry = kmalloc(sizeof *entry, GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->xrcd = xrcd;
+ entry->inode = inode;
+
+ while (*p) {
+ parent = *p;
+ scan = rb_entry(parent, struct xrcd_table_entry, node);
+
+ if (inode < scan->inode) {
+ p = &(*p)->rb_left;
+ } else if (inode > scan->inode) {
+ p = &(*p)->rb_right;
+ } else {
+ kfree(entry);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&entry->node, parent, p);
+ rb_insert_color(&entry->node, &dev->xrcd_tree);
+ igrab(inode);
+ return 0;
+}
+
+static struct xrcd_table_entry *xrcd_table_search(struct ib_uverbs_device *dev,
+ struct inode *inode)
+{
+ struct xrcd_table_entry *entry;
+ struct rb_node *p = dev->xrcd_tree.rb_node;
+
+ while (p) {
+ entry = rb_entry(p, struct xrcd_table_entry, node);
+
+ if (inode < entry->inode)
+ p = p->rb_left;
+ else if (inode > entry->inode)
+ p = p->rb_right;
+ else
+ return entry;
+ }
+
+ return NULL;
+}
+
+static struct ib_xrcd *find_xrcd(struct ib_uverbs_device *dev, struct inode *inode)
+{
+ struct xrcd_table_entry *entry;
+
+ entry = xrcd_table_search(dev, inode);
+ if (!entry)
+ return NULL;
+
+ return entry->xrcd;
+}
+
+static void xrcd_table_delete(struct ib_uverbs_device *dev,
+ struct inode *inode)
+{
+ struct xrcd_table_entry *entry;
+
+ entry = xrcd_table_search(dev, inode);
+ if (entry) {
+ iput(inode);
+ rb_erase(&entry->node, &dev->xrcd_tree);
+ kfree(entry);
+ }
+}
+
+static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_device *ibudev = attrs->ufile->device;
+ struct ib_uverbs_open_xrcd_resp resp = {};
+ struct ib_uverbs_open_xrcd cmd;
+ struct ib_uxrcd_object *obj;
+ struct ib_xrcd *xrcd = NULL;
+ struct inode *inode = NULL;
+ int new_xrcd = 0;
+ struct ib_device *ib_dev;
+ struct fd f = {};
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ mutex_lock(&ibudev->xrcd_tree_mutex);
+
+ if (cmd.fd != -1) {
+ /* search for file descriptor */
+ f = fdget(cmd.fd);
+ if (!f.file) {
+ ret = -EBADF;
+ goto err_tree_mutex_unlock;
+ }
+
+ inode = file_inode(f.file);
+ xrcd = find_xrcd(ibudev, inode);
+ if (!xrcd && !(cmd.oflags & O_CREAT)) {
+ /* no file descriptor. Need CREATE flag */
+ ret = -EAGAIN;
+ goto err_tree_mutex_unlock;
+ }
+
+ if (xrcd && cmd.oflags & O_EXCL) {
+ ret = -EINVAL;
+ goto err_tree_mutex_unlock;
+ }
+ }
+
+ obj = (struct ib_uxrcd_object *)uobj_alloc(UVERBS_OBJECT_XRCD, attrs,
+ &ib_dev);
+ if (IS_ERR(obj)) {
+ ret = PTR_ERR(obj);
+ goto err_tree_mutex_unlock;
+ }
+
+ if (!xrcd) {
+ xrcd = ib_alloc_xrcd_user(ib_dev, inode, &attrs->driver_udata);
+ if (IS_ERR(xrcd)) {
+ ret = PTR_ERR(xrcd);
+ goto err;
+ }
+ new_xrcd = 1;
+ }
+
+ atomic_set(&obj->refcnt, 0);
+ obj->uobject.object = xrcd;
+
+ if (inode) {
+ if (new_xrcd) {
+ /* create new inode/xrcd table entry */
+ ret = xrcd_table_insert(ibudev, inode, xrcd);
+ if (ret)
+ goto err_dealloc_xrcd;
+ }
+ atomic_inc(&xrcd->usecnt);
+ }
+
+ if (f.file)
+ fdput(f);
+
+ mutex_unlock(&ibudev->xrcd_tree_mutex);
+ uobj_finalize_uobj_create(&obj->uobject, attrs);
+
+ resp.xrcd_handle = obj->uobject.id;
+ return uverbs_response(attrs, &resp, sizeof(resp));
+
+err_dealloc_xrcd:
+ ib_dealloc_xrcd_user(xrcd, uverbs_get_cleared_udata(attrs));
+
+err:
+ uobj_alloc_abort(&obj->uobject, attrs);
+
+err_tree_mutex_unlock:
+ if (f.file)
+ fdput(f);
+
+ mutex_unlock(&ibudev->xrcd_tree_mutex);
+
+ return ret;
+}
+
+static int ib_uverbs_close_xrcd(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_close_xrcd cmd;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ return uobj_perform_destroy(UVERBS_OBJECT_XRCD, cmd.xrcd_handle, attrs);
+}
+
+int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject, struct ib_xrcd *xrcd,
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct inode *inode;
+ int ret;
+ struct ib_uverbs_device *dev = attrs->ufile->device;
+
+ inode = xrcd->inode;
+ if (inode && !atomic_dec_and_test(&xrcd->usecnt))
+ return 0;
+
+ ret = ib_dealloc_xrcd_user(xrcd, &attrs->driver_udata);
+ if (ret) {
+ atomic_inc(&xrcd->usecnt);
+ return ret;
+ }
+
+ if (inode)
+ xrcd_table_delete(dev, inode);
+
+ return 0;
+}
+
+static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_reg_mr_resp resp = {};
+ struct ib_uverbs_reg_mr cmd;
+ struct ib_uobject *uobj;
+ struct ib_pd *pd;
+ struct ib_mr *mr;
+ int ret;
+ struct ib_device *ib_dev;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))
+ return -EINVAL;
+
+ uobj = uobj_alloc(UVERBS_OBJECT_MR, attrs, &ib_dev);
+ if (IS_ERR(uobj))
+ return PTR_ERR(uobj);
+
+ ret = ib_check_mr_access(ib_dev, cmd.access_flags);
+ if (ret)
+ goto err_free;
+
+ pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
+ if (!pd) {
+ ret = -EINVAL;
+ goto err_free;
+ }
+
+ mr = pd->device->ops.reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
+ cmd.access_flags,
+ &attrs->driver_udata);
+ if (IS_ERR(mr)) {
+ ret = PTR_ERR(mr);
+ goto err_put;
+ }
+
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->type = IB_MR_TYPE_USER;
+ mr->dm = NULL;
+ mr->sig_attrs = NULL;
+ mr->uobject = uobj;
+ atomic_inc(&pd->usecnt);
+ mr->iova = cmd.hca_va;
+ mr->length = cmd.length;
+
+ rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR);
+ rdma_restrack_set_name(&mr->res, NULL);
+ rdma_restrack_add(&mr->res);
+
+ uobj->object = mr;
+ uobj_put_obj_read(pd);
+ uobj_finalize_uobj_create(uobj, attrs);
+
+ resp.lkey = mr->lkey;
+ resp.rkey = mr->rkey;
+ resp.mr_handle = uobj->id;
+ return uverbs_response(attrs, &resp, sizeof(resp));
+
+err_put:
+ uobj_put_obj_read(pd);
+err_free:
+ uobj_alloc_abort(uobj, attrs);
+ return ret;
+}
+
+static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_rereg_mr cmd;
+ struct ib_uverbs_rereg_mr_resp resp;
+ struct ib_mr *mr;
+ int ret;
+ struct ib_uobject *uobj;
+ struct ib_uobject *new_uobj;
+ struct ib_device *ib_dev;
+ struct ib_pd *orig_pd;
+ struct ib_pd *new_pd;
+ struct ib_mr *new_mr;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ if (!cmd.flags)
+ return -EINVAL;
+
+ if (cmd.flags & ~IB_MR_REREG_SUPPORTED)
+ return -EOPNOTSUPP;
+
+ if ((cmd.flags & IB_MR_REREG_TRANS) &&
+ (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))
+ return -EINVAL;
+
+ uobj = uobj_get_write(UVERBS_OBJECT_MR, cmd.mr_handle, attrs);
+ if (IS_ERR(uobj))
+ return PTR_ERR(uobj);
+
+ mr = uobj->object;
+
+ if (mr->dm) {
+ ret = -EINVAL;
+ goto put_uobjs;
+ }
+
+ if (cmd.flags & IB_MR_REREG_ACCESS) {
+ ret = ib_check_mr_access(mr->device, cmd.access_flags);
+ if (ret)
+ goto put_uobjs;
+ }
+
+ orig_pd = mr->pd;
+ if (cmd.flags & IB_MR_REREG_PD) {
+ new_pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle,
+ attrs);
+ if (!new_pd) {
+ ret = -EINVAL;
+ goto put_uobjs;
+ }
+ } else {
+ new_pd = mr->pd;
+ }
+
+ /*
+ * The driver might create a new HW object as part of the rereg, we need
+ * to have a uobject ready to hold it.
+ */
+ new_uobj = uobj_alloc(UVERBS_OBJECT_MR, attrs, &ib_dev);
+ if (IS_ERR(new_uobj)) {
+ ret = PTR_ERR(new_uobj);
+ goto put_uobj_pd;
+ }
+
+ new_mr = ib_dev->ops.rereg_user_mr(mr, cmd.flags, cmd.start, cmd.length,
+ cmd.hca_va, cmd.access_flags, new_pd,
+ &attrs->driver_udata);
+ if (IS_ERR(new_mr)) {
+ ret = PTR_ERR(new_mr);
+ goto put_new_uobj;
+ }
+ if (new_mr) {
+ new_mr->device = new_pd->device;
+ new_mr->pd = new_pd;
+ new_mr->type = IB_MR_TYPE_USER;
+ new_mr->uobject = uobj;
+ atomic_inc(&new_pd->usecnt);
+ new_uobj->object = new_mr;
+
+ rdma_restrack_new(&new_mr->res, RDMA_RESTRACK_MR);
+ rdma_restrack_set_name(&new_mr->res, NULL);
+ rdma_restrack_add(&new_mr->res);
+
+ /*
+ * The new uobj for the new HW object is put into the same spot
+ * in the IDR and the old uobj & HW object is deleted.
+ */
+ rdma_assign_uobject(uobj, new_uobj, attrs);
+ rdma_alloc_commit_uobject(new_uobj, attrs);
+ uobj_put_destroy(uobj);
+ new_uobj = NULL;
+ uobj = NULL;
+ mr = new_mr;
+ } else {
+ if (cmd.flags & IB_MR_REREG_PD) {
+ atomic_dec(&orig_pd->usecnt);
+ mr->pd = new_pd;
+ atomic_inc(&new_pd->usecnt);
+ }
+ if (cmd.flags & IB_MR_REREG_TRANS) {
+ mr->iova = cmd.hca_va;
+ mr->length = cmd.length;
+ }
+ }
+
+ memset(&resp, 0, sizeof(resp));
+ resp.lkey = mr->lkey;
+ resp.rkey = mr->rkey;
+
+ ret = uverbs_response(attrs, &resp, sizeof(resp));
+
+put_new_uobj:
+ if (new_uobj)
+ uobj_alloc_abort(new_uobj, attrs);
+put_uobj_pd:
+ if (cmd.flags & IB_MR_REREG_PD)
+ uobj_put_obj_read(new_pd);
+
+put_uobjs:
+ if (uobj)
+ uobj_put_write(uobj);
+
+ return ret;
+}
+
+static int ib_uverbs_dereg_mr(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_dereg_mr cmd;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ return uobj_perform_destroy(UVERBS_OBJECT_MR, cmd.mr_handle, attrs);
+}
+
+static int ib_uverbs_alloc_mw(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_alloc_mw cmd;
+ struct ib_uverbs_alloc_mw_resp resp = {};
+ struct ib_uobject *uobj;
+ struct ib_pd *pd;
+ struct ib_mw *mw;
+ int ret;
+ struct ib_device *ib_dev;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ uobj = uobj_alloc(UVERBS_OBJECT_MW, attrs, &ib_dev);
+ if (IS_ERR(uobj))
+ return PTR_ERR(uobj);
+
+ pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
+ if (!pd) {
+ ret = -EINVAL;
+ goto err_free;
+ }
+
+ if (cmd.mw_type != IB_MW_TYPE_1 && cmd.mw_type != IB_MW_TYPE_2) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
+ mw = rdma_zalloc_drv_obj(ib_dev, ib_mw);
+ if (!mw) {
+ ret = -ENOMEM;
+ goto err_put;
+ }
+
+ mw->device = ib_dev;
+ mw->pd = pd;
+ mw->uobject = uobj;
+ mw->type = cmd.mw_type;
+
+ ret = pd->device->ops.alloc_mw(mw, &attrs->driver_udata);
+ if (ret)
+ goto err_alloc;
+
+ atomic_inc(&pd->usecnt);
+
+ uobj->object = mw;
+ uobj_put_obj_read(pd);
+ uobj_finalize_uobj_create(uobj, attrs);
+
+ resp.rkey = mw->rkey;
+ resp.mw_handle = uobj->id;
+ return uverbs_response(attrs, &resp, sizeof(resp));
+
+err_alloc:
+ kfree(mw);
+err_put:
+ uobj_put_obj_read(pd);
+err_free:
+ uobj_alloc_abort(uobj, attrs);
+ return ret;
+}
+
+static int ib_uverbs_dealloc_mw(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_dealloc_mw cmd;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ return uobj_perform_destroy(UVERBS_OBJECT_MW, cmd.mw_handle, attrs);
+}
+
+static int ib_uverbs_create_comp_channel(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_create_comp_channel cmd;
+ struct ib_uverbs_create_comp_channel_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_uverbs_completion_event_file *ev_file;
+ struct ib_device *ib_dev;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ uobj = uobj_alloc(UVERBS_OBJECT_COMP_CHANNEL, attrs, &ib_dev);
+ if (IS_ERR(uobj))
+ return PTR_ERR(uobj);
+
+ ev_file = container_of(uobj, struct ib_uverbs_completion_event_file,
+ uobj);
+ ib_uverbs_init_event_queue(&ev_file->ev_queue);
+ uobj_finalize_uobj_create(uobj, attrs);
+
+ resp.fd = uobj->id;
+ return uverbs_response(attrs, &resp, sizeof(resp));
+}
+
+static int create_cq(struct uverbs_attr_bundle *attrs,
+ struct ib_uverbs_ex_create_cq *cmd)
+{
+ struct ib_ucq_object *obj;
+ struct ib_uverbs_completion_event_file *ev_file = NULL;
+ struct ib_cq *cq;
+ int ret;
+ struct ib_uverbs_ex_create_cq_resp resp = {};
+ struct ib_cq_init_attr attr = {};
+ struct ib_device *ib_dev;
+
+ if (cmd->comp_vector >= attrs->ufile->device->num_comp_vectors)
+ return -EINVAL;
+
+ obj = (struct ib_ucq_object *)uobj_alloc(UVERBS_OBJECT_CQ, attrs,
+ &ib_dev);
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+
+ if (cmd->comp_channel >= 0) {
+ ev_file = ib_uverbs_lookup_comp_file(cmd->comp_channel, attrs);
+ if (IS_ERR(ev_file)) {
+ ret = PTR_ERR(ev_file);
+ goto err;
+ }
+ }
+
+ obj->uevent.uobject.user_handle = cmd->user_handle;
+ INIT_LIST_HEAD(&obj->comp_list);
+ INIT_LIST_HEAD(&obj->uevent.event_list);
+
+ attr.cqe = cmd->cqe;
+ attr.comp_vector = cmd->comp_vector;
+ attr.flags = cmd->flags;
+
+ cq = rdma_zalloc_drv_obj(ib_dev, ib_cq);
+ if (!cq) {
+ ret = -ENOMEM;
+ goto err_file;
+ }
+ cq->device = ib_dev;
+ cq->uobject = obj;
+ cq->comp_handler = ib_uverbs_comp_handler;
+ cq->event_handler = ib_uverbs_cq_event_handler;
+ cq->cq_context = ev_file ? &ev_file->ev_queue : NULL;
+ atomic_set(&cq->usecnt, 0);
+
+ rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ);
+ rdma_restrack_set_name(&cq->res, NULL);
+
+ ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata);
+ if (ret)
+ goto err_free;
+ rdma_restrack_add(&cq->res);
+
+ obj->uevent.uobject.object = cq;
+ obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file);
+ if (obj->uevent.event_file)
+ uverbs_uobject_get(&obj->uevent.event_file->uobj);
+ uobj_finalize_uobj_create(&obj->uevent.uobject, attrs);
+
+ resp.base.cq_handle = obj->uevent.uobject.id;
+ resp.base.cqe = cq->cqe;
+ resp.response_length = uverbs_response_length(attrs, sizeof(resp));
+ return uverbs_response(attrs, &resp, sizeof(resp));
+
+err_free:
+ rdma_restrack_put(&cq->res);
+ kfree(cq);
+err_file:
+ if (ev_file)
+ ib_uverbs_release_ucq(ev_file, obj);
+err:
+ uobj_alloc_abort(&obj->uevent.uobject, attrs);
+ return ret;
+}
+
+static int ib_uverbs_create_cq(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_create_cq cmd;
+ struct ib_uverbs_ex_create_cq cmd_ex;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ memset(&cmd_ex, 0, sizeof(cmd_ex));
+ cmd_ex.user_handle = cmd.user_handle;
+ cmd_ex.cqe = cmd.cqe;
+ cmd_ex.comp_vector = cmd.comp_vector;
+ cmd_ex.comp_channel = cmd.comp_channel;
+
+ return create_cq(attrs, &cmd_ex);
+}
+
+static int ib_uverbs_ex_create_cq(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_ex_create_cq cmd;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ if (cmd.comp_mask)
+ return -EINVAL;
+
+ if (cmd.reserved)
+ return -EINVAL;
+
+ return create_cq(attrs, &cmd);
+}
+
+static int ib_uverbs_resize_cq(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_resize_cq cmd;
+ struct ib_uverbs_resize_cq_resp resp = {};
+ struct ib_cq *cq;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
+ if (!cq)
+ return -EINVAL;
+
+ ret = cq->device->ops.resize_cq(cq, cmd.cqe, &attrs->driver_udata);
+ if (ret)
+ goto out;
+
+ resp.cqe = cq->cqe;
+
+ ret = uverbs_response(attrs, &resp, sizeof(resp));
+out:
+ rdma_lookup_put_uobject(&cq->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+
+ return ret;
+}
+
+static int copy_wc_to_user(struct ib_device *ib_dev, void __user *dest,
+ struct ib_wc *wc)
+{
+ struct ib_uverbs_wc tmp;
+
+ tmp.wr_id = wc->wr_id;
+ tmp.status = wc->status;
+ tmp.opcode = wc->opcode;
+ tmp.vendor_err = wc->vendor_err;
+ tmp.byte_len = wc->byte_len;
+ tmp.ex.imm_data = wc->ex.imm_data;
+ tmp.qp_num = wc->qp->qp_num;
+ tmp.src_qp = wc->src_qp;
+ tmp.wc_flags = wc->wc_flags;
+ tmp.pkey_index = wc->pkey_index;
+ if (rdma_cap_opa_ah(ib_dev, wc->port_num))
+ tmp.slid = OPA_TO_IB_UCAST_LID(wc->slid);
+ else
+ tmp.slid = ib_lid_cpu16(wc->slid);
+ tmp.sl = wc->sl;
+ tmp.dlid_path_bits = wc->dlid_path_bits;
+ tmp.port_num = wc->port_num;
+ tmp.reserved = 0;
+
+ if (copy_to_user(dest, &tmp, sizeof tmp))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int ib_uverbs_poll_cq(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_poll_cq cmd;
+ struct ib_uverbs_poll_cq_resp resp;
+ u8 __user *header_ptr;
+ u8 __user *data_ptr;
+ struct ib_cq *cq;
+ struct ib_wc wc;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
+ if (!cq)
+ return -EINVAL;
+
+ /* we copy a struct ib_uverbs_poll_cq_resp to user space */
+ header_ptr = attrs->ucore.outbuf;
+ data_ptr = header_ptr + sizeof resp;
+
+ memset(&resp, 0, sizeof resp);
+ while (resp.count < cmd.ne) {
+ ret = ib_poll_cq(cq, 1, &wc);
+ if (ret < 0)
+ goto out_put;
+ if (!ret)
+ break;
+
+ ret = copy_wc_to_user(cq->device, data_ptr, &wc);
+ if (ret)
+ goto out_put;
+
+ data_ptr += sizeof(struct ib_uverbs_wc);
+ ++resp.count;
+ }
+
+ if (copy_to_user(header_ptr, &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto out_put;
+ }
+ ret = 0;
+
+ if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CORE_OUT))
+ ret = uverbs_output_written(attrs, UVERBS_ATTR_CORE_OUT);
+
+out_put:
+ rdma_lookup_put_uobject(&cq->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+ return ret;
+}
+
+static int ib_uverbs_req_notify_cq(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_req_notify_cq cmd;
+ struct ib_cq *cq;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
+ if (!cq)
+ return -EINVAL;
+
+ ib_req_notify_cq(cq, cmd.solicited_only ?
+ IB_CQ_SOLICITED : IB_CQ_NEXT_COMP);
+
+ rdma_lookup_put_uobject(&cq->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+ return 0;
+}
+
+static int ib_uverbs_destroy_cq(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_destroy_cq cmd;
+ struct ib_uverbs_destroy_cq_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_ucq_object *obj;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ uobj = uobj_get_destroy(UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
+ if (IS_ERR(uobj))
+ return PTR_ERR(uobj);
+
+ obj = container_of(uobj, struct ib_ucq_object, uevent.uobject);
+ memset(&resp, 0, sizeof(resp));
+ resp.comp_events_reported = obj->comp_events_reported;
+ resp.async_events_reported = obj->uevent.events_reported;
+
+ uobj_put_destroy(uobj);
+
+ return uverbs_response(attrs, &resp, sizeof(resp));
+}
+
+static int create_qp(struct uverbs_attr_bundle *attrs,
+ struct ib_uverbs_ex_create_qp *cmd)
+{
+ struct ib_uqp_object *obj;
+ struct ib_device *device;
+ struct ib_pd *pd = NULL;
+ struct ib_xrcd *xrcd = NULL;
+ struct ib_uobject *xrcd_uobj = ERR_PTR(-ENOENT);
+ struct ib_cq *scq = NULL, *rcq = NULL;
+ struct ib_srq *srq = NULL;
+ struct ib_qp *qp;
+ struct ib_qp_init_attr attr = {};
+ struct ib_uverbs_ex_create_qp_resp resp = {};
+ int ret;
+ struct ib_rwq_ind_table *ind_tbl = NULL;
+ bool has_sq = true;
+ struct ib_device *ib_dev;
+
+ switch (cmd->qp_type) {
+ case IB_QPT_RAW_PACKET:
+ if (!capable(CAP_NET_RAW))
+ return -EPERM;
+ break;
+ case IB_QPT_RC:
+ case IB_QPT_UC:
+ case IB_QPT_UD:
+ case IB_QPT_XRC_INI:
+ case IB_QPT_XRC_TGT:
+ case IB_QPT_DRIVER:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ obj = (struct ib_uqp_object *)uobj_alloc(UVERBS_OBJECT_QP, attrs,
+ &ib_dev);
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+ obj->uxrcd = NULL;
+ obj->uevent.uobject.user_handle = cmd->user_handle;
+ mutex_init(&obj->mcast_lock);
+
+ if (cmd->comp_mask & IB_UVERBS_CREATE_QP_MASK_IND_TABLE) {
+ ind_tbl = uobj_get_obj_read(rwq_ind_table,
+ UVERBS_OBJECT_RWQ_IND_TBL,
+ cmd->rwq_ind_tbl_handle, attrs);
+ if (!ind_tbl) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
+ attr.rwq_ind_tbl = ind_tbl;
+ }
+
+ if (ind_tbl && (cmd->max_recv_wr || cmd->max_recv_sge || cmd->is_srq)) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
+ if (ind_tbl && !cmd->max_send_wr)
+ has_sq = false;
+
+ if (cmd->qp_type == IB_QPT_XRC_TGT) {
+ xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd->pd_handle,
+ attrs);
+
+ if (IS_ERR(xrcd_uobj)) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
+ xrcd = (struct ib_xrcd *)xrcd_uobj->object;
+ if (!xrcd) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+ device = xrcd->device;
+ } else {
+ if (cmd->qp_type == IB_QPT_XRC_INI) {
+ cmd->max_recv_wr = 0;
+ cmd->max_recv_sge = 0;
+ } else {
+ if (cmd->is_srq) {
+ srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ,
+ cmd->srq_handle, attrs);
+ if (!srq || srq->srq_type == IB_SRQT_XRC) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+ }
+
+ if (!ind_tbl) {
+ if (cmd->recv_cq_handle != cmd->send_cq_handle) {
+ rcq = uobj_get_obj_read(
+ cq, UVERBS_OBJECT_CQ,
+ cmd->recv_cq_handle, attrs);
+ if (!rcq) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+ }
+ }
+ }
+
+ if (has_sq)
+ scq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ,
+ cmd->send_cq_handle, attrs);
+ if (!ind_tbl && cmd->qp_type != IB_QPT_XRC_INI)
+ rcq = rcq ?: scq;
+ pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle,
+ attrs);
+ if (!pd || (!scq && has_sq)) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
+ device = pd->device;
+ }
+
+ attr.event_handler = ib_uverbs_qp_event_handler;
+ attr.send_cq = scq;
+ attr.recv_cq = rcq;
+ attr.srq = srq;
+ attr.xrcd = xrcd;
+ attr.sq_sig_type = cmd->sq_sig_all ? IB_SIGNAL_ALL_WR :
+ IB_SIGNAL_REQ_WR;
+ attr.qp_type = cmd->qp_type;
+
+ attr.cap.max_send_wr = cmd->max_send_wr;
+ attr.cap.max_recv_wr = cmd->max_recv_wr;
+ attr.cap.max_send_sge = cmd->max_send_sge;
+ attr.cap.max_recv_sge = cmd->max_recv_sge;
+ attr.cap.max_inline_data = cmd->max_inline_data;
+
+ INIT_LIST_HEAD(&obj->uevent.event_list);
+ INIT_LIST_HEAD(&obj->mcast_list);
+
+ attr.create_flags = cmd->create_flags;
+ if (attr.create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK |
+ IB_QP_CREATE_CROSS_CHANNEL |
+ IB_QP_CREATE_MANAGED_SEND |
+ IB_QP_CREATE_MANAGED_RECV |
+ IB_QP_CREATE_SCATTER_FCS |
+ IB_QP_CREATE_CVLAN_STRIPPING |
+ IB_QP_CREATE_SOURCE_QPN |
+ IB_QP_CREATE_PCI_WRITE_END_PADDING)) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
+ if (attr.create_flags & IB_QP_CREATE_SOURCE_QPN) {
+ if (!capable(CAP_NET_RAW)) {
+ ret = -EPERM;
+ goto err_put;
+ }
+
+ attr.source_qpn = cmd->source_qpn;
+ }
+
+ qp = ib_create_qp_user(device, pd, &attr, &attrs->driver_udata, obj,
+ KBUILD_MODNAME);
+ if (IS_ERR(qp)) {
+ ret = PTR_ERR(qp);
+ goto err_put;
+ }
+ ib_qp_usecnt_inc(qp);
+
+ obj->uevent.uobject.object = qp;
+ obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file);
+ if (obj->uevent.event_file)
+ uverbs_uobject_get(&obj->uevent.event_file->uobj);
+
+ if (xrcd) {
+ obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object,
+ uobject);
+ atomic_inc(&obj->uxrcd->refcnt);
+ uobj_put_read(xrcd_uobj);
+ }
+
+ if (pd)
+ uobj_put_obj_read(pd);
+ if (scq)
+ rdma_lookup_put_uobject(&scq->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+ if (rcq && rcq != scq)
+ rdma_lookup_put_uobject(&rcq->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+ if (srq)
+ rdma_lookup_put_uobject(&srq->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+ if (ind_tbl)
+ uobj_put_obj_read(ind_tbl);
+ uobj_finalize_uobj_create(&obj->uevent.uobject, attrs);
+
+ resp.base.qpn = qp->qp_num;
+ resp.base.qp_handle = obj->uevent.uobject.id;
+ resp.base.max_recv_sge = attr.cap.max_recv_sge;
+ resp.base.max_send_sge = attr.cap.max_send_sge;
+ resp.base.max_recv_wr = attr.cap.max_recv_wr;
+ resp.base.max_send_wr = attr.cap.max_send_wr;
+ resp.base.max_inline_data = attr.cap.max_inline_data;
+ resp.response_length = uverbs_response_length(attrs, sizeof(resp));
+ return uverbs_response(attrs, &resp, sizeof(resp));
+
+err_put:
+ if (!IS_ERR(xrcd_uobj))
+ uobj_put_read(xrcd_uobj);
+ if (pd)
+ uobj_put_obj_read(pd);
+ if (scq)
+ rdma_lookup_put_uobject(&scq->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+ if (rcq && rcq != scq)
+ rdma_lookup_put_uobject(&rcq->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+ if (srq)
+ rdma_lookup_put_uobject(&srq->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+ if (ind_tbl)
+ uobj_put_obj_read(ind_tbl);
+
+ uobj_alloc_abort(&obj->uevent.uobject, attrs);
+ return ret;
+}
+
+static int ib_uverbs_create_qp(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_create_qp cmd;
+ struct ib_uverbs_ex_create_qp cmd_ex;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ memset(&cmd_ex, 0, sizeof(cmd_ex));
+ cmd_ex.user_handle = cmd.user_handle;
+ cmd_ex.pd_handle = cmd.pd_handle;
+ cmd_ex.send_cq_handle = cmd.send_cq_handle;
+ cmd_ex.recv_cq_handle = cmd.recv_cq_handle;
+ cmd_ex.srq_handle = cmd.srq_handle;
+ cmd_ex.max_send_wr = cmd.max_send_wr;
+ cmd_ex.max_recv_wr = cmd.max_recv_wr;
+ cmd_ex.max_send_sge = cmd.max_send_sge;
+ cmd_ex.max_recv_sge = cmd.max_recv_sge;
+ cmd_ex.max_inline_data = cmd.max_inline_data;
+ cmd_ex.sq_sig_all = cmd.sq_sig_all;
+ cmd_ex.qp_type = cmd.qp_type;
+ cmd_ex.is_srq = cmd.is_srq;
+
+ return create_qp(attrs, &cmd_ex);
+}
+
+static int ib_uverbs_ex_create_qp(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_ex_create_qp cmd;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ if (cmd.comp_mask & ~IB_UVERBS_CREATE_QP_SUP_COMP_MASK)
+ return -EINVAL;
+
+ if (cmd.reserved)
+ return -EINVAL;
+
+ return create_qp(attrs, &cmd);
+}
+
+static int ib_uverbs_open_qp(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_create_qp_resp resp = {};
+ struct ib_uverbs_open_qp cmd;
+ struct ib_uqp_object *obj;
+ struct ib_xrcd *xrcd;
+ struct ib_qp *qp;
+ struct ib_qp_open_attr attr = {};
+ int ret;
+ struct ib_uobject *xrcd_uobj;
+ struct ib_device *ib_dev;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ obj = (struct ib_uqp_object *)uobj_alloc(UVERBS_OBJECT_QP, attrs,
+ &ib_dev);
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+
+ xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd.pd_handle, attrs);
+ if (IS_ERR(xrcd_uobj)) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
+ xrcd = (struct ib_xrcd *)xrcd_uobj->object;
+ if (!xrcd) {
+ ret = -EINVAL;
+ goto err_xrcd;
+ }
+
+ attr.event_handler = ib_uverbs_qp_event_handler;
+ attr.qp_num = cmd.qpn;
+ attr.qp_type = cmd.qp_type;
+
+ INIT_LIST_HEAD(&obj->uevent.event_list);
+ INIT_LIST_HEAD(&obj->mcast_list);
+
+ qp = ib_open_qp(xrcd, &attr);
+ if (IS_ERR(qp)) {
+ ret = PTR_ERR(qp);
+ goto err_xrcd;
+ }
+
+ obj->uevent.uobject.object = qp;
+ obj->uevent.uobject.user_handle = cmd.user_handle;
+
+ obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
+ atomic_inc(&obj->uxrcd->refcnt);
+ qp->uobject = obj;
+ uobj_put_read(xrcd_uobj);
+ uobj_finalize_uobj_create(&obj->uevent.uobject, attrs);
+
+ resp.qpn = qp->qp_num;
+ resp.qp_handle = obj->uevent.uobject.id;
+ return uverbs_response(attrs, &resp, sizeof(resp));
+
+err_xrcd:
+ uobj_put_read(xrcd_uobj);
+err_put:
+ uobj_alloc_abort(&obj->uevent.uobject, attrs);
+ return ret;
+}
+
+static void copy_ah_attr_to_uverbs(struct ib_uverbs_qp_dest *uverb_attr,
+ struct rdma_ah_attr *rdma_attr)
+{
+ const struct ib_global_route *grh;
+
+ uverb_attr->dlid = rdma_ah_get_dlid(rdma_attr);
+ uverb_attr->sl = rdma_ah_get_sl(rdma_attr);
+ uverb_attr->src_path_bits = rdma_ah_get_path_bits(rdma_attr);
+ uverb_attr->static_rate = rdma_ah_get_static_rate(rdma_attr);
+ uverb_attr->is_global = !!(rdma_ah_get_ah_flags(rdma_attr) &
+ IB_AH_GRH);
+ if (uverb_attr->is_global) {
+ grh = rdma_ah_read_grh(rdma_attr);
+ memcpy(uverb_attr->dgid, grh->dgid.raw, 16);
+ uverb_attr->flow_label = grh->flow_label;
+ uverb_attr->sgid_index = grh->sgid_index;
+ uverb_attr->hop_limit = grh->hop_limit;
+ uverb_attr->traffic_class = grh->traffic_class;
+ }
+ uverb_attr->port_num = rdma_ah_get_port_num(rdma_attr);
+}
+
+static int ib_uverbs_query_qp(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_query_qp cmd;
+ struct ib_uverbs_query_qp_resp resp;
+ struct ib_qp *qp;
+ struct ib_qp_attr *attr;
+ struct ib_qp_init_attr *init_attr;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ attr = kmalloc(sizeof *attr, GFP_KERNEL);
+ init_attr = kmalloc(sizeof *init_attr, GFP_KERNEL);
+ if (!attr || !init_attr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
+ if (!qp) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = ib_query_qp(qp, attr, cmd.attr_mask, init_attr);
+
+ rdma_lookup_put_uobject(&qp->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+
+ if (ret)
+ goto out;
+
+ memset(&resp, 0, sizeof resp);
+
+ resp.qp_state = attr->qp_state;
+ resp.cur_qp_state = attr->cur_qp_state;
+ resp.path_mtu = attr->path_mtu;
+ resp.path_mig_state = attr->path_mig_state;
+ resp.qkey = attr->qkey;
+ resp.rq_psn = attr->rq_psn;
+ resp.sq_psn = attr->sq_psn;
+ resp.dest_qp_num = attr->dest_qp_num;
+ resp.qp_access_flags = attr->qp_access_flags;
+ resp.pkey_index = attr->pkey_index;
+ resp.alt_pkey_index = attr->alt_pkey_index;
+ resp.sq_draining = attr->sq_draining;
+ resp.max_rd_atomic = attr->max_rd_atomic;
+ resp.max_dest_rd_atomic = attr->max_dest_rd_atomic;
+ resp.min_rnr_timer = attr->min_rnr_timer;
+ resp.port_num = attr->port_num;
+ resp.timeout = attr->timeout;
+ resp.retry_cnt = attr->retry_cnt;
+ resp.rnr_retry = attr->rnr_retry;
+ resp.alt_port_num = attr->alt_port_num;
+ resp.alt_timeout = attr->alt_timeout;
+
+ copy_ah_attr_to_uverbs(&resp.dest, &attr->ah_attr);
+ copy_ah_attr_to_uverbs(&resp.alt_dest, &attr->alt_ah_attr);
+
+ resp.max_send_wr = init_attr->cap.max_send_wr;
+ resp.max_recv_wr = init_attr->cap.max_recv_wr;
+ resp.max_send_sge = init_attr->cap.max_send_sge;
+ resp.max_recv_sge = init_attr->cap.max_recv_sge;
+ resp.max_inline_data = init_attr->cap.max_inline_data;
+ resp.sq_sig_all = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR;
+
+ ret = uverbs_response(attrs, &resp, sizeof(resp));
+
+out:
+ kfree(attr);
+ kfree(init_attr);
+
+ return ret;
+}
+
+/* Remove ignored fields set in the attribute mask */
+static int modify_qp_mask(enum ib_qp_type qp_type, int mask)
+{
+ switch (qp_type) {
+ case IB_QPT_XRC_INI:
+ return mask & ~(IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER);
+ case IB_QPT_XRC_TGT:
+ return mask & ~(IB_QP_MAX_QP_RD_ATOMIC | IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY);
+ default:
+ return mask;
+ }
+}
+
+static void copy_ah_attr_from_uverbs(struct ib_device *dev,
+ struct rdma_ah_attr *rdma_attr,
+ struct ib_uverbs_qp_dest *uverb_attr)
+{
+ rdma_attr->type = rdma_ah_find_type(dev, uverb_attr->port_num);
+ if (uverb_attr->is_global) {
+ rdma_ah_set_grh(rdma_attr, NULL,
+ uverb_attr->flow_label,
+ uverb_attr->sgid_index,
+ uverb_attr->hop_limit,
+ uverb_attr->traffic_class);
+ rdma_ah_set_dgid_raw(rdma_attr, uverb_attr->dgid);
+ } else {
+ rdma_ah_set_ah_flags(rdma_attr, 0);
+ }
+ rdma_ah_set_dlid(rdma_attr, uverb_attr->dlid);
+ rdma_ah_set_sl(rdma_attr, uverb_attr->sl);
+ rdma_ah_set_path_bits(rdma_attr, uverb_attr->src_path_bits);
+ rdma_ah_set_static_rate(rdma_attr, uverb_attr->static_rate);
+ rdma_ah_set_port_num(rdma_attr, uverb_attr->port_num);
+ rdma_ah_set_make_grd(rdma_attr, false);
+}
+
+static int modify_qp(struct uverbs_attr_bundle *attrs,
+ struct ib_uverbs_ex_modify_qp *cmd)
+{
+ struct ib_qp_attr *attr;
+ struct ib_qp *qp;
+ int ret;
+
+ attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+ if (!attr)
+ return -ENOMEM;
+
+ qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd->base.qp_handle,
+ attrs);
+ if (!qp) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if ((cmd->base.attr_mask & IB_QP_PORT) &&
+ !rdma_is_port_valid(qp->device, cmd->base.port_num)) {
+ ret = -EINVAL;
+ goto release_qp;
+ }
+
+ if ((cmd->base.attr_mask & IB_QP_AV)) {
+ if (!rdma_is_port_valid(qp->device, cmd->base.dest.port_num)) {
+ ret = -EINVAL;
+ goto release_qp;
+ }
+
+ if (cmd->base.attr_mask & IB_QP_STATE &&
+ cmd->base.qp_state == IB_QPS_RTR) {
+ /* We are in INIT->RTR TRANSITION (if we are not,
+ * this transition will be rejected in subsequent checks).
+ * In the INIT->RTR transition, we cannot have IB_QP_PORT set,
+ * but the IB_QP_STATE flag is required.
+ *
+ * Since kernel 3.14 (commit dbf727de7440), the uverbs driver,
+ * when IB_QP_AV is set, has required inclusion of a valid
+ * port number in the primary AV. (AVs are created and handled
+ * differently for infiniband and ethernet (RoCE) ports).
+ *
+ * Check the port number included in the primary AV against
+ * the port number in the qp struct, which was set (and saved)
+ * in the RST->INIT transition.
+ */
+ if (cmd->base.dest.port_num != qp->real_qp->port) {
+ ret = -EINVAL;
+ goto release_qp;
+ }
+ } else {
+ /* We are in SQD->SQD. (If we are not, this transition will
+ * be rejected later in the verbs layer checks).
+ * Check for both IB_QP_PORT and IB_QP_AV, these can be set
+ * together in the SQD->SQD transition.
+ *
+ * If only IP_QP_AV was set, add in IB_QP_PORT as well (the
+ * verbs layer driver does not track primary port changes
+ * resulting from path migration. Thus, in SQD, if the primary
+ * AV is modified, the primary port should also be modified).
+ *
+ * Note that in this transition, the IB_QP_STATE flag
+ * is not allowed.
+ */
+ if (((cmd->base.attr_mask & (IB_QP_AV | IB_QP_PORT))
+ == (IB_QP_AV | IB_QP_PORT)) &&
+ cmd->base.port_num != cmd->base.dest.port_num) {
+ ret = -EINVAL;
+ goto release_qp;
+ }
+ if ((cmd->base.attr_mask & (IB_QP_AV | IB_QP_PORT))
+ == IB_QP_AV) {
+ cmd->base.attr_mask |= IB_QP_PORT;
+ cmd->base.port_num = cmd->base.dest.port_num;
+ }
+ }
+ }
+
+ if ((cmd->base.attr_mask & IB_QP_ALT_PATH) &&
+ (!rdma_is_port_valid(qp->device, cmd->base.alt_port_num) ||
+ !rdma_is_port_valid(qp->device, cmd->base.alt_dest.port_num) ||
+ cmd->base.alt_port_num != cmd->base.alt_dest.port_num)) {
+ ret = -EINVAL;
+ goto release_qp;
+ }
+
+ if ((cmd->base.attr_mask & IB_QP_CUR_STATE &&
+ cmd->base.cur_qp_state > IB_QPS_ERR) ||
+ (cmd->base.attr_mask & IB_QP_STATE &&
+ cmd->base.qp_state > IB_QPS_ERR)) {
+ ret = -EINVAL;
+ goto release_qp;
+ }
+
+ if (cmd->base.attr_mask & IB_QP_STATE)
+ attr->qp_state = cmd->base.qp_state;
+ if (cmd->base.attr_mask & IB_QP_CUR_STATE)
+ attr->cur_qp_state = cmd->base.cur_qp_state;
+ if (cmd->base.attr_mask & IB_QP_PATH_MTU)
+ attr->path_mtu = cmd->base.path_mtu;
+ if (cmd->base.attr_mask & IB_QP_PATH_MIG_STATE)
+ attr->path_mig_state = cmd->base.path_mig_state;
+ if (cmd->base.attr_mask & IB_QP_QKEY) {
+ if (cmd->base.qkey & IB_QP_SET_QKEY && !capable(CAP_NET_RAW)) {
+ ret = -EPERM;
+ goto release_qp;
+ }
+ attr->qkey = cmd->base.qkey;
+ }
+ if (cmd->base.attr_mask & IB_QP_RQ_PSN)
+ attr->rq_psn = cmd->base.rq_psn;
+ if (cmd->base.attr_mask & IB_QP_SQ_PSN)
+ attr->sq_psn = cmd->base.sq_psn;
+ if (cmd->base.attr_mask & IB_QP_DEST_QPN)
+ attr->dest_qp_num = cmd->base.dest_qp_num;
+ if (cmd->base.attr_mask & IB_QP_ACCESS_FLAGS)
+ attr->qp_access_flags = cmd->base.qp_access_flags;
+ if (cmd->base.attr_mask & IB_QP_PKEY_INDEX)
+ attr->pkey_index = cmd->base.pkey_index;
+ if (cmd->base.attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY)
+ attr->en_sqd_async_notify = cmd->base.en_sqd_async_notify;
+ if (cmd->base.attr_mask & IB_QP_MAX_QP_RD_ATOMIC)
+ attr->max_rd_atomic = cmd->base.max_rd_atomic;
+ if (cmd->base.attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+ attr->max_dest_rd_atomic = cmd->base.max_dest_rd_atomic;
+ if (cmd->base.attr_mask & IB_QP_MIN_RNR_TIMER)
+ attr->min_rnr_timer = cmd->base.min_rnr_timer;
+ if (cmd->base.attr_mask & IB_QP_PORT)
+ attr->port_num = cmd->base.port_num;
+ if (cmd->base.attr_mask & IB_QP_TIMEOUT)
+ attr->timeout = cmd->base.timeout;
+ if (cmd->base.attr_mask & IB_QP_RETRY_CNT)
+ attr->retry_cnt = cmd->base.retry_cnt;
+ if (cmd->base.attr_mask & IB_QP_RNR_RETRY)
+ attr->rnr_retry = cmd->base.rnr_retry;
+ if (cmd->base.attr_mask & IB_QP_ALT_PATH) {
+ attr->alt_port_num = cmd->base.alt_port_num;
+ attr->alt_timeout = cmd->base.alt_timeout;
+ attr->alt_pkey_index = cmd->base.alt_pkey_index;
+ }
+ if (cmd->base.attr_mask & IB_QP_RATE_LIMIT)
+ attr->rate_limit = cmd->rate_limit;
+
+ if (cmd->base.attr_mask & IB_QP_AV)
+ copy_ah_attr_from_uverbs(qp->device, &attr->ah_attr,
+ &cmd->base.dest);
+
+ if (cmd->base.attr_mask & IB_QP_ALT_PATH)
+ copy_ah_attr_from_uverbs(qp->device, &attr->alt_ah_attr,
+ &cmd->base.alt_dest);
+
+ ret = ib_modify_qp_with_udata(qp, attr,
+ modify_qp_mask(qp->qp_type,
+ cmd->base.attr_mask),
+ &attrs->driver_udata);
+
+release_qp:
+ rdma_lookup_put_uobject(&qp->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+out:
+ kfree(attr);
+
+ return ret;
+}
+
+static int ib_uverbs_modify_qp(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_ex_modify_qp cmd;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd.base, sizeof(cmd.base));
+ if (ret)
+ return ret;
+
+ if (cmd.base.attr_mask & ~IB_QP_ATTR_STANDARD_BITS)
+ return -EOPNOTSUPP;
+
+ return modify_qp(attrs, &cmd);
+}
+
+static int ib_uverbs_ex_modify_qp(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_ex_modify_qp cmd;
+ struct ib_uverbs_ex_modify_qp_resp resp = {
+ .response_length = uverbs_response_length(attrs, sizeof(resp))
+ };
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ /*
+ * Last bit is reserved for extending the attr_mask by
+ * using another field.
+ */
+ if (cmd.base.attr_mask & ~(IB_QP_ATTR_STANDARD_BITS | IB_QP_RATE_LIMIT))
+ return -EOPNOTSUPP;
+
+ ret = modify_qp(attrs, &cmd);
+ if (ret)
+ return ret;
+
+ return uverbs_response(attrs, &resp, sizeof(resp));
+}
+
+static int ib_uverbs_destroy_qp(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_destroy_qp cmd;
+ struct ib_uverbs_destroy_qp_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_uqp_object *obj;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ uobj = uobj_get_destroy(UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
+ if (IS_ERR(uobj))
+ return PTR_ERR(uobj);
+
+ obj = container_of(uobj, struct ib_uqp_object, uevent.uobject);
+ memset(&resp, 0, sizeof(resp));
+ resp.events_reported = obj->uevent.events_reported;
+
+ uobj_put_destroy(uobj);
+
+ return uverbs_response(attrs, &resp, sizeof(resp));
+}
+
+static void *alloc_wr(size_t wr_size, __u32 num_sge)
+{
+ if (num_sge >= (U32_MAX - ALIGN(wr_size, sizeof(struct ib_sge))) /
+ sizeof(struct ib_sge))
+ return NULL;
+
+ return kmalloc(ALIGN(wr_size, sizeof(struct ib_sge)) +
+ num_sge * sizeof(struct ib_sge),
+ GFP_KERNEL);
+}
+
+static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_post_send cmd;
+ struct ib_uverbs_post_send_resp resp;
+ struct ib_uverbs_send_wr *user_wr;
+ struct ib_send_wr *wr = NULL, *last, *next;
+ const struct ib_send_wr *bad_wr;
+ struct ib_qp *qp;
+ int i, sg_ind;
+ int is_ud;
+ int ret, ret2;
+ size_t next_size;
+ const struct ib_sge __user *sgls;
+ const void __user *wqes;
+ struct uverbs_req_iter iter;
+
+ ret = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+ wqes = uverbs_request_next_ptr(&iter, cmd.wqe_size * cmd.wr_count);
+ if (IS_ERR(wqes))
+ return PTR_ERR(wqes);
+ sgls = uverbs_request_next_ptr(
+ &iter, cmd.sge_count * sizeof(struct ib_uverbs_sge));
+ if (IS_ERR(sgls))
+ return PTR_ERR(sgls);
+ ret = uverbs_request_finish(&iter);
+ if (ret)
+ return ret;
+
+ user_wr = kmalloc(cmd.wqe_size, GFP_KERNEL);
+ if (!user_wr)
+ return -ENOMEM;
+
+ qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
+ if (!qp) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ is_ud = qp->qp_type == IB_QPT_UD;
+ sg_ind = 0;
+ last = NULL;
+ for (i = 0; i < cmd.wr_count; ++i) {
+ if (copy_from_user(user_wr, wqes + i * cmd.wqe_size,
+ cmd.wqe_size)) {
+ ret = -EFAULT;
+ goto out_put;
+ }
+
+ if (user_wr->num_sge + sg_ind > cmd.sge_count) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ if (is_ud) {
+ struct ib_ud_wr *ud;
+
+ if (user_wr->opcode != IB_WR_SEND &&
+ user_wr->opcode != IB_WR_SEND_WITH_IMM) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ next_size = sizeof(*ud);
+ ud = alloc_wr(next_size, user_wr->num_sge);
+ if (!ud) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+
+ ud->ah = uobj_get_obj_read(ah, UVERBS_OBJECT_AH,
+ user_wr->wr.ud.ah, attrs);
+ if (!ud->ah) {
+ kfree(ud);
+ ret = -EINVAL;
+ goto out_put;
+ }
+ ud->remote_qpn = user_wr->wr.ud.remote_qpn;
+ ud->remote_qkey = user_wr->wr.ud.remote_qkey;
+
+ next = &ud->wr;
+ } else if (user_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM ||
+ user_wr->opcode == IB_WR_RDMA_WRITE ||
+ user_wr->opcode == IB_WR_RDMA_READ) {
+ struct ib_rdma_wr *rdma;
+
+ next_size = sizeof(*rdma);
+ rdma = alloc_wr(next_size, user_wr->num_sge);
+ if (!rdma) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+
+ rdma->remote_addr = user_wr->wr.rdma.remote_addr;
+ rdma->rkey = user_wr->wr.rdma.rkey;
+
+ next = &rdma->wr;
+ } else if (user_wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ user_wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+ struct ib_atomic_wr *atomic;
+
+ next_size = sizeof(*atomic);
+ atomic = alloc_wr(next_size, user_wr->num_sge);
+ if (!atomic) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+
+ atomic->remote_addr = user_wr->wr.atomic.remote_addr;
+ atomic->compare_add = user_wr->wr.atomic.compare_add;
+ atomic->swap = user_wr->wr.atomic.swap;
+ atomic->rkey = user_wr->wr.atomic.rkey;
+
+ next = &atomic->wr;
+ } else if (user_wr->opcode == IB_WR_SEND ||
+ user_wr->opcode == IB_WR_SEND_WITH_IMM ||
+ user_wr->opcode == IB_WR_SEND_WITH_INV) {
+ next_size = sizeof(*next);
+ next = alloc_wr(next_size, user_wr->num_sge);
+ if (!next) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+ } else {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ if (user_wr->opcode == IB_WR_SEND_WITH_IMM ||
+ user_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
+ next->ex.imm_data =
+ (__be32 __force) user_wr->ex.imm_data;
+ } else if (user_wr->opcode == IB_WR_SEND_WITH_INV) {
+ next->ex.invalidate_rkey = user_wr->ex.invalidate_rkey;
+ }
+
+ if (!last)
+ wr = next;
+ else
+ last->next = next;
+ last = next;
+
+ next->next = NULL;
+ next->wr_id = user_wr->wr_id;
+ next->num_sge = user_wr->num_sge;
+ next->opcode = user_wr->opcode;
+ next->send_flags = user_wr->send_flags;
+
+ if (next->num_sge) {
+ next->sg_list = (void *) next +
+ ALIGN(next_size, sizeof(struct ib_sge));
+ if (copy_from_user(next->sg_list, sgls + sg_ind,
+ next->num_sge *
+ sizeof(struct ib_sge))) {
+ ret = -EFAULT;
+ goto out_put;
+ }
+ sg_ind += next->num_sge;
+ } else
+ next->sg_list = NULL;
+ }
+
+ resp.bad_wr = 0;
+ ret = qp->device->ops.post_send(qp->real_qp, wr, &bad_wr);
+ if (ret)
+ for (next = wr; next; next = next->next) {
+ ++resp.bad_wr;
+ if (next == bad_wr)
+ break;
+ }
+
+ ret2 = uverbs_response(attrs, &resp, sizeof(resp));
+ if (ret2)
+ ret = ret2;
+
+out_put:
+ rdma_lookup_put_uobject(&qp->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+
+ while (wr) {
+ if (is_ud && ud_wr(wr)->ah)
+ uobj_put_obj_read(ud_wr(wr)->ah);
+ next = wr->next;
+ kfree(wr);
+ wr = next;
+ }
+
+out:
+ kfree(user_wr);
+
+ return ret;
+}
+
+static struct ib_recv_wr *
+ib_uverbs_unmarshall_recv(struct uverbs_req_iter *iter, u32 wr_count,
+ u32 wqe_size, u32 sge_count)
+{
+ struct ib_uverbs_recv_wr *user_wr;
+ struct ib_recv_wr *wr = NULL, *last, *next;
+ int sg_ind;
+ int i;
+ int ret;
+ const struct ib_sge __user *sgls;
+ const void __user *wqes;
+
+ if (wqe_size < sizeof(struct ib_uverbs_recv_wr))
+ return ERR_PTR(-EINVAL);
+
+ wqes = uverbs_request_next_ptr(iter, wqe_size * wr_count);
+ if (IS_ERR(wqes))
+ return ERR_CAST(wqes);
+ sgls = uverbs_request_next_ptr(
+ iter, sge_count * sizeof(struct ib_uverbs_sge));
+ if (IS_ERR(sgls))
+ return ERR_CAST(sgls);
+ ret = uverbs_request_finish(iter);
+ if (ret)
+ return ERR_PTR(ret);
+
+ user_wr = kmalloc(wqe_size, GFP_KERNEL);
+ if (!user_wr)
+ return ERR_PTR(-ENOMEM);
+
+ sg_ind = 0;
+ last = NULL;
+ for (i = 0; i < wr_count; ++i) {
+ if (copy_from_user(user_wr, wqes + i * wqe_size,
+ wqe_size)) {
+ ret = -EFAULT;
+ goto err;
+ }
+
+ if (user_wr->num_sge + sg_ind > sge_count) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (user_wr->num_sge >=
+ (U32_MAX - ALIGN(sizeof(*next), sizeof(struct ib_sge))) /
+ sizeof(struct ib_sge)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ next = kmalloc(ALIGN(sizeof(*next), sizeof(struct ib_sge)) +
+ user_wr->num_sge * sizeof(struct ib_sge),
+ GFP_KERNEL);
+ if (!next) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ if (!last)
+ wr = next;
+ else
+ last->next = next;
+ last = next;
+
+ next->next = NULL;
+ next->wr_id = user_wr->wr_id;
+ next->num_sge = user_wr->num_sge;
+
+ if (next->num_sge) {
+ next->sg_list = (void *)next +
+ ALIGN(sizeof(*next), sizeof(struct ib_sge));
+ if (copy_from_user(next->sg_list, sgls + sg_ind,
+ next->num_sge *
+ sizeof(struct ib_sge))) {
+ ret = -EFAULT;
+ goto err;
+ }
+ sg_ind += next->num_sge;
+ } else
+ next->sg_list = NULL;
+ }
+
+ kfree(user_wr);
+ return wr;
+
+err:
+ kfree(user_wr);
+
+ while (wr) {
+ next = wr->next;
+ kfree(wr);
+ wr = next;
+ }
+
+ return ERR_PTR(ret);
+}
+
+static int ib_uverbs_post_recv(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_post_recv cmd;
+ struct ib_uverbs_post_recv_resp resp;
+ struct ib_recv_wr *wr, *next;
+ const struct ib_recv_wr *bad_wr;
+ struct ib_qp *qp;
+ int ret, ret2;
+ struct uverbs_req_iter iter;
+
+ ret = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ wr = ib_uverbs_unmarshall_recv(&iter, cmd.wr_count, cmd.wqe_size,
+ cmd.sge_count);
+ if (IS_ERR(wr))
+ return PTR_ERR(wr);
+
+ qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
+ if (!qp) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ resp.bad_wr = 0;
+ ret = qp->device->ops.post_recv(qp->real_qp, wr, &bad_wr);
+
+ rdma_lookup_put_uobject(&qp->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+ if (ret) {
+ for (next = wr; next; next = next->next) {
+ ++resp.bad_wr;
+ if (next == bad_wr)
+ break;
+ }
+ }
+
+ ret2 = uverbs_response(attrs, &resp, sizeof(resp));
+ if (ret2)
+ ret = ret2;
+out:
+ while (wr) {
+ next = wr->next;
+ kfree(wr);
+ wr = next;
+ }
+
+ return ret;
+}
+
+static int ib_uverbs_post_srq_recv(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_post_srq_recv cmd;
+ struct ib_uverbs_post_srq_recv_resp resp;
+ struct ib_recv_wr *wr, *next;
+ const struct ib_recv_wr *bad_wr;
+ struct ib_srq *srq;
+ int ret, ret2;
+ struct uverbs_req_iter iter;
+
+ ret = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ wr = ib_uverbs_unmarshall_recv(&iter, cmd.wr_count, cmd.wqe_size,
+ cmd.sge_count);
+ if (IS_ERR(wr))
+ return PTR_ERR(wr);
+
+ srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs);
+ if (!srq) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ resp.bad_wr = 0;
+ ret = srq->device->ops.post_srq_recv(srq, wr, &bad_wr);
+
+ rdma_lookup_put_uobject(&srq->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+
+ if (ret)
+ for (next = wr; next; next = next->next) {
+ ++resp.bad_wr;
+ if (next == bad_wr)
+ break;
+ }
+
+ ret2 = uverbs_response(attrs, &resp, sizeof(resp));
+ if (ret2)
+ ret = ret2;
+
+out:
+ while (wr) {
+ next = wr->next;
+ kfree(wr);
+ wr = next;
+ }
+
+ return ret;
+}
+
+static int ib_uverbs_create_ah(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_create_ah cmd;
+ struct ib_uverbs_create_ah_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_pd *pd;
+ struct ib_ah *ah;
+ struct rdma_ah_attr attr = {};
+ int ret;
+ struct ib_device *ib_dev;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ uobj = uobj_alloc(UVERBS_OBJECT_AH, attrs, &ib_dev);
+ if (IS_ERR(uobj))
+ return PTR_ERR(uobj);
+
+ if (!rdma_is_port_valid(ib_dev, cmd.attr.port_num)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
+ if (!pd) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ attr.type = rdma_ah_find_type(ib_dev, cmd.attr.port_num);
+ rdma_ah_set_make_grd(&attr, false);
+ rdma_ah_set_dlid(&attr, cmd.attr.dlid);
+ rdma_ah_set_sl(&attr, cmd.attr.sl);
+ rdma_ah_set_path_bits(&attr, cmd.attr.src_path_bits);
+ rdma_ah_set_static_rate(&attr, cmd.attr.static_rate);
+ rdma_ah_set_port_num(&attr, cmd.attr.port_num);
+
+ if (cmd.attr.is_global) {
+ rdma_ah_set_grh(&attr, NULL, cmd.attr.grh.flow_label,
+ cmd.attr.grh.sgid_index,
+ cmd.attr.grh.hop_limit,
+ cmd.attr.grh.traffic_class);
+ rdma_ah_set_dgid_raw(&attr, cmd.attr.grh.dgid);
+ } else {
+ rdma_ah_set_ah_flags(&attr, 0);
+ }
+
+ ah = rdma_create_user_ah(pd, &attr, &attrs->driver_udata);
+ if (IS_ERR(ah)) {
+ ret = PTR_ERR(ah);
+ goto err_put;
+ }
+
+ ah->uobject = uobj;
+ uobj->user_handle = cmd.user_handle;
+ uobj->object = ah;
+ uobj_put_obj_read(pd);
+ uobj_finalize_uobj_create(uobj, attrs);
+
+ resp.ah_handle = uobj->id;
+ return uverbs_response(attrs, &resp, sizeof(resp));
+
+err_put:
+ uobj_put_obj_read(pd);
+err:
+ uobj_alloc_abort(uobj, attrs);
+ return ret;
+}
+
+static int ib_uverbs_destroy_ah(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_destroy_ah cmd;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ return uobj_perform_destroy(UVERBS_OBJECT_AH, cmd.ah_handle, attrs);
+}
+
+static int ib_uverbs_attach_mcast(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_attach_mcast cmd;
+ struct ib_qp *qp;
+ struct ib_uqp_object *obj;
+ struct ib_uverbs_mcast_entry *mcast;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
+ if (!qp)
+ return -EINVAL;
+
+ obj = qp->uobject;
+
+ mutex_lock(&obj->mcast_lock);
+ list_for_each_entry(mcast, &obj->mcast_list, list)
+ if (cmd.mlid == mcast->lid &&
+ !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) {
+ ret = 0;
+ goto out_put;
+ }
+
+ mcast = kmalloc(sizeof *mcast, GFP_KERNEL);
+ if (!mcast) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+
+ mcast->lid = cmd.mlid;
+ memcpy(mcast->gid.raw, cmd.gid, sizeof mcast->gid.raw);
+
+ ret = ib_attach_mcast(qp, &mcast->gid, cmd.mlid);
+ if (!ret)
+ list_add_tail(&mcast->list, &obj->mcast_list);
+ else
+ kfree(mcast);
+
+out_put:
+ mutex_unlock(&obj->mcast_lock);
+ rdma_lookup_put_uobject(&qp->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+
+ return ret;
+}
+
+static int ib_uverbs_detach_mcast(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_detach_mcast cmd;
+ struct ib_uqp_object *obj;
+ struct ib_qp *qp;
+ struct ib_uverbs_mcast_entry *mcast;
+ int ret;
+ bool found = false;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
+ if (!qp)
+ return -EINVAL;
+
+ obj = qp->uobject;
+ mutex_lock(&obj->mcast_lock);
+
+ list_for_each_entry(mcast, &obj->mcast_list, list)
+ if (cmd.mlid == mcast->lid &&
+ !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) {
+ list_del(&mcast->list);
+ kfree(mcast);
+ found = true;
+ break;
+ }
+
+ if (!found) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ ret = ib_detach_mcast(qp, (union ib_gid *)cmd.gid, cmd.mlid);
+
+out_put:
+ mutex_unlock(&obj->mcast_lock);
+ rdma_lookup_put_uobject(&qp->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+ return ret;
+}
+
+struct ib_uflow_resources *flow_resources_alloc(size_t num_specs)
+{
+ struct ib_uflow_resources *resources;
+
+ resources = kzalloc(sizeof(*resources), GFP_KERNEL);
+
+ if (!resources)
+ return NULL;
+
+ if (!num_specs)
+ goto out;
+
+ resources->counters =
+ kcalloc(num_specs, sizeof(*resources->counters), GFP_KERNEL);
+ resources->collection =
+ kcalloc(num_specs, sizeof(*resources->collection), GFP_KERNEL);
+
+ if (!resources->counters || !resources->collection)
+ goto err;
+
+out:
+ resources->max = num_specs;
+ return resources;
+
+err:
+ kfree(resources->counters);
+ kfree(resources);
+
+ return NULL;
+}
+EXPORT_SYMBOL(flow_resources_alloc);
+
+void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res)
+{
+ unsigned int i;
+
+ if (!uflow_res)
+ return;
+
+ for (i = 0; i < uflow_res->collection_num; i++)
+ atomic_dec(&uflow_res->collection[i]->usecnt);
+
+ for (i = 0; i < uflow_res->counters_num; i++)
+ atomic_dec(&uflow_res->counters[i]->usecnt);
+
+ kfree(uflow_res->collection);
+ kfree(uflow_res->counters);
+ kfree(uflow_res);
+}
+EXPORT_SYMBOL(ib_uverbs_flow_resources_free);
+
+void flow_resources_add(struct ib_uflow_resources *uflow_res,
+ enum ib_flow_spec_type type,
+ void *ibobj)
+{
+ WARN_ON(uflow_res->num >= uflow_res->max);
+
+ switch (type) {
+ case IB_FLOW_SPEC_ACTION_HANDLE:
+ atomic_inc(&((struct ib_flow_action *)ibobj)->usecnt);
+ uflow_res->collection[uflow_res->collection_num++] =
+ (struct ib_flow_action *)ibobj;
+ break;
+ case IB_FLOW_SPEC_ACTION_COUNT:
+ atomic_inc(&((struct ib_counters *)ibobj)->usecnt);
+ uflow_res->counters[uflow_res->counters_num++] =
+ (struct ib_counters *)ibobj;
+ break;
+ default:
+ WARN_ON(1);
+ }
+
+ uflow_res->num++;
+}
+EXPORT_SYMBOL(flow_resources_add);
+
+static int kern_spec_to_ib_spec_action(struct uverbs_attr_bundle *attrs,
+ struct ib_uverbs_flow_spec *kern_spec,
+ union ib_flow_spec *ib_spec,
+ struct ib_uflow_resources *uflow_res)
+{
+ ib_spec->type = kern_spec->type;
+ switch (ib_spec->type) {
+ case IB_FLOW_SPEC_ACTION_TAG:
+ if (kern_spec->flow_tag.size !=
+ sizeof(struct ib_uverbs_flow_spec_action_tag))
+ return -EINVAL;
+
+ ib_spec->flow_tag.size = sizeof(struct ib_flow_spec_action_tag);
+ ib_spec->flow_tag.tag_id = kern_spec->flow_tag.tag_id;
+ break;
+ case IB_FLOW_SPEC_ACTION_DROP:
+ if (kern_spec->drop.size !=
+ sizeof(struct ib_uverbs_flow_spec_action_drop))
+ return -EINVAL;
+
+ ib_spec->drop.size = sizeof(struct ib_flow_spec_action_drop);
+ break;
+ case IB_FLOW_SPEC_ACTION_HANDLE:
+ if (kern_spec->action.size !=
+ sizeof(struct ib_uverbs_flow_spec_action_handle))
+ return -EOPNOTSUPP;
+ ib_spec->action.act = uobj_get_obj_read(flow_action,
+ UVERBS_OBJECT_FLOW_ACTION,
+ kern_spec->action.handle,
+ attrs);
+ if (!ib_spec->action.act)
+ return -EINVAL;
+ ib_spec->action.size =
+ sizeof(struct ib_flow_spec_action_handle);
+ flow_resources_add(uflow_res,
+ IB_FLOW_SPEC_ACTION_HANDLE,
+ ib_spec->action.act);
+ uobj_put_obj_read(ib_spec->action.act);
+ break;
+ case IB_FLOW_SPEC_ACTION_COUNT:
+ if (kern_spec->flow_count.size !=
+ sizeof(struct ib_uverbs_flow_spec_action_count))
+ return -EINVAL;
+ ib_spec->flow_count.counters =
+ uobj_get_obj_read(counters,
+ UVERBS_OBJECT_COUNTERS,
+ kern_spec->flow_count.handle,
+ attrs);
+ if (!ib_spec->flow_count.counters)
+ return -EINVAL;
+ ib_spec->flow_count.size =
+ sizeof(struct ib_flow_spec_action_count);
+ flow_resources_add(uflow_res,
+ IB_FLOW_SPEC_ACTION_COUNT,
+ ib_spec->flow_count.counters);
+ uobj_put_obj_read(ib_spec->flow_count.counters);
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static ssize_t spec_filter_size(const void *kern_spec_filter, u16 kern_filter_size,
+ u16 ib_real_filter_sz)
+{
+ /*
+ * User space filter structures must be 64 bit aligned, otherwise this
+ * may pass, but we won't handle additional new attributes.
+ */
+
+ if (kern_filter_size > ib_real_filter_sz) {
+ if (memchr_inv(kern_spec_filter +
+ ib_real_filter_sz, 0,
+ kern_filter_size - ib_real_filter_sz))
+ return -EINVAL;
+ return ib_real_filter_sz;
+ }
+ return kern_filter_size;
+}
+
+int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type,
+ const void *kern_spec_mask,
+ const void *kern_spec_val,
+ size_t kern_filter_sz,
+ union ib_flow_spec *ib_spec)
+{
+ ssize_t actual_filter_sz;
+ ssize_t ib_filter_sz;
+
+ /* User flow spec size must be aligned to 4 bytes */
+ if (kern_filter_sz != ALIGN(kern_filter_sz, 4))
+ return -EINVAL;
+
+ ib_spec->type = type;
+
+ if (ib_spec->type == (IB_FLOW_SPEC_INNER | IB_FLOW_SPEC_VXLAN_TUNNEL))
+ return -EINVAL;
+
+ switch (ib_spec->type & ~IB_FLOW_SPEC_INNER) {
+ case IB_FLOW_SPEC_ETH:
+ ib_filter_sz = offsetof(struct ib_flow_eth_filter, real_sz);
+ actual_filter_sz = spec_filter_size(kern_spec_mask,
+ kern_filter_sz,
+ ib_filter_sz);
+ if (actual_filter_sz <= 0)
+ return -EINVAL;
+ ib_spec->size = sizeof(struct ib_flow_spec_eth);
+ memcpy(&ib_spec->eth.val, kern_spec_val, actual_filter_sz);
+ memcpy(&ib_spec->eth.mask, kern_spec_mask, actual_filter_sz);
+ break;
+ case IB_FLOW_SPEC_IPV4:
+ ib_filter_sz = offsetof(struct ib_flow_ipv4_filter, real_sz);
+ actual_filter_sz = spec_filter_size(kern_spec_mask,
+ kern_filter_sz,
+ ib_filter_sz);
+ if (actual_filter_sz <= 0)
+ return -EINVAL;
+ ib_spec->size = sizeof(struct ib_flow_spec_ipv4);
+ memcpy(&ib_spec->ipv4.val, kern_spec_val, actual_filter_sz);
+ memcpy(&ib_spec->ipv4.mask, kern_spec_mask, actual_filter_sz);
+ break;
+ case IB_FLOW_SPEC_IPV6:
+ ib_filter_sz = offsetof(struct ib_flow_ipv6_filter, real_sz);
+ actual_filter_sz = spec_filter_size(kern_spec_mask,
+ kern_filter_sz,
+ ib_filter_sz);
+ if (actual_filter_sz <= 0)
+ return -EINVAL;
+ ib_spec->size = sizeof(struct ib_flow_spec_ipv6);
+ memcpy(&ib_spec->ipv6.val, kern_spec_val, actual_filter_sz);
+ memcpy(&ib_spec->ipv6.mask, kern_spec_mask, actual_filter_sz);
+
+ if ((ntohl(ib_spec->ipv6.mask.flow_label)) >= BIT(20) ||
+ (ntohl(ib_spec->ipv6.val.flow_label)) >= BIT(20))
+ return -EINVAL;
+ break;
+ case IB_FLOW_SPEC_TCP:
+ case IB_FLOW_SPEC_UDP:
+ ib_filter_sz = offsetof(struct ib_flow_tcp_udp_filter, real_sz);
+ actual_filter_sz = spec_filter_size(kern_spec_mask,
+ kern_filter_sz,
+ ib_filter_sz);
+ if (actual_filter_sz <= 0)
+ return -EINVAL;
+ ib_spec->size = sizeof(struct ib_flow_spec_tcp_udp);
+ memcpy(&ib_spec->tcp_udp.val, kern_spec_val, actual_filter_sz);
+ memcpy(&ib_spec->tcp_udp.mask, kern_spec_mask, actual_filter_sz);
+ break;
+ case IB_FLOW_SPEC_VXLAN_TUNNEL:
+ ib_filter_sz = offsetof(struct ib_flow_tunnel_filter, real_sz);
+ actual_filter_sz = spec_filter_size(kern_spec_mask,
+ kern_filter_sz,
+ ib_filter_sz);
+ if (actual_filter_sz <= 0)
+ return -EINVAL;
+ ib_spec->tunnel.size = sizeof(struct ib_flow_spec_tunnel);
+ memcpy(&ib_spec->tunnel.val, kern_spec_val, actual_filter_sz);
+ memcpy(&ib_spec->tunnel.mask, kern_spec_mask, actual_filter_sz);
+
+ if ((ntohl(ib_spec->tunnel.mask.tunnel_id)) >= BIT(24) ||
+ (ntohl(ib_spec->tunnel.val.tunnel_id)) >= BIT(24))
+ return -EINVAL;
+ break;
+ case IB_FLOW_SPEC_ESP:
+ ib_filter_sz = offsetof(struct ib_flow_esp_filter, real_sz);
+ actual_filter_sz = spec_filter_size(kern_spec_mask,
+ kern_filter_sz,
+ ib_filter_sz);
+ if (actual_filter_sz <= 0)
+ return -EINVAL;
+ ib_spec->esp.size = sizeof(struct ib_flow_spec_esp);
+ memcpy(&ib_spec->esp.val, kern_spec_val, actual_filter_sz);
+ memcpy(&ib_spec->esp.mask, kern_spec_mask, actual_filter_sz);
+ break;
+ case IB_FLOW_SPEC_GRE:
+ ib_filter_sz = offsetof(struct ib_flow_gre_filter, real_sz);
+ actual_filter_sz = spec_filter_size(kern_spec_mask,
+ kern_filter_sz,
+ ib_filter_sz);
+ if (actual_filter_sz <= 0)
+ return -EINVAL;
+ ib_spec->gre.size = sizeof(struct ib_flow_spec_gre);
+ memcpy(&ib_spec->gre.val, kern_spec_val, actual_filter_sz);
+ memcpy(&ib_spec->gre.mask, kern_spec_mask, actual_filter_sz);
+ break;
+ case IB_FLOW_SPEC_MPLS:
+ ib_filter_sz = offsetof(struct ib_flow_mpls_filter, real_sz);
+ actual_filter_sz = spec_filter_size(kern_spec_mask,
+ kern_filter_sz,
+ ib_filter_sz);
+ if (actual_filter_sz <= 0)
+ return -EINVAL;
+ ib_spec->mpls.size = sizeof(struct ib_flow_spec_mpls);
+ memcpy(&ib_spec->mpls.val, kern_spec_val, actual_filter_sz);
+ memcpy(&ib_spec->mpls.mask, kern_spec_mask, actual_filter_sz);
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int kern_spec_to_ib_spec_filter(struct ib_uverbs_flow_spec *kern_spec,
+ union ib_flow_spec *ib_spec)
+{
+ size_t kern_filter_sz;
+ void *kern_spec_mask;
+ void *kern_spec_val;
+
+ if (check_sub_overflow((size_t)kern_spec->hdr.size,
+ sizeof(struct ib_uverbs_flow_spec_hdr),
+ &kern_filter_sz))
+ return -EINVAL;
+
+ kern_filter_sz /= 2;
+
+ kern_spec_val = (void *)kern_spec +
+ sizeof(struct ib_uverbs_flow_spec_hdr);
+ kern_spec_mask = kern_spec_val + kern_filter_sz;
+
+ return ib_uverbs_kern_spec_to_ib_spec_filter(kern_spec->type,
+ kern_spec_mask,
+ kern_spec_val,
+ kern_filter_sz, ib_spec);
+}
+
+static int kern_spec_to_ib_spec(struct uverbs_attr_bundle *attrs,
+ struct ib_uverbs_flow_spec *kern_spec,
+ union ib_flow_spec *ib_spec,
+ struct ib_uflow_resources *uflow_res)
+{
+ if (kern_spec->reserved)
+ return -EINVAL;
+
+ if (kern_spec->type >= IB_FLOW_SPEC_ACTION_TAG)
+ return kern_spec_to_ib_spec_action(attrs, kern_spec, ib_spec,
+ uflow_res);
+ else
+ return kern_spec_to_ib_spec_filter(kern_spec, ib_spec);
+}
+
+static int ib_uverbs_ex_create_wq(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_ex_create_wq cmd;
+ struct ib_uverbs_ex_create_wq_resp resp = {};
+ struct ib_uwq_object *obj;
+ int err = 0;
+ struct ib_cq *cq;
+ struct ib_pd *pd;
+ struct ib_wq *wq;
+ struct ib_wq_init_attr wq_init_attr = {};
+ struct ib_device *ib_dev;
+
+ err = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (err)
+ return err;
+
+ if (cmd.comp_mask)
+ return -EOPNOTSUPP;
+
+ obj = (struct ib_uwq_object *)uobj_alloc(UVERBS_OBJECT_WQ, attrs,
+ &ib_dev);
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+
+ pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs);
+ if (!pd) {
+ err = -EINVAL;
+ goto err_uobj;
+ }
+
+ cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
+ if (!cq) {
+ err = -EINVAL;
+ goto err_put_pd;
+ }
+
+ wq_init_attr.cq = cq;
+ wq_init_attr.max_sge = cmd.max_sge;
+ wq_init_attr.max_wr = cmd.max_wr;
+ wq_init_attr.wq_type = cmd.wq_type;
+ wq_init_attr.event_handler = ib_uverbs_wq_event_handler;
+ wq_init_attr.create_flags = cmd.create_flags;
+ INIT_LIST_HEAD(&obj->uevent.event_list);
+ obj->uevent.uobject.user_handle = cmd.user_handle;
+
+ wq = pd->device->ops.create_wq(pd, &wq_init_attr, &attrs->driver_udata);
+ if (IS_ERR(wq)) {
+ err = PTR_ERR(wq);
+ goto err_put_cq;
+ }
+
+ wq->uobject = obj;
+ obj->uevent.uobject.object = wq;
+ wq->wq_type = wq_init_attr.wq_type;
+ wq->cq = cq;
+ wq->pd = pd;
+ wq->device = pd->device;
+ atomic_set(&wq->usecnt, 0);
+ atomic_inc(&pd->usecnt);
+ atomic_inc(&cq->usecnt);
+ obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file);
+ if (obj->uevent.event_file)
+ uverbs_uobject_get(&obj->uevent.event_file->uobj);
+
+ uobj_put_obj_read(pd);
+ rdma_lookup_put_uobject(&cq->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+ uobj_finalize_uobj_create(&obj->uevent.uobject, attrs);
+
+ resp.wq_handle = obj->uevent.uobject.id;
+ resp.max_sge = wq_init_attr.max_sge;
+ resp.max_wr = wq_init_attr.max_wr;
+ resp.wqn = wq->wq_num;
+ resp.response_length = uverbs_response_length(attrs, sizeof(resp));
+ return uverbs_response(attrs, &resp, sizeof(resp));
+
+err_put_cq:
+ rdma_lookup_put_uobject(&cq->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+err_put_pd:
+ uobj_put_obj_read(pd);
+err_uobj:
+ uobj_alloc_abort(&obj->uevent.uobject, attrs);
+
+ return err;
+}
+
+static int ib_uverbs_ex_destroy_wq(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_ex_destroy_wq cmd;
+ struct ib_uverbs_ex_destroy_wq_resp resp = {};
+ struct ib_uobject *uobj;
+ struct ib_uwq_object *obj;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ if (cmd.comp_mask)
+ return -EOPNOTSUPP;
+
+ resp.response_length = uverbs_response_length(attrs, sizeof(resp));
+ uobj = uobj_get_destroy(UVERBS_OBJECT_WQ, cmd.wq_handle, attrs);
+ if (IS_ERR(uobj))
+ return PTR_ERR(uobj);
+
+ obj = container_of(uobj, struct ib_uwq_object, uevent.uobject);
+ resp.events_reported = obj->uevent.events_reported;
+
+ uobj_put_destroy(uobj);
+
+ return uverbs_response(attrs, &resp, sizeof(resp));
+}
+
+static int ib_uverbs_ex_modify_wq(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_ex_modify_wq cmd;
+ struct ib_wq *wq;
+ struct ib_wq_attr wq_attr = {};
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ if (!cmd.attr_mask)
+ return -EINVAL;
+
+ if (cmd.attr_mask > (IB_WQ_STATE | IB_WQ_CUR_STATE | IB_WQ_FLAGS))
+ return -EINVAL;
+
+ wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, cmd.wq_handle, attrs);
+ if (!wq)
+ return -EINVAL;
+
+ if (cmd.attr_mask & IB_WQ_FLAGS) {
+ wq_attr.flags = cmd.flags;
+ wq_attr.flags_mask = cmd.flags_mask;
+ }
+
+ if (cmd.attr_mask & IB_WQ_CUR_STATE) {
+ if (cmd.curr_wq_state > IB_WQS_ERR)
+ return -EINVAL;
+
+ wq_attr.curr_wq_state = cmd.curr_wq_state;
+ } else {
+ wq_attr.curr_wq_state = wq->state;
+ }
+
+ if (cmd.attr_mask & IB_WQ_STATE) {
+ if (cmd.wq_state > IB_WQS_ERR)
+ return -EINVAL;
+
+ wq_attr.wq_state = cmd.wq_state;
+ } else {
+ wq_attr.wq_state = wq_attr.curr_wq_state;
+ }
+
+ ret = wq->device->ops.modify_wq(wq, &wq_attr, cmd.attr_mask,
+ &attrs->driver_udata);
+ rdma_lookup_put_uobject(&wq->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+ return ret;
+}
+
+static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_ex_create_rwq_ind_table cmd;
+ struct ib_uverbs_ex_create_rwq_ind_table_resp resp = {};
+ struct ib_uobject *uobj;
+ int err;
+ struct ib_rwq_ind_table_init_attr init_attr = {};
+ struct ib_rwq_ind_table *rwq_ind_tbl;
+ struct ib_wq **wqs = NULL;
+ u32 *wqs_handles = NULL;
+ struct ib_wq *wq = NULL;
+ int i, num_read_wqs;
+ u32 num_wq_handles;
+ struct uverbs_req_iter iter;
+ struct ib_device *ib_dev;
+
+ err = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd));
+ if (err)
+ return err;
+
+ if (cmd.comp_mask)
+ return -EOPNOTSUPP;
+
+ if (cmd.log_ind_tbl_size > IB_USER_VERBS_MAX_LOG_IND_TBL_SIZE)
+ return -EINVAL;
+
+ num_wq_handles = 1 << cmd.log_ind_tbl_size;
+ wqs_handles = kcalloc(num_wq_handles, sizeof(*wqs_handles),
+ GFP_KERNEL);
+ if (!wqs_handles)
+ return -ENOMEM;
+
+ err = uverbs_request_next(&iter, wqs_handles,
+ num_wq_handles * sizeof(__u32));
+ if (err)
+ goto err_free;
+
+ err = uverbs_request_finish(&iter);
+ if (err)
+ goto err_free;
+
+ wqs = kcalloc(num_wq_handles, sizeof(*wqs), GFP_KERNEL);
+ if (!wqs) {
+ err = -ENOMEM;
+ goto err_free;
+ }
+
+ for (num_read_wqs = 0; num_read_wqs < num_wq_handles;
+ num_read_wqs++) {
+ wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ,
+ wqs_handles[num_read_wqs], attrs);
+ if (!wq) {
+ err = -EINVAL;
+ goto put_wqs;
+ }
+
+ wqs[num_read_wqs] = wq;
+ atomic_inc(&wqs[num_read_wqs]->usecnt);
+ }
+
+ uobj = uobj_alloc(UVERBS_OBJECT_RWQ_IND_TBL, attrs, &ib_dev);
+ if (IS_ERR(uobj)) {
+ err = PTR_ERR(uobj);
+ goto put_wqs;
+ }
+
+ rwq_ind_tbl = rdma_zalloc_drv_obj(ib_dev, ib_rwq_ind_table);
+ if (!rwq_ind_tbl) {
+ err = -ENOMEM;
+ goto err_uobj;
+ }
+
+ init_attr.log_ind_tbl_size = cmd.log_ind_tbl_size;
+ init_attr.ind_tbl = wqs;
+
+ rwq_ind_tbl->ind_tbl = wqs;
+ rwq_ind_tbl->log_ind_tbl_size = init_attr.log_ind_tbl_size;
+ rwq_ind_tbl->uobject = uobj;
+ uobj->object = rwq_ind_tbl;
+ rwq_ind_tbl->device = ib_dev;
+ atomic_set(&rwq_ind_tbl->usecnt, 0);
+
+ err = ib_dev->ops.create_rwq_ind_table(rwq_ind_tbl, &init_attr,
+ &attrs->driver_udata);
+ if (err)
+ goto err_create;
+
+ for (i = 0; i < num_wq_handles; i++)
+ rdma_lookup_put_uobject(&wqs[i]->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+ kfree(wqs_handles);
+ uobj_finalize_uobj_create(uobj, attrs);
+
+ resp.ind_tbl_handle = uobj->id;
+ resp.ind_tbl_num = rwq_ind_tbl->ind_tbl_num;
+ resp.response_length = uverbs_response_length(attrs, sizeof(resp));
+ return uverbs_response(attrs, &resp, sizeof(resp));
+
+err_create:
+ kfree(rwq_ind_tbl);
+err_uobj:
+ uobj_alloc_abort(uobj, attrs);
+put_wqs:
+ for (i = 0; i < num_read_wqs; i++) {
+ rdma_lookup_put_uobject(&wqs[i]->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+ atomic_dec(&wqs[i]->usecnt);
+ }
+err_free:
+ kfree(wqs_handles);
+ kfree(wqs);
+ return err;
+}
+
+static int ib_uverbs_ex_destroy_rwq_ind_table(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_ex_destroy_rwq_ind_table cmd;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ if (cmd.comp_mask)
+ return -EOPNOTSUPP;
+
+ return uobj_perform_destroy(UVERBS_OBJECT_RWQ_IND_TBL,
+ cmd.ind_tbl_handle, attrs);
+}
+
+static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_create_flow cmd;
+ struct ib_uverbs_create_flow_resp resp = {};
+ struct ib_uobject *uobj;
+ struct ib_flow *flow_id;
+ struct ib_uverbs_flow_attr *kern_flow_attr;
+ struct ib_flow_attr *flow_attr;
+ struct ib_qp *qp;
+ struct ib_uflow_resources *uflow_res;
+ struct ib_uverbs_flow_spec_hdr *kern_spec;
+ struct uverbs_req_iter iter;
+ int err;
+ void *ib_spec;
+ int i;
+ struct ib_device *ib_dev;
+
+ err = uverbs_request_start(attrs, &iter, &cmd, sizeof(cmd));
+ if (err)
+ return err;
+
+ if (cmd.comp_mask)
+ return -EINVAL;
+
+ if (!capable(CAP_NET_RAW))
+ return -EPERM;
+
+ if (cmd.flow_attr.flags >= IB_FLOW_ATTR_FLAGS_RESERVED)
+ return -EINVAL;
+
+ if ((cmd.flow_attr.flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) &&
+ ((cmd.flow_attr.type == IB_FLOW_ATTR_ALL_DEFAULT) ||
+ (cmd.flow_attr.type == IB_FLOW_ATTR_MC_DEFAULT)))
+ return -EINVAL;
+
+ if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS)
+ return -EINVAL;
+
+ if (cmd.flow_attr.size >
+ (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec)))
+ return -EINVAL;
+
+ if (cmd.flow_attr.reserved[0] ||
+ cmd.flow_attr.reserved[1])
+ return -EINVAL;
+
+ if (cmd.flow_attr.num_of_specs) {
+ kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + cmd.flow_attr.size,
+ GFP_KERNEL);
+ if (!kern_flow_attr)
+ return -ENOMEM;
+
+ *kern_flow_attr = cmd.flow_attr;
+ err = uverbs_request_next(&iter, &kern_flow_attr->flow_specs,
+ cmd.flow_attr.size);
+ if (err)
+ goto err_free_attr;
+ } else {
+ kern_flow_attr = &cmd.flow_attr;
+ }
+
+ err = uverbs_request_finish(&iter);
+ if (err)
+ goto err_free_attr;
+
+ uobj = uobj_alloc(UVERBS_OBJECT_FLOW, attrs, &ib_dev);
+ if (IS_ERR(uobj)) {
+ err = PTR_ERR(uobj);
+ goto err_free_attr;
+ }
+
+ if (!rdma_is_port_valid(uobj->context->device, cmd.flow_attr.port)) {
+ err = -EINVAL;
+ goto err_uobj;
+ }
+
+ qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs);
+ if (!qp) {
+ err = -EINVAL;
+ goto err_uobj;
+ }
+
+ if (qp->qp_type != IB_QPT_UD && qp->qp_type != IB_QPT_RAW_PACKET) {
+ err = -EINVAL;
+ goto err_put;
+ }
+
+ flow_attr = kzalloc(struct_size(flow_attr, flows,
+ cmd.flow_attr.num_of_specs), GFP_KERNEL);
+ if (!flow_attr) {
+ err = -ENOMEM;
+ goto err_put;
+ }
+ uflow_res = flow_resources_alloc(cmd.flow_attr.num_of_specs);
+ if (!uflow_res) {
+ err = -ENOMEM;
+ goto err_free_flow_attr;
+ }
+
+ flow_attr->type = kern_flow_attr->type;
+ flow_attr->priority = kern_flow_attr->priority;
+ flow_attr->num_of_specs = kern_flow_attr->num_of_specs;
+ flow_attr->port = kern_flow_attr->port;
+ flow_attr->flags = kern_flow_attr->flags;
+ flow_attr->size = sizeof(*flow_attr);
+
+ kern_spec = kern_flow_attr->flow_specs;
+ ib_spec = flow_attr + 1;
+ for (i = 0; i < flow_attr->num_of_specs &&
+ cmd.flow_attr.size >= sizeof(*kern_spec) &&
+ cmd.flow_attr.size >= kern_spec->size;
+ i++) {
+ err = kern_spec_to_ib_spec(
+ attrs, (struct ib_uverbs_flow_spec *)kern_spec,
+ ib_spec, uflow_res);
+ if (err)
+ goto err_free;
+
+ flow_attr->size +=
+ ((union ib_flow_spec *) ib_spec)->size;
+ cmd.flow_attr.size -= kern_spec->size;
+ kern_spec = ((void *)kern_spec) + kern_spec->size;
+ ib_spec += ((union ib_flow_spec *) ib_spec)->size;
+ }
+ if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) {
+ pr_warn("create flow failed, flow %d: %u bytes left from uverb cmd\n",
+ i, cmd.flow_attr.size);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ flow_id = qp->device->ops.create_flow(qp, flow_attr,
+ &attrs->driver_udata);
+
+ if (IS_ERR(flow_id)) {
+ err = PTR_ERR(flow_id);
+ goto err_free;
+ }
+
+ ib_set_flow(uobj, flow_id, qp, qp->device, uflow_res);
+
+ rdma_lookup_put_uobject(&qp->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+ kfree(flow_attr);
+
+ if (cmd.flow_attr.num_of_specs)
+ kfree(kern_flow_attr);
+ uobj_finalize_uobj_create(uobj, attrs);
+
+ resp.flow_handle = uobj->id;
+ return uverbs_response(attrs, &resp, sizeof(resp));
+
+err_free:
+ ib_uverbs_flow_resources_free(uflow_res);
+err_free_flow_attr:
+ kfree(flow_attr);
+err_put:
+ rdma_lookup_put_uobject(&qp->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+err_uobj:
+ uobj_alloc_abort(uobj, attrs);
+err_free_attr:
+ if (cmd.flow_attr.num_of_specs)
+ kfree(kern_flow_attr);
+ return err;
+}
+
+static int ib_uverbs_ex_destroy_flow(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_destroy_flow cmd;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ if (cmd.comp_mask)
+ return -EINVAL;
+
+ return uobj_perform_destroy(UVERBS_OBJECT_FLOW, cmd.flow_handle, attrs);
+}
+
+static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs,
+ struct ib_uverbs_create_xsrq *cmd,
+ struct ib_udata *udata)
+{
+ struct ib_uverbs_create_srq_resp resp = {};
+ struct ib_usrq_object *obj;
+ struct ib_pd *pd;
+ struct ib_srq *srq;
+ struct ib_srq_init_attr attr;
+ int ret;
+ struct ib_uobject *xrcd_uobj;
+ struct ib_device *ib_dev;
+
+ obj = (struct ib_usrq_object *)uobj_alloc(UVERBS_OBJECT_SRQ, attrs,
+ &ib_dev);
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+
+ if (cmd->srq_type == IB_SRQT_TM)
+ attr.ext.tag_matching.max_num_tags = cmd->max_num_tags;
+
+ if (cmd->srq_type == IB_SRQT_XRC) {
+ xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd->xrcd_handle,
+ attrs);
+ if (IS_ERR(xrcd_uobj)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ attr.ext.xrc.xrcd = (struct ib_xrcd *)xrcd_uobj->object;
+ if (!attr.ext.xrc.xrcd) {
+ ret = -EINVAL;
+ goto err_put_xrcd;
+ }
+
+ obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
+ atomic_inc(&obj->uxrcd->refcnt);
+ }
+
+ if (ib_srq_has_cq(cmd->srq_type)) {
+ attr.ext.cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ,
+ cmd->cq_handle, attrs);
+ if (!attr.ext.cq) {
+ ret = -EINVAL;
+ goto err_put_xrcd;
+ }
+ }
+
+ pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, attrs);
+ if (!pd) {
+ ret = -EINVAL;
+ goto err_put_cq;
+ }
+
+ attr.event_handler = ib_uverbs_srq_event_handler;
+ attr.srq_type = cmd->srq_type;
+ attr.attr.max_wr = cmd->max_wr;
+ attr.attr.max_sge = cmd->max_sge;
+ attr.attr.srq_limit = cmd->srq_limit;
+
+ INIT_LIST_HEAD(&obj->uevent.event_list);
+ obj->uevent.uobject.user_handle = cmd->user_handle;
+
+ srq = ib_create_srq_user(pd, &attr, obj, udata);
+ if (IS_ERR(srq)) {
+ ret = PTR_ERR(srq);
+ goto err_put_pd;
+ }
+
+ obj->uevent.uobject.object = srq;
+ obj->uevent.uobject.user_handle = cmd->user_handle;
+ obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file);
+ if (obj->uevent.event_file)
+ uverbs_uobject_get(&obj->uevent.event_file->uobj);
+
+ if (cmd->srq_type == IB_SRQT_XRC)
+ resp.srqn = srq->ext.xrc.srq_num;
+
+ if (cmd->srq_type == IB_SRQT_XRC)
+ uobj_put_read(xrcd_uobj);
+
+ if (ib_srq_has_cq(cmd->srq_type))
+ rdma_lookup_put_uobject(&attr.ext.cq->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+
+ uobj_put_obj_read(pd);
+ uobj_finalize_uobj_create(&obj->uevent.uobject, attrs);
+
+ resp.srq_handle = obj->uevent.uobject.id;
+ resp.max_wr = attr.attr.max_wr;
+ resp.max_sge = attr.attr.max_sge;
+ return uverbs_response(attrs, &resp, sizeof(resp));
+
+err_put_pd:
+ uobj_put_obj_read(pd);
+err_put_cq:
+ if (ib_srq_has_cq(cmd->srq_type))
+ rdma_lookup_put_uobject(&attr.ext.cq->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+
+err_put_xrcd:
+ if (cmd->srq_type == IB_SRQT_XRC) {
+ atomic_dec(&obj->uxrcd->refcnt);
+ uobj_put_read(xrcd_uobj);
+ }
+
+err:
+ uobj_alloc_abort(&obj->uevent.uobject, attrs);
+ return ret;
+}
+
+static int ib_uverbs_create_srq(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_create_srq cmd;
+ struct ib_uverbs_create_xsrq xcmd;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ memset(&xcmd, 0, sizeof(xcmd));
+ xcmd.response = cmd.response;
+ xcmd.user_handle = cmd.user_handle;
+ xcmd.srq_type = IB_SRQT_BASIC;
+ xcmd.pd_handle = cmd.pd_handle;
+ xcmd.max_wr = cmd.max_wr;
+ xcmd.max_sge = cmd.max_sge;
+ xcmd.srq_limit = cmd.srq_limit;
+
+ return __uverbs_create_xsrq(attrs, &xcmd, &attrs->driver_udata);
+}
+
+static int ib_uverbs_create_xsrq(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_create_xsrq cmd;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ return __uverbs_create_xsrq(attrs, &cmd, &attrs->driver_udata);
+}
+
+static int ib_uverbs_modify_srq(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_modify_srq cmd;
+ struct ib_srq *srq;
+ struct ib_srq_attr attr;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs);
+ if (!srq)
+ return -EINVAL;
+
+ attr.max_wr = cmd.max_wr;
+ attr.srq_limit = cmd.srq_limit;
+
+ ret = srq->device->ops.modify_srq(srq, &attr, cmd.attr_mask,
+ &attrs->driver_udata);
+
+ rdma_lookup_put_uobject(&srq->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+
+ return ret;
+}
+
+static int ib_uverbs_query_srq(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_query_srq cmd;
+ struct ib_uverbs_query_srq_resp resp;
+ struct ib_srq_attr attr;
+ struct ib_srq *srq;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs);
+ if (!srq)
+ return -EINVAL;
+
+ ret = ib_query_srq(srq, &attr);
+
+ rdma_lookup_put_uobject(&srq->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+
+ if (ret)
+ return ret;
+
+ memset(&resp, 0, sizeof resp);
+
+ resp.max_wr = attr.max_wr;
+ resp.max_sge = attr.max_sge;
+ resp.srq_limit = attr.srq_limit;
+
+ return uverbs_response(attrs, &resp, sizeof(resp));
+}
+
+static int ib_uverbs_destroy_srq(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_destroy_srq cmd;
+ struct ib_uverbs_destroy_srq_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_uevent_object *obj;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ uobj = uobj_get_destroy(UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs);
+ if (IS_ERR(uobj))
+ return PTR_ERR(uobj);
+
+ obj = container_of(uobj, struct ib_uevent_object, uobject);
+ memset(&resp, 0, sizeof(resp));
+ resp.events_reported = obj->events_reported;
+
+ uobj_put_destroy(uobj);
+
+ return uverbs_response(attrs, &resp, sizeof(resp));
+}
+
+static int ib_uverbs_ex_query_device(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_ex_query_device_resp resp = {};
+ struct ib_uverbs_ex_query_device cmd;
+ struct ib_device_attr attr = {0};
+ struct ib_ucontext *ucontext;
+ struct ib_device *ib_dev;
+ int err;
+
+ ucontext = ib_uverbs_get_ucontext(attrs);
+ if (IS_ERR(ucontext))
+ return PTR_ERR(ucontext);
+ ib_dev = ucontext->device;
+
+ err = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (err)
+ return err;
+
+ if (cmd.comp_mask)
+ return -EINVAL;
+
+ if (cmd.reserved)
+ return -EINVAL;
+
+ err = ib_dev->ops.query_device(ib_dev, &attr, &attrs->driver_udata);
+ if (err)
+ return err;
+
+ copy_query_dev_fields(ucontext, &resp.base, &attr);
+
+ resp.odp_caps.general_caps = attr.odp_caps.general_caps;
+ resp.odp_caps.per_transport_caps.rc_odp_caps =
+ attr.odp_caps.per_transport_caps.rc_odp_caps;
+ resp.odp_caps.per_transport_caps.uc_odp_caps =
+ attr.odp_caps.per_transport_caps.uc_odp_caps;
+ resp.odp_caps.per_transport_caps.ud_odp_caps =
+ attr.odp_caps.per_transport_caps.ud_odp_caps;
+ resp.xrc_odp_caps = attr.odp_caps.per_transport_caps.xrc_odp_caps;
+
+ resp.timestamp_mask = attr.timestamp_mask;
+ resp.hca_core_clock = attr.hca_core_clock;
+ resp.device_cap_flags_ex = attr.device_cap_flags;
+ resp.rss_caps.supported_qpts = attr.rss_caps.supported_qpts;
+ resp.rss_caps.max_rwq_indirection_tables =
+ attr.rss_caps.max_rwq_indirection_tables;
+ resp.rss_caps.max_rwq_indirection_table_size =
+ attr.rss_caps.max_rwq_indirection_table_size;
+ resp.max_wq_type_rq = attr.max_wq_type_rq;
+ resp.raw_packet_caps = attr.raw_packet_caps;
+ resp.tm_caps.max_rndv_hdr_size = attr.tm_caps.max_rndv_hdr_size;
+ resp.tm_caps.max_num_tags = attr.tm_caps.max_num_tags;
+ resp.tm_caps.max_ops = attr.tm_caps.max_ops;
+ resp.tm_caps.max_sge = attr.tm_caps.max_sge;
+ resp.tm_caps.flags = attr.tm_caps.flags;
+ resp.cq_moderation_caps.max_cq_moderation_count =
+ attr.cq_caps.max_cq_moderation_count;
+ resp.cq_moderation_caps.max_cq_moderation_period =
+ attr.cq_caps.max_cq_moderation_period;
+ resp.max_dm_size = attr.max_dm_size;
+ resp.response_length = uverbs_response_length(attrs, sizeof(resp));
+
+ return uverbs_response(attrs, &resp, sizeof(resp));
+}
+
+static int ib_uverbs_ex_modify_cq(struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_ex_modify_cq cmd;
+ struct ib_cq *cq;
+ int ret;
+
+ ret = uverbs_request(attrs, &cmd, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ if (!cmd.attr_mask || cmd.reserved)
+ return -EINVAL;
+
+ if (cmd.attr_mask > IB_CQ_MODERATE)
+ return -EOPNOTSUPP;
+
+ cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs);
+ if (!cq)
+ return -EINVAL;
+
+ ret = rdma_set_cq_moderation(cq, cmd.attr.cq_count, cmd.attr.cq_period);
+
+ rdma_lookup_put_uobject(&cq->uobject->uevent.uobject,
+ UVERBS_LOOKUP_READ);
+ return ret;
+}
+
+/*
+ * Describe the input structs for write(). Some write methods have an input
+ * only struct, most have an input and output. If the struct has an output then
+ * the 'response' u64 must be the first field in the request structure.
+ *
+ * If udata is present then both the request and response structs have a
+ * trailing driver_data flex array. In this case the size of the base struct
+ * cannot be changed.
+ */
+#define UAPI_DEF_WRITE_IO(req, resp) \
+ .write.has_resp = 1 + \
+ BUILD_BUG_ON_ZERO(offsetof(req, response) != 0) + \
+ BUILD_BUG_ON_ZERO(sizeof_field(req, response) != \
+ sizeof(u64)), \
+ .write.req_size = sizeof(req), .write.resp_size = sizeof(resp)
+
+#define UAPI_DEF_WRITE_I(req) .write.req_size = sizeof(req)
+
+#define UAPI_DEF_WRITE_UDATA_IO(req, resp) \
+ UAPI_DEF_WRITE_IO(req, resp), \
+ .write.has_udata = \
+ 1 + \
+ BUILD_BUG_ON_ZERO(offsetof(req, driver_data) != \
+ sizeof(req)) + \
+ BUILD_BUG_ON_ZERO(offsetof(resp, driver_data) != \
+ sizeof(resp))
+
+#define UAPI_DEF_WRITE_UDATA_I(req) \
+ UAPI_DEF_WRITE_I(req), \
+ .write.has_udata = \
+ 1 + BUILD_BUG_ON_ZERO(offsetof(req, driver_data) != \
+ sizeof(req))
+
+/*
+ * The _EX versions are for use with WRITE_EX and allow the last struct member
+ * to be specified. Buffers that do not include that member will be rejected.
+ */
+#define UAPI_DEF_WRITE_IO_EX(req, req_last_member, resp, resp_last_member) \
+ .write.has_resp = 1, \
+ .write.req_size = offsetofend(req, req_last_member), \
+ .write.resp_size = offsetofend(resp, resp_last_member)
+
+#define UAPI_DEF_WRITE_I_EX(req, req_last_member) \
+ .write.req_size = offsetofend(req, req_last_member)
+
+const struct uapi_definition uverbs_def_write_intf[] = {
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_AH,
+ DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_CREATE_AH,
+ ib_uverbs_create_ah,
+ UAPI_DEF_WRITE_UDATA_IO(
+ struct ib_uverbs_create_ah,
+ struct ib_uverbs_create_ah_resp)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_DESTROY_AH,
+ ib_uverbs_destroy_ah,
+ UAPI_DEF_WRITE_I(struct ib_uverbs_destroy_ah)),
+ UAPI_DEF_OBJ_NEEDS_FN(create_user_ah),
+ UAPI_DEF_OBJ_NEEDS_FN(destroy_ah)),
+
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_COMP_CHANNEL,
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL,
+ ib_uverbs_create_comp_channel,
+ UAPI_DEF_WRITE_IO(
+ struct ib_uverbs_create_comp_channel,
+ struct ib_uverbs_create_comp_channel_resp))),
+
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_CQ,
+ DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_CREATE_CQ,
+ ib_uverbs_create_cq,
+ UAPI_DEF_WRITE_UDATA_IO(
+ struct ib_uverbs_create_cq,
+ struct ib_uverbs_create_cq_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(create_cq)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_DESTROY_CQ,
+ ib_uverbs_destroy_cq,
+ UAPI_DEF_WRITE_IO(struct ib_uverbs_destroy_cq,
+ struct ib_uverbs_destroy_cq_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(destroy_cq)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_POLL_CQ,
+ ib_uverbs_poll_cq,
+ UAPI_DEF_WRITE_IO(struct ib_uverbs_poll_cq,
+ struct ib_uverbs_poll_cq_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(poll_cq)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_REQ_NOTIFY_CQ,
+ ib_uverbs_req_notify_cq,
+ UAPI_DEF_WRITE_I(struct ib_uverbs_req_notify_cq),
+ UAPI_DEF_METHOD_NEEDS_FN(req_notify_cq)),
+ DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_RESIZE_CQ,
+ ib_uverbs_resize_cq,
+ UAPI_DEF_WRITE_UDATA_IO(
+ struct ib_uverbs_resize_cq,
+ struct ib_uverbs_resize_cq_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(resize_cq)),
+ DECLARE_UVERBS_WRITE_EX(
+ IB_USER_VERBS_EX_CMD_CREATE_CQ,
+ ib_uverbs_ex_create_cq,
+ UAPI_DEF_WRITE_IO_EX(struct ib_uverbs_ex_create_cq,
+ reserved,
+ struct ib_uverbs_ex_create_cq_resp,
+ response_length),
+ UAPI_DEF_METHOD_NEEDS_FN(create_cq)),
+ DECLARE_UVERBS_WRITE_EX(
+ IB_USER_VERBS_EX_CMD_MODIFY_CQ,
+ ib_uverbs_ex_modify_cq,
+ UAPI_DEF_WRITE_I(struct ib_uverbs_ex_modify_cq),
+ UAPI_DEF_METHOD_NEEDS_FN(modify_cq))),
+
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_DEVICE,
+ DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_GET_CONTEXT,
+ ib_uverbs_get_context,
+ UAPI_DEF_WRITE_UDATA_IO(
+ struct ib_uverbs_get_context,
+ struct ib_uverbs_get_context_resp)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_QUERY_DEVICE,
+ ib_uverbs_query_device,
+ UAPI_DEF_WRITE_IO(struct ib_uverbs_query_device,
+ struct ib_uverbs_query_device_resp)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_QUERY_PORT,
+ ib_uverbs_query_port,
+ UAPI_DEF_WRITE_IO(struct ib_uverbs_query_port,
+ struct ib_uverbs_query_port_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(query_port)),
+ DECLARE_UVERBS_WRITE_EX(
+ IB_USER_VERBS_EX_CMD_QUERY_DEVICE,
+ ib_uverbs_ex_query_device,
+ UAPI_DEF_WRITE_IO_EX(
+ struct ib_uverbs_ex_query_device,
+ reserved,
+ struct ib_uverbs_ex_query_device_resp,
+ response_length),
+ UAPI_DEF_METHOD_NEEDS_FN(query_device)),
+ UAPI_DEF_OBJ_NEEDS_FN(alloc_ucontext),
+ UAPI_DEF_OBJ_NEEDS_FN(dealloc_ucontext)),
+
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_FLOW,
+ DECLARE_UVERBS_WRITE_EX(
+ IB_USER_VERBS_EX_CMD_CREATE_FLOW,
+ ib_uverbs_ex_create_flow,
+ UAPI_DEF_WRITE_IO_EX(struct ib_uverbs_create_flow,
+ flow_attr,
+ struct ib_uverbs_create_flow_resp,
+ flow_handle),
+ UAPI_DEF_METHOD_NEEDS_FN(create_flow)),
+ DECLARE_UVERBS_WRITE_EX(
+ IB_USER_VERBS_EX_CMD_DESTROY_FLOW,
+ ib_uverbs_ex_destroy_flow,
+ UAPI_DEF_WRITE_I(struct ib_uverbs_destroy_flow),
+ UAPI_DEF_METHOD_NEEDS_FN(destroy_flow))),
+
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_MR,
+ DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_DEREG_MR,
+ ib_uverbs_dereg_mr,
+ UAPI_DEF_WRITE_I(struct ib_uverbs_dereg_mr),
+ UAPI_DEF_METHOD_NEEDS_FN(dereg_mr)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_REG_MR,
+ ib_uverbs_reg_mr,
+ UAPI_DEF_WRITE_UDATA_IO(struct ib_uverbs_reg_mr,
+ struct ib_uverbs_reg_mr_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(reg_user_mr)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_REREG_MR,
+ ib_uverbs_rereg_mr,
+ UAPI_DEF_WRITE_UDATA_IO(struct ib_uverbs_rereg_mr,
+ struct ib_uverbs_rereg_mr_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(rereg_user_mr))),
+
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_MW,
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_ALLOC_MW,
+ ib_uverbs_alloc_mw,
+ UAPI_DEF_WRITE_UDATA_IO(struct ib_uverbs_alloc_mw,
+ struct ib_uverbs_alloc_mw_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(alloc_mw)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_DEALLOC_MW,
+ ib_uverbs_dealloc_mw,
+ UAPI_DEF_WRITE_I(struct ib_uverbs_dealloc_mw),
+ UAPI_DEF_METHOD_NEEDS_FN(dealloc_mw))),
+
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_PD,
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_ALLOC_PD,
+ ib_uverbs_alloc_pd,
+ UAPI_DEF_WRITE_UDATA_IO(struct ib_uverbs_alloc_pd,
+ struct ib_uverbs_alloc_pd_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(alloc_pd)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_DEALLOC_PD,
+ ib_uverbs_dealloc_pd,
+ UAPI_DEF_WRITE_I(struct ib_uverbs_dealloc_pd),
+ UAPI_DEF_METHOD_NEEDS_FN(dealloc_pd))),
+
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_QP,
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_ATTACH_MCAST,
+ ib_uverbs_attach_mcast,
+ UAPI_DEF_WRITE_I(struct ib_uverbs_attach_mcast),
+ UAPI_DEF_METHOD_NEEDS_FN(attach_mcast),
+ UAPI_DEF_METHOD_NEEDS_FN(detach_mcast)),
+ DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_CREATE_QP,
+ ib_uverbs_create_qp,
+ UAPI_DEF_WRITE_UDATA_IO(
+ struct ib_uverbs_create_qp,
+ struct ib_uverbs_create_qp_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(create_qp)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_DESTROY_QP,
+ ib_uverbs_destroy_qp,
+ UAPI_DEF_WRITE_IO(struct ib_uverbs_destroy_qp,
+ struct ib_uverbs_destroy_qp_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(destroy_qp)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_DETACH_MCAST,
+ ib_uverbs_detach_mcast,
+ UAPI_DEF_WRITE_I(struct ib_uverbs_detach_mcast),
+ UAPI_DEF_METHOD_NEEDS_FN(detach_mcast)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_MODIFY_QP,
+ ib_uverbs_modify_qp,
+ UAPI_DEF_WRITE_I(struct ib_uverbs_modify_qp),
+ UAPI_DEF_METHOD_NEEDS_FN(modify_qp)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_POST_RECV,
+ ib_uverbs_post_recv,
+ UAPI_DEF_WRITE_IO(struct ib_uverbs_post_recv,
+ struct ib_uverbs_post_recv_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(post_recv)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_POST_SEND,
+ ib_uverbs_post_send,
+ UAPI_DEF_WRITE_IO(struct ib_uverbs_post_send,
+ struct ib_uverbs_post_send_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(post_send)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_QUERY_QP,
+ ib_uverbs_query_qp,
+ UAPI_DEF_WRITE_IO(struct ib_uverbs_query_qp,
+ struct ib_uverbs_query_qp_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(query_qp)),
+ DECLARE_UVERBS_WRITE_EX(
+ IB_USER_VERBS_EX_CMD_CREATE_QP,
+ ib_uverbs_ex_create_qp,
+ UAPI_DEF_WRITE_IO_EX(struct ib_uverbs_ex_create_qp,
+ comp_mask,
+ struct ib_uverbs_ex_create_qp_resp,
+ response_length),
+ UAPI_DEF_METHOD_NEEDS_FN(create_qp)),
+ DECLARE_UVERBS_WRITE_EX(
+ IB_USER_VERBS_EX_CMD_MODIFY_QP,
+ ib_uverbs_ex_modify_qp,
+ UAPI_DEF_WRITE_IO_EX(struct ib_uverbs_ex_modify_qp,
+ base,
+ struct ib_uverbs_ex_modify_qp_resp,
+ response_length),
+ UAPI_DEF_METHOD_NEEDS_FN(modify_qp))),
+
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_RWQ_IND_TBL,
+ DECLARE_UVERBS_WRITE_EX(
+ IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL,
+ ib_uverbs_ex_create_rwq_ind_table,
+ UAPI_DEF_WRITE_IO_EX(
+ struct ib_uverbs_ex_create_rwq_ind_table,
+ log_ind_tbl_size,
+ struct ib_uverbs_ex_create_rwq_ind_table_resp,
+ ind_tbl_num),
+ UAPI_DEF_METHOD_NEEDS_FN(create_rwq_ind_table)),
+ DECLARE_UVERBS_WRITE_EX(
+ IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL,
+ ib_uverbs_ex_destroy_rwq_ind_table,
+ UAPI_DEF_WRITE_I(
+ struct ib_uverbs_ex_destroy_rwq_ind_table),
+ UAPI_DEF_METHOD_NEEDS_FN(destroy_rwq_ind_table))),
+
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_WQ,
+ DECLARE_UVERBS_WRITE_EX(
+ IB_USER_VERBS_EX_CMD_CREATE_WQ,
+ ib_uverbs_ex_create_wq,
+ UAPI_DEF_WRITE_IO_EX(struct ib_uverbs_ex_create_wq,
+ max_sge,
+ struct ib_uverbs_ex_create_wq_resp,
+ wqn),
+ UAPI_DEF_METHOD_NEEDS_FN(create_wq)),
+ DECLARE_UVERBS_WRITE_EX(
+ IB_USER_VERBS_EX_CMD_DESTROY_WQ,
+ ib_uverbs_ex_destroy_wq,
+ UAPI_DEF_WRITE_IO_EX(struct ib_uverbs_ex_destroy_wq,
+ wq_handle,
+ struct ib_uverbs_ex_destroy_wq_resp,
+ reserved),
+ UAPI_DEF_METHOD_NEEDS_FN(destroy_wq)),
+ DECLARE_UVERBS_WRITE_EX(
+ IB_USER_VERBS_EX_CMD_MODIFY_WQ,
+ ib_uverbs_ex_modify_wq,
+ UAPI_DEF_WRITE_I_EX(struct ib_uverbs_ex_modify_wq,
+ curr_wq_state),
+ UAPI_DEF_METHOD_NEEDS_FN(modify_wq))),
+
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_SRQ,
+ DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_CREATE_SRQ,
+ ib_uverbs_create_srq,
+ UAPI_DEF_WRITE_UDATA_IO(
+ struct ib_uverbs_create_srq,
+ struct ib_uverbs_create_srq_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(create_srq)),
+ DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_CREATE_XSRQ,
+ ib_uverbs_create_xsrq,
+ UAPI_DEF_WRITE_UDATA_IO(
+ struct ib_uverbs_create_xsrq,
+ struct ib_uverbs_create_srq_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(create_srq)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_DESTROY_SRQ,
+ ib_uverbs_destroy_srq,
+ UAPI_DEF_WRITE_IO(struct ib_uverbs_destroy_srq,
+ struct ib_uverbs_destroy_srq_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(destroy_srq)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_MODIFY_SRQ,
+ ib_uverbs_modify_srq,
+ UAPI_DEF_WRITE_UDATA_I(struct ib_uverbs_modify_srq),
+ UAPI_DEF_METHOD_NEEDS_FN(modify_srq)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_POST_SRQ_RECV,
+ ib_uverbs_post_srq_recv,
+ UAPI_DEF_WRITE_IO(struct ib_uverbs_post_srq_recv,
+ struct ib_uverbs_post_srq_recv_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(post_srq_recv)),
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_QUERY_SRQ,
+ ib_uverbs_query_srq,
+ UAPI_DEF_WRITE_IO(struct ib_uverbs_query_srq,
+ struct ib_uverbs_query_srq_resp),
+ UAPI_DEF_METHOD_NEEDS_FN(query_srq))),
+
+ DECLARE_UVERBS_OBJECT(
+ UVERBS_OBJECT_XRCD,
+ DECLARE_UVERBS_WRITE(
+ IB_USER_VERBS_CMD_CLOSE_XRCD,
+ ib_uverbs_close_xrcd,
+ UAPI_DEF_WRITE_I(struct ib_uverbs_close_xrcd)),
+ DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_OPEN_QP,
+ ib_uverbs_open_qp,
+ UAPI_DEF_WRITE_UDATA_IO(
+ struct ib_uverbs_open_qp,
+ struct ib_uverbs_create_qp_resp)),
+ DECLARE_UVERBS_WRITE(IB_USER_VERBS_CMD_OPEN_XRCD,
+ ib_uverbs_open_xrcd,
+ UAPI_DEF_WRITE_UDATA_IO(
+ struct ib_uverbs_open_xrcd,
+ struct ib_uverbs_open_xrcd_resp)),
+ UAPI_DEF_OBJ_NEEDS_FN(alloc_xrcd),
+ UAPI_DEF_OBJ_NEEDS_FN(dealloc_xrcd)),
+
+ {},
+};
diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c
new file mode 100644
index 000000000..d9799706c
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_ioctl.c
@@ -0,0 +1,837 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/rdma_user_ioctl.h>
+#include <rdma/uverbs_ioctl.h>
+#include "rdma_core.h"
+#include "uverbs.h"
+
+struct bundle_alloc_head {
+ struct bundle_alloc_head *next;
+ u8 data[];
+};
+
+struct bundle_priv {
+ /* Must be first */
+ struct bundle_alloc_head alloc_head;
+ struct bundle_alloc_head *allocated_mem;
+ size_t internal_avail;
+ size_t internal_used;
+
+ struct radix_tree_root *radix;
+ const struct uverbs_api_ioctl_method *method_elm;
+ void __rcu **radix_slots;
+ unsigned long radix_slots_len;
+ u32 method_key;
+
+ struct ib_uverbs_attr __user *user_attrs;
+ struct ib_uverbs_attr *uattrs;
+
+ DECLARE_BITMAP(uobj_finalize, UVERBS_API_ATTR_BKEY_LEN);
+ DECLARE_BITMAP(spec_finalize, UVERBS_API_ATTR_BKEY_LEN);
+ DECLARE_BITMAP(uobj_hw_obj_valid, UVERBS_API_ATTR_BKEY_LEN);
+
+ /*
+ * Must be last. bundle ends in a flex array which overlaps
+ * internal_buffer.
+ */
+ struct uverbs_attr_bundle bundle;
+ u64 internal_buffer[32];
+};
+
+/*
+ * Each method has an absolute minimum amount of memory it needs to allocate,
+ * precompute that amount and determine if the onstack memory can be used or
+ * if allocation is need.
+ */
+void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm,
+ unsigned int num_attrs)
+{
+ struct bundle_priv *pbundle;
+ size_t bundle_size =
+ offsetof(struct bundle_priv, internal_buffer) +
+ sizeof(*pbundle->bundle.attrs) * method_elm->key_bitmap_len +
+ sizeof(*pbundle->uattrs) * num_attrs;
+
+ method_elm->use_stack = bundle_size <= sizeof(*pbundle);
+ method_elm->bundle_size =
+ ALIGN(bundle_size + 256, sizeof(*pbundle->internal_buffer));
+
+ /* Do not want order-2 allocations for this. */
+ WARN_ON_ONCE(method_elm->bundle_size > PAGE_SIZE);
+}
+
+/**
+ * _uverbs_alloc() - Quickly allocate memory for use with a bundle
+ * @bundle: The bundle
+ * @size: Number of bytes to allocate
+ * @flags: Allocator flags
+ *
+ * The bundle allocator is intended for allocations that are connected with
+ * processing the system call related to the bundle. The allocated memory is
+ * always freed once the system call completes, and cannot be freed any other
+ * way.
+ *
+ * This tries to use a small pool of pre-allocated memory for performance.
+ */
+__malloc void *_uverbs_alloc(struct uverbs_attr_bundle *bundle, size_t size,
+ gfp_t flags)
+{
+ struct bundle_priv *pbundle =
+ container_of(bundle, struct bundle_priv, bundle);
+ size_t new_used;
+ void *res;
+
+ if (check_add_overflow(size, pbundle->internal_used, &new_used))
+ return ERR_PTR(-EOVERFLOW);
+
+ if (new_used > pbundle->internal_avail) {
+ struct bundle_alloc_head *buf;
+
+ buf = kvmalloc(struct_size(buf, data, size), flags);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+ buf->next = pbundle->allocated_mem;
+ pbundle->allocated_mem = buf;
+ return buf->data;
+ }
+
+ res = (void *)pbundle->internal_buffer + pbundle->internal_used;
+ pbundle->internal_used =
+ ALIGN(new_used, sizeof(*pbundle->internal_buffer));
+ if (want_init_on_alloc(flags))
+ memset(res, 0, size);
+ return res;
+}
+EXPORT_SYMBOL(_uverbs_alloc);
+
+static bool uverbs_is_attr_cleared(const struct ib_uverbs_attr *uattr,
+ u16 len)
+{
+ if (uattr->len > sizeof_field(struct ib_uverbs_attr, data))
+ return ib_is_buffer_cleared(u64_to_user_ptr(uattr->data) + len,
+ uattr->len - len);
+
+ return !memchr_inv((const void *)&uattr->data + len,
+ 0, uattr->len - len);
+}
+
+static int uverbs_set_output(const struct uverbs_attr_bundle *bundle,
+ const struct uverbs_attr *attr)
+{
+ struct bundle_priv *pbundle =
+ container_of(bundle, struct bundle_priv, bundle);
+ u16 flags;
+
+ flags = pbundle->uattrs[attr->ptr_attr.uattr_idx].flags |
+ UVERBS_ATTR_F_VALID_OUTPUT;
+ if (put_user(flags,
+ &pbundle->user_attrs[attr->ptr_attr.uattr_idx].flags))
+ return -EFAULT;
+ return 0;
+}
+
+static int uverbs_process_idrs_array(struct bundle_priv *pbundle,
+ const struct uverbs_api_attr *attr_uapi,
+ struct uverbs_objs_arr_attr *attr,
+ struct ib_uverbs_attr *uattr,
+ u32 attr_bkey)
+{
+ const struct uverbs_attr_spec *spec = &attr_uapi->spec;
+ size_t array_len;
+ u32 *idr_vals;
+ int ret = 0;
+ size_t i;
+
+ if (uattr->attr_data.reserved)
+ return -EINVAL;
+
+ if (uattr->len % sizeof(u32))
+ return -EINVAL;
+
+ array_len = uattr->len / sizeof(u32);
+ if (array_len < spec->u2.objs_arr.min_len ||
+ array_len > spec->u2.objs_arr.max_len)
+ return -EINVAL;
+
+ attr->uobjects =
+ uverbs_alloc(&pbundle->bundle,
+ array_size(array_len, sizeof(*attr->uobjects)));
+ if (IS_ERR(attr->uobjects))
+ return PTR_ERR(attr->uobjects);
+
+ /*
+ * Since idr is 4B and *uobjects is >= 4B, we can use attr->uobjects
+ * to store idrs array and avoid additional memory allocation. The
+ * idrs array is offset to the end of the uobjects array so we will be
+ * able to read idr and replace with a pointer.
+ */
+ idr_vals = (u32 *)(attr->uobjects + array_len) - array_len;
+
+ if (uattr->len > sizeof(uattr->data)) {
+ ret = copy_from_user(idr_vals, u64_to_user_ptr(uattr->data),
+ uattr->len);
+ if (ret)
+ return -EFAULT;
+ } else {
+ memcpy(idr_vals, &uattr->data, uattr->len);
+ }
+
+ for (i = 0; i != array_len; i++) {
+ attr->uobjects[i] = uverbs_get_uobject_from_file(
+ spec->u2.objs_arr.obj_type, spec->u2.objs_arr.access,
+ idr_vals[i], &pbundle->bundle);
+ if (IS_ERR(attr->uobjects[i])) {
+ ret = PTR_ERR(attr->uobjects[i]);
+ break;
+ }
+ }
+
+ attr->len = i;
+ __set_bit(attr_bkey, pbundle->spec_finalize);
+ return ret;
+}
+
+static void uverbs_free_idrs_array(const struct uverbs_api_attr *attr_uapi,
+ struct uverbs_objs_arr_attr *attr,
+ bool commit,
+ struct uverbs_attr_bundle *attrs)
+{
+ const struct uverbs_attr_spec *spec = &attr_uapi->spec;
+ size_t i;
+
+ for (i = 0; i != attr->len; i++)
+ uverbs_finalize_object(attr->uobjects[i],
+ spec->u2.objs_arr.access, false, commit,
+ attrs);
+}
+
+static int uverbs_process_attr(struct bundle_priv *pbundle,
+ const struct uverbs_api_attr *attr_uapi,
+ struct ib_uverbs_attr *uattr, u32 attr_bkey)
+{
+ const struct uverbs_attr_spec *spec = &attr_uapi->spec;
+ struct uverbs_attr *e = &pbundle->bundle.attrs[attr_bkey];
+ const struct uverbs_attr_spec *val_spec = spec;
+ struct uverbs_obj_attr *o_attr;
+
+ switch (spec->type) {
+ case UVERBS_ATTR_TYPE_ENUM_IN:
+ if (uattr->attr_data.enum_data.elem_id >= spec->u.enum_def.num_elems)
+ return -EOPNOTSUPP;
+
+ if (uattr->attr_data.enum_data.reserved)
+ return -EINVAL;
+
+ val_spec = &spec->u2.enum_def.ids[uattr->attr_data.enum_data.elem_id];
+
+ /* Currently we only support PTR_IN based enums */
+ if (val_spec->type != UVERBS_ATTR_TYPE_PTR_IN)
+ return -EOPNOTSUPP;
+
+ e->ptr_attr.enum_id = uattr->attr_data.enum_data.elem_id;
+ fallthrough;
+ case UVERBS_ATTR_TYPE_PTR_IN:
+ /* Ensure that any data provided by userspace beyond the known
+ * struct is zero. Userspace that knows how to use some future
+ * longer struct will fail here if used with an old kernel and
+ * non-zero content, making ABI compat/discovery simpler.
+ */
+ if (uattr->len > val_spec->u.ptr.len &&
+ val_spec->zero_trailing &&
+ !uverbs_is_attr_cleared(uattr, val_spec->u.ptr.len))
+ return -EOPNOTSUPP;
+
+ fallthrough;
+ case UVERBS_ATTR_TYPE_PTR_OUT:
+ if (uattr->len < val_spec->u.ptr.min_len ||
+ (!val_spec->zero_trailing &&
+ uattr->len > val_spec->u.ptr.len))
+ return -EINVAL;
+
+ if (spec->type != UVERBS_ATTR_TYPE_ENUM_IN &&
+ uattr->attr_data.reserved)
+ return -EINVAL;
+
+ e->ptr_attr.uattr_idx = uattr - pbundle->uattrs;
+ e->ptr_attr.len = uattr->len;
+
+ if (val_spec->alloc_and_copy && !uverbs_attr_ptr_is_inline(e)) {
+ void *p;
+
+ p = uverbs_alloc(&pbundle->bundle, uattr->len);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ e->ptr_attr.ptr = p;
+
+ if (copy_from_user(p, u64_to_user_ptr(uattr->data),
+ uattr->len))
+ return -EFAULT;
+ } else {
+ e->ptr_attr.data = uattr->data;
+ }
+ break;
+
+ case UVERBS_ATTR_TYPE_IDR:
+ case UVERBS_ATTR_TYPE_FD:
+ if (uattr->attr_data.reserved)
+ return -EINVAL;
+
+ if (uattr->len != 0)
+ return -EINVAL;
+
+ o_attr = &e->obj_attr;
+ o_attr->attr_elm = attr_uapi;
+
+ /*
+ * The type of uattr->data is u64 for UVERBS_ATTR_TYPE_IDR and
+ * s64 for UVERBS_ATTR_TYPE_FD. We can cast the u64 to s64
+ * here without caring about truncation as we know that the
+ * IDR implementation today rejects negative IDs
+ */
+ o_attr->uobject = uverbs_get_uobject_from_file(
+ spec->u.obj.obj_type, spec->u.obj.access,
+ uattr->data_s64, &pbundle->bundle);
+ if (IS_ERR(o_attr->uobject))
+ return PTR_ERR(o_attr->uobject);
+ __set_bit(attr_bkey, pbundle->uobj_finalize);
+
+ if (spec->u.obj.access == UVERBS_ACCESS_NEW) {
+ unsigned int uattr_idx = uattr - pbundle->uattrs;
+ s64 id = o_attr->uobject->id;
+
+ /* Copy the allocated id to the user-space */
+ if (put_user(id, &pbundle->user_attrs[uattr_idx].data))
+ return -EFAULT;
+ }
+
+ break;
+
+ case UVERBS_ATTR_TYPE_RAW_FD:
+ if (uattr->attr_data.reserved || uattr->len != 0 ||
+ uattr->data_s64 < INT_MIN || uattr->data_s64 > INT_MAX)
+ return -EINVAL;
+ /* _uverbs_get_const_signed() is the accessor */
+ e->ptr_attr.data = uattr->data_s64;
+ break;
+
+ case UVERBS_ATTR_TYPE_IDRS_ARRAY:
+ return uverbs_process_idrs_array(pbundle, attr_uapi,
+ &e->objs_arr_attr, uattr,
+ attr_bkey);
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+/*
+ * We search the radix tree with the method prefix and now we want to fast
+ * search the suffix bits to get a particular attribute pointer. It is not
+ * totally clear to me if this breaks the radix tree encasulation or not, but
+ * it uses the iter data to determine if the method iter points at the same
+ * chunk that will store the attribute, if so it just derefs it directly. By
+ * construction in most kernel configs the method and attrs will all fit in a
+ * single radix chunk, so in most cases this will have no search. Other cases
+ * this falls back to a full search.
+ */
+static void __rcu **uapi_get_attr_for_method(struct bundle_priv *pbundle,
+ u32 attr_key)
+{
+ void __rcu **slot;
+
+ if (likely(attr_key < pbundle->radix_slots_len)) {
+ void *entry;
+
+ slot = pbundle->radix_slots + attr_key;
+ entry = rcu_dereference_raw(*slot);
+ if (likely(!radix_tree_is_internal_node(entry) && entry))
+ return slot;
+ }
+
+ return radix_tree_lookup_slot(pbundle->radix,
+ pbundle->method_key | attr_key);
+}
+
+static int uverbs_set_attr(struct bundle_priv *pbundle,
+ struct ib_uverbs_attr *uattr)
+{
+ u32 attr_key = uapi_key_attr(uattr->attr_id);
+ u32 attr_bkey = uapi_bkey_attr(attr_key);
+ const struct uverbs_api_attr *attr;
+ void __rcu **slot;
+ int ret;
+
+ slot = uapi_get_attr_for_method(pbundle, attr_key);
+ if (!slot) {
+ /*
+ * Kernel does not support the attribute but user-space says it
+ * is mandatory
+ */
+ if (uattr->flags & UVERBS_ATTR_F_MANDATORY)
+ return -EPROTONOSUPPORT;
+ return 0;
+ }
+ attr = rcu_dereference_protected(*slot, true);
+
+ /* Reject duplicate attributes from user-space */
+ if (test_bit(attr_bkey, pbundle->bundle.attr_present))
+ return -EINVAL;
+
+ ret = uverbs_process_attr(pbundle, attr, uattr, attr_bkey);
+ if (ret)
+ return ret;
+
+ __set_bit(attr_bkey, pbundle->bundle.attr_present);
+
+ return 0;
+}
+
+static int ib_uverbs_run_method(struct bundle_priv *pbundle,
+ unsigned int num_attrs)
+{
+ int (*handler)(struct uverbs_attr_bundle *attrs);
+ size_t uattrs_size = array_size(sizeof(*pbundle->uattrs), num_attrs);
+ unsigned int destroy_bkey = pbundle->method_elm->destroy_bkey;
+ unsigned int i;
+ int ret;
+
+ /* See uverbs_disassociate_api() */
+ handler = srcu_dereference(
+ pbundle->method_elm->handler,
+ &pbundle->bundle.ufile->device->disassociate_srcu);
+ if (!handler)
+ return -EIO;
+
+ pbundle->uattrs = uverbs_alloc(&pbundle->bundle, uattrs_size);
+ if (IS_ERR(pbundle->uattrs))
+ return PTR_ERR(pbundle->uattrs);
+ if (copy_from_user(pbundle->uattrs, pbundle->user_attrs, uattrs_size))
+ return -EFAULT;
+
+ for (i = 0; i != num_attrs; i++) {
+ ret = uverbs_set_attr(pbundle, &pbundle->uattrs[i]);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ /* User space did not provide all the mandatory attributes */
+ if (unlikely(!bitmap_subset(pbundle->method_elm->attr_mandatory,
+ pbundle->bundle.attr_present,
+ pbundle->method_elm->key_bitmap_len)))
+ return -EINVAL;
+
+ if (pbundle->method_elm->has_udata)
+ uverbs_fill_udata(&pbundle->bundle,
+ &pbundle->bundle.driver_udata,
+ UVERBS_ATTR_UHW_IN, UVERBS_ATTR_UHW_OUT);
+ else
+ pbundle->bundle.driver_udata = (struct ib_udata){};
+
+ if (destroy_bkey != UVERBS_API_ATTR_BKEY_LEN) {
+ struct uverbs_obj_attr *destroy_attr =
+ &pbundle->bundle.attrs[destroy_bkey].obj_attr;
+
+ ret = uobj_destroy(destroy_attr->uobject, &pbundle->bundle);
+ if (ret)
+ return ret;
+ __clear_bit(destroy_bkey, pbundle->uobj_finalize);
+
+ ret = handler(&pbundle->bundle);
+ uobj_put_destroy(destroy_attr->uobject);
+ } else {
+ ret = handler(&pbundle->bundle);
+ }
+
+ /*
+ * Until the drivers are revised to use the bundle directly we have to
+ * assume that the driver wrote to its UHW_OUT and flag userspace
+ * appropriately.
+ */
+ if (!ret && pbundle->method_elm->has_udata) {
+ const struct uverbs_attr *attr =
+ uverbs_attr_get(&pbundle->bundle, UVERBS_ATTR_UHW_OUT);
+
+ if (!IS_ERR(attr))
+ ret = uverbs_set_output(&pbundle->bundle, attr);
+ }
+
+ /*
+ * EPROTONOSUPPORT is ONLY to be returned if the ioctl framework can
+ * not invoke the method because the request is not supported. No
+ * other cases should return this code.
+ */
+ if (WARN_ON_ONCE(ret == -EPROTONOSUPPORT))
+ return -EINVAL;
+
+ return ret;
+}
+
+static void bundle_destroy(struct bundle_priv *pbundle, bool commit)
+{
+ unsigned int key_bitmap_len = pbundle->method_elm->key_bitmap_len;
+ struct bundle_alloc_head *memblock;
+ unsigned int i;
+
+ /* fast path for simple uobjects */
+ i = -1;
+ while ((i = find_next_bit(pbundle->uobj_finalize, key_bitmap_len,
+ i + 1)) < key_bitmap_len) {
+ struct uverbs_attr *attr = &pbundle->bundle.attrs[i];
+
+ uverbs_finalize_object(
+ attr->obj_attr.uobject,
+ attr->obj_attr.attr_elm->spec.u.obj.access,
+ test_bit(i, pbundle->uobj_hw_obj_valid),
+ commit,
+ &pbundle->bundle);
+ }
+
+ i = -1;
+ while ((i = find_next_bit(pbundle->spec_finalize, key_bitmap_len,
+ i + 1)) < key_bitmap_len) {
+ struct uverbs_attr *attr = &pbundle->bundle.attrs[i];
+ const struct uverbs_api_attr *attr_uapi;
+ void __rcu **slot;
+
+ slot = uapi_get_attr_for_method(
+ pbundle,
+ pbundle->method_key | uapi_bkey_to_key_attr(i));
+ if (WARN_ON(!slot))
+ continue;
+
+ attr_uapi = rcu_dereference_protected(*slot, true);
+
+ if (attr_uapi->spec.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) {
+ uverbs_free_idrs_array(attr_uapi, &attr->objs_arr_attr,
+ commit, &pbundle->bundle);
+ }
+ }
+
+ for (memblock = pbundle->allocated_mem; memblock;) {
+ struct bundle_alloc_head *tmp = memblock;
+
+ memblock = memblock->next;
+ kvfree(tmp);
+ }
+}
+
+static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile,
+ struct ib_uverbs_ioctl_hdr *hdr,
+ struct ib_uverbs_attr __user *user_attrs)
+{
+ const struct uverbs_api_ioctl_method *method_elm;
+ struct uverbs_api *uapi = ufile->device->uapi;
+ struct radix_tree_iter attrs_iter;
+ struct bundle_priv *pbundle;
+ struct bundle_priv onstack;
+ void __rcu **slot;
+ int ret;
+
+ if (unlikely(hdr->driver_id != uapi->driver_id))
+ return -EINVAL;
+
+ slot = radix_tree_iter_lookup(
+ &uapi->radix, &attrs_iter,
+ uapi_key_obj(hdr->object_id) |
+ uapi_key_ioctl_method(hdr->method_id));
+ if (unlikely(!slot))
+ return -EPROTONOSUPPORT;
+ method_elm = rcu_dereference_protected(*slot, true);
+
+ if (!method_elm->use_stack) {
+ pbundle = kmalloc(method_elm->bundle_size, GFP_KERNEL);
+ if (!pbundle)
+ return -ENOMEM;
+ pbundle->internal_avail =
+ method_elm->bundle_size -
+ offsetof(struct bundle_priv, internal_buffer);
+ pbundle->alloc_head.next = NULL;
+ pbundle->allocated_mem = &pbundle->alloc_head;
+ } else {
+ pbundle = &onstack;
+ pbundle->internal_avail = sizeof(pbundle->internal_buffer);
+ pbundle->allocated_mem = NULL;
+ }
+
+ /* Space for the pbundle->bundle.attrs flex array */
+ pbundle->method_elm = method_elm;
+ pbundle->method_key = attrs_iter.index;
+ pbundle->bundle.ufile = ufile;
+ pbundle->bundle.context = NULL; /* only valid if bundle has uobject */
+ pbundle->radix = &uapi->radix;
+ pbundle->radix_slots = slot;
+ pbundle->radix_slots_len = radix_tree_chunk_size(&attrs_iter);
+ pbundle->user_attrs = user_attrs;
+
+ pbundle->internal_used = ALIGN(pbundle->method_elm->key_bitmap_len *
+ sizeof(*pbundle->bundle.attrs),
+ sizeof(*pbundle->internal_buffer));
+ memset(pbundle->bundle.attr_present, 0,
+ sizeof(pbundle->bundle.attr_present));
+ memset(pbundle->uobj_finalize, 0, sizeof(pbundle->uobj_finalize));
+ memset(pbundle->spec_finalize, 0, sizeof(pbundle->spec_finalize));
+ memset(pbundle->uobj_hw_obj_valid, 0,
+ sizeof(pbundle->uobj_hw_obj_valid));
+
+ ret = ib_uverbs_run_method(pbundle, hdr->num_attrs);
+ bundle_destroy(pbundle, ret == 0);
+ return ret;
+}
+
+long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ struct ib_uverbs_file *file = filp->private_data;
+ struct ib_uverbs_ioctl_hdr __user *user_hdr =
+ (struct ib_uverbs_ioctl_hdr __user *)arg;
+ struct ib_uverbs_ioctl_hdr hdr;
+ int srcu_key;
+ int err;
+
+ if (unlikely(cmd != RDMA_VERBS_IOCTL))
+ return -ENOIOCTLCMD;
+
+ err = copy_from_user(&hdr, user_hdr, sizeof(hdr));
+ if (err)
+ return -EFAULT;
+
+ if (hdr.length > PAGE_SIZE ||
+ hdr.length != struct_size(&hdr, attrs, hdr.num_attrs))
+ return -EINVAL;
+
+ if (hdr.reserved1 || hdr.reserved2)
+ return -EPROTONOSUPPORT;
+
+ srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+ err = ib_uverbs_cmd_verbs(file, &hdr, user_hdr->attrs);
+ srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+ return err;
+}
+
+int uverbs_get_flags64(u64 *to, const struct uverbs_attr_bundle *attrs_bundle,
+ size_t idx, u64 allowed_bits)
+{
+ const struct uverbs_attr *attr;
+ u64 flags;
+
+ attr = uverbs_attr_get(attrs_bundle, idx);
+ /* Missing attribute means 0 flags */
+ if (IS_ERR(attr)) {
+ *to = 0;
+ return 0;
+ }
+
+ /*
+ * New userspace code should use 8 bytes to pass flags, but we
+ * transparently support old userspaces that were using 4 bytes as
+ * well.
+ */
+ if (attr->ptr_attr.len == 8)
+ flags = attr->ptr_attr.data;
+ else if (attr->ptr_attr.len == 4)
+ flags = *(u32 *)&attr->ptr_attr.data;
+ else
+ return -EINVAL;
+
+ if (flags & ~allowed_bits)
+ return -EINVAL;
+
+ *to = flags;
+ return 0;
+}
+EXPORT_SYMBOL(uverbs_get_flags64);
+
+int uverbs_get_flags32(u32 *to, const struct uverbs_attr_bundle *attrs_bundle,
+ size_t idx, u64 allowed_bits)
+{
+ u64 flags;
+ int ret;
+
+ ret = uverbs_get_flags64(&flags, attrs_bundle, idx, allowed_bits);
+ if (ret)
+ return ret;
+
+ if (flags > U32_MAX)
+ return -EINVAL;
+ *to = flags;
+
+ return 0;
+}
+EXPORT_SYMBOL(uverbs_get_flags32);
+
+/*
+ * Fill a ib_udata struct (core or uhw) using the given attribute IDs.
+ * This is primarily used to convert the UVERBS_ATTR_UHW() into the
+ * ib_udata format used by the drivers.
+ */
+void uverbs_fill_udata(struct uverbs_attr_bundle *bundle,
+ struct ib_udata *udata, unsigned int attr_in,
+ unsigned int attr_out)
+{
+ struct bundle_priv *pbundle =
+ container_of(bundle, struct bundle_priv, bundle);
+ const struct uverbs_attr *in =
+ uverbs_attr_get(&pbundle->bundle, attr_in);
+ const struct uverbs_attr *out =
+ uverbs_attr_get(&pbundle->bundle, attr_out);
+
+ if (!IS_ERR(in)) {
+ udata->inlen = in->ptr_attr.len;
+ if (uverbs_attr_ptr_is_inline(in))
+ udata->inbuf =
+ &pbundle->user_attrs[in->ptr_attr.uattr_idx]
+ .data;
+ else
+ udata->inbuf = u64_to_user_ptr(in->ptr_attr.data);
+ } else {
+ udata->inbuf = NULL;
+ udata->inlen = 0;
+ }
+
+ if (!IS_ERR(out)) {
+ udata->outbuf = u64_to_user_ptr(out->ptr_attr.data);
+ udata->outlen = out->ptr_attr.len;
+ } else {
+ udata->outbuf = NULL;
+ udata->outlen = 0;
+ }
+}
+
+int uverbs_copy_to(const struct uverbs_attr_bundle *bundle, size_t idx,
+ const void *from, size_t size)
+{
+ const struct uverbs_attr *attr = uverbs_attr_get(bundle, idx);
+ size_t min_size;
+
+ if (IS_ERR(attr))
+ return PTR_ERR(attr);
+
+ min_size = min_t(size_t, attr->ptr_attr.len, size);
+ if (copy_to_user(u64_to_user_ptr(attr->ptr_attr.data), from, min_size))
+ return -EFAULT;
+
+ return uverbs_set_output(bundle, attr);
+}
+EXPORT_SYMBOL(uverbs_copy_to);
+
+
+/*
+ * This is only used if the caller has directly used copy_to_use to write the
+ * data. It signals to user space that the buffer is filled in.
+ */
+int uverbs_output_written(const struct uverbs_attr_bundle *bundle, size_t idx)
+{
+ const struct uverbs_attr *attr = uverbs_attr_get(bundle, idx);
+
+ if (IS_ERR(attr))
+ return PTR_ERR(attr);
+
+ return uverbs_set_output(bundle, attr);
+}
+
+int _uverbs_get_const_signed(s64 *to,
+ const struct uverbs_attr_bundle *attrs_bundle,
+ size_t idx, s64 lower_bound, u64 upper_bound,
+ s64 *def_val)
+{
+ const struct uverbs_attr *attr;
+
+ attr = uverbs_attr_get(attrs_bundle, idx);
+ if (IS_ERR(attr)) {
+ if ((PTR_ERR(attr) != -ENOENT) || !def_val)
+ return PTR_ERR(attr);
+
+ *to = *def_val;
+ } else {
+ *to = attr->ptr_attr.data;
+ }
+
+ if (*to < lower_bound || (*to > 0 && (u64)*to > upper_bound))
+ return -EINVAL;
+
+ return 0;
+}
+EXPORT_SYMBOL(_uverbs_get_const_signed);
+
+int _uverbs_get_const_unsigned(u64 *to,
+ const struct uverbs_attr_bundle *attrs_bundle,
+ size_t idx, u64 upper_bound, u64 *def_val)
+{
+ const struct uverbs_attr *attr;
+
+ attr = uverbs_attr_get(attrs_bundle, idx);
+ if (IS_ERR(attr)) {
+ if ((PTR_ERR(attr) != -ENOENT) || !def_val)
+ return PTR_ERR(attr);
+
+ *to = *def_val;
+ } else {
+ *to = attr->ptr_attr.data;
+ }
+
+ if (*to > upper_bound)
+ return -EINVAL;
+
+ return 0;
+}
+EXPORT_SYMBOL(_uverbs_get_const_unsigned);
+
+int uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle,
+ size_t idx, const void *from, size_t size)
+{
+ const struct uverbs_attr *attr = uverbs_attr_get(bundle, idx);
+
+ if (IS_ERR(attr))
+ return PTR_ERR(attr);
+
+ if (size < attr->ptr_attr.len) {
+ if (clear_user(u64_to_user_ptr(attr->ptr_attr.data) + size,
+ attr->ptr_attr.len - size))
+ return -EFAULT;
+ }
+ return uverbs_copy_to(bundle, idx, from, size);
+}
+EXPORT_SYMBOL(uverbs_copy_to_struct_or_zero);
+
+/* Once called an abort will call through to the type's destroy_hw() */
+void uverbs_finalize_uobj_create(const struct uverbs_attr_bundle *bundle,
+ u16 idx)
+{
+ struct bundle_priv *pbundle =
+ container_of(bundle, struct bundle_priv, bundle);
+
+ __set_bit(uapi_bkey_attr(uapi_key_attr(idx)),
+ pbundle->uobj_hw_obj_valid);
+}
+EXPORT_SYMBOL(uverbs_finalize_uobj_create);
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
new file mode 100644
index 000000000..6fe825800
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -0,0 +1,1315 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/cdev.h>
+#include <linux/anon_inodes.h>
+#include <linux/slab.h>
+#include <linux/sched/mm.h>
+
+#include <linux/uaccess.h>
+
+#include <rdma/ib.h>
+#include <rdma/uverbs_std_types.h>
+#include <rdma/rdma_netlink.h>
+
+#include "uverbs.h"
+#include "core_priv.h"
+#include "rdma_core.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand userspace verbs access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+ IB_UVERBS_MAJOR = 231,
+ IB_UVERBS_BASE_MINOR = 192,
+ IB_UVERBS_MAX_DEVICES = RDMA_MAX_PORTS,
+ IB_UVERBS_NUM_FIXED_MINOR = 32,
+ IB_UVERBS_NUM_DYNAMIC_MINOR = IB_UVERBS_MAX_DEVICES - IB_UVERBS_NUM_FIXED_MINOR,
+};
+
+#define IB_UVERBS_BASE_DEV MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR)
+
+static dev_t dynamic_uverbs_dev;
+static struct class *uverbs_class;
+
+static DEFINE_IDA(uverbs_ida);
+static int ib_uverbs_add_one(struct ib_device *device);
+static void ib_uverbs_remove_one(struct ib_device *device, void *client_data);
+
+/*
+ * Must be called with the ufile->device->disassociate_srcu held, and the lock
+ * must be held until use of the ucontext is finished.
+ */
+struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile)
+{
+ /*
+ * We do not hold the hw_destroy_rwsem lock for this flow, instead
+ * srcu is used. It does not matter if someone races this with
+ * get_context, we get NULL or valid ucontext.
+ */
+ struct ib_ucontext *ucontext = smp_load_acquire(&ufile->ucontext);
+
+ if (!srcu_dereference(ufile->device->ib_dev,
+ &ufile->device->disassociate_srcu))
+ return ERR_PTR(-EIO);
+
+ if (!ucontext)
+ return ERR_PTR(-EINVAL);
+
+ return ucontext;
+}
+EXPORT_SYMBOL(ib_uverbs_get_ucontext_file);
+
+int uverbs_dealloc_mw(struct ib_mw *mw)
+{
+ struct ib_pd *pd = mw->pd;
+ int ret;
+
+ ret = mw->device->ops.dealloc_mw(mw);
+ if (ret)
+ return ret;
+
+ atomic_dec(&pd->usecnt);
+ kfree(mw);
+ return ret;
+}
+
+static void ib_uverbs_release_dev(struct device *device)
+{
+ struct ib_uverbs_device *dev =
+ container_of(device, struct ib_uverbs_device, dev);
+
+ uverbs_destroy_api(dev->uapi);
+ cleanup_srcu_struct(&dev->disassociate_srcu);
+ mutex_destroy(&dev->lists_mutex);
+ mutex_destroy(&dev->xrcd_tree_mutex);
+ kfree(dev);
+}
+
+void ib_uverbs_release_ucq(struct ib_uverbs_completion_event_file *ev_file,
+ struct ib_ucq_object *uobj)
+{
+ struct ib_uverbs_event *evt, *tmp;
+
+ if (ev_file) {
+ spin_lock_irq(&ev_file->ev_queue.lock);
+ list_for_each_entry_safe(evt, tmp, &uobj->comp_list, obj_list) {
+ list_del(&evt->list);
+ kfree(evt);
+ }
+ spin_unlock_irq(&ev_file->ev_queue.lock);
+
+ uverbs_uobject_put(&ev_file->uobj);
+ }
+
+ ib_uverbs_release_uevent(&uobj->uevent);
+}
+
+void ib_uverbs_release_uevent(struct ib_uevent_object *uobj)
+{
+ struct ib_uverbs_async_event_file *async_file = uobj->event_file;
+ struct ib_uverbs_event *evt, *tmp;
+
+ if (!async_file)
+ return;
+
+ spin_lock_irq(&async_file->ev_queue.lock);
+ list_for_each_entry_safe(evt, tmp, &uobj->event_list, obj_list) {
+ list_del(&evt->list);
+ kfree(evt);
+ }
+ spin_unlock_irq(&async_file->ev_queue.lock);
+ uverbs_uobject_put(&async_file->uobj);
+}
+
+void ib_uverbs_detach_umcast(struct ib_qp *qp,
+ struct ib_uqp_object *uobj)
+{
+ struct ib_uverbs_mcast_entry *mcast, *tmp;
+
+ list_for_each_entry_safe(mcast, tmp, &uobj->mcast_list, list) {
+ ib_detach_mcast(qp, &mcast->gid, mcast->lid);
+ list_del(&mcast->list);
+ kfree(mcast);
+ }
+}
+
+static void ib_uverbs_comp_dev(struct ib_uverbs_device *dev)
+{
+ complete(&dev->comp);
+}
+
+void ib_uverbs_release_file(struct kref *ref)
+{
+ struct ib_uverbs_file *file =
+ container_of(ref, struct ib_uverbs_file, ref);
+ struct ib_device *ib_dev;
+ int srcu_key;
+
+ release_ufile_idr_uobject(file);
+
+ srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+ ib_dev = srcu_dereference(file->device->ib_dev,
+ &file->device->disassociate_srcu);
+ if (ib_dev && !ib_dev->ops.disassociate_ucontext)
+ module_put(ib_dev->ops.owner);
+ srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+
+ if (refcount_dec_and_test(&file->device->refcount))
+ ib_uverbs_comp_dev(file->device);
+
+ if (file->default_async_file)
+ uverbs_uobject_put(&file->default_async_file->uobj);
+ put_device(&file->device->dev);
+
+ if (file->disassociate_page)
+ __free_pages(file->disassociate_page, 0);
+ mutex_destroy(&file->umap_lock);
+ mutex_destroy(&file->ucontext_lock);
+ kfree(file);
+}
+
+static ssize_t ib_uverbs_event_read(struct ib_uverbs_event_queue *ev_queue,
+ struct file *filp, char __user *buf,
+ size_t count, loff_t *pos,
+ size_t eventsz)
+{
+ struct ib_uverbs_event *event;
+ int ret = 0;
+
+ spin_lock_irq(&ev_queue->lock);
+
+ while (list_empty(&ev_queue->event_list)) {
+ if (ev_queue->is_closed) {
+ spin_unlock_irq(&ev_queue->lock);
+ return -EIO;
+ }
+
+ spin_unlock_irq(&ev_queue->lock);
+ if (filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ if (wait_event_interruptible(ev_queue->poll_wait,
+ (!list_empty(&ev_queue->event_list) ||
+ ev_queue->is_closed)))
+ return -ERESTARTSYS;
+
+ spin_lock_irq(&ev_queue->lock);
+ }
+
+ event = list_entry(ev_queue->event_list.next, struct ib_uverbs_event, list);
+
+ if (eventsz > count) {
+ ret = -EINVAL;
+ event = NULL;
+ } else {
+ list_del(ev_queue->event_list.next);
+ if (event->counter) {
+ ++(*event->counter);
+ list_del(&event->obj_list);
+ }
+ }
+
+ spin_unlock_irq(&ev_queue->lock);
+
+ if (event) {
+ if (copy_to_user(buf, event, eventsz))
+ ret = -EFAULT;
+ else
+ ret = eventsz;
+ }
+
+ kfree(event);
+
+ return ret;
+}
+
+static ssize_t ib_uverbs_async_event_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct ib_uverbs_async_event_file *file = filp->private_data;
+
+ return ib_uverbs_event_read(&file->ev_queue, filp, buf, count, pos,
+ sizeof(struct ib_uverbs_async_event_desc));
+}
+
+static ssize_t ib_uverbs_comp_event_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct ib_uverbs_completion_event_file *comp_ev_file =
+ filp->private_data;
+
+ return ib_uverbs_event_read(&comp_ev_file->ev_queue, filp, buf, count,
+ pos,
+ sizeof(struct ib_uverbs_comp_event_desc));
+}
+
+static __poll_t ib_uverbs_event_poll(struct ib_uverbs_event_queue *ev_queue,
+ struct file *filp,
+ struct poll_table_struct *wait)
+{
+ __poll_t pollflags = 0;
+
+ poll_wait(filp, &ev_queue->poll_wait, wait);
+
+ spin_lock_irq(&ev_queue->lock);
+ if (!list_empty(&ev_queue->event_list))
+ pollflags = EPOLLIN | EPOLLRDNORM;
+ else if (ev_queue->is_closed)
+ pollflags = EPOLLERR;
+ spin_unlock_irq(&ev_queue->lock);
+
+ return pollflags;
+}
+
+static __poll_t ib_uverbs_async_event_poll(struct file *filp,
+ struct poll_table_struct *wait)
+{
+ struct ib_uverbs_async_event_file *file = filp->private_data;
+
+ return ib_uverbs_event_poll(&file->ev_queue, filp, wait);
+}
+
+static __poll_t ib_uverbs_comp_event_poll(struct file *filp,
+ struct poll_table_struct *wait)
+{
+ struct ib_uverbs_completion_event_file *comp_ev_file =
+ filp->private_data;
+
+ return ib_uverbs_event_poll(&comp_ev_file->ev_queue, filp, wait);
+}
+
+static int ib_uverbs_async_event_fasync(int fd, struct file *filp, int on)
+{
+ struct ib_uverbs_async_event_file *file = filp->private_data;
+
+ return fasync_helper(fd, filp, on, &file->ev_queue.async_queue);
+}
+
+static int ib_uverbs_comp_event_fasync(int fd, struct file *filp, int on)
+{
+ struct ib_uverbs_completion_event_file *comp_ev_file =
+ filp->private_data;
+
+ return fasync_helper(fd, filp, on, &comp_ev_file->ev_queue.async_queue);
+}
+
+const struct file_operations uverbs_event_fops = {
+ .owner = THIS_MODULE,
+ .read = ib_uverbs_comp_event_read,
+ .poll = ib_uverbs_comp_event_poll,
+ .release = uverbs_uobject_fd_release,
+ .fasync = ib_uverbs_comp_event_fasync,
+ .llseek = no_llseek,
+};
+
+const struct file_operations uverbs_async_event_fops = {
+ .owner = THIS_MODULE,
+ .read = ib_uverbs_async_event_read,
+ .poll = ib_uverbs_async_event_poll,
+ .release = uverbs_async_event_release,
+ .fasync = ib_uverbs_async_event_fasync,
+ .llseek = no_llseek,
+};
+
+void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context)
+{
+ struct ib_uverbs_event_queue *ev_queue = cq_context;
+ struct ib_ucq_object *uobj;
+ struct ib_uverbs_event *entry;
+ unsigned long flags;
+
+ if (!ev_queue)
+ return;
+
+ spin_lock_irqsave(&ev_queue->lock, flags);
+ if (ev_queue->is_closed) {
+ spin_unlock_irqrestore(&ev_queue->lock, flags);
+ return;
+ }
+
+ entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+ if (!entry) {
+ spin_unlock_irqrestore(&ev_queue->lock, flags);
+ return;
+ }
+
+ uobj = cq->uobject;
+
+ entry->desc.comp.cq_handle = cq->uobject->uevent.uobject.user_handle;
+ entry->counter = &uobj->comp_events_reported;
+
+ list_add_tail(&entry->list, &ev_queue->event_list);
+ list_add_tail(&entry->obj_list, &uobj->comp_list);
+ spin_unlock_irqrestore(&ev_queue->lock, flags);
+
+ wake_up_interruptible(&ev_queue->poll_wait);
+ kill_fasync(&ev_queue->async_queue, SIGIO, POLL_IN);
+}
+
+void ib_uverbs_async_handler(struct ib_uverbs_async_event_file *async_file,
+ __u64 element, __u64 event,
+ struct list_head *obj_list, u32 *counter)
+{
+ struct ib_uverbs_event *entry;
+ unsigned long flags;
+
+ if (!async_file)
+ return;
+
+ spin_lock_irqsave(&async_file->ev_queue.lock, flags);
+ if (async_file->ev_queue.is_closed) {
+ spin_unlock_irqrestore(&async_file->ev_queue.lock, flags);
+ return;
+ }
+
+ entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+ if (!entry) {
+ spin_unlock_irqrestore(&async_file->ev_queue.lock, flags);
+ return;
+ }
+
+ entry->desc.async.element = element;
+ entry->desc.async.event_type = event;
+ entry->desc.async.reserved = 0;
+ entry->counter = counter;
+
+ list_add_tail(&entry->list, &async_file->ev_queue.event_list);
+ if (obj_list)
+ list_add_tail(&entry->obj_list, obj_list);
+ spin_unlock_irqrestore(&async_file->ev_queue.lock, flags);
+
+ wake_up_interruptible(&async_file->ev_queue.poll_wait);
+ kill_fasync(&async_file->ev_queue.async_queue, SIGIO, POLL_IN);
+}
+
+static void uverbs_uobj_event(struct ib_uevent_object *eobj,
+ struct ib_event *event)
+{
+ ib_uverbs_async_handler(eobj->event_file,
+ eobj->uobject.user_handle, event->event,
+ &eobj->event_list, &eobj->events_reported);
+}
+
+void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr)
+{
+ uverbs_uobj_event(&event->element.cq->uobject->uevent, event);
+}
+
+void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr)
+{
+ /* for XRC target qp's, check that qp is live */
+ if (!event->element.qp->uobject)
+ return;
+
+ uverbs_uobj_event(&event->element.qp->uobject->uevent, event);
+}
+
+void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr)
+{
+ uverbs_uobj_event(&event->element.wq->uobject->uevent, event);
+}
+
+void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr)
+{
+ uverbs_uobj_event(&event->element.srq->uobject->uevent, event);
+}
+
+static void ib_uverbs_event_handler(struct ib_event_handler *handler,
+ struct ib_event *event)
+{
+ ib_uverbs_async_handler(
+ container_of(handler, struct ib_uverbs_async_event_file,
+ event_handler),
+ event->element.port_num, event->event, NULL, NULL);
+}
+
+void ib_uverbs_init_event_queue(struct ib_uverbs_event_queue *ev_queue)
+{
+ spin_lock_init(&ev_queue->lock);
+ INIT_LIST_HEAD(&ev_queue->event_list);
+ init_waitqueue_head(&ev_queue->poll_wait);
+ ev_queue->is_closed = 0;
+ ev_queue->async_queue = NULL;
+}
+
+void ib_uverbs_init_async_event_file(
+ struct ib_uverbs_async_event_file *async_file)
+{
+ struct ib_uverbs_file *uverbs_file = async_file->uobj.ufile;
+ struct ib_device *ib_dev = async_file->uobj.context->device;
+
+ ib_uverbs_init_event_queue(&async_file->ev_queue);
+
+ /* The first async_event_file becomes the default one for the file. */
+ mutex_lock(&uverbs_file->ucontext_lock);
+ if (!uverbs_file->default_async_file) {
+ /* Pairs with the put in ib_uverbs_release_file */
+ uverbs_uobject_get(&async_file->uobj);
+ smp_store_release(&uverbs_file->default_async_file, async_file);
+ }
+ mutex_unlock(&uverbs_file->ucontext_lock);
+
+ INIT_IB_EVENT_HANDLER(&async_file->event_handler, ib_dev,
+ ib_uverbs_event_handler);
+ ib_register_event_handler(&async_file->event_handler);
+}
+
+static ssize_t verify_hdr(struct ib_uverbs_cmd_hdr *hdr,
+ struct ib_uverbs_ex_cmd_hdr *ex_hdr, size_t count,
+ const struct uverbs_api_write_method *method_elm)
+{
+ if (method_elm->is_ex) {
+ count -= sizeof(*hdr) + sizeof(*ex_hdr);
+
+ if ((hdr->in_words + ex_hdr->provider_in_words) * 8 != count)
+ return -EINVAL;
+
+ if (hdr->in_words * 8 < method_elm->req_size)
+ return -ENOSPC;
+
+ if (ex_hdr->cmd_hdr_reserved)
+ return -EINVAL;
+
+ if (ex_hdr->response) {
+ if (!hdr->out_words && !ex_hdr->provider_out_words)
+ return -EINVAL;
+
+ if (hdr->out_words * 8 < method_elm->resp_size)
+ return -ENOSPC;
+
+ if (!access_ok(u64_to_user_ptr(ex_hdr->response),
+ (hdr->out_words + ex_hdr->provider_out_words) * 8))
+ return -EFAULT;
+ } else {
+ if (hdr->out_words || ex_hdr->provider_out_words)
+ return -EINVAL;
+ }
+
+ return 0;
+ }
+
+ /* not extended command */
+ if (hdr->in_words * 4 != count)
+ return -EINVAL;
+
+ if (count < method_elm->req_size + sizeof(*hdr)) {
+ /*
+ * rdma-core v18 and v19 have a bug where they send DESTROY_CQ
+ * with a 16 byte write instead of 24. Old kernels didn't
+ * check the size so they allowed this. Now that the size is
+ * checked provide a compatibility work around to not break
+ * those userspaces.
+ */
+ if (hdr->command == IB_USER_VERBS_CMD_DESTROY_CQ &&
+ count == 16) {
+ hdr->in_words = 6;
+ return 0;
+ }
+ return -ENOSPC;
+ }
+ if (hdr->out_words * 4 < method_elm->resp_size)
+ return -ENOSPC;
+
+ return 0;
+}
+
+static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct ib_uverbs_file *file = filp->private_data;
+ const struct uverbs_api_write_method *method_elm;
+ struct uverbs_api *uapi = file->device->uapi;
+ struct ib_uverbs_ex_cmd_hdr ex_hdr;
+ struct ib_uverbs_cmd_hdr hdr;
+ struct uverbs_attr_bundle bundle;
+ int srcu_key;
+ ssize_t ret;
+
+ if (!ib_safe_file_access(filp)) {
+ pr_err_once("uverbs_write: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n",
+ task_tgid_vnr(current), current->comm);
+ return -EACCES;
+ }
+
+ if (count < sizeof(hdr))
+ return -EINVAL;
+
+ if (copy_from_user(&hdr, buf, sizeof(hdr)))
+ return -EFAULT;
+
+ method_elm = uapi_get_method(uapi, hdr.command);
+ if (IS_ERR(method_elm))
+ return PTR_ERR(method_elm);
+
+ if (method_elm->is_ex) {
+ if (count < (sizeof(hdr) + sizeof(ex_hdr)))
+ return -EINVAL;
+ if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr)))
+ return -EFAULT;
+ }
+
+ ret = verify_hdr(&hdr, &ex_hdr, count, method_elm);
+ if (ret)
+ return ret;
+
+ srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+
+ buf += sizeof(hdr);
+
+ memset(bundle.attr_present, 0, sizeof(bundle.attr_present));
+ bundle.ufile = file;
+ bundle.context = NULL; /* only valid if bundle has uobject */
+ bundle.uobject = NULL;
+ if (!method_elm->is_ex) {
+ size_t in_len = hdr.in_words * 4 - sizeof(hdr);
+ size_t out_len = hdr.out_words * 4;
+ u64 response = 0;
+
+ if (method_elm->has_udata) {
+ bundle.driver_udata.inlen =
+ in_len - method_elm->req_size;
+ in_len = method_elm->req_size;
+ if (bundle.driver_udata.inlen)
+ bundle.driver_udata.inbuf = buf + in_len;
+ else
+ bundle.driver_udata.inbuf = NULL;
+ } else {
+ memset(&bundle.driver_udata, 0,
+ sizeof(bundle.driver_udata));
+ }
+
+ if (method_elm->has_resp) {
+ /*
+ * The macros check that if has_resp is set
+ * then the command request structure starts
+ * with a '__aligned u64 response' member.
+ */
+ ret = get_user(response, (const u64 __user *)buf);
+ if (ret)
+ goto out_unlock;
+
+ if (method_elm->has_udata) {
+ bundle.driver_udata.outlen =
+ out_len - method_elm->resp_size;
+ out_len = method_elm->resp_size;
+ if (bundle.driver_udata.outlen)
+ bundle.driver_udata.outbuf =
+ u64_to_user_ptr(response +
+ out_len);
+ else
+ bundle.driver_udata.outbuf = NULL;
+ }
+ } else {
+ bundle.driver_udata.outlen = 0;
+ bundle.driver_udata.outbuf = NULL;
+ }
+
+ ib_uverbs_init_udata_buf_or_null(
+ &bundle.ucore, buf, u64_to_user_ptr(response),
+ in_len, out_len);
+ } else {
+ buf += sizeof(ex_hdr);
+
+ ib_uverbs_init_udata_buf_or_null(&bundle.ucore, buf,
+ u64_to_user_ptr(ex_hdr.response),
+ hdr.in_words * 8, hdr.out_words * 8);
+
+ ib_uverbs_init_udata_buf_or_null(
+ &bundle.driver_udata, buf + bundle.ucore.inlen,
+ u64_to_user_ptr(ex_hdr.response) + bundle.ucore.outlen,
+ ex_hdr.provider_in_words * 8,
+ ex_hdr.provider_out_words * 8);
+
+ }
+
+ ret = method_elm->handler(&bundle);
+ if (bundle.uobject)
+ uverbs_finalize_object(bundle.uobject, UVERBS_ACCESS_NEW, true,
+ !ret, &bundle);
+out_unlock:
+ srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+ return (ret) ? : count;
+}
+
+static const struct vm_operations_struct rdma_umap_ops;
+
+static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct ib_uverbs_file *file = filp->private_data;
+ struct ib_ucontext *ucontext;
+ int ret = 0;
+ int srcu_key;
+
+ srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+ ucontext = ib_uverbs_get_ucontext_file(file);
+ if (IS_ERR(ucontext)) {
+ ret = PTR_ERR(ucontext);
+ goto out;
+ }
+ vma->vm_ops = &rdma_umap_ops;
+ ret = ucontext->device->ops.mmap(ucontext, vma);
+out:
+ srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+ return ret;
+}
+
+/*
+ * The VMA has been dup'd, initialize the vm_private_data with a new tracking
+ * struct
+ */
+static void rdma_umap_open(struct vm_area_struct *vma)
+{
+ struct ib_uverbs_file *ufile = vma->vm_file->private_data;
+ struct rdma_umap_priv *opriv = vma->vm_private_data;
+ struct rdma_umap_priv *priv;
+
+ if (!opriv)
+ return;
+
+ /* We are racing with disassociation */
+ if (!down_read_trylock(&ufile->hw_destroy_rwsem))
+ goto out_zap;
+ /*
+ * Disassociation already completed, the VMA should already be zapped.
+ */
+ if (!ufile->ucontext)
+ goto out_unlock;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ goto out_unlock;
+ rdma_umap_priv_init(priv, vma, opriv->entry);
+
+ up_read(&ufile->hw_destroy_rwsem);
+ return;
+
+out_unlock:
+ up_read(&ufile->hw_destroy_rwsem);
+out_zap:
+ /*
+ * We can't allow the VMA to be created with the actual IO pages, that
+ * would break our API contract, and it can't be stopped at this
+ * point, so zap it.
+ */
+ vma->vm_private_data = NULL;
+ zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+}
+
+static void rdma_umap_close(struct vm_area_struct *vma)
+{
+ struct ib_uverbs_file *ufile = vma->vm_file->private_data;
+ struct rdma_umap_priv *priv = vma->vm_private_data;
+
+ if (!priv)
+ return;
+
+ /*
+ * The vma holds a reference on the struct file that created it, which
+ * in turn means that the ib_uverbs_file is guaranteed to exist at
+ * this point.
+ */
+ mutex_lock(&ufile->umap_lock);
+ if (priv->entry)
+ rdma_user_mmap_entry_put(priv->entry);
+
+ list_del(&priv->list);
+ mutex_unlock(&ufile->umap_lock);
+ kfree(priv);
+}
+
+/*
+ * Once the zap_vma_ptes has been called touches to the VMA will come here and
+ * we return a dummy writable zero page for all the pfns.
+ */
+static vm_fault_t rdma_umap_fault(struct vm_fault *vmf)
+{
+ struct ib_uverbs_file *ufile = vmf->vma->vm_file->private_data;
+ struct rdma_umap_priv *priv = vmf->vma->vm_private_data;
+ vm_fault_t ret = 0;
+
+ if (!priv)
+ return VM_FAULT_SIGBUS;
+
+ /* Read only pages can just use the system zero page. */
+ if (!(vmf->vma->vm_flags & (VM_WRITE | VM_MAYWRITE))) {
+ vmf->page = ZERO_PAGE(vmf->address);
+ get_page(vmf->page);
+ return 0;
+ }
+
+ mutex_lock(&ufile->umap_lock);
+ if (!ufile->disassociate_page)
+ ufile->disassociate_page =
+ alloc_pages(vmf->gfp_mask | __GFP_ZERO, 0);
+
+ if (ufile->disassociate_page) {
+ /*
+ * This VMA is forced to always be shared so this doesn't have
+ * to worry about COW.
+ */
+ vmf->page = ufile->disassociate_page;
+ get_page(vmf->page);
+ } else {
+ ret = VM_FAULT_SIGBUS;
+ }
+ mutex_unlock(&ufile->umap_lock);
+
+ return ret;
+}
+
+static const struct vm_operations_struct rdma_umap_ops = {
+ .open = rdma_umap_open,
+ .close = rdma_umap_close,
+ .fault = rdma_umap_fault,
+};
+
+void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
+{
+ struct rdma_umap_priv *priv, *next_priv;
+
+ lockdep_assert_held(&ufile->hw_destroy_rwsem);
+
+ while (1) {
+ struct mm_struct *mm = NULL;
+
+ /* Get an arbitrary mm pointer that hasn't been cleaned yet */
+ mutex_lock(&ufile->umap_lock);
+ while (!list_empty(&ufile->umaps)) {
+ int ret;
+
+ priv = list_first_entry(&ufile->umaps,
+ struct rdma_umap_priv, list);
+ mm = priv->vma->vm_mm;
+ ret = mmget_not_zero(mm);
+ if (!ret) {
+ list_del_init(&priv->list);
+ if (priv->entry) {
+ rdma_user_mmap_entry_put(priv->entry);
+ priv->entry = NULL;
+ }
+ mm = NULL;
+ continue;
+ }
+ break;
+ }
+ mutex_unlock(&ufile->umap_lock);
+ if (!mm)
+ return;
+
+ /*
+ * The umap_lock is nested under mmap_lock since it used within
+ * the vma_ops callbacks, so we have to clean the list one mm
+ * at a time to get the lock ordering right. Typically there
+ * will only be one mm, so no big deal.
+ */
+ mmap_read_lock(mm);
+ mutex_lock(&ufile->umap_lock);
+ list_for_each_entry_safe (priv, next_priv, &ufile->umaps,
+ list) {
+ struct vm_area_struct *vma = priv->vma;
+
+ if (vma->vm_mm != mm)
+ continue;
+ list_del_init(&priv->list);
+
+ zap_vma_ptes(vma, vma->vm_start,
+ vma->vm_end - vma->vm_start);
+
+ if (priv->entry) {
+ rdma_user_mmap_entry_put(priv->entry);
+ priv->entry = NULL;
+ }
+ }
+ mutex_unlock(&ufile->umap_lock);
+ mmap_read_unlock(mm);
+ mmput(mm);
+ }
+}
+
+/*
+ * ib_uverbs_open() does not need the BKL:
+ *
+ * - the ib_uverbs_device structures are properly reference counted and
+ * everything else is purely local to the file being created, so
+ * races against other open calls are not a problem;
+ * - there is no ioctl method to race against;
+ * - the open method will either immediately run -ENXIO, or all
+ * required initialization will be done.
+ */
+static int ib_uverbs_open(struct inode *inode, struct file *filp)
+{
+ struct ib_uverbs_device *dev;
+ struct ib_uverbs_file *file;
+ struct ib_device *ib_dev;
+ int ret;
+ int module_dependent;
+ int srcu_key;
+
+ dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev);
+ if (!refcount_inc_not_zero(&dev->refcount))
+ return -ENXIO;
+
+ get_device(&dev->dev);
+ srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+ mutex_lock(&dev->lists_mutex);
+ ib_dev = srcu_dereference(dev->ib_dev,
+ &dev->disassociate_srcu);
+ if (!ib_dev) {
+ ret = -EIO;
+ goto err;
+ }
+
+ if (!rdma_dev_access_netns(ib_dev, current->nsproxy->net_ns)) {
+ ret = -EPERM;
+ goto err;
+ }
+
+ /* In case IB device supports disassociate ucontext, there is no hard
+ * dependency between uverbs device and its low level device.
+ */
+ module_dependent = !(ib_dev->ops.disassociate_ucontext);
+
+ if (module_dependent) {
+ if (!try_module_get(ib_dev->ops.owner)) {
+ ret = -ENODEV;
+ goto err;
+ }
+ }
+
+ file = kzalloc(sizeof(*file), GFP_KERNEL);
+ if (!file) {
+ ret = -ENOMEM;
+ if (module_dependent)
+ goto err_module;
+
+ goto err;
+ }
+
+ file->device = dev;
+ kref_init(&file->ref);
+ mutex_init(&file->ucontext_lock);
+
+ spin_lock_init(&file->uobjects_lock);
+ INIT_LIST_HEAD(&file->uobjects);
+ init_rwsem(&file->hw_destroy_rwsem);
+ mutex_init(&file->umap_lock);
+ INIT_LIST_HEAD(&file->umaps);
+
+ filp->private_data = file;
+ list_add_tail(&file->list, &dev->uverbs_file_list);
+ mutex_unlock(&dev->lists_mutex);
+ srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
+
+ setup_ufile_idr_uobject(file);
+
+ return stream_open(inode, filp);
+
+err_module:
+ module_put(ib_dev->ops.owner);
+
+err:
+ mutex_unlock(&dev->lists_mutex);
+ srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
+ if (refcount_dec_and_test(&dev->refcount))
+ ib_uverbs_comp_dev(dev);
+
+ put_device(&dev->dev);
+ return ret;
+}
+
+static int ib_uverbs_close(struct inode *inode, struct file *filp)
+{
+ struct ib_uverbs_file *file = filp->private_data;
+
+ uverbs_destroy_ufile_hw(file, RDMA_REMOVE_CLOSE);
+
+ mutex_lock(&file->device->lists_mutex);
+ list_del_init(&file->list);
+ mutex_unlock(&file->device->lists_mutex);
+
+ kref_put(&file->ref, ib_uverbs_release_file);
+
+ return 0;
+}
+
+static const struct file_operations uverbs_fops = {
+ .owner = THIS_MODULE,
+ .write = ib_uverbs_write,
+ .open = ib_uverbs_open,
+ .release = ib_uverbs_close,
+ .llseek = no_llseek,
+ .unlocked_ioctl = ib_uverbs_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+};
+
+static const struct file_operations uverbs_mmap_fops = {
+ .owner = THIS_MODULE,
+ .write = ib_uverbs_write,
+ .mmap = ib_uverbs_mmap,
+ .open = ib_uverbs_open,
+ .release = ib_uverbs_close,
+ .llseek = no_llseek,
+ .unlocked_ioctl = ib_uverbs_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
+};
+
+static int ib_uverbs_get_nl_info(struct ib_device *ibdev, void *client_data,
+ struct ib_client_nl_info *res)
+{
+ struct ib_uverbs_device *uverbs_dev = client_data;
+ int ret;
+
+ if (res->port != -1)
+ return -EINVAL;
+
+ res->abi = ibdev->ops.uverbs_abi_ver;
+ res->cdev = &uverbs_dev->dev;
+
+ /*
+ * To support DRIVER_ID binding in userspace some of the driver need
+ * upgrading to expose their PCI dependent revision information
+ * through get_context instead of relying on modalias matching. When
+ * the drivers are fixed they can drop this flag.
+ */
+ if (!ibdev->ops.uverbs_no_driver_id_binding) {
+ ret = nla_put_u32(res->nl_msg, RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID,
+ ibdev->ops.driver_id);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+static struct ib_client uverbs_client = {
+ .name = "uverbs",
+ .no_kverbs_req = true,
+ .add = ib_uverbs_add_one,
+ .remove = ib_uverbs_remove_one,
+ .get_nl_info = ib_uverbs_get_nl_info,
+};
+MODULE_ALIAS_RDMA_CLIENT("uverbs");
+
+static ssize_t ibdev_show(struct device *device, struct device_attribute *attr,
+ char *buf)
+{
+ struct ib_uverbs_device *dev =
+ container_of(device, struct ib_uverbs_device, dev);
+ int ret = -ENODEV;
+ int srcu_key;
+ struct ib_device *ib_dev;
+
+ srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+ ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
+ if (ib_dev)
+ ret = sysfs_emit(buf, "%s\n", dev_name(&ib_dev->dev));
+ srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
+
+ return ret;
+}
+static DEVICE_ATTR_RO(ibdev);
+
+static ssize_t abi_version_show(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct ib_uverbs_device *dev =
+ container_of(device, struct ib_uverbs_device, dev);
+ int ret = -ENODEV;
+ int srcu_key;
+ struct ib_device *ib_dev;
+
+ srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+ ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
+ if (ib_dev)
+ ret = sysfs_emit(buf, "%u\n", ib_dev->ops.uverbs_abi_ver);
+ srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
+
+ return ret;
+}
+static DEVICE_ATTR_RO(abi_version);
+
+static struct attribute *ib_dev_attrs[] = {
+ &dev_attr_abi_version.attr,
+ &dev_attr_ibdev.attr,
+ NULL,
+};
+
+static const struct attribute_group dev_attr_group = {
+ .attrs = ib_dev_attrs,
+};
+
+static CLASS_ATTR_STRING(abi_version, S_IRUGO,
+ __stringify(IB_USER_VERBS_ABI_VERSION));
+
+static int ib_uverbs_create_uapi(struct ib_device *device,
+ struct ib_uverbs_device *uverbs_dev)
+{
+ struct uverbs_api *uapi;
+
+ uapi = uverbs_alloc_api(device);
+ if (IS_ERR(uapi))
+ return PTR_ERR(uapi);
+
+ uverbs_dev->uapi = uapi;
+ return 0;
+}
+
+static int ib_uverbs_add_one(struct ib_device *device)
+{
+ int devnum;
+ dev_t base;
+ struct ib_uverbs_device *uverbs_dev;
+ int ret;
+
+ if (!device->ops.alloc_ucontext)
+ return -EOPNOTSUPP;
+
+ uverbs_dev = kzalloc(sizeof(*uverbs_dev), GFP_KERNEL);
+ if (!uverbs_dev)
+ return -ENOMEM;
+
+ ret = init_srcu_struct(&uverbs_dev->disassociate_srcu);
+ if (ret) {
+ kfree(uverbs_dev);
+ return -ENOMEM;
+ }
+
+ device_initialize(&uverbs_dev->dev);
+ uverbs_dev->dev.class = uverbs_class;
+ uverbs_dev->dev.parent = device->dev.parent;
+ uverbs_dev->dev.release = ib_uverbs_release_dev;
+ uverbs_dev->groups[0] = &dev_attr_group;
+ uverbs_dev->dev.groups = uverbs_dev->groups;
+ refcount_set(&uverbs_dev->refcount, 1);
+ init_completion(&uverbs_dev->comp);
+ uverbs_dev->xrcd_tree = RB_ROOT;
+ mutex_init(&uverbs_dev->xrcd_tree_mutex);
+ mutex_init(&uverbs_dev->lists_mutex);
+ INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list);
+ rcu_assign_pointer(uverbs_dev->ib_dev, device);
+ uverbs_dev->num_comp_vectors = device->num_comp_vectors;
+
+ devnum = ida_alloc_max(&uverbs_ida, IB_UVERBS_MAX_DEVICES - 1,
+ GFP_KERNEL);
+ if (devnum < 0) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ uverbs_dev->devnum = devnum;
+ if (devnum >= IB_UVERBS_NUM_FIXED_MINOR)
+ base = dynamic_uverbs_dev + devnum - IB_UVERBS_NUM_FIXED_MINOR;
+ else
+ base = IB_UVERBS_BASE_DEV + devnum;
+
+ ret = ib_uverbs_create_uapi(device, uverbs_dev);
+ if (ret)
+ goto err_uapi;
+
+ uverbs_dev->dev.devt = base;
+ dev_set_name(&uverbs_dev->dev, "uverbs%d", uverbs_dev->devnum);
+
+ cdev_init(&uverbs_dev->cdev,
+ device->ops.mmap ? &uverbs_mmap_fops : &uverbs_fops);
+ uverbs_dev->cdev.owner = THIS_MODULE;
+
+ ret = cdev_device_add(&uverbs_dev->cdev, &uverbs_dev->dev);
+ if (ret)
+ goto err_uapi;
+
+ ib_set_client_data(device, &uverbs_client, uverbs_dev);
+ return 0;
+
+err_uapi:
+ ida_free(&uverbs_ida, devnum);
+err:
+ if (refcount_dec_and_test(&uverbs_dev->refcount))
+ ib_uverbs_comp_dev(uverbs_dev);
+ wait_for_completion(&uverbs_dev->comp);
+ put_device(&uverbs_dev->dev);
+ return ret;
+}
+
+static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev,
+ struct ib_device *ib_dev)
+{
+ struct ib_uverbs_file *file;
+
+ /* Pending running commands to terminate */
+ uverbs_disassociate_api_pre(uverbs_dev);
+
+ mutex_lock(&uverbs_dev->lists_mutex);
+ while (!list_empty(&uverbs_dev->uverbs_file_list)) {
+ file = list_first_entry(&uverbs_dev->uverbs_file_list,
+ struct ib_uverbs_file, list);
+ list_del_init(&file->list);
+ kref_get(&file->ref);
+
+ /* We must release the mutex before going ahead and calling
+ * uverbs_cleanup_ufile, as it might end up indirectly calling
+ * uverbs_close, for example due to freeing the resources (e.g
+ * mmput).
+ */
+ mutex_unlock(&uverbs_dev->lists_mutex);
+
+ uverbs_destroy_ufile_hw(file, RDMA_REMOVE_DRIVER_REMOVE);
+ kref_put(&file->ref, ib_uverbs_release_file);
+
+ mutex_lock(&uverbs_dev->lists_mutex);
+ }
+ mutex_unlock(&uverbs_dev->lists_mutex);
+
+ uverbs_disassociate_api(uverbs_dev->uapi);
+}
+
+static void ib_uverbs_remove_one(struct ib_device *device, void *client_data)
+{
+ struct ib_uverbs_device *uverbs_dev = client_data;
+ int wait_clients = 1;
+
+ cdev_device_del(&uverbs_dev->cdev, &uverbs_dev->dev);
+ ida_free(&uverbs_ida, uverbs_dev->devnum);
+
+ if (device->ops.disassociate_ucontext) {
+ /* We disassociate HW resources and immediately return.
+ * Userspace will see a EIO errno for all future access.
+ * Upon returning, ib_device may be freed internally and is not
+ * valid any more.
+ * uverbs_device is still available until all clients close
+ * their files, then the uverbs device ref count will be zero
+ * and its resources will be freed.
+ * Note: At this point no more files can be opened since the
+ * cdev was deleted, however active clients can still issue
+ * commands and close their open files.
+ */
+ ib_uverbs_free_hw_resources(uverbs_dev, device);
+ wait_clients = 0;
+ }
+
+ if (refcount_dec_and_test(&uverbs_dev->refcount))
+ ib_uverbs_comp_dev(uverbs_dev);
+ if (wait_clients)
+ wait_for_completion(&uverbs_dev->comp);
+
+ put_device(&uverbs_dev->dev);
+}
+
+static char *uverbs_devnode(struct device *dev, umode_t *mode)
+{
+ if (mode)
+ *mode = 0666;
+ return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
+static int __init ib_uverbs_init(void)
+{
+ int ret;
+
+ ret = register_chrdev_region(IB_UVERBS_BASE_DEV,
+ IB_UVERBS_NUM_FIXED_MINOR,
+ "infiniband_verbs");
+ if (ret) {
+ pr_err("user_verbs: couldn't register device number\n");
+ goto out;
+ }
+
+ ret = alloc_chrdev_region(&dynamic_uverbs_dev, 0,
+ IB_UVERBS_NUM_DYNAMIC_MINOR,
+ "infiniband_verbs");
+ if (ret) {
+ pr_err("couldn't register dynamic device number\n");
+ goto out_alloc;
+ }
+
+ uverbs_class = class_create(THIS_MODULE, "infiniband_verbs");
+ if (IS_ERR(uverbs_class)) {
+ ret = PTR_ERR(uverbs_class);
+ pr_err("user_verbs: couldn't create class infiniband_verbs\n");
+ goto out_chrdev;
+ }
+
+ uverbs_class->devnode = uverbs_devnode;
+
+ ret = class_create_file(uverbs_class, &class_attr_abi_version.attr);
+ if (ret) {
+ pr_err("user_verbs: couldn't create abi_version attribute\n");
+ goto out_class;
+ }
+
+ ret = ib_register_client(&uverbs_client);
+ if (ret) {
+ pr_err("user_verbs: couldn't register client\n");
+ goto out_class;
+ }
+
+ return 0;
+
+out_class:
+ class_destroy(uverbs_class);
+
+out_chrdev:
+ unregister_chrdev_region(dynamic_uverbs_dev,
+ IB_UVERBS_NUM_DYNAMIC_MINOR);
+
+out_alloc:
+ unregister_chrdev_region(IB_UVERBS_BASE_DEV,
+ IB_UVERBS_NUM_FIXED_MINOR);
+
+out:
+ return ret;
+}
+
+static void __exit ib_uverbs_cleanup(void)
+{
+ ib_unregister_client(&uverbs_client);
+ class_destroy(uverbs_class);
+ unregister_chrdev_region(IB_UVERBS_BASE_DEV,
+ IB_UVERBS_NUM_FIXED_MINOR);
+ unregister_chrdev_region(dynamic_uverbs_dev,
+ IB_UVERBS_NUM_DYNAMIC_MINOR);
+ mmu_notifier_synchronize();
+}
+
+module_init(ib_uverbs_init);
+module_exit(ib_uverbs_cleanup);
diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c
new file mode 100644
index 000000000..11a080646
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_marshall.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/export.h>
+#include <rdma/ib_marshall.h>
+
+#define OPA_DEFAULT_GID_PREFIX cpu_to_be64(0xfe80000000000000ULL)
+static int rdma_ah_conv_opa_to_ib(struct ib_device *dev,
+ struct rdma_ah_attr *ib,
+ struct rdma_ah_attr *opa)
+{
+ struct ib_port_attr port_attr;
+ int ret = 0;
+
+ /* Do structure copy and the over-write fields */
+ *ib = *opa;
+
+ ib->type = RDMA_AH_ATTR_TYPE_IB;
+ rdma_ah_set_grh(ib, NULL, 0, 0, 1, 0);
+
+ if (ib_query_port(dev, opa->port_num, &port_attr)) {
+ /* Set to default subnet to indicate error */
+ rdma_ah_set_subnet_prefix(ib, OPA_DEFAULT_GID_PREFIX);
+ ret = -EINVAL;
+ } else {
+ rdma_ah_set_subnet_prefix(ib,
+ cpu_to_be64(port_attr.subnet_prefix));
+ }
+ rdma_ah_set_interface_id(ib, OPA_MAKE_ID(rdma_ah_get_dlid(opa)));
+ return ret;
+}
+
+void ib_copy_ah_attr_to_user(struct ib_device *device,
+ struct ib_uverbs_ah_attr *dst,
+ struct rdma_ah_attr *ah_attr)
+{
+ struct rdma_ah_attr *src = ah_attr;
+ struct rdma_ah_attr conv_ah;
+
+ memset(&dst->grh, 0, sizeof(dst->grh));
+
+ if ((ah_attr->type == RDMA_AH_ATTR_TYPE_OPA) &&
+ (rdma_ah_get_dlid(ah_attr) > be16_to_cpu(IB_LID_PERMISSIVE)) &&
+ (!rdma_ah_conv_opa_to_ib(device, &conv_ah, ah_attr)))
+ src = &conv_ah;
+
+ dst->dlid = rdma_ah_get_dlid(src);
+ dst->sl = rdma_ah_get_sl(src);
+ dst->src_path_bits = rdma_ah_get_path_bits(src);
+ dst->static_rate = rdma_ah_get_static_rate(src);
+ dst->is_global = rdma_ah_get_ah_flags(src) &
+ IB_AH_GRH ? 1 : 0;
+ if (dst->is_global) {
+ const struct ib_global_route *grh = rdma_ah_read_grh(src);
+
+ memcpy(dst->grh.dgid, grh->dgid.raw, sizeof(grh->dgid));
+ dst->grh.flow_label = grh->flow_label;
+ dst->grh.sgid_index = grh->sgid_index;
+ dst->grh.hop_limit = grh->hop_limit;
+ dst->grh.traffic_class = grh->traffic_class;
+ }
+ dst->port_num = rdma_ah_get_port_num(src);
+ dst->reserved = 0;
+}
+EXPORT_SYMBOL(ib_copy_ah_attr_to_user);
+
+void ib_copy_qp_attr_to_user(struct ib_device *device,
+ struct ib_uverbs_qp_attr *dst,
+ struct ib_qp_attr *src)
+{
+ dst->qp_state = src->qp_state;
+ dst->cur_qp_state = src->cur_qp_state;
+ dst->path_mtu = src->path_mtu;
+ dst->path_mig_state = src->path_mig_state;
+ dst->qkey = src->qkey;
+ dst->rq_psn = src->rq_psn;
+ dst->sq_psn = src->sq_psn;
+ dst->dest_qp_num = src->dest_qp_num;
+ dst->qp_access_flags = src->qp_access_flags;
+
+ dst->max_send_wr = src->cap.max_send_wr;
+ dst->max_recv_wr = src->cap.max_recv_wr;
+ dst->max_send_sge = src->cap.max_send_sge;
+ dst->max_recv_sge = src->cap.max_recv_sge;
+ dst->max_inline_data = src->cap.max_inline_data;
+
+ ib_copy_ah_attr_to_user(device, &dst->ah_attr, &src->ah_attr);
+ ib_copy_ah_attr_to_user(device, &dst->alt_ah_attr, &src->alt_ah_attr);
+
+ dst->pkey_index = src->pkey_index;
+ dst->alt_pkey_index = src->alt_pkey_index;
+ dst->en_sqd_async_notify = src->en_sqd_async_notify;
+ dst->sq_draining = src->sq_draining;
+ dst->max_rd_atomic = src->max_rd_atomic;
+ dst->max_dest_rd_atomic = src->max_dest_rd_atomic;
+ dst->min_rnr_timer = src->min_rnr_timer;
+ dst->port_num = src->port_num;
+ dst->timeout = src->timeout;
+ dst->retry_cnt = src->retry_cnt;
+ dst->rnr_retry = src->rnr_retry;
+ dst->alt_port_num = src->alt_port_num;
+ dst->alt_timeout = src->alt_timeout;
+ memset(dst->reserved, 0, sizeof(dst->reserved));
+}
+EXPORT_SYMBOL(ib_copy_qp_attr_to_user);
+
+static void __ib_copy_path_rec_to_user(struct ib_user_path_rec *dst,
+ struct sa_path_rec *src)
+{
+ memcpy(dst->dgid, src->dgid.raw, sizeof(src->dgid));
+ memcpy(dst->sgid, src->sgid.raw, sizeof(src->sgid));
+
+ dst->dlid = htons(ntohl(sa_path_get_dlid(src)));
+ dst->slid = htons(ntohl(sa_path_get_slid(src)));
+ dst->raw_traffic = sa_path_get_raw_traffic(src);
+ dst->flow_label = src->flow_label;
+ dst->hop_limit = src->hop_limit;
+ dst->traffic_class = src->traffic_class;
+ dst->reversible = src->reversible;
+ dst->numb_path = src->numb_path;
+ dst->pkey = src->pkey;
+ dst->sl = src->sl;
+ dst->mtu_selector = src->mtu_selector;
+ dst->mtu = src->mtu;
+ dst->rate_selector = src->rate_selector;
+ dst->rate = src->rate;
+ dst->packet_life_time = src->packet_life_time;
+ dst->preference = src->preference;
+ dst->packet_life_time_selector = src->packet_life_time_selector;
+}
+
+void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst,
+ struct sa_path_rec *src)
+{
+ struct sa_path_rec rec;
+
+ if (src->rec_type == SA_PATH_REC_TYPE_OPA) {
+ sa_convert_path_opa_to_ib(&rec, src);
+ __ib_copy_path_rec_to_user(dst, &rec);
+ return;
+ }
+ __ib_copy_path_rec_to_user(dst, src);
+}
+EXPORT_SYMBOL(ib_copy_path_rec_to_user);
+
+void ib_copy_path_rec_from_user(struct sa_path_rec *dst,
+ struct ib_user_path_rec *src)
+{
+ u32 slid, dlid;
+
+ memset(dst, 0, sizeof(*dst));
+ if ((ib_is_opa_gid((union ib_gid *)src->sgid)) ||
+ (ib_is_opa_gid((union ib_gid *)src->dgid))) {
+ dst->rec_type = SA_PATH_REC_TYPE_OPA;
+ slid = opa_get_lid_from_gid((union ib_gid *)src->sgid);
+ dlid = opa_get_lid_from_gid((union ib_gid *)src->dgid);
+ } else {
+ dst->rec_type = SA_PATH_REC_TYPE_IB;
+ slid = ntohs(src->slid);
+ dlid = ntohs(src->dlid);
+ }
+ memcpy(dst->dgid.raw, src->dgid, sizeof dst->dgid);
+ memcpy(dst->sgid.raw, src->sgid, sizeof dst->sgid);
+
+ sa_path_set_dlid(dst, dlid);
+ sa_path_set_slid(dst, slid);
+ sa_path_set_raw_traffic(dst, src->raw_traffic);
+ dst->flow_label = src->flow_label;
+ dst->hop_limit = src->hop_limit;
+ dst->traffic_class = src->traffic_class;
+ dst->reversible = src->reversible;
+ dst->numb_path = src->numb_path;
+ dst->pkey = src->pkey;
+ dst->sl = src->sl;
+ dst->mtu_selector = src->mtu_selector;
+ dst->mtu = src->mtu;
+ dst->rate_selector = src->rate_selector;
+ dst->rate = src->rate;
+ dst->packet_life_time = src->packet_life_time;
+ dst->preference = src->preference;
+ dst->packet_life_time_selector = src->packet_life_time_selector;
+
+ /* TODO: No need to set this */
+ sa_path_set_dmac_zero(dst);
+}
+EXPORT_SYMBOL(ib_copy_path_rec_from_user);
diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c
new file mode 100644
index 000000000..13776a66e
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types.c
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/uverbs_std_types.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_verbs.h>
+#include <linux/bug.h>
+#include <linux/file.h>
+#include <rdma/restrack.h>
+#include "rdma_core.h"
+#include "uverbs.h"
+
+static int uverbs_free_ah(struct ib_uobject *uobject,
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
+{
+ return rdma_destroy_ah_user((struct ib_ah *)uobject->object,
+ RDMA_DESTROY_AH_SLEEPABLE,
+ &attrs->driver_udata);
+}
+
+static int uverbs_free_flow(struct ib_uobject *uobject,
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_flow *flow = (struct ib_flow *)uobject->object;
+ struct ib_uflow_object *uflow =
+ container_of(uobject, struct ib_uflow_object, uobject);
+ struct ib_qp *qp = flow->qp;
+ int ret;
+
+ ret = flow->device->ops.destroy_flow(flow);
+ if (!ret) {
+ if (qp)
+ atomic_dec(&qp->usecnt);
+ ib_uverbs_flow_resources_free(uflow->resources);
+ }
+
+ return ret;
+}
+
+static int uverbs_free_mw(struct ib_uobject *uobject,
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
+{
+ return uverbs_dealloc_mw((struct ib_mw *)uobject->object);
+}
+
+static int uverbs_free_rwq_ind_tbl(struct ib_uobject *uobject,
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_rwq_ind_table *rwq_ind_tbl = uobject->object;
+ struct ib_wq **ind_tbl = rwq_ind_tbl->ind_tbl;
+ u32 table_size = (1 << rwq_ind_tbl->log_ind_tbl_size);
+ int ret, i;
+
+ if (atomic_read(&rwq_ind_tbl->usecnt))
+ return -EBUSY;
+
+ ret = rwq_ind_tbl->device->ops.destroy_rwq_ind_table(rwq_ind_tbl);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < table_size; i++)
+ atomic_dec(&ind_tbl[i]->usecnt);
+
+ kfree(rwq_ind_tbl);
+ kfree(ind_tbl);
+ return 0;
+}
+
+static int uverbs_free_xrcd(struct ib_uobject *uobject,
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_xrcd *xrcd = uobject->object;
+ struct ib_uxrcd_object *uxrcd =
+ container_of(uobject, struct ib_uxrcd_object, uobject);
+ int ret;
+
+ if (atomic_read(&uxrcd->refcnt))
+ return -EBUSY;
+
+ mutex_lock(&attrs->ufile->device->xrcd_tree_mutex);
+ ret = ib_uverbs_dealloc_xrcd(uobject, xrcd, why, attrs);
+ mutex_unlock(&attrs->ufile->device->xrcd_tree_mutex);
+
+ return ret;
+}
+
+static int uverbs_free_pd(struct ib_uobject *uobject,
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_pd *pd = uobject->object;
+
+ if (atomic_read(&pd->usecnt))
+ return -EBUSY;
+
+ return ib_dealloc_pd_user(pd, &attrs->driver_udata);
+}
+
+void ib_uverbs_free_event_queue(struct ib_uverbs_event_queue *event_queue)
+{
+ struct ib_uverbs_event *entry, *tmp;
+
+ spin_lock_irq(&event_queue->lock);
+ /*
+ * The user must ensure that no new items are added to the event_list
+ * once is_closed is set.
+ */
+ event_queue->is_closed = 1;
+ spin_unlock_irq(&event_queue->lock);
+ wake_up_interruptible(&event_queue->poll_wait);
+ kill_fasync(&event_queue->async_queue, SIGIO, POLL_IN);
+
+ spin_lock_irq(&event_queue->lock);
+ list_for_each_entry_safe(entry, tmp, &event_queue->event_list, list) {
+ if (entry->counter)
+ list_del(&entry->obj_list);
+ list_del(&entry->list);
+ kfree(entry);
+ }
+ spin_unlock_irq(&event_queue->lock);
+}
+
+static void
+uverbs_completion_event_file_destroy_uobj(struct ib_uobject *uobj,
+ enum rdma_remove_reason why)
+{
+ struct ib_uverbs_completion_event_file *file =
+ container_of(uobj, struct ib_uverbs_completion_event_file,
+ uobj);
+
+ ib_uverbs_free_event_queue(&file->ev_queue);
+}
+
+int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs)
+{
+ return 0;
+}
+EXPORT_SYMBOL(uverbs_destroy_def_handler);
+
+DECLARE_UVERBS_NAMED_OBJECT(
+ UVERBS_OBJECT_COMP_CHANNEL,
+ UVERBS_TYPE_ALLOC_FD(sizeof(struct ib_uverbs_completion_event_file),
+ uverbs_completion_event_file_destroy_uobj,
+ &uverbs_event_fops,
+ "[infinibandevent]",
+ O_RDONLY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+ UVERBS_METHOD_MW_DESTROY,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_MW_HANDLE,
+ UVERBS_OBJECT_MW,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_MW,
+ UVERBS_TYPE_ALLOC_IDR(uverbs_free_mw),
+ &UVERBS_METHOD(UVERBS_METHOD_MW_DESTROY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+ UVERBS_METHOD_AH_DESTROY,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_AH_HANDLE,
+ UVERBS_OBJECT_AH,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_AH,
+ UVERBS_TYPE_ALLOC_IDR(uverbs_free_ah),
+ &UVERBS_METHOD(UVERBS_METHOD_AH_DESTROY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+ UVERBS_METHOD_FLOW_DESTROY,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_FLOW_HANDLE,
+ UVERBS_OBJECT_FLOW,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(
+ UVERBS_OBJECT_FLOW,
+ UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uflow_object),
+ uverbs_free_flow),
+ &UVERBS_METHOD(UVERBS_METHOD_FLOW_DESTROY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+ UVERBS_METHOD_RWQ_IND_TBL_DESTROY,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_RWQ_IND_TBL_HANDLE,
+ UVERBS_OBJECT_RWQ_IND_TBL,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL,
+ UVERBS_TYPE_ALLOC_IDR(uverbs_free_rwq_ind_tbl),
+ &UVERBS_METHOD(UVERBS_METHOD_RWQ_IND_TBL_DESTROY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+ UVERBS_METHOD_XRCD_DESTROY,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_XRCD_HANDLE,
+ UVERBS_OBJECT_XRCD,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(
+ UVERBS_OBJECT_XRCD,
+ UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uxrcd_object),
+ uverbs_free_xrcd),
+ &UVERBS_METHOD(UVERBS_METHOD_XRCD_DESTROY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+ UVERBS_METHOD_PD_DESTROY,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_PD_HANDLE,
+ UVERBS_OBJECT_PD,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_PD,
+ UVERBS_TYPE_ALLOC_IDR(uverbs_free_pd),
+ &UVERBS_METHOD(UVERBS_METHOD_PD_DESTROY));
+
+const struct uapi_definition uverbs_def_obj_intf[] = {
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_PD,
+ UAPI_DEF_OBJ_NEEDS_FN(dealloc_pd)),
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_COMP_CHANNEL,
+ UAPI_DEF_OBJ_NEEDS_FN(dealloc_pd)),
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_AH,
+ UAPI_DEF_OBJ_NEEDS_FN(destroy_ah)),
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MW,
+ UAPI_DEF_OBJ_NEEDS_FN(dealloc_mw)),
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_FLOW,
+ UAPI_DEF_OBJ_NEEDS_FN(destroy_flow)),
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+ UVERBS_OBJECT_RWQ_IND_TBL,
+ UAPI_DEF_OBJ_NEEDS_FN(destroy_rwq_ind_table)),
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_XRCD,
+ UAPI_DEF_OBJ_NEEDS_FN(dealloc_xrcd)),
+ {}
+};
diff --git a/drivers/infiniband/core/uverbs_std_types_async_fd.c b/drivers/infiniband/core/uverbs_std_types_async_fd.c
new file mode 100644
index 000000000..cc24cfdf7
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_async_fd.c
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2019, Mellanox Technologies inc. All rights reserved.
+ */
+
+#include <rdma/uverbs_std_types.h>
+#include <rdma/uverbs_ioctl.h>
+#include "rdma_core.h"
+#include "uverbs.h"
+
+static int UVERBS_HANDLER(UVERBS_METHOD_ASYNC_EVENT_ALLOC)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj =
+ uverbs_attr_get_uobject(attrs, UVERBS_METHOD_ASYNC_EVENT_ALLOC);
+
+ ib_uverbs_init_async_event_file(
+ container_of(uobj, struct ib_uverbs_async_event_file, uobj));
+ return 0;
+}
+
+static void uverbs_async_event_destroy_uobj(struct ib_uobject *uobj,
+ enum rdma_remove_reason why)
+{
+ struct ib_uverbs_async_event_file *event_file =
+ container_of(uobj, struct ib_uverbs_async_event_file, uobj);
+
+ ib_unregister_event_handler(&event_file->event_handler);
+
+ if (why == RDMA_REMOVE_DRIVER_REMOVE)
+ ib_uverbs_async_handler(event_file, 0, IB_EVENT_DEVICE_FATAL,
+ NULL, NULL);
+}
+
+int uverbs_async_event_release(struct inode *inode, struct file *filp)
+{
+ struct ib_uverbs_async_event_file *event_file;
+ struct ib_uobject *uobj = filp->private_data;
+ int ret;
+
+ if (!uobj)
+ return uverbs_uobject_fd_release(inode, filp);
+
+ event_file =
+ container_of(uobj, struct ib_uverbs_async_event_file, uobj);
+
+ /*
+ * The async event FD has to deliver IB_EVENT_DEVICE_FATAL even after
+ * disassociation, so cleaning the event list must only happen after
+ * release. The user knows it has reached the end of the event stream
+ * when it sees IB_EVENT_DEVICE_FATAL.
+ */
+ uverbs_uobject_get(uobj);
+ ret = uverbs_uobject_fd_release(inode, filp);
+ ib_uverbs_free_event_queue(&event_file->ev_queue);
+ uverbs_uobject_put(uobj);
+ return ret;
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_ASYNC_EVENT_ALLOC,
+ UVERBS_ATTR_FD(UVERBS_ATTR_ASYNC_EVENT_ALLOC_FD_HANDLE,
+ UVERBS_OBJECT_ASYNC_EVENT,
+ UVERBS_ACCESS_NEW,
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(
+ UVERBS_OBJECT_ASYNC_EVENT,
+ UVERBS_TYPE_ALLOC_FD(sizeof(struct ib_uverbs_async_event_file),
+ uverbs_async_event_destroy_uobj,
+ &uverbs_async_event_fops,
+ "[infinibandevent]",
+ O_RDONLY),
+ &UVERBS_METHOD(UVERBS_METHOD_ASYNC_EVENT_ALLOC));
+
+const struct uapi_definition uverbs_def_obj_async_fd[] = {
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_ASYNC_EVENT),
+ {}
+};
diff --git a/drivers/infiniband/core/uverbs_std_types_counters.c b/drivers/infiniband/core/uverbs_std_types_counters.c
new file mode 100644
index 000000000..381aa5797
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_counters.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rdma_core.h"
+#include "uverbs.h"
+#include <rdma/uverbs_std_types.h>
+
+static int uverbs_free_counters(struct ib_uobject *uobject,
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_counters *counters = uobject->object;
+ int ret;
+
+ if (atomic_read(&counters->usecnt))
+ return -EBUSY;
+
+ ret = counters->device->ops.destroy_counters(counters);
+ if (ret)
+ return ret;
+ kfree(counters);
+ return 0;
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_CREATE)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj = uverbs_attr_get_uobject(
+ attrs, UVERBS_ATTR_CREATE_COUNTERS_HANDLE);
+ struct ib_device *ib_dev = attrs->context->device;
+ struct ib_counters *counters;
+ int ret;
+
+ /*
+ * This check should be removed once the infrastructure
+ * have the ability to remove methods from parse tree once
+ * such condition is met.
+ */
+ if (!ib_dev->ops.create_counters)
+ return -EOPNOTSUPP;
+
+ counters = rdma_zalloc_drv_obj(ib_dev, ib_counters);
+ if (!counters)
+ return -ENOMEM;
+
+ counters->device = ib_dev;
+ counters->uobject = uobj;
+ uobj->object = counters;
+ atomic_set(&counters->usecnt, 0);
+
+ ret = ib_dev->ops.create_counters(counters, attrs);
+ if (ret)
+ kfree(counters);
+
+ return ret;
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_READ)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_counters_read_attr read_attr = {};
+ const struct uverbs_attr *uattr;
+ struct ib_counters *counters =
+ uverbs_attr_get_obj(attrs, UVERBS_ATTR_READ_COUNTERS_HANDLE);
+ int ret;
+
+ if (!counters->device->ops.read_counters)
+ return -EOPNOTSUPP;
+
+ if (!atomic_read(&counters->usecnt))
+ return -EINVAL;
+
+ ret = uverbs_get_flags32(&read_attr.flags, attrs,
+ UVERBS_ATTR_READ_COUNTERS_FLAGS,
+ IB_UVERBS_READ_COUNTERS_PREFER_CACHED);
+ if (ret)
+ return ret;
+
+ uattr = uverbs_attr_get(attrs, UVERBS_ATTR_READ_COUNTERS_BUFF);
+ if (IS_ERR(uattr))
+ return PTR_ERR(uattr);
+ read_attr.ncounters = uattr->ptr_attr.len / sizeof(u64);
+ read_attr.counters_buff = uverbs_zalloc(
+ attrs, array_size(read_attr.ncounters, sizeof(u64)));
+ if (IS_ERR(read_attr.counters_buff))
+ return PTR_ERR(read_attr.counters_buff);
+
+ ret = counters->device->ops.read_counters(counters, &read_attr, attrs);
+ if (ret)
+ return ret;
+
+ return uverbs_copy_to(attrs, UVERBS_ATTR_READ_COUNTERS_BUFF,
+ read_attr.counters_buff,
+ read_attr.ncounters * sizeof(u64));
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_COUNTERS_CREATE,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_COUNTERS_HANDLE,
+ UVERBS_OBJECT_COUNTERS,
+ UVERBS_ACCESS_NEW,
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+ UVERBS_METHOD_COUNTERS_DESTROY,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_COUNTERS_HANDLE,
+ UVERBS_OBJECT_COUNTERS,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_COUNTERS_READ,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_READ_COUNTERS_HANDLE,
+ UVERBS_OBJECT_COUNTERS,
+ UVERBS_ACCESS_READ,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_READ_COUNTERS_BUFF,
+ UVERBS_ATTR_MIN_SIZE(0),
+ UA_MANDATORY),
+ UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_READ_COUNTERS_FLAGS,
+ enum ib_uverbs_read_counters_flags));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_COUNTERS,
+ UVERBS_TYPE_ALLOC_IDR(uverbs_free_counters),
+ &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_CREATE),
+ &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_DESTROY),
+ &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_READ));
+
+const struct uapi_definition uverbs_def_obj_counters[] = {
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_COUNTERS,
+ UAPI_DEF_OBJ_NEEDS_FN(destroy_counters)),
+ {}
+};
diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c
new file mode 100644
index 000000000..370ad7c83
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_cq.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/uverbs_std_types.h>
+#include "rdma_core.h"
+#include "uverbs.h"
+#include "restrack.h"
+
+static int uverbs_free_cq(struct ib_uobject *uobject,
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_cq *cq = uobject->object;
+ struct ib_uverbs_event_queue *ev_queue = cq->cq_context;
+ struct ib_ucq_object *ucq =
+ container_of(uobject, struct ib_ucq_object, uevent.uobject);
+ int ret;
+
+ ret = ib_destroy_cq_user(cq, &attrs->driver_udata);
+ if (ret)
+ return ret;
+
+ ib_uverbs_release_ucq(
+ ev_queue ? container_of(ev_queue,
+ struct ib_uverbs_completion_event_file,
+ ev_queue) :
+ NULL,
+ ucq);
+ return 0;
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_ucq_object *obj = container_of(
+ uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_CQ_HANDLE),
+ typeof(*obj), uevent.uobject);
+ struct ib_device *ib_dev = attrs->context->device;
+ int ret;
+ u64 user_handle;
+ struct ib_cq_init_attr attr = {};
+ struct ib_cq *cq;
+ struct ib_uverbs_completion_event_file *ev_file = NULL;
+ struct ib_uobject *ev_file_uobj;
+
+ if (!ib_dev->ops.create_cq || !ib_dev->ops.destroy_cq)
+ return -EOPNOTSUPP;
+
+ ret = uverbs_copy_from(&attr.comp_vector, attrs,
+ UVERBS_ATTR_CREATE_CQ_COMP_VECTOR);
+ if (!ret)
+ ret = uverbs_copy_from(&attr.cqe, attrs,
+ UVERBS_ATTR_CREATE_CQ_CQE);
+ if (!ret)
+ ret = uverbs_copy_from(&user_handle, attrs,
+ UVERBS_ATTR_CREATE_CQ_USER_HANDLE);
+ if (ret)
+ return ret;
+
+ ret = uverbs_get_flags32(&attr.flags, attrs,
+ UVERBS_ATTR_CREATE_CQ_FLAGS,
+ IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION |
+ IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN);
+ if (ret)
+ return ret;
+
+ ev_file_uobj = uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL);
+ if (!IS_ERR(ev_file_uobj)) {
+ ev_file = container_of(ev_file_uobj,
+ struct ib_uverbs_completion_event_file,
+ uobj);
+ uverbs_uobject_get(ev_file_uobj);
+ }
+
+ obj->uevent.event_file = ib_uverbs_get_async_event(
+ attrs, UVERBS_ATTR_CREATE_CQ_EVENT_FD);
+
+ if (attr.comp_vector >= attrs->ufile->device->num_comp_vectors) {
+ ret = -EINVAL;
+ goto err_event_file;
+ }
+
+ INIT_LIST_HEAD(&obj->comp_list);
+ INIT_LIST_HEAD(&obj->uevent.event_list);
+
+ cq = rdma_zalloc_drv_obj(ib_dev, ib_cq);
+ if (!cq) {
+ ret = -ENOMEM;
+ goto err_event_file;
+ }
+
+ cq->device = ib_dev;
+ cq->uobject = obj;
+ cq->comp_handler = ib_uverbs_comp_handler;
+ cq->event_handler = ib_uverbs_cq_event_handler;
+ cq->cq_context = ev_file ? &ev_file->ev_queue : NULL;
+ atomic_set(&cq->usecnt, 0);
+
+ rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ);
+ rdma_restrack_set_name(&cq->res, NULL);
+
+ ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata);
+ if (ret)
+ goto err_free;
+
+ obj->uevent.uobject.object = cq;
+ obj->uevent.uobject.user_handle = user_handle;
+ rdma_restrack_add(&cq->res);
+ uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_CREATE_CQ_HANDLE);
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_CQ_RESP_CQE, &cq->cqe,
+ sizeof(cq->cqe));
+ return ret;
+
+err_free:
+ rdma_restrack_put(&cq->res);
+ kfree(cq);
+err_event_file:
+ if (obj->uevent.event_file)
+ uverbs_uobject_put(&obj->uevent.event_file->uobj);
+ if (ev_file)
+ uverbs_uobject_put(ev_file_uobj);
+ return ret;
+};
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_CQ_CREATE,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_CQ_HANDLE,
+ UVERBS_OBJECT_CQ,
+ UVERBS_ACCESS_NEW,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_CQE,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_USER_HANDLE,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY),
+ UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL,
+ UVERBS_OBJECT_COMP_CHANNEL,
+ UVERBS_ACCESS_READ,
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_COMP_VECTOR,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_CREATE_CQ_FLAGS,
+ enum ib_uverbs_ex_create_cq_flags),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_CQ_RESP_CQE,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_CQ_EVENT_FD,
+ UVERBS_OBJECT_ASYNC_EVENT,
+ UVERBS_ACCESS_READ,
+ UA_OPTIONAL),
+ UVERBS_ATTR_UHW());
+
+static int UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj =
+ uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_CQ_HANDLE);
+ struct ib_ucq_object *obj =
+ container_of(uobj, struct ib_ucq_object, uevent.uobject);
+ struct ib_uverbs_destroy_cq_resp resp = {
+ .comp_events_reported = obj->comp_events_reported,
+ .async_events_reported = obj->uevent.events_reported
+ };
+
+ return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_CQ_RESP, &resp,
+ sizeof(resp));
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_CQ_DESTROY,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_CQ_HANDLE,
+ UVERBS_OBJECT_CQ,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_CQ_RESP,
+ UVERBS_ATTR_TYPE(struct ib_uverbs_destroy_cq_resp),
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(
+ UVERBS_OBJECT_CQ,
+ UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), uverbs_free_cq),
+ &UVERBS_METHOD(UVERBS_METHOD_CQ_CREATE),
+ &UVERBS_METHOD(UVERBS_METHOD_CQ_DESTROY)
+);
+
+const struct uapi_definition uverbs_def_obj_cq[] = {
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_CQ,
+ UAPI_DEF_OBJ_NEEDS_FN(destroy_cq)),
+ {}
+};
diff --git a/drivers/infiniband/core/uverbs_std_types_device.c b/drivers/infiniband/core/uverbs_std_types_device.c
new file mode 100644
index 000000000..049684880
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_device.c
@@ -0,0 +1,503 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved.
+ */
+
+#include <linux/overflow.h>
+#include <rdma/uverbs_std_types.h>
+#include "rdma_core.h"
+#include "uverbs.h"
+#include <rdma/uverbs_ioctl.h>
+#include <rdma/opa_addr.h>
+#include <rdma/ib_cache.h>
+
+/*
+ * This ioctl method allows calling any defined write or write_ex
+ * handler. This essentially replaces the hdr/ex_hdr system with the ioctl
+ * marshalling, and brings the non-ex path into the same marshalling as the ex
+ * path.
+ */
+static int UVERBS_HANDLER(UVERBS_METHOD_INVOKE_WRITE)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct uverbs_api *uapi = attrs->ufile->device->uapi;
+ const struct uverbs_api_write_method *method_elm;
+ u32 cmd;
+ int rc;
+
+ rc = uverbs_get_const(&cmd, attrs, UVERBS_ATTR_WRITE_CMD);
+ if (rc)
+ return rc;
+
+ method_elm = uapi_get_method(uapi, cmd);
+ if (IS_ERR(method_elm))
+ return PTR_ERR(method_elm);
+
+ uverbs_fill_udata(attrs, &attrs->ucore, UVERBS_ATTR_CORE_IN,
+ UVERBS_ATTR_CORE_OUT);
+
+ if (attrs->ucore.inlen < method_elm->req_size ||
+ attrs->ucore.outlen < method_elm->resp_size)
+ return -ENOSPC;
+
+ attrs->uobject = NULL;
+ rc = method_elm->handler(attrs);
+ if (attrs->uobject)
+ uverbs_finalize_object(attrs->uobject, UVERBS_ACCESS_NEW, true,
+ !rc, attrs);
+ return rc;
+}
+
+DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_INVOKE_WRITE,
+ UVERBS_ATTR_CONST_IN(UVERBS_ATTR_WRITE_CMD,
+ enum ib_uverbs_write_cmds,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CORE_IN,
+ UVERBS_ATTR_MIN_SIZE(sizeof(u32)),
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CORE_OUT,
+ UVERBS_ATTR_MIN_SIZE(0),
+ UA_OPTIONAL),
+ UVERBS_ATTR_UHW());
+
+static uint32_t *
+gather_objects_handle(struct ib_uverbs_file *ufile,
+ const struct uverbs_api_object *uapi_object,
+ struct uverbs_attr_bundle *attrs,
+ ssize_t out_len,
+ u64 *total)
+{
+ u64 max_count = out_len / sizeof(u32);
+ struct ib_uobject *obj;
+ u64 count = 0;
+ u32 *handles;
+
+ /* Allocated memory that cannot page out where we gather
+ * all object ids under a spin_lock.
+ */
+ handles = uverbs_zalloc(attrs, out_len);
+ if (IS_ERR(handles))
+ return handles;
+
+ spin_lock_irq(&ufile->uobjects_lock);
+ list_for_each_entry(obj, &ufile->uobjects, list) {
+ u32 obj_id = obj->id;
+
+ if (obj->uapi_object != uapi_object)
+ continue;
+
+ if (count >= max_count)
+ break;
+
+ handles[count] = obj_id;
+ count++;
+ }
+ spin_unlock_irq(&ufile->uobjects_lock);
+
+ *total = count;
+ return handles;
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_INFO_HANDLES)(
+ struct uverbs_attr_bundle *attrs)
+{
+ const struct uverbs_api_object *uapi_object;
+ ssize_t out_len;
+ u64 total = 0;
+ u16 object_id;
+ u32 *handles;
+ int ret;
+
+ out_len = uverbs_attr_get_len(attrs, UVERBS_ATTR_INFO_HANDLES_LIST);
+ if (out_len <= 0 || (out_len % sizeof(u32) != 0))
+ return -EINVAL;
+
+ ret = uverbs_get_const(&object_id, attrs, UVERBS_ATTR_INFO_OBJECT_ID);
+ if (ret)
+ return ret;
+
+ uapi_object = uapi_get_object(attrs->ufile->device->uapi, object_id);
+ if (IS_ERR(uapi_object))
+ return PTR_ERR(uapi_object);
+
+ handles = gather_objects_handle(attrs->ufile, uapi_object, attrs,
+ out_len, &total);
+ if (IS_ERR(handles))
+ return PTR_ERR(handles);
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_INFO_HANDLES_LIST, handles,
+ sizeof(u32) * total);
+ if (ret)
+ goto err;
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_INFO_TOTAL_HANDLES, &total,
+ sizeof(total));
+err:
+ return ret;
+}
+
+void copy_port_attr_to_resp(struct ib_port_attr *attr,
+ struct ib_uverbs_query_port_resp *resp,
+ struct ib_device *ib_dev, u8 port_num)
+{
+ resp->state = attr->state;
+ resp->max_mtu = attr->max_mtu;
+ resp->active_mtu = attr->active_mtu;
+ resp->gid_tbl_len = attr->gid_tbl_len;
+ resp->port_cap_flags = make_port_cap_flags(attr);
+ resp->max_msg_sz = attr->max_msg_sz;
+ resp->bad_pkey_cntr = attr->bad_pkey_cntr;
+ resp->qkey_viol_cntr = attr->qkey_viol_cntr;
+ resp->pkey_tbl_len = attr->pkey_tbl_len;
+
+ if (rdma_is_grh_required(ib_dev, port_num))
+ resp->flags |= IB_UVERBS_QPF_GRH_REQUIRED;
+
+ if (rdma_cap_opa_ah(ib_dev, port_num)) {
+ resp->lid = OPA_TO_IB_UCAST_LID(attr->lid);
+ resp->sm_lid = OPA_TO_IB_UCAST_LID(attr->sm_lid);
+ } else {
+ resp->lid = ib_lid_cpu16(attr->lid);
+ resp->sm_lid = ib_lid_cpu16(attr->sm_lid);
+ }
+
+ resp->lmc = attr->lmc;
+ resp->max_vl_num = attr->max_vl_num;
+ resp->sm_sl = attr->sm_sl;
+ resp->subnet_timeout = attr->subnet_timeout;
+ resp->init_type_reply = attr->init_type_reply;
+ resp->active_width = attr->active_width;
+ /* This ABI needs to be extended to provide any speed more than IB_SPEED_NDR */
+ resp->active_speed = min_t(u16, attr->active_speed, IB_SPEED_NDR);
+ resp->phys_state = attr->phys_state;
+ resp->link_layer = rdma_port_get_link_layer(ib_dev, port_num);
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_PORT)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_device *ib_dev;
+ struct ib_port_attr attr = {};
+ struct ib_uverbs_query_port_resp_ex resp = {};
+ struct ib_ucontext *ucontext;
+ int ret;
+ u8 port_num;
+
+ ucontext = ib_uverbs_get_ucontext(attrs);
+ if (IS_ERR(ucontext))
+ return PTR_ERR(ucontext);
+ ib_dev = ucontext->device;
+
+ /* FIXME: Extend the UAPI_DEF_OBJ_NEEDS_FN stuff.. */
+ if (!ib_dev->ops.query_port)
+ return -EOPNOTSUPP;
+
+ ret = uverbs_get_const(&port_num, attrs,
+ UVERBS_ATTR_QUERY_PORT_PORT_NUM);
+ if (ret)
+ return ret;
+
+ ret = ib_query_port(ib_dev, port_num, &attr);
+ if (ret)
+ return ret;
+
+ copy_port_attr_to_resp(&attr, &resp.legacy_resp, ib_dev, port_num);
+ resp.port_cap_flags2 = attr.port_cap_flags2;
+
+ return uverbs_copy_to_struct_or_zero(attrs, UVERBS_ATTR_QUERY_PORT_RESP,
+ &resp, sizeof(resp));
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_GET_CONTEXT)(
+ struct uverbs_attr_bundle *attrs)
+{
+ u32 num_comp = attrs->ufile->device->num_comp_vectors;
+ u64 core_support = IB_UVERBS_CORE_SUPPORT_OPTIONAL_MR_ACCESS;
+ int ret;
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_GET_CONTEXT_NUM_COMP_VECTORS,
+ &num_comp, sizeof(num_comp));
+ if (IS_UVERBS_COPY_ERR(ret))
+ return ret;
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_GET_CONTEXT_CORE_SUPPORT,
+ &core_support, sizeof(core_support));
+ if (IS_UVERBS_COPY_ERR(ret))
+ return ret;
+
+ ret = ib_alloc_ucontext(attrs);
+ if (ret)
+ return ret;
+ ret = ib_init_ucontext(attrs);
+ if (ret) {
+ kfree(attrs->context);
+ attrs->context = NULL;
+ return ret;
+ }
+ return 0;
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_CONTEXT)(
+ struct uverbs_attr_bundle *attrs)
+{
+ u64 core_support = IB_UVERBS_CORE_SUPPORT_OPTIONAL_MR_ACCESS;
+ struct ib_ucontext *ucontext;
+ struct ib_device *ib_dev;
+ u32 num_comp;
+ int ret;
+
+ ucontext = ib_uverbs_get_ucontext(attrs);
+ if (IS_ERR(ucontext))
+ return PTR_ERR(ucontext);
+ ib_dev = ucontext->device;
+
+ if (!ib_dev->ops.query_ucontext)
+ return -EOPNOTSUPP;
+
+ num_comp = attrs->ufile->device->num_comp_vectors;
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_CONTEXT_NUM_COMP_VECTORS,
+ &num_comp, sizeof(num_comp));
+ if (IS_UVERBS_COPY_ERR(ret))
+ return ret;
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_CONTEXT_CORE_SUPPORT,
+ &core_support, sizeof(core_support));
+ if (IS_UVERBS_COPY_ERR(ret))
+ return ret;
+
+ return ucontext->device->ops.query_ucontext(ucontext, attrs);
+}
+
+static int copy_gid_entries_to_user(struct uverbs_attr_bundle *attrs,
+ struct ib_uverbs_gid_entry *entries,
+ size_t num_entries, size_t user_entry_size)
+{
+ const struct uverbs_attr *attr;
+ void __user *user_entries;
+ size_t copy_len;
+ int ret;
+ int i;
+
+ if (user_entry_size == sizeof(*entries)) {
+ ret = uverbs_copy_to(attrs,
+ UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES,
+ entries, sizeof(*entries) * num_entries);
+ return ret;
+ }
+
+ copy_len = min_t(size_t, user_entry_size, sizeof(*entries));
+ attr = uverbs_attr_get(attrs, UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES);
+ if (IS_ERR(attr))
+ return PTR_ERR(attr);
+
+ user_entries = u64_to_user_ptr(attr->ptr_attr.data);
+ for (i = 0; i < num_entries; i++) {
+ if (copy_to_user(user_entries, entries, copy_len))
+ return -EFAULT;
+
+ if (user_entry_size > sizeof(*entries)) {
+ if (clear_user(user_entries + sizeof(*entries),
+ user_entry_size - sizeof(*entries)))
+ return -EFAULT;
+ }
+
+ entries++;
+ user_entries += user_entry_size;
+ }
+
+ return uverbs_output_written(attrs,
+ UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES);
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_GID_TABLE)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_gid_entry *entries;
+ struct ib_ucontext *ucontext;
+ struct ib_device *ib_dev;
+ size_t user_entry_size;
+ ssize_t num_entries;
+ int max_entries;
+ u32 flags;
+ int ret;
+
+ ret = uverbs_get_flags32(&flags, attrs,
+ UVERBS_ATTR_QUERY_GID_TABLE_FLAGS, 0);
+ if (ret)
+ return ret;
+
+ ret = uverbs_get_const(&user_entry_size, attrs,
+ UVERBS_ATTR_QUERY_GID_TABLE_ENTRY_SIZE);
+ if (ret)
+ return ret;
+
+ if (!user_entry_size)
+ return -EINVAL;
+
+ max_entries = uverbs_attr_ptr_get_array_size(
+ attrs, UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES,
+ user_entry_size);
+ if (max_entries <= 0)
+ return max_entries ?: -EINVAL;
+
+ ucontext = ib_uverbs_get_ucontext(attrs);
+ if (IS_ERR(ucontext))
+ return PTR_ERR(ucontext);
+ ib_dev = ucontext->device;
+
+ entries = uverbs_kcalloc(attrs, max_entries, sizeof(*entries));
+ if (IS_ERR(entries))
+ return PTR_ERR(entries);
+
+ num_entries = rdma_query_gid_table(ib_dev, entries, max_entries);
+ if (num_entries < 0)
+ return -EINVAL;
+
+ ret = copy_gid_entries_to_user(attrs, entries, num_entries,
+ user_entry_size);
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_to(attrs,
+ UVERBS_ATTR_QUERY_GID_TABLE_RESP_NUM_ENTRIES,
+ &num_entries, sizeof(num_entries));
+ return ret;
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_GID_ENTRY)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uverbs_gid_entry entry = {};
+ const struct ib_gid_attr *gid_attr;
+ struct ib_ucontext *ucontext;
+ struct ib_device *ib_dev;
+ struct net_device *ndev;
+ u32 gid_index;
+ u32 port_num;
+ u32 flags;
+ int ret;
+
+ ret = uverbs_get_flags32(&flags, attrs,
+ UVERBS_ATTR_QUERY_GID_ENTRY_FLAGS, 0);
+ if (ret)
+ return ret;
+
+ ret = uverbs_get_const(&port_num, attrs,
+ UVERBS_ATTR_QUERY_GID_ENTRY_PORT);
+ if (ret)
+ return ret;
+
+ ret = uverbs_get_const(&gid_index, attrs,
+ UVERBS_ATTR_QUERY_GID_ENTRY_GID_INDEX);
+ if (ret)
+ return ret;
+
+ ucontext = ib_uverbs_get_ucontext(attrs);
+ if (IS_ERR(ucontext))
+ return PTR_ERR(ucontext);
+ ib_dev = ucontext->device;
+
+ if (!rdma_is_port_valid(ib_dev, port_num))
+ return -EINVAL;
+
+ gid_attr = rdma_get_gid_attr(ib_dev, port_num, gid_index);
+ if (IS_ERR(gid_attr))
+ return PTR_ERR(gid_attr);
+
+ memcpy(&entry.gid, &gid_attr->gid, sizeof(gid_attr->gid));
+ entry.gid_index = gid_attr->index;
+ entry.port_num = gid_attr->port_num;
+ entry.gid_type = gid_attr->gid_type;
+
+ rcu_read_lock();
+ ndev = rdma_read_gid_attr_ndev_rcu(gid_attr);
+ if (IS_ERR(ndev)) {
+ if (PTR_ERR(ndev) != -ENODEV) {
+ ret = PTR_ERR(ndev);
+ rcu_read_unlock();
+ goto out;
+ }
+ } else {
+ entry.netdev_ifindex = ndev->ifindex;
+ }
+ rcu_read_unlock();
+
+ ret = uverbs_copy_to_struct_or_zero(
+ attrs, UVERBS_ATTR_QUERY_GID_ENTRY_RESP_ENTRY, &entry,
+ sizeof(entry));
+out:
+ rdma_put_gid_attr(gid_attr);
+ return ret;
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_GET_CONTEXT,
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_GET_CONTEXT_NUM_COMP_VECTORS,
+ UVERBS_ATTR_TYPE(u32), UA_OPTIONAL),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_GET_CONTEXT_CORE_SUPPORT,
+ UVERBS_ATTR_TYPE(u64), UA_OPTIONAL),
+ UVERBS_ATTR_UHW());
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_QUERY_CONTEXT,
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_CONTEXT_NUM_COMP_VECTORS,
+ UVERBS_ATTR_TYPE(u32), UA_OPTIONAL),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_CONTEXT_CORE_SUPPORT,
+ UVERBS_ATTR_TYPE(u64), UA_OPTIONAL));
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_INFO_HANDLES,
+ /* Also includes any device specific object ids */
+ UVERBS_ATTR_CONST_IN(UVERBS_ATTR_INFO_OBJECT_ID,
+ enum uverbs_default_objects, UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_INFO_TOTAL_HANDLES,
+ UVERBS_ATTR_TYPE(u32), UA_OPTIONAL),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_INFO_HANDLES_LIST,
+ UVERBS_ATTR_MIN_SIZE(sizeof(u32)), UA_OPTIONAL));
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_QUERY_PORT,
+ UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_PORT_PORT_NUM, u8, UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(
+ UVERBS_ATTR_QUERY_PORT_RESP,
+ UVERBS_ATTR_STRUCT(struct ib_uverbs_query_port_resp_ex,
+ reserved),
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_QUERY_GID_TABLE,
+ UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_GID_TABLE_ENTRY_SIZE, u64,
+ UA_MANDATORY),
+ UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_QUERY_GID_TABLE_FLAGS, u32,
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES,
+ UVERBS_ATTR_MIN_SIZE(0), UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_GID_TABLE_RESP_NUM_ENTRIES,
+ UVERBS_ATTR_TYPE(u64), UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_QUERY_GID_ENTRY,
+ UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_GID_ENTRY_PORT, u32,
+ UA_MANDATORY),
+ UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_GID_ENTRY_GID_INDEX, u32,
+ UA_MANDATORY),
+ UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_QUERY_GID_ENTRY_FLAGS, u32,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_GID_ENTRY_RESP_ENTRY,
+ UVERBS_ATTR_STRUCT(struct ib_uverbs_gid_entry,
+ netdev_ifindex),
+ UA_MANDATORY));
+
+DECLARE_UVERBS_GLOBAL_METHODS(UVERBS_OBJECT_DEVICE,
+ &UVERBS_METHOD(UVERBS_METHOD_GET_CONTEXT),
+ &UVERBS_METHOD(UVERBS_METHOD_INVOKE_WRITE),
+ &UVERBS_METHOD(UVERBS_METHOD_INFO_HANDLES),
+ &UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT),
+ &UVERBS_METHOD(UVERBS_METHOD_QUERY_CONTEXT),
+ &UVERBS_METHOD(UVERBS_METHOD_QUERY_GID_TABLE),
+ &UVERBS_METHOD(UVERBS_METHOD_QUERY_GID_ENTRY));
+
+const struct uapi_definition uverbs_def_obj_device[] = {
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DEVICE),
+ {},
+};
diff --git a/drivers/infiniband/core/uverbs_std_types_dm.c b/drivers/infiniband/core/uverbs_std_types_dm.c
new file mode 100644
index 000000000..98c522cf8
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_dm.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rdma_core.h"
+#include "uverbs.h"
+#include <rdma/uverbs_std_types.h>
+
+static int uverbs_free_dm(struct ib_uobject *uobject,
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_dm *dm = uobject->object;
+
+ if (atomic_read(&dm->usecnt))
+ return -EBUSY;
+
+ return dm->device->ops.dealloc_dm(dm, attrs);
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_dm_alloc_attr attr = {};
+ struct ib_uobject *uobj =
+ uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DM_HANDLE)
+ ->obj_attr.uobject;
+ struct ib_device *ib_dev = attrs->context->device;
+ struct ib_dm *dm;
+ int ret;
+
+ if (!ib_dev->ops.alloc_dm)
+ return -EOPNOTSUPP;
+
+ ret = uverbs_copy_from(&attr.length, attrs,
+ UVERBS_ATTR_ALLOC_DM_LENGTH);
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_from(&attr.alignment, attrs,
+ UVERBS_ATTR_ALLOC_DM_ALIGNMENT);
+ if (ret)
+ return ret;
+
+ dm = ib_dev->ops.alloc_dm(ib_dev, attrs->context, &attr, attrs);
+ if (IS_ERR(dm))
+ return PTR_ERR(dm);
+
+ dm->device = ib_dev;
+ dm->length = attr.length;
+ dm->uobject = uobj;
+ atomic_set(&dm->usecnt, 0);
+
+ uobj->object = dm;
+
+ return 0;
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_DM_ALLOC,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_ALLOC_DM_HANDLE,
+ UVERBS_OBJECT_DM,
+ UVERBS_ACCESS_NEW,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DM_LENGTH,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DM_ALIGNMENT,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+ UVERBS_METHOD_DM_FREE,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_FREE_DM_HANDLE,
+ UVERBS_OBJECT_DM,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_DM,
+ UVERBS_TYPE_ALLOC_IDR(uverbs_free_dm),
+ &UVERBS_METHOD(UVERBS_METHOD_DM_ALLOC),
+ &UVERBS_METHOD(UVERBS_METHOD_DM_FREE));
+
+const struct uapi_definition uverbs_def_obj_dm[] = {
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DM,
+ UAPI_DEF_OBJ_NEEDS_FN(dealloc_dm)),
+ {}
+};
diff --git a/drivers/infiniband/core/uverbs_std_types_flow_action.c b/drivers/infiniband/core/uverbs_std_types_flow_action.c
new file mode 100644
index 000000000..0ddcf6da6
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_flow_action.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rdma_core.h"
+#include "uverbs.h"
+#include <rdma/uverbs_std_types.h>
+
+static int uverbs_free_flow_action(struct ib_uobject *uobject,
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_flow_action *action = uobject->object;
+
+ if (atomic_read(&action->usecnt))
+ return -EBUSY;
+
+ return action->device->ops.destroy_flow_action(action);
+}
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+ UVERBS_METHOD_FLOW_ACTION_DESTROY,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_FLOW_ACTION_HANDLE,
+ UVERBS_OBJECT_FLOW_ACTION,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(
+ UVERBS_OBJECT_FLOW_ACTION,
+ UVERBS_TYPE_ALLOC_IDR(uverbs_free_flow_action),
+ &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_DESTROY));
+
+const struct uapi_definition uverbs_def_obj_flow_action[] = {
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+ UVERBS_OBJECT_FLOW_ACTION,
+ UAPI_DEF_OBJ_NEEDS_FN(destroy_flow_action)),
+ {}
+};
diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c
new file mode 100644
index 000000000..03e1db5d1
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_mr.c
@@ -0,0 +1,385 @@
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved.
+ * Copyright (c) 2020, Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "rdma_core.h"
+#include "uverbs.h"
+#include <rdma/uverbs_std_types.h>
+#include "restrack.h"
+
+static int uverbs_free_mr(struct ib_uobject *uobject,
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
+{
+ return ib_dereg_mr_user((struct ib_mr *)uobject->object,
+ &attrs->driver_udata);
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_ADVISE_MR)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_pd *pd =
+ uverbs_attr_get_obj(attrs, UVERBS_ATTR_ADVISE_MR_PD_HANDLE);
+ enum ib_uverbs_advise_mr_advice advice;
+ struct ib_device *ib_dev = pd->device;
+ struct ib_sge *sg_list;
+ int num_sge;
+ u32 flags;
+ int ret;
+
+ /* FIXME: Extend the UAPI_DEF_OBJ_NEEDS_FN stuff.. */
+ if (!ib_dev->ops.advise_mr)
+ return -EOPNOTSUPP;
+
+ ret = uverbs_get_const(&advice, attrs, UVERBS_ATTR_ADVISE_MR_ADVICE);
+ if (ret)
+ return ret;
+
+ ret = uverbs_get_flags32(&flags, attrs, UVERBS_ATTR_ADVISE_MR_FLAGS,
+ IB_UVERBS_ADVISE_MR_FLAG_FLUSH);
+ if (ret)
+ return ret;
+
+ num_sge = uverbs_attr_ptr_get_array_size(
+ attrs, UVERBS_ATTR_ADVISE_MR_SGE_LIST, sizeof(struct ib_sge));
+ if (num_sge <= 0)
+ return num_sge;
+
+ sg_list = uverbs_attr_get_alloced_ptr(attrs,
+ UVERBS_ATTR_ADVISE_MR_SGE_LIST);
+ return ib_dev->ops.advise_mr(pd, advice, flags, sg_list, num_sge,
+ attrs);
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_dm_mr_attr attr = {};
+ struct ib_uobject *uobj =
+ uverbs_attr_get_uobject(attrs, UVERBS_ATTR_REG_DM_MR_HANDLE);
+ struct ib_dm *dm =
+ uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_DM_MR_DM_HANDLE);
+ struct ib_pd *pd =
+ uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_DM_MR_PD_HANDLE);
+ struct ib_device *ib_dev = pd->device;
+
+ struct ib_mr *mr;
+ int ret;
+
+ if (!ib_dev->ops.reg_dm_mr)
+ return -EOPNOTSUPP;
+
+ ret = uverbs_copy_from(&attr.offset, attrs, UVERBS_ATTR_REG_DM_MR_OFFSET);
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_from(&attr.length, attrs,
+ UVERBS_ATTR_REG_DM_MR_LENGTH);
+ if (ret)
+ return ret;
+
+ ret = uverbs_get_flags32(&attr.access_flags, attrs,
+ UVERBS_ATTR_REG_DM_MR_ACCESS_FLAGS,
+ IB_ACCESS_SUPPORTED);
+ if (ret)
+ return ret;
+
+ if (!(attr.access_flags & IB_ZERO_BASED))
+ return -EINVAL;
+
+ ret = ib_check_mr_access(ib_dev, attr.access_flags);
+ if (ret)
+ return ret;
+
+ if (attr.offset > dm->length || attr.length > dm->length ||
+ attr.length > dm->length - attr.offset)
+ return -EINVAL;
+
+ mr = pd->device->ops.reg_dm_mr(pd, dm, &attr, attrs);
+ if (IS_ERR(mr))
+ return PTR_ERR(mr);
+
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->type = IB_MR_TYPE_DM;
+ mr->dm = dm;
+ mr->uobject = uobj;
+ atomic_inc(&pd->usecnt);
+ atomic_inc(&dm->usecnt);
+
+ rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR);
+ rdma_restrack_set_name(&mr->res, NULL);
+ rdma_restrack_add(&mr->res);
+ uobj->object = mr;
+
+ uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_REG_DM_MR_HANDLE);
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DM_MR_RESP_LKEY, &mr->lkey,
+ sizeof(mr->lkey));
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DM_MR_RESP_RKEY,
+ &mr->rkey, sizeof(mr->rkey));
+ return ret;
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_MR)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_mr *mr =
+ uverbs_attr_get_obj(attrs, UVERBS_ATTR_QUERY_MR_HANDLE);
+ int ret;
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_LKEY, &mr->lkey,
+ sizeof(mr->lkey));
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_RKEY,
+ &mr->rkey, sizeof(mr->rkey));
+
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_LENGTH,
+ &mr->length, sizeof(mr->length));
+
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_MR_RESP_IOVA,
+ &mr->iova, sizeof(mr->iova));
+
+ return IS_UVERBS_COPY_ERR(ret) ? ret : 0;
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj =
+ uverbs_attr_get_uobject(attrs, UVERBS_ATTR_REG_DMABUF_MR_HANDLE);
+ struct ib_pd *pd =
+ uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_DMABUF_MR_PD_HANDLE);
+ struct ib_device *ib_dev = pd->device;
+
+ u64 offset, length, iova;
+ u32 fd, access_flags;
+ struct ib_mr *mr;
+ int ret;
+
+ if (!ib_dev->ops.reg_user_mr_dmabuf)
+ return -EOPNOTSUPP;
+
+ ret = uverbs_copy_from(&offset, attrs,
+ UVERBS_ATTR_REG_DMABUF_MR_OFFSET);
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_from(&length, attrs,
+ UVERBS_ATTR_REG_DMABUF_MR_LENGTH);
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_from(&iova, attrs,
+ UVERBS_ATTR_REG_DMABUF_MR_IOVA);
+ if (ret)
+ return ret;
+
+ if ((offset & ~PAGE_MASK) != (iova & ~PAGE_MASK))
+ return -EINVAL;
+
+ ret = uverbs_copy_from(&fd, attrs,
+ UVERBS_ATTR_REG_DMABUF_MR_FD);
+ if (ret)
+ return ret;
+
+ ret = uverbs_get_flags32(&access_flags, attrs,
+ UVERBS_ATTR_REG_DMABUF_MR_ACCESS_FLAGS,
+ IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_ATOMIC |
+ IB_ACCESS_RELAXED_ORDERING);
+ if (ret)
+ return ret;
+
+ ret = ib_check_mr_access(ib_dev, access_flags);
+ if (ret)
+ return ret;
+
+ mr = pd->device->ops.reg_user_mr_dmabuf(pd, offset, length, iova, fd,
+ access_flags,
+ &attrs->driver_udata);
+ if (IS_ERR(mr))
+ return PTR_ERR(mr);
+
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->type = IB_MR_TYPE_USER;
+ mr->uobject = uobj;
+ atomic_inc(&pd->usecnt);
+
+ rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR);
+ rdma_restrack_set_name(&mr->res, NULL);
+ rdma_restrack_add(&mr->res);
+ uobj->object = mr;
+
+ uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_REG_DMABUF_MR_HANDLE);
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DMABUF_MR_RESP_LKEY,
+ &mr->lkey, sizeof(mr->lkey));
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DMABUF_MR_RESP_RKEY,
+ &mr->rkey, sizeof(mr->rkey));
+ return ret;
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_ADVISE_MR,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_ADVISE_MR_PD_HANDLE,
+ UVERBS_OBJECT_PD,
+ UVERBS_ACCESS_READ,
+ UA_MANDATORY),
+ UVERBS_ATTR_CONST_IN(UVERBS_ATTR_ADVISE_MR_ADVICE,
+ enum ib_uverbs_advise_mr_advice,
+ UA_MANDATORY),
+ UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_ADVISE_MR_FLAGS,
+ enum ib_uverbs_advise_mr_flag,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ADVISE_MR_SGE_LIST,
+ UVERBS_ATTR_MIN_SIZE(sizeof(struct ib_uverbs_sge)),
+ UA_MANDATORY,
+ UA_ALLOC_AND_COPY));
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_QUERY_MR,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_QUERY_MR_HANDLE,
+ UVERBS_OBJECT_MR,
+ UVERBS_ACCESS_READ,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_RKEY,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_LKEY,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_LENGTH,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_MR_RESP_IOVA,
+ UVERBS_ATTR_TYPE(u64),
+ UA_OPTIONAL));
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_DM_MR_REG,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_HANDLE,
+ UVERBS_OBJECT_MR,
+ UVERBS_ACCESS_NEW,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DM_MR_OFFSET,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DM_MR_LENGTH,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY),
+ UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_PD_HANDLE,
+ UVERBS_OBJECT_PD,
+ UVERBS_ACCESS_READ,
+ UA_MANDATORY),
+ UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_REG_DM_MR_ACCESS_FLAGS,
+ enum ib_access_flags),
+ UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_DM_HANDLE,
+ UVERBS_OBJECT_DM,
+ UVERBS_ACCESS_READ,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_DM_MR_RESP_LKEY,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_DM_MR_RESP_RKEY,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_REG_DMABUF_MR,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DMABUF_MR_HANDLE,
+ UVERBS_OBJECT_MR,
+ UVERBS_ACCESS_NEW,
+ UA_MANDATORY),
+ UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DMABUF_MR_PD_HANDLE,
+ UVERBS_OBJECT_PD,
+ UVERBS_ACCESS_READ,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DMABUF_MR_OFFSET,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DMABUF_MR_LENGTH,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DMABUF_MR_IOVA,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DMABUF_MR_FD,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_REG_DMABUF_MR_ACCESS_FLAGS,
+ enum ib_access_flags),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_DMABUF_MR_RESP_LKEY,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_DMABUF_MR_RESP_RKEY,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+ UVERBS_METHOD_MR_DESTROY,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_MR_HANDLE,
+ UVERBS_OBJECT_MR,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(
+ UVERBS_OBJECT_MR,
+ UVERBS_TYPE_ALLOC_IDR(uverbs_free_mr),
+ &UVERBS_METHOD(UVERBS_METHOD_ADVISE_MR),
+ &UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG),
+ &UVERBS_METHOD(UVERBS_METHOD_MR_DESTROY),
+ &UVERBS_METHOD(UVERBS_METHOD_QUERY_MR),
+ &UVERBS_METHOD(UVERBS_METHOD_REG_DMABUF_MR));
+
+const struct uapi_definition uverbs_def_obj_mr[] = {
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MR,
+ UAPI_DEF_OBJ_NEEDS_FN(dereg_mr)),
+ {}
+};
diff --git a/drivers/infiniband/core/uverbs_std_types_qp.c b/drivers/infiniband/core/uverbs_std_types_qp.c
new file mode 100644
index 000000000..dd1075466
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_qp.c
@@ -0,0 +1,380 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved.
+ */
+
+#include <rdma/uverbs_std_types.h>
+#include "rdma_core.h"
+#include "uverbs.h"
+#include "core_priv.h"
+
+static int uverbs_free_qp(struct ib_uobject *uobject,
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_qp *qp = uobject->object;
+ struct ib_uqp_object *uqp =
+ container_of(uobject, struct ib_uqp_object, uevent.uobject);
+ int ret;
+
+ /*
+ * If this is a user triggered destroy then do not allow destruction
+ * until the user cleans up all the mcast bindings. Unlike in other
+ * places we forcibly clean up the mcast attachments for !DESTROY
+ * because the mcast attaches are not ubojects and will not be
+ * destroyed by anything else during cleanup processing.
+ */
+ if (why == RDMA_REMOVE_DESTROY) {
+ if (!list_empty(&uqp->mcast_list))
+ return -EBUSY;
+ } else if (qp == qp->real_qp) {
+ ib_uverbs_detach_umcast(qp, uqp);
+ }
+
+ ret = ib_destroy_qp_user(qp, &attrs->driver_udata);
+ if (ret)
+ return ret;
+
+ if (uqp->uxrcd)
+ atomic_dec(&uqp->uxrcd->refcnt);
+
+ ib_uverbs_release_uevent(&uqp->uevent);
+ return 0;
+}
+
+static int check_creation_flags(enum ib_qp_type qp_type,
+ u32 create_flags)
+{
+ create_flags &= ~IB_UVERBS_QP_CREATE_SQ_SIG_ALL;
+
+ if (!create_flags || qp_type == IB_QPT_DRIVER)
+ return 0;
+
+ if (qp_type != IB_QPT_RAW_PACKET && qp_type != IB_QPT_UD)
+ return -EINVAL;
+
+ if ((create_flags & IB_UVERBS_QP_CREATE_SCATTER_FCS ||
+ create_flags & IB_UVERBS_QP_CREATE_CVLAN_STRIPPING) &&
+ qp_type != IB_QPT_RAW_PACKET)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void set_caps(struct ib_qp_init_attr *attr,
+ struct ib_uverbs_qp_cap *cap, bool req)
+{
+ if (req) {
+ attr->cap.max_send_wr = cap->max_send_wr;
+ attr->cap.max_recv_wr = cap->max_recv_wr;
+ attr->cap.max_send_sge = cap->max_send_sge;
+ attr->cap.max_recv_sge = cap->max_recv_sge;
+ attr->cap.max_inline_data = cap->max_inline_data;
+ } else {
+ cap->max_send_wr = attr->cap.max_send_wr;
+ cap->max_recv_wr = attr->cap.max_recv_wr;
+ cap->max_send_sge = attr->cap.max_send_sge;
+ cap->max_recv_sge = attr->cap.max_recv_sge;
+ cap->max_inline_data = attr->cap.max_inline_data;
+ }
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_QP_CREATE)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uqp_object *obj = container_of(
+ uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_QP_HANDLE),
+ typeof(*obj), uevent.uobject);
+ struct ib_qp_init_attr attr = {};
+ struct ib_uverbs_qp_cap cap = {};
+ struct ib_rwq_ind_table *rwq_ind_tbl = NULL;
+ struct ib_qp *qp;
+ struct ib_pd *pd = NULL;
+ struct ib_srq *srq = NULL;
+ struct ib_cq *recv_cq = NULL;
+ struct ib_cq *send_cq = NULL;
+ struct ib_xrcd *xrcd = NULL;
+ struct ib_uobject *xrcd_uobj = NULL;
+ struct ib_device *device;
+ u64 user_handle;
+ int ret;
+
+ ret = uverbs_copy_from_or_zero(&cap, attrs,
+ UVERBS_ATTR_CREATE_QP_CAP);
+ if (!ret)
+ ret = uverbs_copy_from(&user_handle, attrs,
+ UVERBS_ATTR_CREATE_QP_USER_HANDLE);
+ if (!ret)
+ ret = uverbs_get_const(&attr.qp_type, attrs,
+ UVERBS_ATTR_CREATE_QP_TYPE);
+ if (ret)
+ return ret;
+
+ switch (attr.qp_type) {
+ case IB_QPT_XRC_TGT:
+ if (uverbs_attr_is_valid(attrs,
+ UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE) ||
+ uverbs_attr_is_valid(attrs,
+ UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE) ||
+ uverbs_attr_is_valid(attrs,
+ UVERBS_ATTR_CREATE_QP_PD_HANDLE) ||
+ uverbs_attr_is_valid(attrs,
+ UVERBS_ATTR_CREATE_QP_IND_TABLE_HANDLE))
+ return -EINVAL;
+
+ xrcd_uobj = uverbs_attr_get_uobject(attrs,
+ UVERBS_ATTR_CREATE_QP_XRCD_HANDLE);
+ if (IS_ERR(xrcd_uobj))
+ return PTR_ERR(xrcd_uobj);
+
+ xrcd = (struct ib_xrcd *)xrcd_uobj->object;
+ if (!xrcd)
+ return -EINVAL;
+ device = xrcd->device;
+ break;
+ case IB_UVERBS_QPT_RAW_PACKET:
+ if (!capable(CAP_NET_RAW))
+ return -EPERM;
+ fallthrough;
+ case IB_UVERBS_QPT_RC:
+ case IB_UVERBS_QPT_UC:
+ case IB_UVERBS_QPT_UD:
+ case IB_UVERBS_QPT_XRC_INI:
+ case IB_UVERBS_QPT_DRIVER:
+ if (uverbs_attr_is_valid(attrs,
+ UVERBS_ATTR_CREATE_QP_XRCD_HANDLE) ||
+ (uverbs_attr_is_valid(attrs,
+ UVERBS_ATTR_CREATE_QP_SRQ_HANDLE) &&
+ attr.qp_type == IB_QPT_XRC_INI))
+ return -EINVAL;
+
+ pd = uverbs_attr_get_obj(attrs,
+ UVERBS_ATTR_CREATE_QP_PD_HANDLE);
+ if (IS_ERR(pd))
+ return PTR_ERR(pd);
+
+ rwq_ind_tbl = uverbs_attr_get_obj(attrs,
+ UVERBS_ATTR_CREATE_QP_IND_TABLE_HANDLE);
+ if (!IS_ERR(rwq_ind_tbl)) {
+ if (cap.max_recv_wr || cap.max_recv_sge ||
+ uverbs_attr_is_valid(attrs,
+ UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE) ||
+ uverbs_attr_is_valid(attrs,
+ UVERBS_ATTR_CREATE_QP_SRQ_HANDLE))
+ return -EINVAL;
+
+ /* send_cq is optinal */
+ if (cap.max_send_wr) {
+ send_cq = uverbs_attr_get_obj(attrs,
+ UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE);
+ if (IS_ERR(send_cq))
+ return PTR_ERR(send_cq);
+ }
+ attr.rwq_ind_tbl = rwq_ind_tbl;
+ } else {
+ send_cq = uverbs_attr_get_obj(attrs,
+ UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE);
+ if (IS_ERR(send_cq))
+ return PTR_ERR(send_cq);
+
+ if (attr.qp_type != IB_QPT_XRC_INI) {
+ recv_cq = uverbs_attr_get_obj(attrs,
+ UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE);
+ if (IS_ERR(recv_cq))
+ return PTR_ERR(recv_cq);
+ }
+ }
+
+ device = pd->device;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ ret = uverbs_get_flags32(&attr.create_flags, attrs,
+ UVERBS_ATTR_CREATE_QP_FLAGS,
+ IB_UVERBS_QP_CREATE_BLOCK_MULTICAST_LOOPBACK |
+ IB_UVERBS_QP_CREATE_SCATTER_FCS |
+ IB_UVERBS_QP_CREATE_CVLAN_STRIPPING |
+ IB_UVERBS_QP_CREATE_PCI_WRITE_END_PADDING |
+ IB_UVERBS_QP_CREATE_SQ_SIG_ALL);
+ if (ret)
+ return ret;
+
+ ret = check_creation_flags(attr.qp_type, attr.create_flags);
+ if (ret)
+ return ret;
+
+ if (uverbs_attr_is_valid(attrs,
+ UVERBS_ATTR_CREATE_QP_SOURCE_QPN)) {
+ ret = uverbs_copy_from(&attr.source_qpn, attrs,
+ UVERBS_ATTR_CREATE_QP_SOURCE_QPN);
+ if (ret)
+ return ret;
+ attr.create_flags |= IB_QP_CREATE_SOURCE_QPN;
+ }
+
+ srq = uverbs_attr_get_obj(attrs,
+ UVERBS_ATTR_CREATE_QP_SRQ_HANDLE);
+ if (!IS_ERR(srq)) {
+ if ((srq->srq_type == IB_SRQT_XRC &&
+ attr.qp_type != IB_QPT_XRC_TGT) ||
+ (srq->srq_type != IB_SRQT_XRC &&
+ attr.qp_type == IB_QPT_XRC_TGT))
+ return -EINVAL;
+ attr.srq = srq;
+ }
+
+ obj->uevent.event_file = ib_uverbs_get_async_event(attrs,
+ UVERBS_ATTR_CREATE_QP_EVENT_FD);
+ INIT_LIST_HEAD(&obj->uevent.event_list);
+ INIT_LIST_HEAD(&obj->mcast_list);
+ obj->uevent.uobject.user_handle = user_handle;
+ attr.event_handler = ib_uverbs_qp_event_handler;
+ attr.send_cq = send_cq;
+ attr.recv_cq = recv_cq;
+ attr.xrcd = xrcd;
+ if (attr.create_flags & IB_UVERBS_QP_CREATE_SQ_SIG_ALL) {
+ /* This creation bit is uverbs one, need to mask before
+ * calling drivers. It was added to prevent an extra user attr
+ * only for that when using ioctl.
+ */
+ attr.create_flags &= ~IB_UVERBS_QP_CREATE_SQ_SIG_ALL;
+ attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+ } else {
+ attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+ }
+
+ set_caps(&attr, &cap, true);
+ mutex_init(&obj->mcast_lock);
+
+ qp = ib_create_qp_user(device, pd, &attr, &attrs->driver_udata, obj,
+ KBUILD_MODNAME);
+ if (IS_ERR(qp)) {
+ ret = PTR_ERR(qp);
+ goto err_put;
+ }
+ ib_qp_usecnt_inc(qp);
+
+ if (attr.qp_type == IB_QPT_XRC_TGT) {
+ obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object,
+ uobject);
+ atomic_inc(&obj->uxrcd->refcnt);
+ }
+
+ obj->uevent.uobject.object = qp;
+ uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_CREATE_QP_HANDLE);
+
+ set_caps(&attr, &cap, false);
+ ret = uverbs_copy_to_struct_or_zero(attrs,
+ UVERBS_ATTR_CREATE_QP_RESP_CAP, &cap,
+ sizeof(cap));
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_QP_RESP_QP_NUM,
+ &qp->qp_num,
+ sizeof(qp->qp_num));
+
+ return ret;
+err_put:
+ if (obj->uevent.event_file)
+ uverbs_uobject_put(&obj->uevent.event_file->uobj);
+ return ret;
+};
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_QP_CREATE,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_HANDLE,
+ UVERBS_OBJECT_QP,
+ UVERBS_ACCESS_NEW,
+ UA_MANDATORY),
+ UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_XRCD_HANDLE,
+ UVERBS_OBJECT_XRCD,
+ UVERBS_ACCESS_READ,
+ UA_OPTIONAL),
+ UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_PD_HANDLE,
+ UVERBS_OBJECT_PD,
+ UVERBS_ACCESS_READ,
+ UA_OPTIONAL),
+ UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_SRQ_HANDLE,
+ UVERBS_OBJECT_SRQ,
+ UVERBS_ACCESS_READ,
+ UA_OPTIONAL),
+ UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE,
+ UVERBS_OBJECT_CQ,
+ UVERBS_ACCESS_READ,
+ UA_OPTIONAL),
+ UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE,
+ UVERBS_OBJECT_CQ,
+ UVERBS_ACCESS_READ,
+ UA_OPTIONAL),
+ UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_IND_TABLE_HANDLE,
+ UVERBS_OBJECT_RWQ_IND_TBL,
+ UVERBS_ACCESS_READ,
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_QP_USER_HANDLE,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_QP_CAP,
+ UVERBS_ATTR_STRUCT(struct ib_uverbs_qp_cap,
+ max_inline_data),
+ UA_MANDATORY),
+ UVERBS_ATTR_CONST_IN(UVERBS_ATTR_CREATE_QP_TYPE,
+ enum ib_uverbs_qp_type,
+ UA_MANDATORY),
+ UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_CREATE_QP_FLAGS,
+ enum ib_uverbs_qp_create_flags,
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_QP_SOURCE_QPN,
+ UVERBS_ATTR_TYPE(u32),
+ UA_OPTIONAL),
+ UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_QP_EVENT_FD,
+ UVERBS_OBJECT_ASYNC_EVENT,
+ UVERBS_ACCESS_READ,
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_QP_RESP_CAP,
+ UVERBS_ATTR_STRUCT(struct ib_uverbs_qp_cap,
+ max_inline_data),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_QP_RESP_QP_NUM,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_UHW());
+
+static int UVERBS_HANDLER(UVERBS_METHOD_QP_DESTROY)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj =
+ uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_QP_HANDLE);
+ struct ib_uqp_object *obj =
+ container_of(uobj, struct ib_uqp_object, uevent.uobject);
+ struct ib_uverbs_destroy_qp_resp resp = {
+ .events_reported = obj->uevent.events_reported
+ };
+
+ return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_QP_RESP, &resp,
+ sizeof(resp));
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_QP_DESTROY,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_QP_HANDLE,
+ UVERBS_OBJECT_QP,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_QP_RESP,
+ UVERBS_ATTR_TYPE(struct ib_uverbs_destroy_qp_resp),
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(
+ UVERBS_OBJECT_QP,
+ UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), uverbs_free_qp),
+ &UVERBS_METHOD(UVERBS_METHOD_QP_CREATE),
+ &UVERBS_METHOD(UVERBS_METHOD_QP_DESTROY));
+
+const struct uapi_definition uverbs_def_obj_qp[] = {
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_QP,
+ UAPI_DEF_OBJ_NEEDS_FN(destroy_qp)),
+ {}
+};
diff --git a/drivers/infiniband/core/uverbs_std_types_srq.c b/drivers/infiniband/core/uverbs_std_types_srq.c
new file mode 100644
index 000000000..e5513f828
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_srq.c
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved.
+ */
+
+#include <rdma/uverbs_std_types.h>
+#include "rdma_core.h"
+#include "uverbs.h"
+
+static int uverbs_free_srq(struct ib_uobject *uobject,
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_srq *srq = uobject->object;
+ struct ib_uevent_object *uevent =
+ container_of(uobject, struct ib_uevent_object, uobject);
+ enum ib_srq_type srq_type = srq->srq_type;
+ int ret;
+
+ ret = ib_destroy_srq_user(srq, &attrs->driver_udata);
+ if (ret)
+ return ret;
+
+ if (srq_type == IB_SRQT_XRC) {
+ struct ib_usrq_object *us =
+ container_of(uobject, struct ib_usrq_object,
+ uevent.uobject);
+
+ atomic_dec(&us->uxrcd->refcnt);
+ }
+
+ ib_uverbs_release_uevent(uevent);
+ return 0;
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_SRQ_CREATE)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_usrq_object *obj = container_of(
+ uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_SRQ_HANDLE),
+ typeof(*obj), uevent.uobject);
+ struct ib_pd *pd =
+ uverbs_attr_get_obj(attrs, UVERBS_ATTR_CREATE_SRQ_PD_HANDLE);
+ struct ib_srq_init_attr attr = {};
+ struct ib_uobject *xrcd_uobj;
+ struct ib_srq *srq;
+ u64 user_handle;
+ int ret;
+
+ ret = uverbs_copy_from(&attr.attr.max_sge, attrs,
+ UVERBS_ATTR_CREATE_SRQ_MAX_SGE);
+ if (!ret)
+ ret = uverbs_copy_from(&attr.attr.max_wr, attrs,
+ UVERBS_ATTR_CREATE_SRQ_MAX_WR);
+ if (!ret)
+ ret = uverbs_copy_from(&attr.attr.srq_limit, attrs,
+ UVERBS_ATTR_CREATE_SRQ_LIMIT);
+ if (!ret)
+ ret = uverbs_copy_from(&user_handle, attrs,
+ UVERBS_ATTR_CREATE_SRQ_USER_HANDLE);
+ if (!ret)
+ ret = uverbs_get_const(&attr.srq_type, attrs,
+ UVERBS_ATTR_CREATE_SRQ_TYPE);
+ if (ret)
+ return ret;
+
+ if (ib_srq_has_cq(attr.srq_type)) {
+ attr.ext.cq = uverbs_attr_get_obj(attrs,
+ UVERBS_ATTR_CREATE_SRQ_CQ_HANDLE);
+ if (IS_ERR(attr.ext.cq))
+ return PTR_ERR(attr.ext.cq);
+ }
+
+ switch (attr.srq_type) {
+ case IB_UVERBS_SRQT_XRC:
+ xrcd_uobj = uverbs_attr_get_uobject(attrs,
+ UVERBS_ATTR_CREATE_SRQ_XRCD_HANDLE);
+ if (IS_ERR(xrcd_uobj))
+ return PTR_ERR(xrcd_uobj);
+
+ attr.ext.xrc.xrcd = (struct ib_xrcd *)xrcd_uobj->object;
+ if (!attr.ext.xrc.xrcd)
+ return -EINVAL;
+ obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object,
+ uobject);
+ atomic_inc(&obj->uxrcd->refcnt);
+ break;
+ case IB_UVERBS_SRQT_TM:
+ ret = uverbs_copy_from(&attr.ext.tag_matching.max_num_tags,
+ attrs,
+ UVERBS_ATTR_CREATE_SRQ_MAX_NUM_TAGS);
+ if (ret)
+ return ret;
+ break;
+ case IB_UVERBS_SRQT_BASIC:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ obj->uevent.event_file = ib_uverbs_get_async_event(attrs,
+ UVERBS_ATTR_CREATE_SRQ_EVENT_FD);
+ INIT_LIST_HEAD(&obj->uevent.event_list);
+ attr.event_handler = ib_uverbs_srq_event_handler;
+ obj->uevent.uobject.user_handle = user_handle;
+
+ srq = ib_create_srq_user(pd, &attr, obj, &attrs->driver_udata);
+ if (IS_ERR(srq)) {
+ ret = PTR_ERR(srq);
+ goto err;
+ }
+
+ obj->uevent.uobject.object = srq;
+ uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_CREATE_SRQ_HANDLE);
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_SRQ_RESP_MAX_WR,
+ &attr.attr.max_wr,
+ sizeof(attr.attr.max_wr));
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_SRQ_RESP_MAX_SGE,
+ &attr.attr.max_sge,
+ sizeof(attr.attr.max_sge));
+ if (ret)
+ return ret;
+
+ if (attr.srq_type == IB_SRQT_XRC) {
+ ret = uverbs_copy_to(attrs,
+ UVERBS_ATTR_CREATE_SRQ_RESP_SRQ_NUM,
+ &srq->ext.xrc.srq_num,
+ sizeof(srq->ext.xrc.srq_num));
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+err:
+ if (obj->uevent.event_file)
+ uverbs_uobject_put(&obj->uevent.event_file->uobj);
+ if (attr.srq_type == IB_SRQT_XRC)
+ atomic_dec(&obj->uxrcd->refcnt);
+ return ret;
+};
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_SRQ_CREATE,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_SRQ_HANDLE,
+ UVERBS_OBJECT_SRQ,
+ UVERBS_ACCESS_NEW,
+ UA_MANDATORY),
+ UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_SRQ_PD_HANDLE,
+ UVERBS_OBJECT_PD,
+ UVERBS_ACCESS_READ,
+ UA_MANDATORY),
+ UVERBS_ATTR_CONST_IN(UVERBS_ATTR_CREATE_SRQ_TYPE,
+ enum ib_uverbs_srq_type,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_USER_HANDLE,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_MAX_WR,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_MAX_SGE,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_LIMIT,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_SRQ_XRCD_HANDLE,
+ UVERBS_OBJECT_XRCD,
+ UVERBS_ACCESS_READ,
+ UA_OPTIONAL),
+ UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_SRQ_CQ_HANDLE,
+ UVERBS_OBJECT_CQ,
+ UVERBS_ACCESS_READ,
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_MAX_NUM_TAGS,
+ UVERBS_ATTR_TYPE(u32),
+ UA_OPTIONAL),
+ UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_SRQ_EVENT_FD,
+ UVERBS_OBJECT_ASYNC_EVENT,
+ UVERBS_ACCESS_READ,
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_SRQ_RESP_MAX_WR,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_SRQ_RESP_MAX_SGE,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_SRQ_RESP_SRQ_NUM,
+ UVERBS_ATTR_TYPE(u32),
+ UA_OPTIONAL),
+ UVERBS_ATTR_UHW());
+
+static int UVERBS_HANDLER(UVERBS_METHOD_SRQ_DESTROY)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj =
+ uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_SRQ_HANDLE);
+ struct ib_usrq_object *obj =
+ container_of(uobj, struct ib_usrq_object, uevent.uobject);
+ struct ib_uverbs_destroy_srq_resp resp = {
+ .events_reported = obj->uevent.events_reported
+ };
+
+ return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_SRQ_RESP, &resp,
+ sizeof(resp));
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_SRQ_DESTROY,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_SRQ_HANDLE,
+ UVERBS_OBJECT_SRQ,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_SRQ_RESP,
+ UVERBS_ATTR_TYPE(struct ib_uverbs_destroy_srq_resp),
+ UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(
+ UVERBS_OBJECT_SRQ,
+ UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object),
+ uverbs_free_srq),
+ &UVERBS_METHOD(UVERBS_METHOD_SRQ_CREATE),
+ &UVERBS_METHOD(UVERBS_METHOD_SRQ_DESTROY)
+);
+
+const struct uapi_definition uverbs_def_obj_srq[] = {
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_SRQ,
+ UAPI_DEF_OBJ_NEEDS_FN(destroy_srq)),
+ {}
+};
diff --git a/drivers/infiniband/core/uverbs_std_types_wq.c b/drivers/infiniband/core/uverbs_std_types_wq.c
new file mode 100644
index 000000000..7ded83393
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_std_types_wq.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved.
+ */
+
+#include <rdma/uverbs_std_types.h>
+#include "rdma_core.h"
+#include "uverbs.h"
+
+static int uverbs_free_wq(struct ib_uobject *uobject,
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_wq *wq = uobject->object;
+ struct ib_uwq_object *uwq =
+ container_of(uobject, struct ib_uwq_object, uevent.uobject);
+ int ret;
+
+ ret = ib_destroy_wq_user(wq, &attrs->driver_udata);
+ if (ret)
+ return ret;
+
+ ib_uverbs_release_uevent(&uwq->uevent);
+ return 0;
+}
+
+static int UVERBS_HANDLER(UVERBS_METHOD_WQ_CREATE)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uwq_object *obj = container_of(
+ uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_WQ_HANDLE),
+ typeof(*obj), uevent.uobject);
+ struct ib_pd *pd =
+ uverbs_attr_get_obj(attrs, UVERBS_ATTR_CREATE_WQ_PD_HANDLE);
+ struct ib_cq *cq =
+ uverbs_attr_get_obj(attrs, UVERBS_ATTR_CREATE_WQ_CQ_HANDLE);
+ struct ib_wq_init_attr wq_init_attr = {};
+ struct ib_wq *wq;
+ u64 user_handle;
+ int ret;
+
+ ret = uverbs_get_flags32(&wq_init_attr.create_flags, attrs,
+ UVERBS_ATTR_CREATE_WQ_FLAGS,
+ IB_UVERBS_WQ_FLAGS_CVLAN_STRIPPING |
+ IB_UVERBS_WQ_FLAGS_SCATTER_FCS |
+ IB_UVERBS_WQ_FLAGS_DELAY_DROP |
+ IB_UVERBS_WQ_FLAGS_PCI_WRITE_END_PADDING);
+ if (!ret)
+ ret = uverbs_copy_from(&wq_init_attr.max_sge, attrs,
+ UVERBS_ATTR_CREATE_WQ_MAX_SGE);
+ if (!ret)
+ ret = uverbs_copy_from(&wq_init_attr.max_wr, attrs,
+ UVERBS_ATTR_CREATE_WQ_MAX_WR);
+ if (!ret)
+ ret = uverbs_copy_from(&user_handle, attrs,
+ UVERBS_ATTR_CREATE_WQ_USER_HANDLE);
+ if (!ret)
+ ret = uverbs_get_const(&wq_init_attr.wq_type, attrs,
+ UVERBS_ATTR_CREATE_WQ_TYPE);
+ if (ret)
+ return ret;
+
+ if (wq_init_attr.wq_type != IB_WQT_RQ)
+ return -EINVAL;
+
+ obj->uevent.event_file = ib_uverbs_get_async_event(attrs,
+ UVERBS_ATTR_CREATE_WQ_EVENT_FD);
+ obj->uevent.uobject.user_handle = user_handle;
+ INIT_LIST_HEAD(&obj->uevent.event_list);
+ wq_init_attr.event_handler = ib_uverbs_wq_event_handler;
+ wq_init_attr.wq_context = attrs->ufile;
+ wq_init_attr.cq = cq;
+
+ wq = pd->device->ops.create_wq(pd, &wq_init_attr, &attrs->driver_udata);
+ if (IS_ERR(wq)) {
+ ret = PTR_ERR(wq);
+ goto err;
+ }
+
+ obj->uevent.uobject.object = wq;
+ wq->wq_type = wq_init_attr.wq_type;
+ wq->cq = cq;
+ wq->pd = pd;
+ wq->device = pd->device;
+ wq->wq_context = wq_init_attr.wq_context;
+ atomic_set(&wq->usecnt, 0);
+ atomic_inc(&pd->usecnt);
+ atomic_inc(&cq->usecnt);
+ wq->uobject = obj;
+ uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_CREATE_WQ_HANDLE);
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_WQ_RESP_MAX_WR,
+ &wq_init_attr.max_wr,
+ sizeof(wq_init_attr.max_wr));
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_WQ_RESP_MAX_SGE,
+ &wq_init_attr.max_sge,
+ sizeof(wq_init_attr.max_sge));
+ if (ret)
+ return ret;
+
+ ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_WQ_RESP_WQ_NUM,
+ &wq->wq_num,
+ sizeof(wq->wq_num));
+ return ret;
+
+err:
+ if (obj->uevent.event_file)
+ uverbs_uobject_put(&obj->uevent.event_file->uobj);
+ return ret;
+};
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_WQ_CREATE,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_WQ_HANDLE,
+ UVERBS_OBJECT_WQ,
+ UVERBS_ACCESS_NEW,
+ UA_MANDATORY),
+ UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_WQ_PD_HANDLE,
+ UVERBS_OBJECT_PD,
+ UVERBS_ACCESS_READ,
+ UA_MANDATORY),
+ UVERBS_ATTR_CONST_IN(UVERBS_ATTR_CREATE_WQ_TYPE,
+ enum ib_wq_type,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_WQ_USER_HANDLE,
+ UVERBS_ATTR_TYPE(u64),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_WQ_MAX_WR,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_WQ_MAX_SGE,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_CREATE_WQ_FLAGS,
+ enum ib_uverbs_wq_flags,
+ UA_MANDATORY),
+ UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_WQ_CQ_HANDLE,
+ UVERBS_OBJECT_CQ,
+ UVERBS_ACCESS_READ,
+ UA_OPTIONAL),
+ UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_WQ_EVENT_FD,
+ UVERBS_OBJECT_ASYNC_EVENT,
+ UVERBS_ACCESS_READ,
+ UA_OPTIONAL),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_WQ_RESP_MAX_WR,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_WQ_RESP_MAX_SGE,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_WQ_RESP_WQ_NUM,
+ UVERBS_ATTR_TYPE(u32),
+ UA_OPTIONAL),
+ UVERBS_ATTR_UHW());
+
+static int UVERBS_HANDLER(UVERBS_METHOD_WQ_DESTROY)(
+ struct uverbs_attr_bundle *attrs)
+{
+ struct ib_uobject *uobj =
+ uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_WQ_HANDLE);
+ struct ib_uwq_object *obj =
+ container_of(uobj, struct ib_uwq_object, uevent.uobject);
+
+ return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_WQ_RESP,
+ &obj->uevent.events_reported,
+ sizeof(obj->uevent.events_reported));
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+ UVERBS_METHOD_WQ_DESTROY,
+ UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_WQ_HANDLE,
+ UVERBS_OBJECT_WQ,
+ UVERBS_ACCESS_DESTROY,
+ UA_MANDATORY),
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_WQ_RESP,
+ UVERBS_ATTR_TYPE(u32),
+ UA_MANDATORY));
+
+
+DECLARE_UVERBS_NAMED_OBJECT(
+ UVERBS_OBJECT_WQ,
+ UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), uverbs_free_wq),
+ &UVERBS_METHOD(UVERBS_METHOD_WQ_CREATE),
+ &UVERBS_METHOD(UVERBS_METHOD_WQ_DESTROY)
+);
+
+const struct uapi_definition uverbs_def_obj_wq[] = {
+ UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_WQ,
+ UAPI_DEF_OBJ_NEEDS_FN(destroy_wq)),
+ {}
+};
diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c
new file mode 100644
index 000000000..a02916a3a
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_uapi.c
@@ -0,0 +1,734 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved.
+ */
+#include <rdma/uverbs_ioctl.h>
+#include <rdma/rdma_user_ioctl.h>
+#include <linux/bitops.h>
+#include "rdma_core.h"
+#include "uverbs.h"
+
+static int ib_uverbs_notsupp(struct uverbs_attr_bundle *attrs)
+{
+ return -EOPNOTSUPP;
+}
+
+static void *uapi_add_elm(struct uverbs_api *uapi, u32 key, size_t alloc_size)
+{
+ void *elm;
+ int rc;
+
+ if (key == UVERBS_API_KEY_ERR)
+ return ERR_PTR(-EOVERFLOW);
+
+ elm = kzalloc(alloc_size, GFP_KERNEL);
+ if (!elm)
+ return ERR_PTR(-ENOMEM);
+ rc = radix_tree_insert(&uapi->radix, key, elm);
+ if (rc) {
+ kfree(elm);
+ return ERR_PTR(rc);
+ }
+
+ return elm;
+}
+
+static void *uapi_add_get_elm(struct uverbs_api *uapi, u32 key,
+ size_t alloc_size, bool *exists)
+{
+ void *elm;
+
+ elm = uapi_add_elm(uapi, key, alloc_size);
+ if (!IS_ERR(elm)) {
+ *exists = false;
+ return elm;
+ }
+
+ if (elm != ERR_PTR(-EEXIST))
+ return elm;
+
+ elm = radix_tree_lookup(&uapi->radix, key);
+ if (WARN_ON(!elm))
+ return ERR_PTR(-EINVAL);
+ *exists = true;
+ return elm;
+}
+
+static int uapi_create_write(struct uverbs_api *uapi,
+ struct ib_device *ibdev,
+ const struct uapi_definition *def,
+ u32 obj_key,
+ u32 *cur_method_key)
+{
+ struct uverbs_api_write_method *method_elm;
+ u32 method_key = obj_key;
+ bool exists;
+
+ if (def->write.is_ex)
+ method_key |= uapi_key_write_ex_method(def->write.command_num);
+ else
+ method_key |= uapi_key_write_method(def->write.command_num);
+
+ method_elm = uapi_add_get_elm(uapi, method_key, sizeof(*method_elm),
+ &exists);
+ if (IS_ERR(method_elm))
+ return PTR_ERR(method_elm);
+
+ if (WARN_ON(exists && (def->write.is_ex != method_elm->is_ex)))
+ return -EINVAL;
+
+ method_elm->is_ex = def->write.is_ex;
+ method_elm->handler = def->func_write;
+ if (!def->write.is_ex)
+ method_elm->disabled = !(ibdev->uverbs_cmd_mask &
+ BIT_ULL(def->write.command_num));
+
+ if (!def->write.is_ex && def->func_write) {
+ method_elm->has_udata = def->write.has_udata;
+ method_elm->has_resp = def->write.has_resp;
+ method_elm->req_size = def->write.req_size;
+ method_elm->resp_size = def->write.resp_size;
+ }
+
+ *cur_method_key = method_key;
+ return 0;
+}
+
+static int uapi_merge_method(struct uverbs_api *uapi,
+ struct uverbs_api_object *obj_elm, u32 obj_key,
+ const struct uverbs_method_def *method,
+ bool is_driver)
+{
+ u32 method_key = obj_key | uapi_key_ioctl_method(method->id);
+ struct uverbs_api_ioctl_method *method_elm;
+ unsigned int i;
+ bool exists;
+
+ if (!method->attrs)
+ return 0;
+
+ method_elm = uapi_add_get_elm(uapi, method_key, sizeof(*method_elm),
+ &exists);
+ if (IS_ERR(method_elm))
+ return PTR_ERR(method_elm);
+ if (exists) {
+ /*
+ * This occurs when a driver uses ADD_UVERBS_ATTRIBUTES_SIMPLE
+ */
+ if (WARN_ON(method->handler))
+ return -EINVAL;
+ } else {
+ WARN_ON(!method->handler);
+ rcu_assign_pointer(method_elm->handler, method->handler);
+ if (method->handler != uverbs_destroy_def_handler)
+ method_elm->driver_method = is_driver;
+ }
+
+ for (i = 0; i != method->num_attrs; i++) {
+ const struct uverbs_attr_def *attr = (*method->attrs)[i];
+ struct uverbs_api_attr *attr_slot;
+
+ if (!attr)
+ continue;
+
+ /*
+ * ENUM_IN contains the 'ids' pointer to the driver's .rodata,
+ * so if it is specified by a driver then it always makes this
+ * into a driver method.
+ */
+ if (attr->attr.type == UVERBS_ATTR_TYPE_ENUM_IN)
+ method_elm->driver_method |= is_driver;
+
+ /*
+ * Like other uobject based things we only support a single
+ * uobject being NEW'd or DESTROY'd
+ */
+ if (attr->attr.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) {
+ u8 access = attr->attr.u2.objs_arr.access;
+
+ if (WARN_ON(access == UVERBS_ACCESS_NEW ||
+ access == UVERBS_ACCESS_DESTROY))
+ return -EINVAL;
+ }
+
+ attr_slot =
+ uapi_add_elm(uapi, method_key | uapi_key_attr(attr->id),
+ sizeof(*attr_slot));
+ /* Attributes are not allowed to be modified by drivers */
+ if (IS_ERR(attr_slot))
+ return PTR_ERR(attr_slot);
+
+ attr_slot->spec = attr->attr;
+ }
+
+ return 0;
+}
+
+static int uapi_merge_obj_tree(struct uverbs_api *uapi,
+ const struct uverbs_object_def *obj,
+ bool is_driver)
+{
+ struct uverbs_api_object *obj_elm;
+ unsigned int i;
+ u32 obj_key;
+ bool exists;
+ int rc;
+
+ obj_key = uapi_key_obj(obj->id);
+ obj_elm = uapi_add_get_elm(uapi, obj_key, sizeof(*obj_elm), &exists);
+ if (IS_ERR(obj_elm))
+ return PTR_ERR(obj_elm);
+
+ if (obj->type_attrs) {
+ if (WARN_ON(obj_elm->type_attrs))
+ return -EINVAL;
+
+ obj_elm->id = obj->id;
+ obj_elm->type_attrs = obj->type_attrs;
+ obj_elm->type_class = obj->type_attrs->type_class;
+ /*
+ * Today drivers are only permitted to use idr_class and
+ * fd_class types. We can revoke the IDR types during
+ * disassociation, and the FD types require the driver to use
+ * struct file_operations.owner to prevent the driver module
+ * code from unloading while the file is open. This provides
+ * enough safety that uverbs_uobject_fd_release() will
+ * continue to work. Drivers using FD are responsible to
+ * handle disassociation of the device on their own.
+ */
+ if (WARN_ON(is_driver &&
+ obj->type_attrs->type_class != &uverbs_idr_class &&
+ obj->type_attrs->type_class != &uverbs_fd_class))
+ return -EINVAL;
+ }
+
+ if (!obj->methods)
+ return 0;
+
+ for (i = 0; i != obj->num_methods; i++) {
+ const struct uverbs_method_def *method = (*obj->methods)[i];
+
+ if (!method)
+ continue;
+
+ rc = uapi_merge_method(uapi, obj_elm, obj_key, method,
+ is_driver);
+ if (rc)
+ return rc;
+ }
+
+ return 0;
+}
+
+static int uapi_disable_elm(struct uverbs_api *uapi,
+ const struct uapi_definition *def,
+ u32 obj_key,
+ u32 method_key)
+{
+ bool exists;
+
+ if (def->scope == UAPI_SCOPE_OBJECT) {
+ struct uverbs_api_object *obj_elm;
+
+ obj_elm = uapi_add_get_elm(
+ uapi, obj_key, sizeof(*obj_elm), &exists);
+ if (IS_ERR(obj_elm))
+ return PTR_ERR(obj_elm);
+ obj_elm->disabled = 1;
+ return 0;
+ }
+
+ if (def->scope == UAPI_SCOPE_METHOD &&
+ uapi_key_is_ioctl_method(method_key)) {
+ struct uverbs_api_ioctl_method *method_elm;
+
+ method_elm = uapi_add_get_elm(uapi, method_key,
+ sizeof(*method_elm), &exists);
+ if (IS_ERR(method_elm))
+ return PTR_ERR(method_elm);
+ method_elm->disabled = 1;
+ return 0;
+ }
+
+ if (def->scope == UAPI_SCOPE_METHOD &&
+ (uapi_key_is_write_method(method_key) ||
+ uapi_key_is_write_ex_method(method_key))) {
+ struct uverbs_api_write_method *write_elm;
+
+ write_elm = uapi_add_get_elm(uapi, method_key,
+ sizeof(*write_elm), &exists);
+ if (IS_ERR(write_elm))
+ return PTR_ERR(write_elm);
+ write_elm->disabled = 1;
+ return 0;
+ }
+
+ WARN_ON(true);
+ return -EINVAL;
+}
+
+static int uapi_merge_def(struct uverbs_api *uapi, struct ib_device *ibdev,
+ const struct uapi_definition *def_list,
+ bool is_driver)
+{
+ const struct uapi_definition *def = def_list;
+ u32 cur_obj_key = UVERBS_API_KEY_ERR;
+ u32 cur_method_key = UVERBS_API_KEY_ERR;
+ bool exists;
+ int rc;
+
+ if (!def_list)
+ return 0;
+
+ for (;; def++) {
+ switch ((enum uapi_definition_kind)def->kind) {
+ case UAPI_DEF_CHAIN:
+ rc = uapi_merge_def(uapi, ibdev, def->chain, is_driver);
+ if (rc)
+ return rc;
+ continue;
+
+ case UAPI_DEF_CHAIN_OBJ_TREE:
+ if (WARN_ON(def->object_start.object_id !=
+ def->chain_obj_tree->id))
+ return -EINVAL;
+
+ cur_obj_key = uapi_key_obj(def->object_start.object_id);
+ rc = uapi_merge_obj_tree(uapi, def->chain_obj_tree,
+ is_driver);
+ if (rc)
+ return rc;
+ continue;
+
+ case UAPI_DEF_END:
+ return 0;
+
+ case UAPI_DEF_IS_SUPPORTED_DEV_FN: {
+ void **ibdev_fn =
+ (void *)(&ibdev->ops) + def->needs_fn_offset;
+
+ if (*ibdev_fn)
+ continue;
+ rc = uapi_disable_elm(
+ uapi, def, cur_obj_key, cur_method_key);
+ if (rc)
+ return rc;
+ continue;
+ }
+
+ case UAPI_DEF_IS_SUPPORTED_FUNC:
+ if (def->func_is_supported(ibdev))
+ continue;
+ rc = uapi_disable_elm(
+ uapi, def, cur_obj_key, cur_method_key);
+ if (rc)
+ return rc;
+ continue;
+
+ case UAPI_DEF_OBJECT_START: {
+ struct uverbs_api_object *obj_elm;
+
+ cur_obj_key = uapi_key_obj(def->object_start.object_id);
+ obj_elm = uapi_add_get_elm(uapi, cur_obj_key,
+ sizeof(*obj_elm), &exists);
+ if (IS_ERR(obj_elm))
+ return PTR_ERR(obj_elm);
+ continue;
+ }
+
+ case UAPI_DEF_WRITE:
+ rc = uapi_create_write(
+ uapi, ibdev, def, cur_obj_key, &cur_method_key);
+ if (rc)
+ return rc;
+ continue;
+ }
+ WARN_ON(true);
+ return -EINVAL;
+ }
+}
+
+static int
+uapi_finalize_ioctl_method(struct uverbs_api *uapi,
+ struct uverbs_api_ioctl_method *method_elm,
+ u32 method_key)
+{
+ struct radix_tree_iter iter;
+ unsigned int num_attrs = 0;
+ unsigned int max_bkey = 0;
+ bool single_uobj = false;
+ void __rcu **slot;
+
+ method_elm->destroy_bkey = UVERBS_API_ATTR_BKEY_LEN;
+ radix_tree_for_each_slot (slot, &uapi->radix, &iter,
+ uapi_key_attrs_start(method_key)) {
+ struct uverbs_api_attr *elm =
+ rcu_dereference_protected(*slot, true);
+ u32 attr_key = iter.index & UVERBS_API_ATTR_KEY_MASK;
+ u32 attr_bkey = uapi_bkey_attr(attr_key);
+ u8 type = elm->spec.type;
+
+ if (uapi_key_attr_to_ioctl_method(iter.index) !=
+ uapi_key_attr_to_ioctl_method(method_key))
+ break;
+
+ if (elm->spec.mandatory)
+ __set_bit(attr_bkey, method_elm->attr_mandatory);
+
+ if (elm->spec.is_udata)
+ method_elm->has_udata = true;
+
+ if (type == UVERBS_ATTR_TYPE_IDR ||
+ type == UVERBS_ATTR_TYPE_FD) {
+ u8 access = elm->spec.u.obj.access;
+
+ /*
+ * Verbs specs may only have one NEW/DESTROY, we don't
+ * have the infrastructure to abort multiple NEW's or
+ * cope with multiple DESTROY failure.
+ */
+ if (access == UVERBS_ACCESS_NEW ||
+ access == UVERBS_ACCESS_DESTROY) {
+ if (WARN_ON(single_uobj))
+ return -EINVAL;
+
+ single_uobj = true;
+ if (WARN_ON(!elm->spec.mandatory))
+ return -EINVAL;
+ }
+
+ if (access == UVERBS_ACCESS_DESTROY)
+ method_elm->destroy_bkey = attr_bkey;
+ }
+
+ max_bkey = max(max_bkey, attr_bkey);
+ num_attrs++;
+ }
+
+ method_elm->key_bitmap_len = max_bkey + 1;
+ WARN_ON(method_elm->key_bitmap_len > UVERBS_API_ATTR_BKEY_LEN);
+
+ uapi_compute_bundle_size(method_elm, num_attrs);
+ return 0;
+}
+
+static int uapi_finalize(struct uverbs_api *uapi)
+{
+ const struct uverbs_api_write_method **data;
+ unsigned long max_write_ex = 0;
+ unsigned long max_write = 0;
+ struct radix_tree_iter iter;
+ void __rcu **slot;
+ int rc;
+ int i;
+
+ radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) {
+ struct uverbs_api_ioctl_method *method_elm =
+ rcu_dereference_protected(*slot, true);
+
+ if (uapi_key_is_ioctl_method(iter.index)) {
+ rc = uapi_finalize_ioctl_method(uapi, method_elm,
+ iter.index);
+ if (rc)
+ return rc;
+ }
+
+ if (uapi_key_is_write_method(iter.index))
+ max_write = max(max_write,
+ iter.index & UVERBS_API_ATTR_KEY_MASK);
+ if (uapi_key_is_write_ex_method(iter.index))
+ max_write_ex =
+ max(max_write_ex,
+ iter.index & UVERBS_API_ATTR_KEY_MASK);
+ }
+
+ uapi->notsupp_method.handler = ib_uverbs_notsupp;
+ uapi->num_write = max_write + 1;
+ uapi->num_write_ex = max_write_ex + 1;
+ data = kmalloc_array(uapi->num_write + uapi->num_write_ex,
+ sizeof(*uapi->write_methods), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ for (i = 0; i != uapi->num_write + uapi->num_write_ex; i++)
+ data[i] = &uapi->notsupp_method;
+ uapi->write_methods = data;
+ uapi->write_ex_methods = data + uapi->num_write;
+
+ radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) {
+ if (uapi_key_is_write_method(iter.index))
+ uapi->write_methods[iter.index &
+ UVERBS_API_ATTR_KEY_MASK] =
+ rcu_dereference_protected(*slot, true);
+ if (uapi_key_is_write_ex_method(iter.index))
+ uapi->write_ex_methods[iter.index &
+ UVERBS_API_ATTR_KEY_MASK] =
+ rcu_dereference_protected(*slot, true);
+ }
+
+ return 0;
+}
+
+static void uapi_remove_range(struct uverbs_api *uapi, u32 start, u32 last)
+{
+ struct radix_tree_iter iter;
+ void __rcu **slot;
+
+ radix_tree_for_each_slot (slot, &uapi->radix, &iter, start) {
+ if (iter.index > last)
+ return;
+ kfree(rcu_dereference_protected(*slot, true));
+ radix_tree_iter_delete(&uapi->radix, &iter, slot);
+ }
+}
+
+static void uapi_remove_object(struct uverbs_api *uapi, u32 obj_key)
+{
+ uapi_remove_range(uapi, obj_key,
+ obj_key | UVERBS_API_METHOD_KEY_MASK |
+ UVERBS_API_ATTR_KEY_MASK);
+}
+
+static void uapi_remove_method(struct uverbs_api *uapi, u32 method_key)
+{
+ uapi_remove_range(uapi, method_key,
+ method_key | UVERBS_API_ATTR_KEY_MASK);
+}
+
+
+static u32 uapi_get_obj_id(struct uverbs_attr_spec *spec)
+{
+ if (spec->type == UVERBS_ATTR_TYPE_IDR ||
+ spec->type == UVERBS_ATTR_TYPE_FD)
+ return spec->u.obj.obj_type;
+ if (spec->type == UVERBS_ATTR_TYPE_IDRS_ARRAY)
+ return spec->u2.objs_arr.obj_type;
+ return UVERBS_API_KEY_ERR;
+}
+
+static void uapi_key_okay(u32 key)
+{
+ unsigned int count = 0;
+
+ if (uapi_key_is_object(key))
+ count++;
+ if (uapi_key_is_ioctl_method(key))
+ count++;
+ if (uapi_key_is_write_method(key))
+ count++;
+ if (uapi_key_is_write_ex_method(key))
+ count++;
+ if (uapi_key_is_attr(key))
+ count++;
+ WARN(count != 1, "Bad count %u key=%x", count, key);
+}
+
+static void uapi_finalize_disable(struct uverbs_api *uapi)
+{
+ struct radix_tree_iter iter;
+ u32 starting_key = 0;
+ bool scan_again = false;
+ void __rcu **slot;
+
+again:
+ radix_tree_for_each_slot (slot, &uapi->radix, &iter, starting_key) {
+ uapi_key_okay(iter.index);
+
+ if (uapi_key_is_object(iter.index)) {
+ struct uverbs_api_object *obj_elm =
+ rcu_dereference_protected(*slot, true);
+
+ if (obj_elm->disabled) {
+ /* Have to check all the attrs again */
+ scan_again = true;
+ starting_key = iter.index;
+ uapi_remove_object(uapi, iter.index);
+ goto again;
+ }
+ continue;
+ }
+
+ if (uapi_key_is_ioctl_method(iter.index)) {
+ struct uverbs_api_ioctl_method *method_elm =
+ rcu_dereference_protected(*slot, true);
+
+ if (method_elm->disabled) {
+ starting_key = iter.index;
+ uapi_remove_method(uapi, iter.index);
+ goto again;
+ }
+ continue;
+ }
+
+ if (uapi_key_is_write_method(iter.index) ||
+ uapi_key_is_write_ex_method(iter.index)) {
+ struct uverbs_api_write_method *method_elm =
+ rcu_dereference_protected(*slot, true);
+
+ if (method_elm->disabled) {
+ kfree(method_elm);
+ radix_tree_iter_delete(&uapi->radix, &iter, slot);
+ }
+ continue;
+ }
+
+ if (uapi_key_is_attr(iter.index)) {
+ struct uverbs_api_attr *attr_elm =
+ rcu_dereference_protected(*slot, true);
+ const struct uverbs_api_object *tmp_obj;
+ u32 obj_key;
+
+ /*
+ * If the method has a mandatory object handle
+ * attribute which relies on an object which is not
+ * present then the entire method is uncallable.
+ */
+ if (!attr_elm->spec.mandatory)
+ continue;
+ obj_key = uapi_get_obj_id(&attr_elm->spec);
+ if (obj_key == UVERBS_API_KEY_ERR)
+ continue;
+ tmp_obj = uapi_get_object(uapi, obj_key);
+ if (IS_ERR(tmp_obj)) {
+ if (PTR_ERR(tmp_obj) == -ENOMSG)
+ continue;
+ } else {
+ if (!tmp_obj->disabled)
+ continue;
+ }
+
+ starting_key = iter.index;
+ uapi_remove_method(
+ uapi,
+ iter.index & (UVERBS_API_OBJ_KEY_MASK |
+ UVERBS_API_METHOD_KEY_MASK));
+ goto again;
+ }
+
+ WARN_ON(false);
+ }
+
+ if (!scan_again)
+ return;
+ scan_again = false;
+ starting_key = 0;
+ goto again;
+}
+
+void uverbs_destroy_api(struct uverbs_api *uapi)
+{
+ if (!uapi)
+ return;
+
+ uapi_remove_range(uapi, 0, U32_MAX);
+ kfree(uapi->write_methods);
+ kfree(uapi);
+}
+
+static const struct uapi_definition uverbs_core_api[] = {
+ UAPI_DEF_CHAIN(uverbs_def_obj_async_fd),
+ UAPI_DEF_CHAIN(uverbs_def_obj_counters),
+ UAPI_DEF_CHAIN(uverbs_def_obj_cq),
+ UAPI_DEF_CHAIN(uverbs_def_obj_device),
+ UAPI_DEF_CHAIN(uverbs_def_obj_dm),
+ UAPI_DEF_CHAIN(uverbs_def_obj_flow_action),
+ UAPI_DEF_CHAIN(uverbs_def_obj_intf),
+ UAPI_DEF_CHAIN(uverbs_def_obj_mr),
+ UAPI_DEF_CHAIN(uverbs_def_obj_qp),
+ UAPI_DEF_CHAIN(uverbs_def_obj_srq),
+ UAPI_DEF_CHAIN(uverbs_def_obj_wq),
+ UAPI_DEF_CHAIN(uverbs_def_write_intf),
+ {},
+};
+
+struct uverbs_api *uverbs_alloc_api(struct ib_device *ibdev)
+{
+ struct uverbs_api *uapi;
+ int rc;
+
+ uapi = kzalloc(sizeof(*uapi), GFP_KERNEL);
+ if (!uapi)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_RADIX_TREE(&uapi->radix, GFP_KERNEL);
+ uapi->driver_id = ibdev->ops.driver_id;
+
+ rc = uapi_merge_def(uapi, ibdev, uverbs_core_api, false);
+ if (rc)
+ goto err;
+ rc = uapi_merge_def(uapi, ibdev, ibdev->driver_def, true);
+ if (rc)
+ goto err;
+
+ uapi_finalize_disable(uapi);
+ rc = uapi_finalize(uapi);
+ if (rc)
+ goto err;
+
+ return uapi;
+err:
+ if (rc != -ENOMEM)
+ dev_err(&ibdev->dev,
+ "Setup of uverbs_api failed, kernel parsing tree description is not valid (%d)??\n",
+ rc);
+
+ uverbs_destroy_api(uapi);
+ return ERR_PTR(rc);
+}
+
+/*
+ * The pre version is done before destroying the HW objects, it only blocks
+ * off method access. All methods that require the ib_dev or the module data
+ * must test one of these assignments prior to continuing.
+ */
+void uverbs_disassociate_api_pre(struct ib_uverbs_device *uverbs_dev)
+{
+ struct uverbs_api *uapi = uverbs_dev->uapi;
+ struct radix_tree_iter iter;
+ void __rcu **slot;
+
+ rcu_assign_pointer(uverbs_dev->ib_dev, NULL);
+
+ radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) {
+ if (uapi_key_is_ioctl_method(iter.index)) {
+ struct uverbs_api_ioctl_method *method_elm =
+ rcu_dereference_protected(*slot, true);
+
+ if (method_elm->driver_method)
+ rcu_assign_pointer(method_elm->handler, NULL);
+ }
+ }
+
+ synchronize_srcu(&uverbs_dev->disassociate_srcu);
+}
+
+/*
+ * Called when a driver disassociates from the ib_uverbs_device. The
+ * assumption is that the driver module will unload after. Replace everything
+ * related to the driver with NULL as a safety measure.
+ */
+void uverbs_disassociate_api(struct uverbs_api *uapi)
+{
+ struct radix_tree_iter iter;
+ void __rcu **slot;
+
+ radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) {
+ if (uapi_key_is_object(iter.index)) {
+ struct uverbs_api_object *object_elm =
+ rcu_dereference_protected(*slot, true);
+
+ /*
+ * Some type_attrs are in the driver module. We don't
+ * bother to keep track of which since there should be
+ * no use of this after disassociate.
+ */
+ object_elm->type_attrs = NULL;
+ } else if (uapi_key_is_attr(iter.index)) {
+ struct uverbs_api_attr *elm =
+ rcu_dereference_protected(*slot, true);
+
+ if (elm->spec.type == UVERBS_ATTR_TYPE_ENUM_IN)
+ elm->spec.u2.enum_def.ids = NULL;
+ }
+ }
+}
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
new file mode 100644
index 000000000..b99b3cc28
--- /dev/null
+++ b/drivers/infiniband/core/verbs.c
@@ -0,0 +1,3030 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <net/addrconf.h>
+#include <linux/security.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_addr.h>
+#include <rdma/rw.h>
+#include <rdma/lag.h>
+
+#include "core_priv.h"
+#include <trace/events/rdma_core.h>
+
+static int ib_resolve_eth_dmac(struct ib_device *device,
+ struct rdma_ah_attr *ah_attr);
+
+static const char * const ib_events[] = {
+ [IB_EVENT_CQ_ERR] = "CQ error",
+ [IB_EVENT_QP_FATAL] = "QP fatal error",
+ [IB_EVENT_QP_REQ_ERR] = "QP request error",
+ [IB_EVENT_QP_ACCESS_ERR] = "QP access error",
+ [IB_EVENT_COMM_EST] = "communication established",
+ [IB_EVENT_SQ_DRAINED] = "send queue drained",
+ [IB_EVENT_PATH_MIG] = "path migration successful",
+ [IB_EVENT_PATH_MIG_ERR] = "path migration error",
+ [IB_EVENT_DEVICE_FATAL] = "device fatal error",
+ [IB_EVENT_PORT_ACTIVE] = "port active",
+ [IB_EVENT_PORT_ERR] = "port error",
+ [IB_EVENT_LID_CHANGE] = "LID change",
+ [IB_EVENT_PKEY_CHANGE] = "P_key change",
+ [IB_EVENT_SM_CHANGE] = "SM change",
+ [IB_EVENT_SRQ_ERR] = "SRQ error",
+ [IB_EVENT_SRQ_LIMIT_REACHED] = "SRQ limit reached",
+ [IB_EVENT_QP_LAST_WQE_REACHED] = "last WQE reached",
+ [IB_EVENT_CLIENT_REREGISTER] = "client reregister",
+ [IB_EVENT_GID_CHANGE] = "GID changed",
+};
+
+const char *__attribute_const__ ib_event_msg(enum ib_event_type event)
+{
+ size_t index = event;
+
+ return (index < ARRAY_SIZE(ib_events) && ib_events[index]) ?
+ ib_events[index] : "unrecognized event";
+}
+EXPORT_SYMBOL(ib_event_msg);
+
+static const char * const wc_statuses[] = {
+ [IB_WC_SUCCESS] = "success",
+ [IB_WC_LOC_LEN_ERR] = "local length error",
+ [IB_WC_LOC_QP_OP_ERR] = "local QP operation error",
+ [IB_WC_LOC_EEC_OP_ERR] = "local EE context operation error",
+ [IB_WC_LOC_PROT_ERR] = "local protection error",
+ [IB_WC_WR_FLUSH_ERR] = "WR flushed",
+ [IB_WC_MW_BIND_ERR] = "memory bind operation error",
+ [IB_WC_BAD_RESP_ERR] = "bad response error",
+ [IB_WC_LOC_ACCESS_ERR] = "local access error",
+ [IB_WC_REM_INV_REQ_ERR] = "remote invalid request error",
+ [IB_WC_REM_ACCESS_ERR] = "remote access error",
+ [IB_WC_REM_OP_ERR] = "remote operation error",
+ [IB_WC_RETRY_EXC_ERR] = "transport retry counter exceeded",
+ [IB_WC_RNR_RETRY_EXC_ERR] = "RNR retry counter exceeded",
+ [IB_WC_LOC_RDD_VIOL_ERR] = "local RDD violation error",
+ [IB_WC_REM_INV_RD_REQ_ERR] = "remote invalid RD request",
+ [IB_WC_REM_ABORT_ERR] = "operation aborted",
+ [IB_WC_INV_EECN_ERR] = "invalid EE context number",
+ [IB_WC_INV_EEC_STATE_ERR] = "invalid EE context state",
+ [IB_WC_FATAL_ERR] = "fatal error",
+ [IB_WC_RESP_TIMEOUT_ERR] = "response timeout error",
+ [IB_WC_GENERAL_ERR] = "general error",
+};
+
+const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status)
+{
+ size_t index = status;
+
+ return (index < ARRAY_SIZE(wc_statuses) && wc_statuses[index]) ?
+ wc_statuses[index] : "unrecognized status";
+}
+EXPORT_SYMBOL(ib_wc_status_msg);
+
+__attribute_const__ int ib_rate_to_mult(enum ib_rate rate)
+{
+ switch (rate) {
+ case IB_RATE_2_5_GBPS: return 1;
+ case IB_RATE_5_GBPS: return 2;
+ case IB_RATE_10_GBPS: return 4;
+ case IB_RATE_20_GBPS: return 8;
+ case IB_RATE_30_GBPS: return 12;
+ case IB_RATE_40_GBPS: return 16;
+ case IB_RATE_60_GBPS: return 24;
+ case IB_RATE_80_GBPS: return 32;
+ case IB_RATE_120_GBPS: return 48;
+ case IB_RATE_14_GBPS: return 6;
+ case IB_RATE_56_GBPS: return 22;
+ case IB_RATE_112_GBPS: return 45;
+ case IB_RATE_168_GBPS: return 67;
+ case IB_RATE_25_GBPS: return 10;
+ case IB_RATE_100_GBPS: return 40;
+ case IB_RATE_200_GBPS: return 80;
+ case IB_RATE_300_GBPS: return 120;
+ case IB_RATE_28_GBPS: return 11;
+ case IB_RATE_50_GBPS: return 20;
+ case IB_RATE_400_GBPS: return 160;
+ case IB_RATE_600_GBPS: return 240;
+ default: return -1;
+ }
+}
+EXPORT_SYMBOL(ib_rate_to_mult);
+
+__attribute_const__ enum ib_rate mult_to_ib_rate(int mult)
+{
+ switch (mult) {
+ case 1: return IB_RATE_2_5_GBPS;
+ case 2: return IB_RATE_5_GBPS;
+ case 4: return IB_RATE_10_GBPS;
+ case 8: return IB_RATE_20_GBPS;
+ case 12: return IB_RATE_30_GBPS;
+ case 16: return IB_RATE_40_GBPS;
+ case 24: return IB_RATE_60_GBPS;
+ case 32: return IB_RATE_80_GBPS;
+ case 48: return IB_RATE_120_GBPS;
+ case 6: return IB_RATE_14_GBPS;
+ case 22: return IB_RATE_56_GBPS;
+ case 45: return IB_RATE_112_GBPS;
+ case 67: return IB_RATE_168_GBPS;
+ case 10: return IB_RATE_25_GBPS;
+ case 40: return IB_RATE_100_GBPS;
+ case 80: return IB_RATE_200_GBPS;
+ case 120: return IB_RATE_300_GBPS;
+ case 11: return IB_RATE_28_GBPS;
+ case 20: return IB_RATE_50_GBPS;
+ case 160: return IB_RATE_400_GBPS;
+ case 240: return IB_RATE_600_GBPS;
+ default: return IB_RATE_PORT_CURRENT;
+ }
+}
+EXPORT_SYMBOL(mult_to_ib_rate);
+
+__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate)
+{
+ switch (rate) {
+ case IB_RATE_2_5_GBPS: return 2500;
+ case IB_RATE_5_GBPS: return 5000;
+ case IB_RATE_10_GBPS: return 10000;
+ case IB_RATE_20_GBPS: return 20000;
+ case IB_RATE_30_GBPS: return 30000;
+ case IB_RATE_40_GBPS: return 40000;
+ case IB_RATE_60_GBPS: return 60000;
+ case IB_RATE_80_GBPS: return 80000;
+ case IB_RATE_120_GBPS: return 120000;
+ case IB_RATE_14_GBPS: return 14062;
+ case IB_RATE_56_GBPS: return 56250;
+ case IB_RATE_112_GBPS: return 112500;
+ case IB_RATE_168_GBPS: return 168750;
+ case IB_RATE_25_GBPS: return 25781;
+ case IB_RATE_100_GBPS: return 103125;
+ case IB_RATE_200_GBPS: return 206250;
+ case IB_RATE_300_GBPS: return 309375;
+ case IB_RATE_28_GBPS: return 28125;
+ case IB_RATE_50_GBPS: return 53125;
+ case IB_RATE_400_GBPS: return 425000;
+ case IB_RATE_600_GBPS: return 637500;
+ default: return -1;
+ }
+}
+EXPORT_SYMBOL(ib_rate_to_mbps);
+
+__attribute_const__ enum rdma_transport_type
+rdma_node_get_transport(unsigned int node_type)
+{
+
+ if (node_type == RDMA_NODE_USNIC)
+ return RDMA_TRANSPORT_USNIC;
+ if (node_type == RDMA_NODE_USNIC_UDP)
+ return RDMA_TRANSPORT_USNIC_UDP;
+ if (node_type == RDMA_NODE_RNIC)
+ return RDMA_TRANSPORT_IWARP;
+ if (node_type == RDMA_NODE_UNSPECIFIED)
+ return RDMA_TRANSPORT_UNSPECIFIED;
+
+ return RDMA_TRANSPORT_IB;
+}
+EXPORT_SYMBOL(rdma_node_get_transport);
+
+enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device,
+ u32 port_num)
+{
+ enum rdma_transport_type lt;
+ if (device->ops.get_link_layer)
+ return device->ops.get_link_layer(device, port_num);
+
+ lt = rdma_node_get_transport(device->node_type);
+ if (lt == RDMA_TRANSPORT_IB)
+ return IB_LINK_LAYER_INFINIBAND;
+
+ return IB_LINK_LAYER_ETHERNET;
+}
+EXPORT_SYMBOL(rdma_port_get_link_layer);
+
+/* Protection domains */
+
+/**
+ * __ib_alloc_pd - Allocates an unused protection domain.
+ * @device: The device on which to allocate the protection domain.
+ * @flags: protection domain flags
+ * @caller: caller's build-time module name
+ *
+ * A protection domain object provides an association between QPs, shared
+ * receive queues, address handles, memory regions, and memory windows.
+ *
+ * Every PD has a local_dma_lkey which can be used as the lkey value for local
+ * memory operations.
+ */
+struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
+ const char *caller)
+{
+ struct ib_pd *pd;
+ int mr_access_flags = 0;
+ int ret;
+
+ pd = rdma_zalloc_drv_obj(device, ib_pd);
+ if (!pd)
+ return ERR_PTR(-ENOMEM);
+
+ pd->device = device;
+ pd->flags = flags;
+
+ rdma_restrack_new(&pd->res, RDMA_RESTRACK_PD);
+ rdma_restrack_set_name(&pd->res, caller);
+
+ ret = device->ops.alloc_pd(pd, NULL);
+ if (ret) {
+ rdma_restrack_put(&pd->res);
+ kfree(pd);
+ return ERR_PTR(ret);
+ }
+ rdma_restrack_add(&pd->res);
+
+ if (device->attrs.kernel_cap_flags & IBK_LOCAL_DMA_LKEY)
+ pd->local_dma_lkey = device->local_dma_lkey;
+ else
+ mr_access_flags |= IB_ACCESS_LOCAL_WRITE;
+
+ if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) {
+ pr_warn("%s: enabling unsafe global rkey\n", caller);
+ mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE;
+ }
+
+ if (mr_access_flags) {
+ struct ib_mr *mr;
+
+ mr = pd->device->ops.get_dma_mr(pd, mr_access_flags);
+ if (IS_ERR(mr)) {
+ ib_dealloc_pd(pd);
+ return ERR_CAST(mr);
+ }
+
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->type = IB_MR_TYPE_DMA;
+ mr->uobject = NULL;
+ mr->need_inval = false;
+
+ pd->__internal_mr = mr;
+
+ if (!(device->attrs.kernel_cap_flags & IBK_LOCAL_DMA_LKEY))
+ pd->local_dma_lkey = pd->__internal_mr->lkey;
+
+ if (flags & IB_PD_UNSAFE_GLOBAL_RKEY)
+ pd->unsafe_global_rkey = pd->__internal_mr->rkey;
+ }
+
+ return pd;
+}
+EXPORT_SYMBOL(__ib_alloc_pd);
+
+/**
+ * ib_dealloc_pd_user - Deallocates a protection domain.
+ * @pd: The protection domain to deallocate.
+ * @udata: Valid user data or NULL for kernel object
+ *
+ * It is an error to call this function while any resources in the pd still
+ * exist. The caller is responsible to synchronously destroy them and
+ * guarantee no new allocations will happen.
+ */
+int ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata)
+{
+ int ret;
+
+ if (pd->__internal_mr) {
+ ret = pd->device->ops.dereg_mr(pd->__internal_mr, NULL);
+ WARN_ON(ret);
+ pd->__internal_mr = NULL;
+ }
+
+ ret = pd->device->ops.dealloc_pd(pd, udata);
+ if (ret)
+ return ret;
+
+ rdma_restrack_del(&pd->res);
+ kfree(pd);
+ return ret;
+}
+EXPORT_SYMBOL(ib_dealloc_pd_user);
+
+/* Address handles */
+
+/**
+ * rdma_copy_ah_attr - Copy rdma ah attribute from source to destination.
+ * @dest: Pointer to destination ah_attr. Contents of the destination
+ * pointer is assumed to be invalid and attribute are overwritten.
+ * @src: Pointer to source ah_attr.
+ */
+void rdma_copy_ah_attr(struct rdma_ah_attr *dest,
+ const struct rdma_ah_attr *src)
+{
+ *dest = *src;
+ if (dest->grh.sgid_attr)
+ rdma_hold_gid_attr(dest->grh.sgid_attr);
+}
+EXPORT_SYMBOL(rdma_copy_ah_attr);
+
+/**
+ * rdma_replace_ah_attr - Replace valid ah_attr with new new one.
+ * @old: Pointer to existing ah_attr which needs to be replaced.
+ * old is assumed to be valid or zero'd
+ * @new: Pointer to the new ah_attr.
+ *
+ * rdma_replace_ah_attr() first releases any reference in the old ah_attr if
+ * old the ah_attr is valid; after that it copies the new attribute and holds
+ * the reference to the replaced ah_attr.
+ */
+void rdma_replace_ah_attr(struct rdma_ah_attr *old,
+ const struct rdma_ah_attr *new)
+{
+ rdma_destroy_ah_attr(old);
+ *old = *new;
+ if (old->grh.sgid_attr)
+ rdma_hold_gid_attr(old->grh.sgid_attr);
+}
+EXPORT_SYMBOL(rdma_replace_ah_attr);
+
+/**
+ * rdma_move_ah_attr - Move ah_attr pointed by source to destination.
+ * @dest: Pointer to destination ah_attr to copy to.
+ * dest is assumed to be valid or zero'd
+ * @src: Pointer to the new ah_attr.
+ *
+ * rdma_move_ah_attr() first releases any reference in the destination ah_attr
+ * if it is valid. This also transfers ownership of internal references from
+ * src to dest, making src invalid in the process. No new reference of the src
+ * ah_attr is taken.
+ */
+void rdma_move_ah_attr(struct rdma_ah_attr *dest, struct rdma_ah_attr *src)
+{
+ rdma_destroy_ah_attr(dest);
+ *dest = *src;
+ src->grh.sgid_attr = NULL;
+}
+EXPORT_SYMBOL(rdma_move_ah_attr);
+
+/*
+ * Validate that the rdma_ah_attr is valid for the device before passing it
+ * off to the driver.
+ */
+static int rdma_check_ah_attr(struct ib_device *device,
+ struct rdma_ah_attr *ah_attr)
+{
+ if (!rdma_is_port_valid(device, ah_attr->port_num))
+ return -EINVAL;
+
+ if ((rdma_is_grh_required(device, ah_attr->port_num) ||
+ ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) &&
+ !(ah_attr->ah_flags & IB_AH_GRH))
+ return -EINVAL;
+
+ if (ah_attr->grh.sgid_attr) {
+ /*
+ * Make sure the passed sgid_attr is consistent with the
+ * parameters
+ */
+ if (ah_attr->grh.sgid_attr->index != ah_attr->grh.sgid_index ||
+ ah_attr->grh.sgid_attr->port_num != ah_attr->port_num)
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * If the ah requires a GRH then ensure that sgid_attr pointer is filled in.
+ * On success the caller is responsible to call rdma_unfill_sgid_attr().
+ */
+static int rdma_fill_sgid_attr(struct ib_device *device,
+ struct rdma_ah_attr *ah_attr,
+ const struct ib_gid_attr **old_sgid_attr)
+{
+ const struct ib_gid_attr *sgid_attr;
+ struct ib_global_route *grh;
+ int ret;
+
+ *old_sgid_attr = ah_attr->grh.sgid_attr;
+
+ ret = rdma_check_ah_attr(device, ah_attr);
+ if (ret)
+ return ret;
+
+ if (!(ah_attr->ah_flags & IB_AH_GRH))
+ return 0;
+
+ grh = rdma_ah_retrieve_grh(ah_attr);
+ if (grh->sgid_attr)
+ return 0;
+
+ sgid_attr =
+ rdma_get_gid_attr(device, ah_attr->port_num, grh->sgid_index);
+ if (IS_ERR(sgid_attr))
+ return PTR_ERR(sgid_attr);
+
+ /* Move ownerhip of the kref into the ah_attr */
+ grh->sgid_attr = sgid_attr;
+ return 0;
+}
+
+static void rdma_unfill_sgid_attr(struct rdma_ah_attr *ah_attr,
+ const struct ib_gid_attr *old_sgid_attr)
+{
+ /*
+ * Fill didn't change anything, the caller retains ownership of
+ * whatever it passed
+ */
+ if (ah_attr->grh.sgid_attr == old_sgid_attr)
+ return;
+
+ /*
+ * Otherwise, we need to undo what rdma_fill_sgid_attr so the caller
+ * doesn't see any change in the rdma_ah_attr. If we get here
+ * old_sgid_attr is NULL.
+ */
+ rdma_destroy_ah_attr(ah_attr);
+}
+
+static const struct ib_gid_attr *
+rdma_update_sgid_attr(struct rdma_ah_attr *ah_attr,
+ const struct ib_gid_attr *old_attr)
+{
+ if (old_attr)
+ rdma_put_gid_attr(old_attr);
+ if (ah_attr->ah_flags & IB_AH_GRH) {
+ rdma_hold_gid_attr(ah_attr->grh.sgid_attr);
+ return ah_attr->grh.sgid_attr;
+ }
+ return NULL;
+}
+
+static struct ib_ah *_rdma_create_ah(struct ib_pd *pd,
+ struct rdma_ah_attr *ah_attr,
+ u32 flags,
+ struct ib_udata *udata,
+ struct net_device *xmit_slave)
+{
+ struct rdma_ah_init_attr init_attr = {};
+ struct ib_device *device = pd->device;
+ struct ib_ah *ah;
+ int ret;
+
+ might_sleep_if(flags & RDMA_CREATE_AH_SLEEPABLE);
+
+ if (!udata && !device->ops.create_ah)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ ah = rdma_zalloc_drv_obj_gfp(
+ device, ib_ah,
+ (flags & RDMA_CREATE_AH_SLEEPABLE) ? GFP_KERNEL : GFP_ATOMIC);
+ if (!ah)
+ return ERR_PTR(-ENOMEM);
+
+ ah->device = device;
+ ah->pd = pd;
+ ah->type = ah_attr->type;
+ ah->sgid_attr = rdma_update_sgid_attr(ah_attr, NULL);
+ init_attr.ah_attr = ah_attr;
+ init_attr.flags = flags;
+ init_attr.xmit_slave = xmit_slave;
+
+ if (udata)
+ ret = device->ops.create_user_ah(ah, &init_attr, udata);
+ else
+ ret = device->ops.create_ah(ah, &init_attr, NULL);
+ if (ret) {
+ if (ah->sgid_attr)
+ rdma_put_gid_attr(ah->sgid_attr);
+ kfree(ah);
+ return ERR_PTR(ret);
+ }
+
+ atomic_inc(&pd->usecnt);
+ return ah;
+}
+
+/**
+ * rdma_create_ah - Creates an address handle for the
+ * given address vector.
+ * @pd: The protection domain associated with the address handle.
+ * @ah_attr: The attributes of the address vector.
+ * @flags: Create address handle flags (see enum rdma_create_ah_flags).
+ *
+ * It returns 0 on success and returns appropriate error code on error.
+ * The address handle is used to reference a local or global destination
+ * in all UD QP post sends.
+ */
+struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
+ u32 flags)
+{
+ const struct ib_gid_attr *old_sgid_attr;
+ struct net_device *slave;
+ struct ib_ah *ah;
+ int ret;
+
+ ret = rdma_fill_sgid_attr(pd->device, ah_attr, &old_sgid_attr);
+ if (ret)
+ return ERR_PTR(ret);
+ slave = rdma_lag_get_ah_roce_slave(pd->device, ah_attr,
+ (flags & RDMA_CREATE_AH_SLEEPABLE) ?
+ GFP_KERNEL : GFP_ATOMIC);
+ if (IS_ERR(slave)) {
+ rdma_unfill_sgid_attr(ah_attr, old_sgid_attr);
+ return (void *)slave;
+ }
+ ah = _rdma_create_ah(pd, ah_attr, flags, NULL, slave);
+ rdma_lag_put_ah_roce_slave(slave);
+ rdma_unfill_sgid_attr(ah_attr, old_sgid_attr);
+ return ah;
+}
+EXPORT_SYMBOL(rdma_create_ah);
+
+/**
+ * rdma_create_user_ah - Creates an address handle for the
+ * given address vector.
+ * It resolves destination mac address for ah attribute of RoCE type.
+ * @pd: The protection domain associated with the address handle.
+ * @ah_attr: The attributes of the address vector.
+ * @udata: pointer to user's input output buffer information need by
+ * provider driver.
+ *
+ * It returns 0 on success and returns appropriate error code on error.
+ * The address handle is used to reference a local or global destination
+ * in all UD QP post sends.
+ */
+struct ib_ah *rdma_create_user_ah(struct ib_pd *pd,
+ struct rdma_ah_attr *ah_attr,
+ struct ib_udata *udata)
+{
+ const struct ib_gid_attr *old_sgid_attr;
+ struct ib_ah *ah;
+ int err;
+
+ err = rdma_fill_sgid_attr(pd->device, ah_attr, &old_sgid_attr);
+ if (err)
+ return ERR_PTR(err);
+
+ if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) {
+ err = ib_resolve_eth_dmac(pd->device, ah_attr);
+ if (err) {
+ ah = ERR_PTR(err);
+ goto out;
+ }
+ }
+
+ ah = _rdma_create_ah(pd, ah_attr, RDMA_CREATE_AH_SLEEPABLE,
+ udata, NULL);
+
+out:
+ rdma_unfill_sgid_attr(ah_attr, old_sgid_attr);
+ return ah;
+}
+EXPORT_SYMBOL(rdma_create_user_ah);
+
+int ib_get_rdma_header_version(const union rdma_network_hdr *hdr)
+{
+ const struct iphdr *ip4h = (struct iphdr *)&hdr->roce4grh;
+ struct iphdr ip4h_checked;
+ const struct ipv6hdr *ip6h = (struct ipv6hdr *)&hdr->ibgrh;
+
+ /* If it's IPv6, the version must be 6, otherwise, the first
+ * 20 bytes (before the IPv4 header) are garbled.
+ */
+ if (ip6h->version != 6)
+ return (ip4h->version == 4) ? 4 : 0;
+ /* version may be 6 or 4 because the first 20 bytes could be garbled */
+
+ /* RoCE v2 requires no options, thus header length
+ * must be 5 words
+ */
+ if (ip4h->ihl != 5)
+ return 6;
+
+ /* Verify checksum.
+ * We can't write on scattered buffers so we need to copy to
+ * temp buffer.
+ */
+ memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked));
+ ip4h_checked.check = 0;
+ ip4h_checked.check = ip_fast_csum((u8 *)&ip4h_checked, 5);
+ /* if IPv4 header checksum is OK, believe it */
+ if (ip4h->check == ip4h_checked.check)
+ return 4;
+ return 6;
+}
+EXPORT_SYMBOL(ib_get_rdma_header_version);
+
+static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device,
+ u32 port_num,
+ const struct ib_grh *grh)
+{
+ int grh_version;
+
+ if (rdma_protocol_ib(device, port_num))
+ return RDMA_NETWORK_IB;
+
+ grh_version = ib_get_rdma_header_version((union rdma_network_hdr *)grh);
+
+ if (grh_version == 4)
+ return RDMA_NETWORK_IPV4;
+
+ if (grh->next_hdr == IPPROTO_UDP)
+ return RDMA_NETWORK_IPV6;
+
+ return RDMA_NETWORK_ROCE_V1;
+}
+
+struct find_gid_index_context {
+ u16 vlan_id;
+ enum ib_gid_type gid_type;
+};
+
+static bool find_gid_index(const union ib_gid *gid,
+ const struct ib_gid_attr *gid_attr,
+ void *context)
+{
+ struct find_gid_index_context *ctx = context;
+ u16 vlan_id = 0xffff;
+ int ret;
+
+ if (ctx->gid_type != gid_attr->gid_type)
+ return false;
+
+ ret = rdma_read_gid_l2_fields(gid_attr, &vlan_id, NULL);
+ if (ret)
+ return false;
+
+ return ctx->vlan_id == vlan_id;
+}
+
+static const struct ib_gid_attr *
+get_sgid_attr_from_eth(struct ib_device *device, u32 port_num,
+ u16 vlan_id, const union ib_gid *sgid,
+ enum ib_gid_type gid_type)
+{
+ struct find_gid_index_context context = {.vlan_id = vlan_id,
+ .gid_type = gid_type};
+
+ return rdma_find_gid_by_filter(device, sgid, port_num, find_gid_index,
+ &context);
+}
+
+int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr,
+ enum rdma_network_type net_type,
+ union ib_gid *sgid, union ib_gid *dgid)
+{
+ struct sockaddr_in src_in;
+ struct sockaddr_in dst_in;
+ __be32 src_saddr, dst_saddr;
+
+ if (!sgid || !dgid)
+ return -EINVAL;
+
+ if (net_type == RDMA_NETWORK_IPV4) {
+ memcpy(&src_in.sin_addr.s_addr,
+ &hdr->roce4grh.saddr, 4);
+ memcpy(&dst_in.sin_addr.s_addr,
+ &hdr->roce4grh.daddr, 4);
+ src_saddr = src_in.sin_addr.s_addr;
+ dst_saddr = dst_in.sin_addr.s_addr;
+ ipv6_addr_set_v4mapped(src_saddr,
+ (struct in6_addr *)sgid);
+ ipv6_addr_set_v4mapped(dst_saddr,
+ (struct in6_addr *)dgid);
+ return 0;
+ } else if (net_type == RDMA_NETWORK_IPV6 ||
+ net_type == RDMA_NETWORK_IB || RDMA_NETWORK_ROCE_V1) {
+ *dgid = hdr->ibgrh.dgid;
+ *sgid = hdr->ibgrh.sgid;
+ return 0;
+ } else {
+ return -EINVAL;
+ }
+}
+EXPORT_SYMBOL(ib_get_gids_from_rdma_hdr);
+
+/* Resolve destination mac address and hop limit for unicast destination
+ * GID entry, considering the source GID entry as well.
+ * ah_attribute must have have valid port_num, sgid_index.
+ */
+static int ib_resolve_unicast_gid_dmac(struct ib_device *device,
+ struct rdma_ah_attr *ah_attr)
+{
+ struct ib_global_route *grh = rdma_ah_retrieve_grh(ah_attr);
+ const struct ib_gid_attr *sgid_attr = grh->sgid_attr;
+ int hop_limit = 0xff;
+ int ret = 0;
+
+ /* If destination is link local and source GID is RoCEv1,
+ * IP stack is not used.
+ */
+ if (rdma_link_local_addr((struct in6_addr *)grh->dgid.raw) &&
+ sgid_attr->gid_type == IB_GID_TYPE_ROCE) {
+ rdma_get_ll_mac((struct in6_addr *)grh->dgid.raw,
+ ah_attr->roce.dmac);
+ return ret;
+ }
+
+ ret = rdma_addr_find_l2_eth_by_grh(&sgid_attr->gid, &grh->dgid,
+ ah_attr->roce.dmac,
+ sgid_attr, &hop_limit);
+
+ grh->hop_limit = hop_limit;
+ return ret;
+}
+
+/*
+ * This function initializes address handle attributes from the incoming packet.
+ * Incoming packet has dgid of the receiver node on which this code is
+ * getting executed and, sgid contains the GID of the sender.
+ *
+ * When resolving mac address of destination, the arrived dgid is used
+ * as sgid and, sgid is used as dgid because sgid contains destinations
+ * GID whom to respond to.
+ *
+ * On success the caller is responsible to call rdma_destroy_ah_attr on the
+ * attr.
+ */
+int ib_init_ah_attr_from_wc(struct ib_device *device, u32 port_num,
+ const struct ib_wc *wc, const struct ib_grh *grh,
+ struct rdma_ah_attr *ah_attr)
+{
+ u32 flow_class;
+ int ret;
+ enum rdma_network_type net_type = RDMA_NETWORK_IB;
+ enum ib_gid_type gid_type = IB_GID_TYPE_IB;
+ const struct ib_gid_attr *sgid_attr;
+ int hoplimit = 0xff;
+ union ib_gid dgid;
+ union ib_gid sgid;
+
+ might_sleep();
+
+ memset(ah_attr, 0, sizeof *ah_attr);
+ ah_attr->type = rdma_ah_find_type(device, port_num);
+ if (rdma_cap_eth_ah(device, port_num)) {
+ if (wc->wc_flags & IB_WC_WITH_NETWORK_HDR_TYPE)
+ net_type = wc->network_hdr_type;
+ else
+ net_type = ib_get_net_type_by_grh(device, port_num, grh);
+ gid_type = ib_network_to_gid_type(net_type);
+ }
+ ret = ib_get_gids_from_rdma_hdr((union rdma_network_hdr *)grh, net_type,
+ &sgid, &dgid);
+ if (ret)
+ return ret;
+
+ rdma_ah_set_sl(ah_attr, wc->sl);
+ rdma_ah_set_port_num(ah_attr, port_num);
+
+ if (rdma_protocol_roce(device, port_num)) {
+ u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ?
+ wc->vlan_id : 0xffff;
+
+ if (!(wc->wc_flags & IB_WC_GRH))
+ return -EPROTOTYPE;
+
+ sgid_attr = get_sgid_attr_from_eth(device, port_num,
+ vlan_id, &dgid,
+ gid_type);
+ if (IS_ERR(sgid_attr))
+ return PTR_ERR(sgid_attr);
+
+ flow_class = be32_to_cpu(grh->version_tclass_flow);
+ rdma_move_grh_sgid_attr(ah_attr,
+ &sgid,
+ flow_class & 0xFFFFF,
+ hoplimit,
+ (flow_class >> 20) & 0xFF,
+ sgid_attr);
+
+ ret = ib_resolve_unicast_gid_dmac(device, ah_attr);
+ if (ret)
+ rdma_destroy_ah_attr(ah_attr);
+
+ return ret;
+ } else {
+ rdma_ah_set_dlid(ah_attr, wc->slid);
+ rdma_ah_set_path_bits(ah_attr, wc->dlid_path_bits);
+
+ if ((wc->wc_flags & IB_WC_GRH) == 0)
+ return 0;
+
+ if (dgid.global.interface_id !=
+ cpu_to_be64(IB_SA_WELL_KNOWN_GUID)) {
+ sgid_attr = rdma_find_gid_by_port(
+ device, &dgid, IB_GID_TYPE_IB, port_num, NULL);
+ } else
+ sgid_attr = rdma_get_gid_attr(device, port_num, 0);
+
+ if (IS_ERR(sgid_attr))
+ return PTR_ERR(sgid_attr);
+ flow_class = be32_to_cpu(grh->version_tclass_flow);
+ rdma_move_grh_sgid_attr(ah_attr,
+ &sgid,
+ flow_class & 0xFFFFF,
+ hoplimit,
+ (flow_class >> 20) & 0xFF,
+ sgid_attr);
+
+ return 0;
+ }
+}
+EXPORT_SYMBOL(ib_init_ah_attr_from_wc);
+
+/**
+ * rdma_move_grh_sgid_attr - Sets the sgid attribute of GRH, taking ownership
+ * of the reference
+ *
+ * @attr: Pointer to AH attribute structure
+ * @dgid: Destination GID
+ * @flow_label: Flow label
+ * @hop_limit: Hop limit
+ * @traffic_class: traffic class
+ * @sgid_attr: Pointer to SGID attribute
+ *
+ * This takes ownership of the sgid_attr reference. The caller must ensure
+ * rdma_destroy_ah_attr() is called before destroying the rdma_ah_attr after
+ * calling this function.
+ */
+void rdma_move_grh_sgid_attr(struct rdma_ah_attr *attr, union ib_gid *dgid,
+ u32 flow_label, u8 hop_limit, u8 traffic_class,
+ const struct ib_gid_attr *sgid_attr)
+{
+ rdma_ah_set_grh(attr, dgid, flow_label, sgid_attr->index, hop_limit,
+ traffic_class);
+ attr->grh.sgid_attr = sgid_attr;
+}
+EXPORT_SYMBOL(rdma_move_grh_sgid_attr);
+
+/**
+ * rdma_destroy_ah_attr - Release reference to SGID attribute of
+ * ah attribute.
+ * @ah_attr: Pointer to ah attribute
+ *
+ * Release reference to the SGID attribute of the ah attribute if it is
+ * non NULL. It is safe to call this multiple times, and safe to call it on
+ * a zero initialized ah_attr.
+ */
+void rdma_destroy_ah_attr(struct rdma_ah_attr *ah_attr)
+{
+ if (ah_attr->grh.sgid_attr) {
+ rdma_put_gid_attr(ah_attr->grh.sgid_attr);
+ ah_attr->grh.sgid_attr = NULL;
+ }
+}
+EXPORT_SYMBOL(rdma_destroy_ah_attr);
+
+struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc,
+ const struct ib_grh *grh, u32 port_num)
+{
+ struct rdma_ah_attr ah_attr;
+ struct ib_ah *ah;
+ int ret;
+
+ ret = ib_init_ah_attr_from_wc(pd->device, port_num, wc, grh, &ah_attr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ ah = rdma_create_ah(pd, &ah_attr, RDMA_CREATE_AH_SLEEPABLE);
+
+ rdma_destroy_ah_attr(&ah_attr);
+ return ah;
+}
+EXPORT_SYMBOL(ib_create_ah_from_wc);
+
+int rdma_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr)
+{
+ const struct ib_gid_attr *old_sgid_attr;
+ int ret;
+
+ if (ah->type != ah_attr->type)
+ return -EINVAL;
+
+ ret = rdma_fill_sgid_attr(ah->device, ah_attr, &old_sgid_attr);
+ if (ret)
+ return ret;
+
+ ret = ah->device->ops.modify_ah ?
+ ah->device->ops.modify_ah(ah, ah_attr) :
+ -EOPNOTSUPP;
+
+ ah->sgid_attr = rdma_update_sgid_attr(ah_attr, ah->sgid_attr);
+ rdma_unfill_sgid_attr(ah_attr, old_sgid_attr);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_modify_ah);
+
+int rdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr)
+{
+ ah_attr->grh.sgid_attr = NULL;
+
+ return ah->device->ops.query_ah ?
+ ah->device->ops.query_ah(ah, ah_attr) :
+ -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(rdma_query_ah);
+
+int rdma_destroy_ah_user(struct ib_ah *ah, u32 flags, struct ib_udata *udata)
+{
+ const struct ib_gid_attr *sgid_attr = ah->sgid_attr;
+ struct ib_pd *pd;
+ int ret;
+
+ might_sleep_if(flags & RDMA_DESTROY_AH_SLEEPABLE);
+
+ pd = ah->pd;
+
+ ret = ah->device->ops.destroy_ah(ah, flags);
+ if (ret)
+ return ret;
+
+ atomic_dec(&pd->usecnt);
+ if (sgid_attr)
+ rdma_put_gid_attr(sgid_attr);
+
+ kfree(ah);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_destroy_ah_user);
+
+/* Shared receive queues */
+
+/**
+ * ib_create_srq_user - Creates a SRQ associated with the specified protection
+ * domain.
+ * @pd: The protection domain associated with the SRQ.
+ * @srq_init_attr: A list of initial attributes required to create the
+ * SRQ. If SRQ creation succeeds, then the attributes are updated to
+ * the actual capabilities of the created SRQ.
+ * @uobject: uobject pointer if this is not a kernel SRQ
+ * @udata: udata pointer if this is not a kernel SRQ
+ *
+ * srq_attr->max_wr and srq_attr->max_sge are read the determine the
+ * requested size of the SRQ, and set to the actual values allocated
+ * on return. If ib_create_srq() succeeds, then max_wr and max_sge
+ * will always be at least as large as the requested values.
+ */
+struct ib_srq *ib_create_srq_user(struct ib_pd *pd,
+ struct ib_srq_init_attr *srq_init_attr,
+ struct ib_usrq_object *uobject,
+ struct ib_udata *udata)
+{
+ struct ib_srq *srq;
+ int ret;
+
+ srq = rdma_zalloc_drv_obj(pd->device, ib_srq);
+ if (!srq)
+ return ERR_PTR(-ENOMEM);
+
+ srq->device = pd->device;
+ srq->pd = pd;
+ srq->event_handler = srq_init_attr->event_handler;
+ srq->srq_context = srq_init_attr->srq_context;
+ srq->srq_type = srq_init_attr->srq_type;
+ srq->uobject = uobject;
+
+ if (ib_srq_has_cq(srq->srq_type)) {
+ srq->ext.cq = srq_init_attr->ext.cq;
+ atomic_inc(&srq->ext.cq->usecnt);
+ }
+ if (srq->srq_type == IB_SRQT_XRC) {
+ srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd;
+ if (srq->ext.xrc.xrcd)
+ atomic_inc(&srq->ext.xrc.xrcd->usecnt);
+ }
+ atomic_inc(&pd->usecnt);
+
+ rdma_restrack_new(&srq->res, RDMA_RESTRACK_SRQ);
+ rdma_restrack_parent_name(&srq->res, &pd->res);
+
+ ret = pd->device->ops.create_srq(srq, srq_init_attr, udata);
+ if (ret) {
+ rdma_restrack_put(&srq->res);
+ atomic_dec(&pd->usecnt);
+ if (srq->srq_type == IB_SRQT_XRC && srq->ext.xrc.xrcd)
+ atomic_dec(&srq->ext.xrc.xrcd->usecnt);
+ if (ib_srq_has_cq(srq->srq_type))
+ atomic_dec(&srq->ext.cq->usecnt);
+ kfree(srq);
+ return ERR_PTR(ret);
+ }
+
+ rdma_restrack_add(&srq->res);
+
+ return srq;
+}
+EXPORT_SYMBOL(ib_create_srq_user);
+
+int ib_modify_srq(struct ib_srq *srq,
+ struct ib_srq_attr *srq_attr,
+ enum ib_srq_attr_mask srq_attr_mask)
+{
+ return srq->device->ops.modify_srq ?
+ srq->device->ops.modify_srq(srq, srq_attr, srq_attr_mask,
+ NULL) : -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(ib_modify_srq);
+
+int ib_query_srq(struct ib_srq *srq,
+ struct ib_srq_attr *srq_attr)
+{
+ return srq->device->ops.query_srq ?
+ srq->device->ops.query_srq(srq, srq_attr) : -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(ib_query_srq);
+
+int ib_destroy_srq_user(struct ib_srq *srq, struct ib_udata *udata)
+{
+ int ret;
+
+ if (atomic_read(&srq->usecnt))
+ return -EBUSY;
+
+ ret = srq->device->ops.destroy_srq(srq, udata);
+ if (ret)
+ return ret;
+
+ atomic_dec(&srq->pd->usecnt);
+ if (srq->srq_type == IB_SRQT_XRC && srq->ext.xrc.xrcd)
+ atomic_dec(&srq->ext.xrc.xrcd->usecnt);
+ if (ib_srq_has_cq(srq->srq_type))
+ atomic_dec(&srq->ext.cq->usecnt);
+ rdma_restrack_del(&srq->res);
+ kfree(srq);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_destroy_srq_user);
+
+/* Queue pairs */
+
+static void __ib_shared_qp_event_handler(struct ib_event *event, void *context)
+{
+ struct ib_qp *qp = context;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->device->qp_open_list_lock, flags);
+ list_for_each_entry(event->element.qp, &qp->open_list, open_list)
+ if (event->element.qp->event_handler)
+ event->element.qp->event_handler(event, event->element.qp->qp_context);
+ spin_unlock_irqrestore(&qp->device->qp_open_list_lock, flags);
+}
+
+static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp,
+ void (*event_handler)(struct ib_event *, void *),
+ void *qp_context)
+{
+ struct ib_qp *qp;
+ unsigned long flags;
+ int err;
+
+ qp = kzalloc(sizeof *qp, GFP_KERNEL);
+ if (!qp)
+ return ERR_PTR(-ENOMEM);
+
+ qp->real_qp = real_qp;
+ err = ib_open_shared_qp_security(qp, real_qp->device);
+ if (err) {
+ kfree(qp);
+ return ERR_PTR(err);
+ }
+
+ qp->real_qp = real_qp;
+ atomic_inc(&real_qp->usecnt);
+ qp->device = real_qp->device;
+ qp->event_handler = event_handler;
+ qp->qp_context = qp_context;
+ qp->qp_num = real_qp->qp_num;
+ qp->qp_type = real_qp->qp_type;
+
+ spin_lock_irqsave(&real_qp->device->qp_open_list_lock, flags);
+ list_add(&qp->open_list, &real_qp->open_list);
+ spin_unlock_irqrestore(&real_qp->device->qp_open_list_lock, flags);
+
+ return qp;
+}
+
+struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd,
+ struct ib_qp_open_attr *qp_open_attr)
+{
+ struct ib_qp *qp, *real_qp;
+
+ if (qp_open_attr->qp_type != IB_QPT_XRC_TGT)
+ return ERR_PTR(-EINVAL);
+
+ down_read(&xrcd->tgt_qps_rwsem);
+ real_qp = xa_load(&xrcd->tgt_qps, qp_open_attr->qp_num);
+ if (!real_qp) {
+ up_read(&xrcd->tgt_qps_rwsem);
+ return ERR_PTR(-EINVAL);
+ }
+ qp = __ib_open_qp(real_qp, qp_open_attr->event_handler,
+ qp_open_attr->qp_context);
+ up_read(&xrcd->tgt_qps_rwsem);
+ return qp;
+}
+EXPORT_SYMBOL(ib_open_qp);
+
+static struct ib_qp *create_xrc_qp_user(struct ib_qp *qp,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ struct ib_qp *real_qp = qp;
+ int err;
+
+ qp->event_handler = __ib_shared_qp_event_handler;
+ qp->qp_context = qp;
+ qp->pd = NULL;
+ qp->send_cq = qp->recv_cq = NULL;
+ qp->srq = NULL;
+ qp->xrcd = qp_init_attr->xrcd;
+ atomic_inc(&qp_init_attr->xrcd->usecnt);
+ INIT_LIST_HEAD(&qp->open_list);
+
+ qp = __ib_open_qp(real_qp, qp_init_attr->event_handler,
+ qp_init_attr->qp_context);
+ if (IS_ERR(qp))
+ return qp;
+
+ err = xa_err(xa_store(&qp_init_attr->xrcd->tgt_qps, real_qp->qp_num,
+ real_qp, GFP_KERNEL));
+ if (err) {
+ ib_close_qp(qp);
+ return ERR_PTR(err);
+ }
+ return qp;
+}
+
+static struct ib_qp *create_qp(struct ib_device *dev, struct ib_pd *pd,
+ struct ib_qp_init_attr *attr,
+ struct ib_udata *udata,
+ struct ib_uqp_object *uobj, const char *caller)
+{
+ struct ib_udata dummy = {};
+ struct ib_qp *qp;
+ int ret;
+
+ if (!dev->ops.create_qp)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ qp = rdma_zalloc_drv_obj_numa(dev, ib_qp);
+ if (!qp)
+ return ERR_PTR(-ENOMEM);
+
+ qp->device = dev;
+ qp->pd = pd;
+ qp->uobject = uobj;
+ qp->real_qp = qp;
+
+ qp->qp_type = attr->qp_type;
+ qp->rwq_ind_tbl = attr->rwq_ind_tbl;
+ qp->srq = attr->srq;
+ qp->event_handler = attr->event_handler;
+ qp->port = attr->port_num;
+ qp->qp_context = attr->qp_context;
+
+ spin_lock_init(&qp->mr_lock);
+ INIT_LIST_HEAD(&qp->rdma_mrs);
+ INIT_LIST_HEAD(&qp->sig_mrs);
+
+ qp->send_cq = attr->send_cq;
+ qp->recv_cq = attr->recv_cq;
+
+ rdma_restrack_new(&qp->res, RDMA_RESTRACK_QP);
+ WARN_ONCE(!udata && !caller, "Missing kernel QP owner");
+ rdma_restrack_set_name(&qp->res, udata ? NULL : caller);
+ ret = dev->ops.create_qp(qp, attr, udata);
+ if (ret)
+ goto err_create;
+
+ /*
+ * TODO: The mlx4 internally overwrites send_cq and recv_cq.
+ * Unfortunately, it is not an easy task to fix that driver.
+ */
+ qp->send_cq = attr->send_cq;
+ qp->recv_cq = attr->recv_cq;
+
+ ret = ib_create_qp_security(qp, dev);
+ if (ret)
+ goto err_security;
+
+ rdma_restrack_add(&qp->res);
+ return qp;
+
+err_security:
+ qp->device->ops.destroy_qp(qp, udata ? &dummy : NULL);
+err_create:
+ rdma_restrack_put(&qp->res);
+ kfree(qp);
+ return ERR_PTR(ret);
+
+}
+
+/**
+ * ib_create_qp_user - Creates a QP associated with the specified protection
+ * domain.
+ * @dev: IB device
+ * @pd: The protection domain associated with the QP.
+ * @attr: A list of initial attributes required to create the
+ * QP. If QP creation succeeds, then the attributes are updated to
+ * the actual capabilities of the created QP.
+ * @udata: User data
+ * @uobj: uverbs obect
+ * @caller: caller's build-time module name
+ */
+struct ib_qp *ib_create_qp_user(struct ib_device *dev, struct ib_pd *pd,
+ struct ib_qp_init_attr *attr,
+ struct ib_udata *udata,
+ struct ib_uqp_object *uobj, const char *caller)
+{
+ struct ib_qp *qp, *xrc_qp;
+
+ if (attr->qp_type == IB_QPT_XRC_TGT)
+ qp = create_qp(dev, pd, attr, NULL, NULL, caller);
+ else
+ qp = create_qp(dev, pd, attr, udata, uobj, NULL);
+ if (attr->qp_type != IB_QPT_XRC_TGT || IS_ERR(qp))
+ return qp;
+
+ xrc_qp = create_xrc_qp_user(qp, attr);
+ if (IS_ERR(xrc_qp)) {
+ ib_destroy_qp(qp);
+ return xrc_qp;
+ }
+
+ xrc_qp->uobject = uobj;
+ return xrc_qp;
+}
+EXPORT_SYMBOL(ib_create_qp_user);
+
+void ib_qp_usecnt_inc(struct ib_qp *qp)
+{
+ if (qp->pd)
+ atomic_inc(&qp->pd->usecnt);
+ if (qp->send_cq)
+ atomic_inc(&qp->send_cq->usecnt);
+ if (qp->recv_cq)
+ atomic_inc(&qp->recv_cq->usecnt);
+ if (qp->srq)
+ atomic_inc(&qp->srq->usecnt);
+ if (qp->rwq_ind_tbl)
+ atomic_inc(&qp->rwq_ind_tbl->usecnt);
+}
+EXPORT_SYMBOL(ib_qp_usecnt_inc);
+
+void ib_qp_usecnt_dec(struct ib_qp *qp)
+{
+ if (qp->rwq_ind_tbl)
+ atomic_dec(&qp->rwq_ind_tbl->usecnt);
+ if (qp->srq)
+ atomic_dec(&qp->srq->usecnt);
+ if (qp->recv_cq)
+ atomic_dec(&qp->recv_cq->usecnt);
+ if (qp->send_cq)
+ atomic_dec(&qp->send_cq->usecnt);
+ if (qp->pd)
+ atomic_dec(&qp->pd->usecnt);
+}
+EXPORT_SYMBOL(ib_qp_usecnt_dec);
+
+struct ib_qp *ib_create_qp_kernel(struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr,
+ const char *caller)
+{
+ struct ib_device *device = pd->device;
+ struct ib_qp *qp;
+ int ret;
+
+ /*
+ * If the callers is using the RDMA API calculate the resources
+ * needed for the RDMA READ/WRITE operations.
+ *
+ * Note that these callers need to pass in a port number.
+ */
+ if (qp_init_attr->cap.max_rdma_ctxs)
+ rdma_rw_init_qp(device, qp_init_attr);
+
+ qp = create_qp(device, pd, qp_init_attr, NULL, NULL, caller);
+ if (IS_ERR(qp))
+ return qp;
+
+ ib_qp_usecnt_inc(qp);
+
+ if (qp_init_attr->cap.max_rdma_ctxs) {
+ ret = rdma_rw_init_mrs(qp, qp_init_attr);
+ if (ret)
+ goto err;
+ }
+
+ /*
+ * Note: all hw drivers guarantee that max_send_sge is lower than
+ * the device RDMA WRITE SGE limit but not all hw drivers ensure that
+ * max_send_sge <= max_sge_rd.
+ */
+ qp->max_write_sge = qp_init_attr->cap.max_send_sge;
+ qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge,
+ device->attrs.max_sge_rd);
+ if (qp_init_attr->create_flags & IB_QP_CREATE_INTEGRITY_EN)
+ qp->integrity_en = true;
+
+ return qp;
+
+err:
+ ib_destroy_qp(qp);
+ return ERR_PTR(ret);
+
+}
+EXPORT_SYMBOL(ib_create_qp_kernel);
+
+static const struct {
+ int valid;
+ enum ib_qp_attr_mask req_param[IB_QPT_MAX];
+ enum ib_qp_attr_mask opt_param[IB_QPT_MAX];
+} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
+ [IB_QPS_RESET] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_INIT] = {
+ .valid = 1,
+ .req_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_QKEY),
+ [IB_QPT_RAW_PACKET] = IB_QP_PORT,
+ [IB_QPT_UC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_RC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ }
+ },
+ },
+ [IB_QPS_INIT] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_INIT] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_RC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ }
+ },
+ [IB_QPS_RTR] = {
+ .valid = 1,
+ .req_param = {
+ [IB_QPT_UC] = (IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN),
+ [IB_QPT_RC] = (IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_MIN_RNR_TIMER),
+ [IB_QPT_XRC_INI] = (IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN),
+ [IB_QPT_XRC_TGT] = (IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_MIN_RNR_TIMER),
+ },
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ [IB_QPT_RC] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ [IB_QPT_XRC_TGT] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ },
+ },
+ },
+ [IB_QPS_RTR] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .req_param = {
+ [IB_QPT_UD] = IB_QP_SQ_PSN,
+ [IB_QPT_UC] = IB_QP_SQ_PSN,
+ [IB_QPT_RC] = (IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_SQ_PSN |
+ IB_QP_MAX_QP_RD_ATOMIC),
+ [IB_QPT_XRC_INI] = (IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_SQ_PSN |
+ IB_QP_MAX_QP_RD_ATOMIC),
+ [IB_QPT_XRC_TGT] = (IB_QP_TIMEOUT |
+ IB_QP_SQ_PSN),
+ [IB_QPT_SMI] = IB_QP_SQ_PSN,
+ [IB_QPT_GSI] = IB_QP_SQ_PSN,
+ },
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_RAW_PACKET] = IB_QP_RATE_LIMIT,
+ }
+ }
+ },
+ [IB_QPS_RTS] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE |
+ IB_QP_MIN_RNR_TIMER),
+ [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE |
+ IB_QP_MIN_RNR_TIMER),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_RAW_PACKET] = IB_QP_RATE_LIMIT,
+ }
+ },
+ [IB_QPS_SQD] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_UC] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_RC] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_XRC_INI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_XRC_TGT] = IB_QP_EN_SQD_ASYNC_NOTIFY, /* ??? */
+ [IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY
+ }
+ },
+ },
+ [IB_QPS_SQD] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ }
+ },
+ [IB_QPS_SQD] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_AV |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_PORT |
+ IB_QP_AV |
+ IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_MAX_QP_RD_ATOMIC |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_INI] = (IB_QP_PORT |
+ IB_QP_AV |
+ IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_MAX_QP_RD_ATOMIC |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_TGT] = (IB_QP_PORT |
+ IB_QP_AV |
+ IB_QP_TIMEOUT |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_SQE] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_ERR] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 }
+ }
+};
+
+bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
+ enum ib_qp_type type, enum ib_qp_attr_mask mask)
+{
+ enum ib_qp_attr_mask req_param, opt_param;
+
+ if (mask & IB_QP_CUR_STATE &&
+ cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS &&
+ cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE)
+ return false;
+
+ if (!qp_state_table[cur_state][next_state].valid)
+ return false;
+
+ req_param = qp_state_table[cur_state][next_state].req_param[type];
+ opt_param = qp_state_table[cur_state][next_state].opt_param[type];
+
+ if ((mask & req_param) != req_param)
+ return false;
+
+ if (mask & ~(req_param | opt_param | IB_QP_STATE))
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL(ib_modify_qp_is_ok);
+
+/**
+ * ib_resolve_eth_dmac - Resolve destination mac address
+ * @device: Device to consider
+ * @ah_attr: address handle attribute which describes the
+ * source and destination parameters
+ * ib_resolve_eth_dmac() resolves destination mac address and L3 hop limit It
+ * returns 0 on success or appropriate error code. It initializes the
+ * necessary ah_attr fields when call is successful.
+ */
+static int ib_resolve_eth_dmac(struct ib_device *device,
+ struct rdma_ah_attr *ah_attr)
+{
+ int ret = 0;
+
+ if (rdma_is_multicast_addr((struct in6_addr *)ah_attr->grh.dgid.raw)) {
+ if (ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw)) {
+ __be32 addr = 0;
+
+ memcpy(&addr, ah_attr->grh.dgid.raw + 12, 4);
+ ip_eth_mc_map(addr, (char *)ah_attr->roce.dmac);
+ } else {
+ ipv6_eth_mc_map((struct in6_addr *)ah_attr->grh.dgid.raw,
+ (char *)ah_attr->roce.dmac);
+ }
+ } else {
+ ret = ib_resolve_unicast_gid_dmac(device, ah_attr);
+ }
+ return ret;
+}
+
+static bool is_qp_type_connected(const struct ib_qp *qp)
+{
+ return (qp->qp_type == IB_QPT_UC ||
+ qp->qp_type == IB_QPT_RC ||
+ qp->qp_type == IB_QPT_XRC_INI ||
+ qp->qp_type == IB_QPT_XRC_TGT);
+}
+
+/*
+ * IB core internal function to perform QP attributes modification.
+ */
+static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata)
+{
+ u32 port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+ const struct ib_gid_attr *old_sgid_attr_av;
+ const struct ib_gid_attr *old_sgid_attr_alt_av;
+ int ret;
+
+ attr->xmit_slave = NULL;
+ if (attr_mask & IB_QP_AV) {
+ ret = rdma_fill_sgid_attr(qp->device, &attr->ah_attr,
+ &old_sgid_attr_av);
+ if (ret)
+ return ret;
+
+ if (attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE &&
+ is_qp_type_connected(qp)) {
+ struct net_device *slave;
+
+ /*
+ * If the user provided the qp_attr then we have to
+ * resolve it. Kerne users have to provide already
+ * resolved rdma_ah_attr's.
+ */
+ if (udata) {
+ ret = ib_resolve_eth_dmac(qp->device,
+ &attr->ah_attr);
+ if (ret)
+ goto out_av;
+ }
+ slave = rdma_lag_get_ah_roce_slave(qp->device,
+ &attr->ah_attr,
+ GFP_KERNEL);
+ if (IS_ERR(slave)) {
+ ret = PTR_ERR(slave);
+ goto out_av;
+ }
+ attr->xmit_slave = slave;
+ }
+ }
+ if (attr_mask & IB_QP_ALT_PATH) {
+ /*
+ * FIXME: This does not track the migration state, so if the
+ * user loads a new alternate path after the HW has migrated
+ * from primary->alternate we will keep the wrong
+ * references. This is OK for IB because the reference
+ * counting does not serve any functional purpose.
+ */
+ ret = rdma_fill_sgid_attr(qp->device, &attr->alt_ah_attr,
+ &old_sgid_attr_alt_av);
+ if (ret)
+ goto out_av;
+
+ /*
+ * Today the core code can only handle alternate paths and APM
+ * for IB. Ban them in roce mode.
+ */
+ if (!(rdma_protocol_ib(qp->device,
+ attr->alt_ah_attr.port_num) &&
+ rdma_protocol_ib(qp->device, port))) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ if (rdma_ib_or_roce(qp->device, port)) {
+ if (attr_mask & IB_QP_RQ_PSN && attr->rq_psn & ~0xffffff) {
+ dev_warn(&qp->device->dev,
+ "%s rq_psn overflow, masking to 24 bits\n",
+ __func__);
+ attr->rq_psn &= 0xffffff;
+ }
+
+ if (attr_mask & IB_QP_SQ_PSN && attr->sq_psn & ~0xffffff) {
+ dev_warn(&qp->device->dev,
+ " %s sq_psn overflow, masking to 24 bits\n",
+ __func__);
+ attr->sq_psn &= 0xffffff;
+ }
+ }
+
+ /*
+ * Bind this qp to a counter automatically based on the rdma counter
+ * rules. This only set in RST2INIT with port specified
+ */
+ if (!qp->counter && (attr_mask & IB_QP_PORT) &&
+ ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_INIT))
+ rdma_counter_bind_qp_auto(qp, attr->port_num);
+
+ ret = ib_security_modify_qp(qp, attr, attr_mask, udata);
+ if (ret)
+ goto out;
+
+ if (attr_mask & IB_QP_PORT)
+ qp->port = attr->port_num;
+ if (attr_mask & IB_QP_AV)
+ qp->av_sgid_attr =
+ rdma_update_sgid_attr(&attr->ah_attr, qp->av_sgid_attr);
+ if (attr_mask & IB_QP_ALT_PATH)
+ qp->alt_path_sgid_attr = rdma_update_sgid_attr(
+ &attr->alt_ah_attr, qp->alt_path_sgid_attr);
+
+out:
+ if (attr_mask & IB_QP_ALT_PATH)
+ rdma_unfill_sgid_attr(&attr->alt_ah_attr, old_sgid_attr_alt_av);
+out_av:
+ if (attr_mask & IB_QP_AV) {
+ rdma_lag_put_ah_roce_slave(attr->xmit_slave);
+ rdma_unfill_sgid_attr(&attr->ah_attr, old_sgid_attr_av);
+ }
+ return ret;
+}
+
+/**
+ * ib_modify_qp_with_udata - Modifies the attributes for the specified QP.
+ * @ib_qp: The QP to modify.
+ * @attr: On input, specifies the QP attributes to modify. On output,
+ * the current values of selected QP attributes are returned.
+ * @attr_mask: A bit-mask used to specify which attributes of the QP
+ * are being modified.
+ * @udata: pointer to user's input output buffer information
+ * are being modified.
+ * It returns 0 on success and returns appropriate error code on error.
+ */
+int ib_modify_qp_with_udata(struct ib_qp *ib_qp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata)
+{
+ return _ib_modify_qp(ib_qp->real_qp, attr, attr_mask, udata);
+}
+EXPORT_SYMBOL(ib_modify_qp_with_udata);
+
+int ib_get_eth_speed(struct ib_device *dev, u32 port_num, u16 *speed, u8 *width)
+{
+ int rc;
+ u32 netdev_speed;
+ struct net_device *netdev;
+ struct ethtool_link_ksettings lksettings;
+
+ if (rdma_port_get_link_layer(dev, port_num) != IB_LINK_LAYER_ETHERNET)
+ return -EINVAL;
+
+ netdev = ib_device_get_netdev(dev, port_num);
+ if (!netdev)
+ return -ENODEV;
+
+ rtnl_lock();
+ rc = __ethtool_get_link_ksettings(netdev, &lksettings);
+ rtnl_unlock();
+
+ dev_put(netdev);
+
+ if (!rc && lksettings.base.speed != (u32)SPEED_UNKNOWN) {
+ netdev_speed = lksettings.base.speed;
+ } else {
+ netdev_speed = SPEED_1000;
+ pr_warn("%s speed is unknown, defaulting to %u\n", netdev->name,
+ netdev_speed);
+ }
+
+ if (netdev_speed <= SPEED_1000) {
+ *width = IB_WIDTH_1X;
+ *speed = IB_SPEED_SDR;
+ } else if (netdev_speed <= SPEED_10000) {
+ *width = IB_WIDTH_1X;
+ *speed = IB_SPEED_FDR10;
+ } else if (netdev_speed <= SPEED_20000) {
+ *width = IB_WIDTH_4X;
+ *speed = IB_SPEED_DDR;
+ } else if (netdev_speed <= SPEED_25000) {
+ *width = IB_WIDTH_1X;
+ *speed = IB_SPEED_EDR;
+ } else if (netdev_speed <= SPEED_40000) {
+ *width = IB_WIDTH_4X;
+ *speed = IB_SPEED_FDR10;
+ } else {
+ *width = IB_WIDTH_4X;
+ *speed = IB_SPEED_EDR;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_get_eth_speed);
+
+int ib_modify_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask)
+{
+ return _ib_modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL);
+}
+EXPORT_SYMBOL(ib_modify_qp);
+
+int ib_query_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ qp_attr->ah_attr.grh.sgid_attr = NULL;
+ qp_attr->alt_ah_attr.grh.sgid_attr = NULL;
+
+ return qp->device->ops.query_qp ?
+ qp->device->ops.query_qp(qp->real_qp, qp_attr, qp_attr_mask,
+ qp_init_attr) : -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(ib_query_qp);
+
+int ib_close_qp(struct ib_qp *qp)
+{
+ struct ib_qp *real_qp;
+ unsigned long flags;
+
+ real_qp = qp->real_qp;
+ if (real_qp == qp)
+ return -EINVAL;
+
+ spin_lock_irqsave(&real_qp->device->qp_open_list_lock, flags);
+ list_del(&qp->open_list);
+ spin_unlock_irqrestore(&real_qp->device->qp_open_list_lock, flags);
+
+ atomic_dec(&real_qp->usecnt);
+ if (qp->qp_sec)
+ ib_close_shared_qp_security(qp->qp_sec);
+ kfree(qp);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_close_qp);
+
+static int __ib_destroy_shared_qp(struct ib_qp *qp)
+{
+ struct ib_xrcd *xrcd;
+ struct ib_qp *real_qp;
+ int ret;
+
+ real_qp = qp->real_qp;
+ xrcd = real_qp->xrcd;
+ down_write(&xrcd->tgt_qps_rwsem);
+ ib_close_qp(qp);
+ if (atomic_read(&real_qp->usecnt) == 0)
+ xa_erase(&xrcd->tgt_qps, real_qp->qp_num);
+ else
+ real_qp = NULL;
+ up_write(&xrcd->tgt_qps_rwsem);
+
+ if (real_qp) {
+ ret = ib_destroy_qp(real_qp);
+ if (!ret)
+ atomic_dec(&xrcd->usecnt);
+ }
+
+ return 0;
+}
+
+int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata)
+{
+ const struct ib_gid_attr *alt_path_sgid_attr = qp->alt_path_sgid_attr;
+ const struct ib_gid_attr *av_sgid_attr = qp->av_sgid_attr;
+ struct ib_qp_security *sec;
+ int ret;
+
+ WARN_ON_ONCE(qp->mrs_used > 0);
+
+ if (atomic_read(&qp->usecnt))
+ return -EBUSY;
+
+ if (qp->real_qp != qp)
+ return __ib_destroy_shared_qp(qp);
+
+ sec = qp->qp_sec;
+ if (sec)
+ ib_destroy_qp_security_begin(sec);
+
+ if (!qp->uobject)
+ rdma_rw_cleanup_mrs(qp);
+
+ rdma_counter_unbind_qp(qp, true);
+ ret = qp->device->ops.destroy_qp(qp, udata);
+ if (ret) {
+ if (sec)
+ ib_destroy_qp_security_abort(sec);
+ return ret;
+ }
+
+ if (alt_path_sgid_attr)
+ rdma_put_gid_attr(alt_path_sgid_attr);
+ if (av_sgid_attr)
+ rdma_put_gid_attr(av_sgid_attr);
+
+ ib_qp_usecnt_dec(qp);
+ if (sec)
+ ib_destroy_qp_security_end(sec);
+
+ rdma_restrack_del(&qp->res);
+ kfree(qp);
+ return ret;
+}
+EXPORT_SYMBOL(ib_destroy_qp_user);
+
+/* Completion queues */
+
+struct ib_cq *__ib_create_cq(struct ib_device *device,
+ ib_comp_handler comp_handler,
+ void (*event_handler)(struct ib_event *, void *),
+ void *cq_context,
+ const struct ib_cq_init_attr *cq_attr,
+ const char *caller)
+{
+ struct ib_cq *cq;
+ int ret;
+
+ cq = rdma_zalloc_drv_obj(device, ib_cq);
+ if (!cq)
+ return ERR_PTR(-ENOMEM);
+
+ cq->device = device;
+ cq->uobject = NULL;
+ cq->comp_handler = comp_handler;
+ cq->event_handler = event_handler;
+ cq->cq_context = cq_context;
+ atomic_set(&cq->usecnt, 0);
+
+ rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ);
+ rdma_restrack_set_name(&cq->res, caller);
+
+ ret = device->ops.create_cq(cq, cq_attr, NULL);
+ if (ret) {
+ rdma_restrack_put(&cq->res);
+ kfree(cq);
+ return ERR_PTR(ret);
+ }
+
+ rdma_restrack_add(&cq->res);
+ return cq;
+}
+EXPORT_SYMBOL(__ib_create_cq);
+
+int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+{
+ if (cq->shared)
+ return -EOPNOTSUPP;
+
+ return cq->device->ops.modify_cq ?
+ cq->device->ops.modify_cq(cq, cq_count,
+ cq_period) : -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(rdma_set_cq_moderation);
+
+int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata)
+{
+ int ret;
+
+ if (WARN_ON_ONCE(cq->shared))
+ return -EOPNOTSUPP;
+
+ if (atomic_read(&cq->usecnt))
+ return -EBUSY;
+
+ ret = cq->device->ops.destroy_cq(cq, udata);
+ if (ret)
+ return ret;
+
+ rdma_restrack_del(&cq->res);
+ kfree(cq);
+ return ret;
+}
+EXPORT_SYMBOL(ib_destroy_cq_user);
+
+int ib_resize_cq(struct ib_cq *cq, int cqe)
+{
+ if (cq->shared)
+ return -EOPNOTSUPP;
+
+ return cq->device->ops.resize_cq ?
+ cq->device->ops.resize_cq(cq, cqe, NULL) : -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(ib_resize_cq);
+
+/* Memory regions */
+
+struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+ u64 virt_addr, int access_flags)
+{
+ struct ib_mr *mr;
+
+ if (access_flags & IB_ACCESS_ON_DEMAND) {
+ if (!(pd->device->attrs.kernel_cap_flags &
+ IBK_ON_DEMAND_PAGING)) {
+ pr_debug("ODP support not available\n");
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ mr = pd->device->ops.reg_user_mr(pd, start, length, virt_addr,
+ access_flags, NULL);
+
+ if (IS_ERR(mr))
+ return mr;
+
+ mr->device = pd->device;
+ mr->type = IB_MR_TYPE_USER;
+ mr->pd = pd;
+ mr->dm = NULL;
+ atomic_inc(&pd->usecnt);
+ mr->iova = virt_addr;
+ mr->length = length;
+
+ rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR);
+ rdma_restrack_parent_name(&mr->res, &pd->res);
+ rdma_restrack_add(&mr->res);
+
+ return mr;
+}
+EXPORT_SYMBOL(ib_reg_user_mr);
+
+int ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
+ u32 flags, struct ib_sge *sg_list, u32 num_sge)
+{
+ if (!pd->device->ops.advise_mr)
+ return -EOPNOTSUPP;
+
+ if (!num_sge)
+ return 0;
+
+ return pd->device->ops.advise_mr(pd, advice, flags, sg_list, num_sge,
+ NULL);
+}
+EXPORT_SYMBOL(ib_advise_mr);
+
+int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)
+{
+ struct ib_pd *pd = mr->pd;
+ struct ib_dm *dm = mr->dm;
+ struct ib_sig_attrs *sig_attrs = mr->sig_attrs;
+ int ret;
+
+ trace_mr_dereg(mr);
+ rdma_restrack_del(&mr->res);
+ ret = mr->device->ops.dereg_mr(mr, udata);
+ if (!ret) {
+ atomic_dec(&pd->usecnt);
+ if (dm)
+ atomic_dec(&dm->usecnt);
+ kfree(sig_attrs);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_dereg_mr_user);
+
+/**
+ * ib_alloc_mr() - Allocates a memory region
+ * @pd: protection domain associated with the region
+ * @mr_type: memory region type
+ * @max_num_sg: maximum sg entries available for registration.
+ *
+ * Notes:
+ * Memory registeration page/sg lists must not exceed max_num_sg.
+ * For mr_type IB_MR_TYPE_MEM_REG, the total length cannot exceed
+ * max_num_sg * used_page_size.
+ *
+ */
+struct ib_mr *ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+ u32 max_num_sg)
+{
+ struct ib_mr *mr;
+
+ if (!pd->device->ops.alloc_mr) {
+ mr = ERR_PTR(-EOPNOTSUPP);
+ goto out;
+ }
+
+ if (mr_type == IB_MR_TYPE_INTEGRITY) {
+ WARN_ON_ONCE(1);
+ mr = ERR_PTR(-EINVAL);
+ goto out;
+ }
+
+ mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg);
+ if (IS_ERR(mr))
+ goto out;
+
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->dm = NULL;
+ mr->uobject = NULL;
+ atomic_inc(&pd->usecnt);
+ mr->need_inval = false;
+ mr->type = mr_type;
+ mr->sig_attrs = NULL;
+
+ rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR);
+ rdma_restrack_parent_name(&mr->res, &pd->res);
+ rdma_restrack_add(&mr->res);
+out:
+ trace_mr_alloc(pd, mr_type, max_num_sg, mr);
+ return mr;
+}
+EXPORT_SYMBOL(ib_alloc_mr);
+
+/**
+ * ib_alloc_mr_integrity() - Allocates an integrity memory region
+ * @pd: protection domain associated with the region
+ * @max_num_data_sg: maximum data sg entries available for registration
+ * @max_num_meta_sg: maximum metadata sg entries available for
+ * registration
+ *
+ * Notes:
+ * Memory registration page/sg lists must not exceed max_num_sg,
+ * also the integrity page/sg lists must not exceed max_num_meta_sg.
+ *
+ */
+struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd,
+ u32 max_num_data_sg,
+ u32 max_num_meta_sg)
+{
+ struct ib_mr *mr;
+ struct ib_sig_attrs *sig_attrs;
+
+ if (!pd->device->ops.alloc_mr_integrity ||
+ !pd->device->ops.map_mr_sg_pi) {
+ mr = ERR_PTR(-EOPNOTSUPP);
+ goto out;
+ }
+
+ if (!max_num_meta_sg) {
+ mr = ERR_PTR(-EINVAL);
+ goto out;
+ }
+
+ sig_attrs = kzalloc(sizeof(struct ib_sig_attrs), GFP_KERNEL);
+ if (!sig_attrs) {
+ mr = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ mr = pd->device->ops.alloc_mr_integrity(pd, max_num_data_sg,
+ max_num_meta_sg);
+ if (IS_ERR(mr)) {
+ kfree(sig_attrs);
+ goto out;
+ }
+
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->dm = NULL;
+ mr->uobject = NULL;
+ atomic_inc(&pd->usecnt);
+ mr->need_inval = false;
+ mr->type = IB_MR_TYPE_INTEGRITY;
+ mr->sig_attrs = sig_attrs;
+
+ rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR);
+ rdma_restrack_parent_name(&mr->res, &pd->res);
+ rdma_restrack_add(&mr->res);
+out:
+ trace_mr_integ_alloc(pd, max_num_data_sg, max_num_meta_sg, mr);
+ return mr;
+}
+EXPORT_SYMBOL(ib_alloc_mr_integrity);
+
+/* Multicast groups */
+
+static bool is_valid_mcast_lid(struct ib_qp *qp, u16 lid)
+{
+ struct ib_qp_init_attr init_attr = {};
+ struct ib_qp_attr attr = {};
+ int num_eth_ports = 0;
+ unsigned int port;
+
+ /* If QP state >= init, it is assigned to a port and we can check this
+ * port only.
+ */
+ if (!ib_query_qp(qp, &attr, IB_QP_STATE | IB_QP_PORT, &init_attr)) {
+ if (attr.qp_state >= IB_QPS_INIT) {
+ if (rdma_port_get_link_layer(qp->device, attr.port_num) !=
+ IB_LINK_LAYER_INFINIBAND)
+ return true;
+ goto lid_check;
+ }
+ }
+
+ /* Can't get a quick answer, iterate over all ports */
+ rdma_for_each_port(qp->device, port)
+ if (rdma_port_get_link_layer(qp->device, port) !=
+ IB_LINK_LAYER_INFINIBAND)
+ num_eth_ports++;
+
+ /* If we have at lease one Ethernet port, RoCE annex declares that
+ * multicast LID should be ignored. We can't tell at this step if the
+ * QP belongs to an IB or Ethernet port.
+ */
+ if (num_eth_ports)
+ return true;
+
+ /* If all the ports are IB, we can check according to IB spec. */
+lid_check:
+ return !(lid < be16_to_cpu(IB_MULTICAST_LID_BASE) ||
+ lid == be16_to_cpu(IB_LID_PERMISSIVE));
+}
+
+int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+ int ret;
+
+ if (!qp->device->ops.attach_mcast)
+ return -EOPNOTSUPP;
+
+ if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) ||
+ qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid))
+ return -EINVAL;
+
+ ret = qp->device->ops.attach_mcast(qp, gid, lid);
+ if (!ret)
+ atomic_inc(&qp->usecnt);
+ return ret;
+}
+EXPORT_SYMBOL(ib_attach_mcast);
+
+int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+ int ret;
+
+ if (!qp->device->ops.detach_mcast)
+ return -EOPNOTSUPP;
+
+ if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) ||
+ qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid))
+ return -EINVAL;
+
+ ret = qp->device->ops.detach_mcast(qp, gid, lid);
+ if (!ret)
+ atomic_dec(&qp->usecnt);
+ return ret;
+}
+EXPORT_SYMBOL(ib_detach_mcast);
+
+/**
+ * ib_alloc_xrcd_user - Allocates an XRC domain.
+ * @device: The device on which to allocate the XRC domain.
+ * @inode: inode to connect XRCD
+ * @udata: Valid user data or NULL for kernel object
+ */
+struct ib_xrcd *ib_alloc_xrcd_user(struct ib_device *device,
+ struct inode *inode, struct ib_udata *udata)
+{
+ struct ib_xrcd *xrcd;
+ int ret;
+
+ if (!device->ops.alloc_xrcd)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ xrcd = rdma_zalloc_drv_obj(device, ib_xrcd);
+ if (!xrcd)
+ return ERR_PTR(-ENOMEM);
+
+ xrcd->device = device;
+ xrcd->inode = inode;
+ atomic_set(&xrcd->usecnt, 0);
+ init_rwsem(&xrcd->tgt_qps_rwsem);
+ xa_init(&xrcd->tgt_qps);
+
+ ret = device->ops.alloc_xrcd(xrcd, udata);
+ if (ret)
+ goto err;
+ return xrcd;
+err:
+ kfree(xrcd);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_alloc_xrcd_user);
+
+/**
+ * ib_dealloc_xrcd_user - Deallocates an XRC domain.
+ * @xrcd: The XRC domain to deallocate.
+ * @udata: Valid user data or NULL for kernel object
+ */
+int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata)
+{
+ int ret;
+
+ if (atomic_read(&xrcd->usecnt))
+ return -EBUSY;
+
+ WARN_ON(!xa_empty(&xrcd->tgt_qps));
+ ret = xrcd->device->ops.dealloc_xrcd(xrcd, udata);
+ if (ret)
+ return ret;
+ kfree(xrcd);
+ return ret;
+}
+EXPORT_SYMBOL(ib_dealloc_xrcd_user);
+
+/**
+ * ib_create_wq - Creates a WQ associated with the specified protection
+ * domain.
+ * @pd: The protection domain associated with the WQ.
+ * @wq_attr: A list of initial attributes required to create the
+ * WQ. If WQ creation succeeds, then the attributes are updated to
+ * the actual capabilities of the created WQ.
+ *
+ * wq_attr->max_wr and wq_attr->max_sge determine
+ * the requested size of the WQ, and set to the actual values allocated
+ * on return.
+ * If ib_create_wq() succeeds, then max_wr and max_sge will always be
+ * at least as large as the requested values.
+ */
+struct ib_wq *ib_create_wq(struct ib_pd *pd,
+ struct ib_wq_init_attr *wq_attr)
+{
+ struct ib_wq *wq;
+
+ if (!pd->device->ops.create_wq)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ wq = pd->device->ops.create_wq(pd, wq_attr, NULL);
+ if (!IS_ERR(wq)) {
+ wq->event_handler = wq_attr->event_handler;
+ wq->wq_context = wq_attr->wq_context;
+ wq->wq_type = wq_attr->wq_type;
+ wq->cq = wq_attr->cq;
+ wq->device = pd->device;
+ wq->pd = pd;
+ wq->uobject = NULL;
+ atomic_inc(&pd->usecnt);
+ atomic_inc(&wq_attr->cq->usecnt);
+ atomic_set(&wq->usecnt, 0);
+ }
+ return wq;
+}
+EXPORT_SYMBOL(ib_create_wq);
+
+/**
+ * ib_destroy_wq_user - Destroys the specified user WQ.
+ * @wq: The WQ to destroy.
+ * @udata: Valid user data
+ */
+int ib_destroy_wq_user(struct ib_wq *wq, struct ib_udata *udata)
+{
+ struct ib_cq *cq = wq->cq;
+ struct ib_pd *pd = wq->pd;
+ int ret;
+
+ if (atomic_read(&wq->usecnt))
+ return -EBUSY;
+
+ ret = wq->device->ops.destroy_wq(wq, udata);
+ if (ret)
+ return ret;
+
+ atomic_dec(&pd->usecnt);
+ atomic_dec(&cq->usecnt);
+ return ret;
+}
+EXPORT_SYMBOL(ib_destroy_wq_user);
+
+int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
+ struct ib_mr_status *mr_status)
+{
+ if (!mr->device->ops.check_mr_status)
+ return -EOPNOTSUPP;
+
+ return mr->device->ops.check_mr_status(mr, check_mask, mr_status);
+}
+EXPORT_SYMBOL(ib_check_mr_status);
+
+int ib_set_vf_link_state(struct ib_device *device, int vf, u32 port,
+ int state)
+{
+ if (!device->ops.set_vf_link_state)
+ return -EOPNOTSUPP;
+
+ return device->ops.set_vf_link_state(device, vf, port, state);
+}
+EXPORT_SYMBOL(ib_set_vf_link_state);
+
+int ib_get_vf_config(struct ib_device *device, int vf, u32 port,
+ struct ifla_vf_info *info)
+{
+ if (!device->ops.get_vf_config)
+ return -EOPNOTSUPP;
+
+ return device->ops.get_vf_config(device, vf, port, info);
+}
+EXPORT_SYMBOL(ib_get_vf_config);
+
+int ib_get_vf_stats(struct ib_device *device, int vf, u32 port,
+ struct ifla_vf_stats *stats)
+{
+ if (!device->ops.get_vf_stats)
+ return -EOPNOTSUPP;
+
+ return device->ops.get_vf_stats(device, vf, port, stats);
+}
+EXPORT_SYMBOL(ib_get_vf_stats);
+
+int ib_set_vf_guid(struct ib_device *device, int vf, u32 port, u64 guid,
+ int type)
+{
+ if (!device->ops.set_vf_guid)
+ return -EOPNOTSUPP;
+
+ return device->ops.set_vf_guid(device, vf, port, guid, type);
+}
+EXPORT_SYMBOL(ib_set_vf_guid);
+
+int ib_get_vf_guid(struct ib_device *device, int vf, u32 port,
+ struct ifla_vf_guid *node_guid,
+ struct ifla_vf_guid *port_guid)
+{
+ if (!device->ops.get_vf_guid)
+ return -EOPNOTSUPP;
+
+ return device->ops.get_vf_guid(device, vf, port, node_guid, port_guid);
+}
+EXPORT_SYMBOL(ib_get_vf_guid);
+/**
+ * ib_map_mr_sg_pi() - Map the dma mapped SG lists for PI (protection
+ * information) and set an appropriate memory region for registration.
+ * @mr: memory region
+ * @data_sg: dma mapped scatterlist for data
+ * @data_sg_nents: number of entries in data_sg
+ * @data_sg_offset: offset in bytes into data_sg
+ * @meta_sg: dma mapped scatterlist for metadata
+ * @meta_sg_nents: number of entries in meta_sg
+ * @meta_sg_offset: offset in bytes into meta_sg
+ * @page_size: page vector desired page size
+ *
+ * Constraints:
+ * - The MR must be allocated with type IB_MR_TYPE_INTEGRITY.
+ *
+ * Return: 0 on success.
+ *
+ * After this completes successfully, the memory region
+ * is ready for registration.
+ */
+int ib_map_mr_sg_pi(struct ib_mr *mr, struct scatterlist *data_sg,
+ int data_sg_nents, unsigned int *data_sg_offset,
+ struct scatterlist *meta_sg, int meta_sg_nents,
+ unsigned int *meta_sg_offset, unsigned int page_size)
+{
+ if (unlikely(!mr->device->ops.map_mr_sg_pi ||
+ WARN_ON_ONCE(mr->type != IB_MR_TYPE_INTEGRITY)))
+ return -EOPNOTSUPP;
+
+ mr->page_size = page_size;
+
+ return mr->device->ops.map_mr_sg_pi(mr, data_sg, data_sg_nents,
+ data_sg_offset, meta_sg,
+ meta_sg_nents, meta_sg_offset);
+}
+EXPORT_SYMBOL(ib_map_mr_sg_pi);
+
+/**
+ * ib_map_mr_sg() - Map the largest prefix of a dma mapped SG list
+ * and set it the memory region.
+ * @mr: memory region
+ * @sg: dma mapped scatterlist
+ * @sg_nents: number of entries in sg
+ * @sg_offset: offset in bytes into sg
+ * @page_size: page vector desired page size
+ *
+ * Constraints:
+ *
+ * - The first sg element is allowed to have an offset.
+ * - Each sg element must either be aligned to page_size or virtually
+ * contiguous to the previous element. In case an sg element has a
+ * non-contiguous offset, the mapping prefix will not include it.
+ * - The last sg element is allowed to have length less than page_size.
+ * - If sg_nents total byte length exceeds the mr max_num_sge * page_size
+ * then only max_num_sg entries will be mapped.
+ * - If the MR was allocated with type IB_MR_TYPE_SG_GAPS, none of these
+ * constraints holds and the page_size argument is ignored.
+ *
+ * Returns the number of sg elements that were mapped to the memory region.
+ *
+ * After this completes successfully, the memory region
+ * is ready for registration.
+ */
+int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
+ unsigned int *sg_offset, unsigned int page_size)
+{
+ if (unlikely(!mr->device->ops.map_mr_sg))
+ return -EOPNOTSUPP;
+
+ mr->page_size = page_size;
+
+ return mr->device->ops.map_mr_sg(mr, sg, sg_nents, sg_offset);
+}
+EXPORT_SYMBOL(ib_map_mr_sg);
+
+/**
+ * ib_sg_to_pages() - Convert the largest prefix of a sg list
+ * to a page vector
+ * @mr: memory region
+ * @sgl: dma mapped scatterlist
+ * @sg_nents: number of entries in sg
+ * @sg_offset_p: ==== =======================================================
+ * IN start offset in bytes into sg
+ * OUT offset in bytes for element n of the sg of the first
+ * byte that has not been processed where n is the return
+ * value of this function.
+ * ==== =======================================================
+ * @set_page: driver page assignment function pointer
+ *
+ * Core service helper for drivers to convert the largest
+ * prefix of given sg list to a page vector. The sg list
+ * prefix converted is the prefix that meet the requirements
+ * of ib_map_mr_sg.
+ *
+ * Returns the number of sg elements that were assigned to
+ * a page vector.
+ */
+int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents,
+ unsigned int *sg_offset_p, int (*set_page)(struct ib_mr *, u64))
+{
+ struct scatterlist *sg;
+ u64 last_end_dma_addr = 0;
+ unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
+ unsigned int last_page_off = 0;
+ u64 page_mask = ~((u64)mr->page_size - 1);
+ int i, ret;
+
+ if (unlikely(sg_nents <= 0 || sg_offset > sg_dma_len(&sgl[0])))
+ return -EINVAL;
+
+ mr->iova = sg_dma_address(&sgl[0]) + sg_offset;
+ mr->length = 0;
+
+ for_each_sg(sgl, sg, sg_nents, i) {
+ u64 dma_addr = sg_dma_address(sg) + sg_offset;
+ u64 prev_addr = dma_addr;
+ unsigned int dma_len = sg_dma_len(sg) - sg_offset;
+ u64 end_dma_addr = dma_addr + dma_len;
+ u64 page_addr = dma_addr & page_mask;
+
+ /*
+ * For the second and later elements, check whether either the
+ * end of element i-1 or the start of element i is not aligned
+ * on a page boundary.
+ */
+ if (i && (last_page_off != 0 || page_addr != dma_addr)) {
+ /* Stop mapping if there is a gap. */
+ if (last_end_dma_addr != dma_addr)
+ break;
+
+ /*
+ * Coalesce this element with the last. If it is small
+ * enough just update mr->length. Otherwise start
+ * mapping from the next page.
+ */
+ goto next_page;
+ }
+
+ do {
+ ret = set_page(mr, page_addr);
+ if (unlikely(ret < 0)) {
+ sg_offset = prev_addr - sg_dma_address(sg);
+ mr->length += prev_addr - dma_addr;
+ if (sg_offset_p)
+ *sg_offset_p = sg_offset;
+ return i || sg_offset ? i : ret;
+ }
+ prev_addr = page_addr;
+next_page:
+ page_addr += mr->page_size;
+ } while (page_addr < end_dma_addr);
+
+ mr->length += dma_len;
+ last_end_dma_addr = end_dma_addr;
+ last_page_off = end_dma_addr & ~page_mask;
+
+ sg_offset = 0;
+ }
+
+ if (sg_offset_p)
+ *sg_offset_p = 0;
+ return i;
+}
+EXPORT_SYMBOL(ib_sg_to_pages);
+
+struct ib_drain_cqe {
+ struct ib_cqe cqe;
+ struct completion done;
+};
+
+static void ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct ib_drain_cqe *cqe = container_of(wc->wr_cqe, struct ib_drain_cqe,
+ cqe);
+
+ complete(&cqe->done);
+}
+
+/*
+ * Post a WR and block until its completion is reaped for the SQ.
+ */
+static void __ib_drain_sq(struct ib_qp *qp)
+{
+ struct ib_cq *cq = qp->send_cq;
+ struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
+ struct ib_drain_cqe sdrain;
+ struct ib_rdma_wr swr = {
+ .wr = {
+ .next = NULL,
+ { .wr_cqe = &sdrain.cqe, },
+ .opcode = IB_WR_RDMA_WRITE,
+ },
+ };
+ int ret;
+
+ ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
+ if (ret) {
+ WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
+ return;
+ }
+
+ sdrain.cqe.done = ib_drain_qp_done;
+ init_completion(&sdrain.done);
+
+ ret = ib_post_send(qp, &swr.wr, NULL);
+ if (ret) {
+ WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
+ return;
+ }
+
+ if (cq->poll_ctx == IB_POLL_DIRECT)
+ while (wait_for_completion_timeout(&sdrain.done, HZ / 10) <= 0)
+ ib_process_cq_direct(cq, -1);
+ else
+ wait_for_completion(&sdrain.done);
+}
+
+/*
+ * Post a WR and block until its completion is reaped for the RQ.
+ */
+static void __ib_drain_rq(struct ib_qp *qp)
+{
+ struct ib_cq *cq = qp->recv_cq;
+ struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
+ struct ib_drain_cqe rdrain;
+ struct ib_recv_wr rwr = {};
+ int ret;
+
+ ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
+ if (ret) {
+ WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
+ return;
+ }
+
+ rwr.wr_cqe = &rdrain.cqe;
+ rdrain.cqe.done = ib_drain_qp_done;
+ init_completion(&rdrain.done);
+
+ ret = ib_post_recv(qp, &rwr, NULL);
+ if (ret) {
+ WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
+ return;
+ }
+
+ if (cq->poll_ctx == IB_POLL_DIRECT)
+ while (wait_for_completion_timeout(&rdrain.done, HZ / 10) <= 0)
+ ib_process_cq_direct(cq, -1);
+ else
+ wait_for_completion(&rdrain.done);
+}
+
+/**
+ * ib_drain_sq() - Block until all SQ CQEs have been consumed by the
+ * application.
+ * @qp: queue pair to drain
+ *
+ * If the device has a provider-specific drain function, then
+ * call that. Otherwise call the generic drain function
+ * __ib_drain_sq().
+ *
+ * The caller must:
+ *
+ * ensure there is room in the CQ and SQ for the drain work request and
+ * completion.
+ *
+ * allocate the CQ using ib_alloc_cq().
+ *
+ * ensure that there are no other contexts that are posting WRs concurrently.
+ * Otherwise the drain is not guaranteed.
+ */
+void ib_drain_sq(struct ib_qp *qp)
+{
+ if (qp->device->ops.drain_sq)
+ qp->device->ops.drain_sq(qp);
+ else
+ __ib_drain_sq(qp);
+ trace_cq_drain_complete(qp->send_cq);
+}
+EXPORT_SYMBOL(ib_drain_sq);
+
+/**
+ * ib_drain_rq() - Block until all RQ CQEs have been consumed by the
+ * application.
+ * @qp: queue pair to drain
+ *
+ * If the device has a provider-specific drain function, then
+ * call that. Otherwise call the generic drain function
+ * __ib_drain_rq().
+ *
+ * The caller must:
+ *
+ * ensure there is room in the CQ and RQ for the drain work request and
+ * completion.
+ *
+ * allocate the CQ using ib_alloc_cq().
+ *
+ * ensure that there are no other contexts that are posting WRs concurrently.
+ * Otherwise the drain is not guaranteed.
+ */
+void ib_drain_rq(struct ib_qp *qp)
+{
+ if (qp->device->ops.drain_rq)
+ qp->device->ops.drain_rq(qp);
+ else
+ __ib_drain_rq(qp);
+ trace_cq_drain_complete(qp->recv_cq);
+}
+EXPORT_SYMBOL(ib_drain_rq);
+
+/**
+ * ib_drain_qp() - Block until all CQEs have been consumed by the
+ * application on both the RQ and SQ.
+ * @qp: queue pair to drain
+ *
+ * The caller must:
+ *
+ * ensure there is room in the CQ(s), SQ, and RQ for drain work requests
+ * and completions.
+ *
+ * allocate the CQs using ib_alloc_cq().
+ *
+ * ensure that there are no other contexts that are posting WRs concurrently.
+ * Otherwise the drain is not guaranteed.
+ */
+void ib_drain_qp(struct ib_qp *qp)
+{
+ ib_drain_sq(qp);
+ if (!qp->srq)
+ ib_drain_rq(qp);
+}
+EXPORT_SYMBOL(ib_drain_qp);
+
+struct net_device *rdma_alloc_netdev(struct ib_device *device, u32 port_num,
+ enum rdma_netdev_t type, const char *name,
+ unsigned char name_assign_type,
+ void (*setup)(struct net_device *))
+{
+ struct rdma_netdev_alloc_params params;
+ struct net_device *netdev;
+ int rc;
+
+ if (!device->ops.rdma_netdev_get_params)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ rc = device->ops.rdma_netdev_get_params(device, port_num, type,
+ &params);
+ if (rc)
+ return ERR_PTR(rc);
+
+ netdev = alloc_netdev_mqs(params.sizeof_priv, name, name_assign_type,
+ setup, params.txqs, params.rxqs);
+ if (!netdev)
+ return ERR_PTR(-ENOMEM);
+
+ return netdev;
+}
+EXPORT_SYMBOL(rdma_alloc_netdev);
+
+int rdma_init_netdev(struct ib_device *device, u32 port_num,
+ enum rdma_netdev_t type, const char *name,
+ unsigned char name_assign_type,
+ void (*setup)(struct net_device *),
+ struct net_device *netdev)
+{
+ struct rdma_netdev_alloc_params params;
+ int rc;
+
+ if (!device->ops.rdma_netdev_get_params)
+ return -EOPNOTSUPP;
+
+ rc = device->ops.rdma_netdev_get_params(device, port_num, type,
+ &params);
+ if (rc)
+ return rc;
+
+ return params.initialize_rdma_netdev(device, port_num,
+ netdev, params.param);
+}
+EXPORT_SYMBOL(rdma_init_netdev);
+
+void __rdma_block_iter_start(struct ib_block_iter *biter,
+ struct scatterlist *sglist, unsigned int nents,
+ unsigned long pgsz)
+{
+ memset(biter, 0, sizeof(struct ib_block_iter));
+ biter->__sg = sglist;
+ biter->__sg_nents = nents;
+
+ /* Driver provides best block size to use */
+ biter->__pg_bit = __fls(pgsz);
+}
+EXPORT_SYMBOL(__rdma_block_iter_start);
+
+bool __rdma_block_iter_next(struct ib_block_iter *biter)
+{
+ unsigned int block_offset;
+ unsigned int sg_delta;
+
+ if (!biter->__sg_nents || !biter->__sg)
+ return false;
+
+ biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance;
+ block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1);
+ sg_delta = BIT_ULL(biter->__pg_bit) - block_offset;
+
+ if (sg_dma_len(biter->__sg) - biter->__sg_advance > sg_delta) {
+ biter->__sg_advance += sg_delta;
+ } else {
+ biter->__sg_advance = 0;
+ biter->__sg = sg_next(biter->__sg);
+ biter->__sg_nents--;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL(__rdma_block_iter_next);
+
+/**
+ * rdma_alloc_hw_stats_struct - Helper function to allocate dynamic struct
+ * for the drivers.
+ * @descs: array of static descriptors
+ * @num_counters: number of elements in array
+ * @lifespan: milliseconds between updates
+ */
+struct rdma_hw_stats *rdma_alloc_hw_stats_struct(
+ const struct rdma_stat_desc *descs, int num_counters,
+ unsigned long lifespan)
+{
+ struct rdma_hw_stats *stats;
+
+ stats = kzalloc(struct_size(stats, value, num_counters), GFP_KERNEL);
+ if (!stats)
+ return NULL;
+
+ stats->is_disabled = kcalloc(BITS_TO_LONGS(num_counters),
+ sizeof(*stats->is_disabled), GFP_KERNEL);
+ if (!stats->is_disabled)
+ goto err;
+
+ stats->descs = descs;
+ stats->num_counters = num_counters;
+ stats->lifespan = msecs_to_jiffies(lifespan);
+ mutex_init(&stats->lock);
+
+ return stats;
+
+err:
+ kfree(stats);
+ return NULL;
+}
+EXPORT_SYMBOL(rdma_alloc_hw_stats_struct);
+
+/**
+ * rdma_free_hw_stats_struct - Helper function to release rdma_hw_stats
+ * @stats: statistics to release
+ */
+void rdma_free_hw_stats_struct(struct rdma_hw_stats *stats)
+{
+ if (!stats)
+ return;
+
+ kfree(stats->is_disabled);
+ kfree(stats);
+}
+EXPORT_SYMBOL(rdma_free_hw_stats_struct);