summaryrefslogtreecommitdiffstats
path: root/include/rdma
diff options
context:
space:
mode:
Diffstat (limited to 'include/rdma')
-rw-r--r--include/rdma/ib.h81
-rw-r--r--include/rdma/ib_addr.h247
-rw-r--r--include/rdma/ib_cache.h117
-rw-r--r--include/rdma/ib_cm.h568
-rw-r--r--include/rdma/ib_hdrs.h307
-rw-r--r--include/rdma/ib_mad.h819
-rw-r--r--include/rdma/ib_marshall.h28
-rw-r--r--include/rdma/ib_pack.h289
-rw-r--r--include/rdma/ib_pma.h130
-rw-r--r--include/rdma/ib_sa.h609
-rw-r--r--include/rdma/ib_smi.h158
-rw-r--r--include/rdma/ib_sysfs.h37
-rw-r--r--include/rdma/ib_umem.h207
-rw-r--r--include/rdma/ib_umem_odp.h115
-rw-r--r--include/rdma/ib_verbs.h4836
-rw-r--r--include/rdma/iba.h146
-rw-r--r--include/rdma/ibta_vol1_c12.h219
-rw-r--r--include/rdma/iw_cm.h205
-rw-r--r--include/rdma/iw_portmap.h65
-rw-r--r--include/rdma/lag.h23
-rw-r--r--include/rdma/mr_pool.h17
-rw-r--r--include/rdma/opa_addr.h91
-rw-r--r--include/rdma/opa_port_info.h385
-rw-r--r--include/rdma/opa_smi.h124
-rw-r--r--include/rdma/opa_vnic.h96
-rw-r--r--include/rdma/rdma_cm.h393
-rw-r--r--include/rdma/rdma_cm_ib.h27
-rw-r--r--include/rdma/rdma_counter.h68
-rw-r--r--include/rdma/rdma_netlink.h125
-rw-r--r--include/rdma/rdma_vt.h532
-rw-r--r--include/rdma/rdmavt_cq.h67
-rw-r--r--include/rdma/rdmavt_mr.h155
-rw-r--r--include/rdma/rdmavt_qp.h1003
-rw-r--r--include/rdma/restrack.h176
-rw-r--r--include/rdma/rw.h73
-rw-r--r--include/rdma/signature.h124
-rw-r--r--include/rdma/tid_rdma_defs.h108
-rw-r--r--include/rdma/uverbs_ioctl.h1017
-rw-r--r--include/rdma/uverbs_named_ioctl.h97
-rw-r--r--include/rdma/uverbs_std_types.h178
-rw-r--r--include/rdma/uverbs_types.h184
41 files changed, 14246 insertions, 0 deletions
diff --git a/include/rdma/ib.h b/include/rdma/ib.h
new file mode 100644
index 0000000000..f7c185ff7a
--- /dev/null
+++ b/include/rdma/ib.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2010 Intel Corporation. All rights reserved.
+ */
+
+#ifndef _RDMA_IB_H
+#define _RDMA_IB_H
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/cred.h>
+#include <linux/uaccess.h>
+#include <linux/fs.h>
+
+struct ib_addr {
+ union {
+ __u8 uib_addr8[16];
+ __be16 uib_addr16[8];
+ __be32 uib_addr32[4];
+ __be64 uib_addr64[2];
+ } ib_u;
+#define sib_addr8 ib_u.uib_addr8
+#define sib_addr16 ib_u.uib_addr16
+#define sib_addr32 ib_u.uib_addr32
+#define sib_addr64 ib_u.uib_addr64
+#define sib_raw ib_u.uib_addr8
+#define sib_subnet_prefix ib_u.uib_addr64[0]
+#define sib_interface_id ib_u.uib_addr64[1]
+};
+
+static inline bool ib_addr_any(const struct ib_addr *a)
+{
+ return ((a->sib_addr64[0] | a->sib_addr64[1]) == 0);
+}
+
+static inline bool ib_addr_loopback(const struct ib_addr *a)
+{
+ return ((a->sib_addr32[0] | a->sib_addr32[1] |
+ a->sib_addr32[2] | (a->sib_addr32[3] ^ htonl(1))) == 0);
+}
+
+static inline void ib_addr_set(struct ib_addr *addr,
+ __be32 w1, __be32 w2, __be32 w3, __be32 w4)
+{
+ addr->sib_addr32[0] = w1;
+ addr->sib_addr32[1] = w2;
+ addr->sib_addr32[2] = w3;
+ addr->sib_addr32[3] = w4;
+}
+
+static inline int ib_addr_cmp(const struct ib_addr *a1, const struct ib_addr *a2)
+{
+ return memcmp(a1, a2, sizeof(struct ib_addr));
+}
+
+struct sockaddr_ib {
+ unsigned short int sib_family; /* AF_IB */
+ __be16 sib_pkey;
+ __be32 sib_flowinfo;
+ struct ib_addr sib_addr;
+ __be64 sib_sid;
+ __be64 sib_sid_mask;
+ __u64 sib_scope_id;
+};
+
+/*
+ * The IB interfaces that use write() as bi-directional ioctl() are
+ * fundamentally unsafe, since there are lots of ways to trigger "write()"
+ * calls from various contexts with elevated privileges. That includes the
+ * traditional suid executable error message writes, but also various kernel
+ * interfaces that can write to file descriptors.
+ *
+ * This function provides protection for the legacy API by restricting the
+ * calling context.
+ */
+static inline bool ib_safe_file_access(struct file *filp)
+{
+ return filp->f_cred == current_cred();
+}
+
+#endif /* _RDMA_IB_H */
diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
new file mode 100644
index 0000000000..811a0f11d0
--- /dev/null
+++ b/include/rdma/ib_addr.h
@@ -0,0 +1,247 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2005 Voltaire Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ */
+
+#ifndef IB_ADDR_H
+#define IB_ADDR_H
+
+#include <linux/ethtool.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/socket.h>
+#include <linux/if_vlan.h>
+#include <net/ipv6.h>
+#include <net/if_inet6.h>
+#include <net/ip.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+#include <net/net_namespace.h>
+
+/**
+ * struct rdma_dev_addr - Contains resolved RDMA hardware addresses
+ * @src_dev_addr: Source MAC address.
+ * @dst_dev_addr: Destination MAC address.
+ * @broadcast: Broadcast address of the device.
+ * @dev_type: The interface hardware type of the device.
+ * @bound_dev_if: An optional device interface index.
+ * @transport: The transport type used.
+ * @net: Network namespace containing the bound_dev_if net_dev.
+ * @sgid_attr: GID attribute to use for identified SGID
+ */
+struct rdma_dev_addr {
+ unsigned char src_dev_addr[MAX_ADDR_LEN];
+ unsigned char dst_dev_addr[MAX_ADDR_LEN];
+ unsigned char broadcast[MAX_ADDR_LEN];
+ unsigned short dev_type;
+ int bound_dev_if;
+ enum rdma_transport_type transport;
+ struct net *net;
+ const struct ib_gid_attr *sgid_attr;
+ enum rdma_network_type network;
+ int hoplimit;
+};
+
+/**
+ * rdma_translate_ip - Translate a local IP address to an RDMA hardware
+ * address.
+ *
+ * The dev_addr->net field must be initialized.
+ */
+int rdma_translate_ip(const struct sockaddr *addr,
+ struct rdma_dev_addr *dev_addr);
+
+/**
+ * rdma_resolve_ip - Resolve source and destination IP addresses to
+ * RDMA hardware addresses.
+ * @src_addr: An optional source address to use in the resolution. If a
+ * source address is not provided, a usable address will be returned via
+ * the callback.
+ * @dst_addr: The destination address to resolve.
+ * @addr: A reference to a data location that will receive the resolved
+ * addresses. The data location must remain valid until the callback has
+ * been invoked. The net field of the addr struct must be valid.
+ * @timeout_ms: Amount of time to wait for the address resolution to complete.
+ * @callback: Call invoked once address resolution has completed, timed out,
+ * or been canceled. A status of 0 indicates success.
+ * @resolve_by_gid_attr: Resolve the ip based on the GID attribute from
+ * rdma_dev_addr.
+ * @context: User-specified context associated with the call.
+ */
+int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr,
+ struct rdma_dev_addr *addr, unsigned long timeout_ms,
+ void (*callback)(int status, struct sockaddr *src_addr,
+ struct rdma_dev_addr *addr, void *context),
+ bool resolve_by_gid_attr, void *context);
+
+void rdma_addr_cancel(struct rdma_dev_addr *addr);
+
+int rdma_addr_size(const struct sockaddr *addr);
+int rdma_addr_size_in6(struct sockaddr_in6 *addr);
+int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr);
+
+static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr)
+{
+ return ((u16)dev_addr->broadcast[8] << 8) | (u16)dev_addr->broadcast[9];
+}
+
+static inline void ib_addr_set_pkey(struct rdma_dev_addr *dev_addr, u16 pkey)
+{
+ dev_addr->broadcast[8] = pkey >> 8;
+ dev_addr->broadcast[9] = (unsigned char) pkey;
+}
+
+static inline void ib_addr_get_mgid(struct rdma_dev_addr *dev_addr,
+ union ib_gid *gid)
+{
+ memcpy(gid, dev_addr->broadcast + 4, sizeof *gid);
+}
+
+static inline int rdma_addr_gid_offset(struct rdma_dev_addr *dev_addr)
+{
+ return dev_addr->dev_type == ARPHRD_INFINIBAND ? 4 : 0;
+}
+
+static inline u16 rdma_vlan_dev_vlan_id(const struct net_device *dev)
+{
+ return is_vlan_dev(dev) ? vlan_dev_vlan_id(dev) : 0xffff;
+}
+
+static inline int rdma_ip2gid(struct sockaddr *addr, union ib_gid *gid)
+{
+ switch (addr->sa_family) {
+ case AF_INET:
+ ipv6_addr_set_v4mapped(((struct sockaddr_in *)
+ addr)->sin_addr.s_addr,
+ (struct in6_addr *)gid);
+ break;
+ case AF_INET6:
+ *(struct in6_addr *)&gid->raw =
+ ((struct sockaddr_in6 *)addr)->sin6_addr;
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* Important - sockaddr should be a union of sockaddr_in and sockaddr_in6 */
+static inline void rdma_gid2ip(struct sockaddr *out, const union ib_gid *gid)
+{
+ if (ipv6_addr_v4mapped((struct in6_addr *)gid)) {
+ struct sockaddr_in *out_in = (struct sockaddr_in *)out;
+ memset(out_in, 0, sizeof(*out_in));
+ out_in->sin_family = AF_INET;
+ memcpy(&out_in->sin_addr.s_addr, gid->raw + 12, 4);
+ } else {
+ struct sockaddr_in6 *out_in = (struct sockaddr_in6 *)out;
+ memset(out_in, 0, sizeof(*out_in));
+ out_in->sin6_family = AF_INET6;
+ memcpy(&out_in->sin6_addr.s6_addr, gid->raw, 16);
+ }
+}
+
+/*
+ * rdma_get/set_sgid/dgid() APIs are applicable to IB, and iWarp.
+ * They are not applicable to RoCE.
+ * RoCE GIDs are derived from the IP addresses.
+ */
+static inline void rdma_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid)
+{
+ memcpy(gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr),
+ sizeof(*gid));
+}
+
+static inline void rdma_addr_set_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid)
+{
+ memcpy(dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid);
+}
+
+static inline void rdma_addr_get_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid)
+{
+ memcpy(gid, dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof *gid);
+}
+
+static inline void rdma_addr_set_dgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid)
+{
+ memcpy(dev_addr->dst_dev_addr + rdma_addr_gid_offset(dev_addr), gid, sizeof *gid);
+}
+
+static inline enum ib_mtu iboe_get_mtu(int mtu)
+{
+ /*
+ * Reduce IB headers from effective IBoE MTU.
+ */
+ mtu = mtu - (IB_GRH_BYTES + IB_UDP_BYTES + IB_BTH_BYTES +
+ IB_EXT_XRC_BYTES + IB_EXT_ATOMICETH_BYTES +
+ IB_ICRC_BYTES);
+
+ if (mtu >= ib_mtu_enum_to_int(IB_MTU_4096))
+ return IB_MTU_4096;
+ else if (mtu >= ib_mtu_enum_to_int(IB_MTU_2048))
+ return IB_MTU_2048;
+ else if (mtu >= ib_mtu_enum_to_int(IB_MTU_1024))
+ return IB_MTU_1024;
+ else if (mtu >= ib_mtu_enum_to_int(IB_MTU_512))
+ return IB_MTU_512;
+ else if (mtu >= ib_mtu_enum_to_int(IB_MTU_256))
+ return IB_MTU_256;
+ else
+ return 0;
+}
+
+static inline int rdma_link_local_addr(struct in6_addr *addr)
+{
+ if (addr->s6_addr32[0] == htonl(0xfe800000) &&
+ addr->s6_addr32[1] == 0)
+ return 1;
+
+ return 0;
+}
+
+static inline void rdma_get_ll_mac(struct in6_addr *addr, u8 *mac)
+{
+ memcpy(mac, &addr->s6_addr[8], 3);
+ memcpy(mac + 3, &addr->s6_addr[13], 3);
+ mac[0] ^= 2;
+}
+
+static inline int rdma_is_multicast_addr(struct in6_addr *addr)
+{
+ __be32 ipv4_addr;
+
+ if (addr->s6_addr[0] == 0xff)
+ return 1;
+
+ ipv4_addr = addr->s6_addr32[3];
+ return (ipv6_addr_v4mapped(addr) && ipv4_is_multicast(ipv4_addr));
+}
+
+static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac)
+{
+ int i;
+
+ mac[0] = 0x33;
+ mac[1] = 0x33;
+ for (i = 2; i < 6; ++i)
+ mac[i] = addr->s6_addr[i + 10];
+}
+
+static inline u16 rdma_get_vlan_id(union ib_gid *dgid)
+{
+ u16 vid;
+
+ vid = dgid->raw[11] << 8 | dgid->raw[12];
+ return vid < 0x1000 ? vid : 0xffff;
+}
+
+static inline struct net_device *rdma_vlan_dev_real_dev(const struct net_device *dev)
+{
+ return is_vlan_dev(dev) ? vlan_dev_real_dev(dev) : NULL;
+}
+
+#endif /* IB_ADDR_H */
diff --git a/include/rdma/ib_cache.h b/include/rdma/ib_cache.h
new file mode 100644
index 0000000000..226ae3702d
--- /dev/null
+++ b/include/rdma/ib_cache.h
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ */
+
+#ifndef _IB_CACHE_H
+#define _IB_CACHE_H
+
+#include <rdma/ib_verbs.h>
+
+int rdma_query_gid(struct ib_device *device, u32 port_num, int index,
+ union ib_gid *gid);
+void *rdma_read_gid_hw_context(const struct ib_gid_attr *attr);
+const struct ib_gid_attr *rdma_find_gid(struct ib_device *device,
+ const union ib_gid *gid,
+ enum ib_gid_type gid_type,
+ struct net_device *ndev);
+const struct ib_gid_attr *rdma_find_gid_by_port(struct ib_device *ib_dev,
+ const union ib_gid *gid,
+ enum ib_gid_type gid_type,
+ u32 port,
+ struct net_device *ndev);
+const struct ib_gid_attr *rdma_find_gid_by_filter(
+ struct ib_device *device, const union ib_gid *gid, u32 port_num,
+ bool (*filter)(const union ib_gid *gid, const struct ib_gid_attr *,
+ void *),
+ void *context);
+
+int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr,
+ u16 *vlan_id, u8 *smac);
+struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr);
+
+/**
+ * ib_get_cached_pkey - Returns a cached PKey table entry
+ * @device: The device to query.
+ * @port_num: The port number of the device to query.
+ * @index: The index into the cached PKey table to query.
+ * @pkey: The PKey value found at the specified index.
+ *
+ * ib_get_cached_pkey() fetches the specified PKey table entry stored in
+ * the local software cache.
+ */
+int ib_get_cached_pkey(struct ib_device *device_handle,
+ u32 port_num,
+ int index,
+ u16 *pkey);
+
+/**
+ * ib_find_cached_pkey - Returns the PKey table index where a specified
+ * PKey value occurs.
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the cached PKey table where the PKey was found.
+ *
+ * ib_find_cached_pkey() searches the specified PKey table in
+ * the local software cache.
+ */
+int ib_find_cached_pkey(struct ib_device *device,
+ u32 port_num,
+ u16 pkey,
+ u16 *index);
+
+/**
+ * ib_find_exact_cached_pkey - Returns the PKey table index where a specified
+ * PKey value occurs. Comparison uses the FULL 16 bits (incl membership bit)
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the cached PKey table where the PKey was found.
+ *
+ * ib_find_exact_cached_pkey() searches the specified PKey table in
+ * the local software cache.
+ */
+int ib_find_exact_cached_pkey(struct ib_device *device,
+ u32 port_num,
+ u16 pkey,
+ u16 *index);
+
+/**
+ * ib_get_cached_lmc - Returns a cached lmc table entry
+ * @device: The device to query.
+ * @port_num: The port number of the device to query.
+ * @lmc: The lmc value for the specified port for that device.
+ *
+ * ib_get_cached_lmc() fetches the specified lmc table entry stored in
+ * the local software cache.
+ */
+int ib_get_cached_lmc(struct ib_device *device,
+ u32 port_num,
+ u8 *lmc);
+
+/**
+ * ib_get_cached_port_state - Returns a cached port state table entry
+ * @device: The device to query.
+ * @port_num: The port number of the device to query.
+ * @port_state: port_state for the specified port for that device.
+ *
+ * ib_get_cached_port_state() fetches the specified port_state table entry stored in
+ * the local software cache.
+ */
+int ib_get_cached_port_state(struct ib_device *device,
+ u32 port_num,
+ enum ib_port_state *port_active);
+
+bool rdma_is_zero_gid(const union ib_gid *gid);
+const struct ib_gid_attr *rdma_get_gid_attr(struct ib_device *device,
+ u32 port_num, int index);
+void rdma_put_gid_attr(const struct ib_gid_attr *attr);
+void rdma_hold_gid_attr(const struct ib_gid_attr *attr);
+ssize_t rdma_query_gid_table(struct ib_device *device,
+ struct ib_uverbs_gid_entry *entries,
+ size_t max_entries);
+
+#endif /* _IB_CACHE_H */
diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h
new file mode 100644
index 0000000000..a2ac62b4a6
--- /dev/null
+++ b/include/rdma/ib_cm.h
@@ -0,0 +1,568 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2019, Mellanox Technologies inc. All rights reserved.
+ */
+
+#ifndef IB_CM_H
+#define IB_CM_H
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_sa.h>
+#include <rdma/rdma_cm.h>
+
+enum ib_cm_state {
+ IB_CM_IDLE,
+ IB_CM_LISTEN,
+ IB_CM_REQ_SENT,
+ IB_CM_REQ_RCVD,
+ IB_CM_MRA_REQ_SENT,
+ IB_CM_MRA_REQ_RCVD,
+ IB_CM_REP_SENT,
+ IB_CM_REP_RCVD,
+ IB_CM_MRA_REP_SENT,
+ IB_CM_MRA_REP_RCVD,
+ IB_CM_ESTABLISHED,
+ IB_CM_DREQ_SENT,
+ IB_CM_DREQ_RCVD,
+ IB_CM_TIMEWAIT,
+ IB_CM_SIDR_REQ_SENT,
+ IB_CM_SIDR_REQ_RCVD
+};
+
+enum ib_cm_lap_state {
+ IB_CM_LAP_UNINIT,
+ IB_CM_LAP_IDLE,
+ IB_CM_LAP_SENT,
+ IB_CM_LAP_RCVD,
+ IB_CM_MRA_LAP_SENT,
+ IB_CM_MRA_LAP_RCVD,
+};
+
+enum ib_cm_event_type {
+ IB_CM_REQ_ERROR,
+ IB_CM_REQ_RECEIVED,
+ IB_CM_REP_ERROR,
+ IB_CM_REP_RECEIVED,
+ IB_CM_RTU_RECEIVED,
+ IB_CM_USER_ESTABLISHED,
+ IB_CM_DREQ_ERROR,
+ IB_CM_DREQ_RECEIVED,
+ IB_CM_DREP_RECEIVED,
+ IB_CM_TIMEWAIT_EXIT,
+ IB_CM_MRA_RECEIVED,
+ IB_CM_REJ_RECEIVED,
+ IB_CM_LAP_ERROR,
+ IB_CM_LAP_RECEIVED,
+ IB_CM_APR_RECEIVED,
+ IB_CM_SIDR_REQ_ERROR,
+ IB_CM_SIDR_REQ_RECEIVED,
+ IB_CM_SIDR_REP_RECEIVED
+};
+
+enum ib_cm_data_size {
+ IB_CM_REQ_PRIVATE_DATA_SIZE = 92,
+ IB_CM_MRA_PRIVATE_DATA_SIZE = 222,
+ IB_CM_REJ_PRIVATE_DATA_SIZE = 148,
+ IB_CM_REP_PRIVATE_DATA_SIZE = 196,
+ IB_CM_RTU_PRIVATE_DATA_SIZE = 224,
+ IB_CM_DREQ_PRIVATE_DATA_SIZE = 220,
+ IB_CM_DREP_PRIVATE_DATA_SIZE = 224,
+ IB_CM_REJ_ARI_LENGTH = 72,
+ IB_CM_LAP_PRIVATE_DATA_SIZE = 168,
+ IB_CM_APR_PRIVATE_DATA_SIZE = 148,
+ IB_CM_APR_INFO_LENGTH = 72,
+ IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE = 216,
+ IB_CM_SIDR_REP_PRIVATE_DATA_SIZE = 136,
+ IB_CM_SIDR_REP_INFO_LENGTH = 72,
+};
+
+struct ib_cm_id;
+
+struct ib_cm_req_event_param {
+ struct ib_cm_id *listen_id;
+
+ /* P_Key that was used by the GMP's BTH header */
+ u16 bth_pkey;
+
+ u8 port;
+
+ struct sa_path_rec *primary_path;
+ struct sa_path_rec *alternate_path;
+
+ /*
+ * SGID attribute of the primary path. Currently only
+ * useful for RoCE. Alternate path GID attributes
+ * are not yet supported.
+ */
+ const struct ib_gid_attr *ppath_sgid_attr;
+
+ __be64 remote_ca_guid;
+ u32 remote_qkey;
+ u32 remote_qpn;
+ enum ib_qp_type qp_type;
+
+ u32 starting_psn;
+ u8 responder_resources;
+ u8 initiator_depth;
+ unsigned int local_cm_response_timeout:5;
+ unsigned int flow_control:1;
+ unsigned int remote_cm_response_timeout:5;
+ unsigned int retry_count:3;
+ unsigned int rnr_retry_count:3;
+ unsigned int srq:1;
+ struct rdma_ucm_ece ece;
+};
+
+struct ib_cm_rep_event_param {
+ __be64 remote_ca_guid;
+ u32 remote_qkey;
+ u32 remote_qpn;
+ u32 starting_psn;
+ u8 responder_resources;
+ u8 initiator_depth;
+ unsigned int target_ack_delay:5;
+ unsigned int failover_accepted:2;
+ unsigned int flow_control:1;
+ unsigned int rnr_retry_count:3;
+ unsigned int srq:1;
+ struct rdma_ucm_ece ece;
+};
+
+enum ib_cm_rej_reason {
+ IB_CM_REJ_NO_QP = 1,
+ IB_CM_REJ_NO_EEC = 2,
+ IB_CM_REJ_NO_RESOURCES = 3,
+ IB_CM_REJ_TIMEOUT = 4,
+ IB_CM_REJ_UNSUPPORTED = 5,
+ IB_CM_REJ_INVALID_COMM_ID = 6,
+ IB_CM_REJ_INVALID_COMM_INSTANCE = 7,
+ IB_CM_REJ_INVALID_SERVICE_ID = 8,
+ IB_CM_REJ_INVALID_TRANSPORT_TYPE = 9,
+ IB_CM_REJ_STALE_CONN = 10,
+ IB_CM_REJ_RDC_NOT_EXIST = 11,
+ IB_CM_REJ_INVALID_GID = 12,
+ IB_CM_REJ_INVALID_LID = 13,
+ IB_CM_REJ_INVALID_SL = 14,
+ IB_CM_REJ_INVALID_TRAFFIC_CLASS = 15,
+ IB_CM_REJ_INVALID_HOP_LIMIT = 16,
+ IB_CM_REJ_INVALID_PACKET_RATE = 17,
+ IB_CM_REJ_INVALID_ALT_GID = 18,
+ IB_CM_REJ_INVALID_ALT_LID = 19,
+ IB_CM_REJ_INVALID_ALT_SL = 20,
+ IB_CM_REJ_INVALID_ALT_TRAFFIC_CLASS = 21,
+ IB_CM_REJ_INVALID_ALT_HOP_LIMIT = 22,
+ IB_CM_REJ_INVALID_ALT_PACKET_RATE = 23,
+ IB_CM_REJ_PORT_CM_REDIRECT = 24,
+ IB_CM_REJ_PORT_REDIRECT = 25,
+ IB_CM_REJ_INVALID_MTU = 26,
+ IB_CM_REJ_INSUFFICIENT_RESP_RESOURCES = 27,
+ IB_CM_REJ_CONSUMER_DEFINED = 28,
+ IB_CM_REJ_INVALID_RNR_RETRY = 29,
+ IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID = 30,
+ IB_CM_REJ_INVALID_CLASS_VERSION = 31,
+ IB_CM_REJ_INVALID_FLOW_LABEL = 32,
+ IB_CM_REJ_INVALID_ALT_FLOW_LABEL = 33,
+ IB_CM_REJ_VENDOR_OPTION_NOT_SUPPORTED = 35,
+};
+
+struct ib_cm_rej_event_param {
+ enum ib_cm_rej_reason reason;
+ void *ari;
+ u8 ari_length;
+};
+
+struct ib_cm_mra_event_param {
+ u8 service_timeout;
+};
+
+struct ib_cm_lap_event_param {
+ struct sa_path_rec *alternate_path;
+};
+
+enum ib_cm_apr_status {
+ IB_CM_APR_SUCCESS,
+ IB_CM_APR_INVALID_COMM_ID,
+ IB_CM_APR_UNSUPPORTED,
+ IB_CM_APR_REJECT,
+ IB_CM_APR_REDIRECT,
+ IB_CM_APR_IS_CURRENT,
+ IB_CM_APR_INVALID_QPN_EECN,
+ IB_CM_APR_INVALID_LID,
+ IB_CM_APR_INVALID_GID,
+ IB_CM_APR_INVALID_FLOW_LABEL,
+ IB_CM_APR_INVALID_TCLASS,
+ IB_CM_APR_INVALID_HOP_LIMIT,
+ IB_CM_APR_INVALID_PACKET_RATE,
+ IB_CM_APR_INVALID_SL
+};
+
+struct ib_cm_apr_event_param {
+ enum ib_cm_apr_status ap_status;
+ void *apr_info;
+ u8 info_len;
+};
+
+struct ib_cm_sidr_req_event_param {
+ struct ib_cm_id *listen_id;
+ __be64 service_id;
+
+ /*
+ * SGID attribute of the request. Currently only
+ * useful for RoCE.
+ */
+ const struct ib_gid_attr *sgid_attr;
+ /* P_Key that was used by the GMP's BTH header */
+ u16 bth_pkey;
+ u8 port;
+ u16 pkey;
+};
+
+enum ib_cm_sidr_status {
+ IB_SIDR_SUCCESS,
+ IB_SIDR_UNSUPPORTED,
+ IB_SIDR_REJECT,
+ IB_SIDR_NO_QP,
+ IB_SIDR_REDIRECT,
+ IB_SIDR_UNSUPPORTED_VERSION
+};
+
+struct ib_cm_sidr_rep_event_param {
+ enum ib_cm_sidr_status status;
+ u32 qkey;
+ u32 qpn;
+ void *info;
+ const struct ib_gid_attr *sgid_attr;
+ u8 info_len;
+};
+
+struct ib_cm_event {
+ enum ib_cm_event_type event;
+ union {
+ struct ib_cm_req_event_param req_rcvd;
+ struct ib_cm_rep_event_param rep_rcvd;
+ /* No data for RTU received events. */
+ struct ib_cm_rej_event_param rej_rcvd;
+ struct ib_cm_mra_event_param mra_rcvd;
+ struct ib_cm_lap_event_param lap_rcvd;
+ struct ib_cm_apr_event_param apr_rcvd;
+ /* No data for DREQ/DREP received events. */
+ struct ib_cm_sidr_req_event_param sidr_req_rcvd;
+ struct ib_cm_sidr_rep_event_param sidr_rep_rcvd;
+ enum ib_wc_status send_status;
+ } param;
+
+ void *private_data;
+};
+
+#define CM_REQ_ATTR_ID cpu_to_be16(0x0010)
+#define CM_MRA_ATTR_ID cpu_to_be16(0x0011)
+#define CM_REJ_ATTR_ID cpu_to_be16(0x0012)
+#define CM_REP_ATTR_ID cpu_to_be16(0x0013)
+#define CM_RTU_ATTR_ID cpu_to_be16(0x0014)
+#define CM_DREQ_ATTR_ID cpu_to_be16(0x0015)
+#define CM_DREP_ATTR_ID cpu_to_be16(0x0016)
+#define CM_SIDR_REQ_ATTR_ID cpu_to_be16(0x0017)
+#define CM_SIDR_REP_ATTR_ID cpu_to_be16(0x0018)
+#define CM_LAP_ATTR_ID cpu_to_be16(0x0019)
+#define CM_APR_ATTR_ID cpu_to_be16(0x001A)
+
+/**
+ * ib_cm_handler - User-defined callback to process communication events.
+ * @cm_id: Communication identifier associated with the reported event.
+ * @event: Information about the communication event.
+ *
+ * IB_CM_REQ_RECEIVED and IB_CM_SIDR_REQ_RECEIVED communication events
+ * generated as a result of listen requests result in the allocation of a
+ * new @cm_id. The new @cm_id is returned to the user through this callback.
+ * Clients are responsible for destroying the new @cm_id. For peer-to-peer
+ * IB_CM_REQ_RECEIVED and all other events, the returned @cm_id corresponds
+ * to a user's existing communication identifier.
+ *
+ * Users may not call ib_destroy_cm_id while in the context of this callback;
+ * however, returning a non-zero value instructs the communication manager to
+ * destroy the @cm_id after the callback completes.
+ */
+typedef int (*ib_cm_handler)(struct ib_cm_id *cm_id,
+ const struct ib_cm_event *event);
+
+struct ib_cm_id {
+ ib_cm_handler cm_handler;
+ void *context;
+ struct ib_device *device;
+ __be64 service_id;
+ enum ib_cm_state state; /* internal CM/debug use */
+ enum ib_cm_lap_state lap_state; /* internal CM/debug use */
+ __be32 local_id;
+ __be32 remote_id;
+ u32 remote_cm_qpn; /* 1 unless redirected */
+};
+
+/**
+ * ib_create_cm_id - Allocate a communication identifier.
+ * @device: Device associated with the cm_id. All related communication will
+ * be associated with the specified device.
+ * @cm_handler: Callback invoked to notify the user of CM events.
+ * @context: User specified context associated with the communication
+ * identifier.
+ *
+ * Communication identifiers are used to track connection states, service
+ * ID resolution requests, and listen requests.
+ */
+struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
+ ib_cm_handler cm_handler,
+ void *context);
+
+/**
+ * ib_destroy_cm_id - Destroy a connection identifier.
+ * @cm_id: Connection identifier to destroy.
+ *
+ * This call blocks until the connection identifier is destroyed.
+ */
+void ib_destroy_cm_id(struct ib_cm_id *cm_id);
+
+#define IB_SERVICE_ID_AGN_MASK cpu_to_be64(0xFF00000000000000ULL)
+#define IB_CM_ASSIGN_SERVICE_ID cpu_to_be64(0x0200000000000000ULL)
+#define IB_CMA_SERVICE_ID cpu_to_be64(0x0000000001000000ULL)
+#define IB_CMA_SERVICE_ID_MASK cpu_to_be64(0xFFFFFFFFFF000000ULL)
+#define IB_SDP_SERVICE_ID cpu_to_be64(0x0000000000010000ULL)
+#define IB_SDP_SERVICE_ID_MASK cpu_to_be64(0xFFFFFFFFFFFF0000ULL)
+
+/**
+ * ib_cm_listen - Initiates listening on the specified service ID for
+ * connection and service ID resolution requests.
+ * @cm_id: Connection identifier associated with the listen request.
+ * @service_id: Service identifier matched against incoming connection
+ * and service ID resolution requests. The service ID should be specified
+ * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ * assign a service ID to the caller.
+ */
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id);
+
+struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
+ ib_cm_handler cm_handler,
+ __be64 service_id);
+
+struct ib_cm_req_param {
+ struct sa_path_rec *primary_path;
+ struct sa_path_rec *primary_path_inbound;
+ struct sa_path_rec *primary_path_outbound;
+ struct sa_path_rec *alternate_path;
+ const struct ib_gid_attr *ppath_sgid_attr;
+ __be64 service_id;
+ u32 qp_num;
+ enum ib_qp_type qp_type;
+ u32 starting_psn;
+ const void *private_data;
+ u8 private_data_len;
+ u8 responder_resources;
+ u8 initiator_depth;
+ u8 remote_cm_response_timeout;
+ u8 flow_control;
+ u8 local_cm_response_timeout;
+ u8 retry_count;
+ u8 rnr_retry_count;
+ u8 max_cm_retries;
+ u8 srq;
+ struct rdma_ucm_ece ece;
+};
+
+/**
+ * ib_send_cm_req - Sends a connection request to the remote node.
+ * @cm_id: Connection identifier that will be associated with the
+ * connection request.
+ * @param: Connection request information needed to establish the
+ * connection.
+ */
+int ib_send_cm_req(struct ib_cm_id *cm_id,
+ struct ib_cm_req_param *param);
+
+struct ib_cm_rep_param {
+ u32 qp_num;
+ u32 starting_psn;
+ const void *private_data;
+ u8 private_data_len;
+ u8 responder_resources;
+ u8 initiator_depth;
+ u8 failover_accepted;
+ u8 flow_control;
+ u8 rnr_retry_count;
+ u8 srq;
+ struct rdma_ucm_ece ece;
+};
+
+/**
+ * ib_send_cm_rep - Sends a connection reply in response to a connection
+ * request.
+ * @cm_id: Connection identifier that will be associated with the
+ * connection request.
+ * @param: Connection reply information needed to establish the
+ * connection.
+ */
+int ib_send_cm_rep(struct ib_cm_id *cm_id,
+ struct ib_cm_rep_param *param);
+
+/**
+ * ib_send_cm_rtu - Sends a connection ready to use message in response
+ * to a connection reply message.
+ * @cm_id: Connection identifier associated with the connection request.
+ * @private_data: Optional user-defined private data sent with the
+ * ready to use message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_rtu(struct ib_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len);
+
+/**
+ * ib_send_cm_dreq - Sends a disconnection request for an existing
+ * connection.
+ * @cm_id: Connection identifier associated with the connection being
+ * released.
+ * @private_data: Optional user-defined private data sent with the
+ * disconnection request message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_dreq(struct ib_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len);
+
+/**
+ * ib_send_cm_drep - Sends a disconnection reply to a disconnection request.
+ * @cm_id: Connection identifier associated with the connection being
+ * released.
+ * @private_data: Optional user-defined private data sent with the
+ * disconnection reply message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ *
+ * If the cm_id is in the correct state, the CM will transition the connection
+ * to the timewait state, even if an error occurs sending the DREP message.
+ */
+int ib_send_cm_drep(struct ib_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len);
+
+/**
+ * ib_cm_notify - Notifies the CM of an event reported to the consumer.
+ * @cm_id: Connection identifier to transition to established.
+ * @event: Type of event.
+ *
+ * This routine should be invoked by users to notify the CM of relevant
+ * communication events. Events that should be reported to the CM and
+ * when to report them are:
+ *
+ * IB_EVENT_COMM_EST - Used when a message is received on a connected
+ * QP before an RTU has been received.
+ * IB_EVENT_PATH_MIG - Notifies the CM that the connection has failed over
+ * to the alternate path.
+ */
+int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event);
+
+/**
+ * ib_send_cm_rej - Sends a connection rejection message to the
+ * remote node.
+ * @cm_id: Connection identifier associated with the connection being
+ * rejected.
+ * @reason: Reason for the connection request rejection.
+ * @ari: Optional additional rejection information.
+ * @ari_length: Size of the additional rejection information, in bytes.
+ * @private_data: Optional user-defined private data sent with the
+ * rejection message.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_rej(struct ib_cm_id *cm_id,
+ enum ib_cm_rej_reason reason,
+ void *ari,
+ u8 ari_length,
+ const void *private_data,
+ u8 private_data_len);
+
+#define IB_CM_MRA_FLAG_DELAY 0x80 /* Send MRA only after a duplicate msg */
+
+/**
+ * ib_send_cm_mra - Sends a message receipt acknowledgement to a connection
+ * message.
+ * @cm_id: Connection identifier associated with the connection message.
+ * @service_timeout: The lower 5-bits specify the maximum time required for
+ * the sender to reply to the connection message. The upper 3-bits
+ * specify additional control flags.
+ * @private_data: Optional user-defined private data sent with the
+ * message receipt acknowledgement.
+ * @private_data_len: Size of the private data buffer, in bytes.
+ */
+int ib_send_cm_mra(struct ib_cm_id *cm_id,
+ u8 service_timeout,
+ const void *private_data,
+ u8 private_data_len);
+
+/**
+ * ib_cm_init_qp_attr - Initializes the QP attributes for use in transitioning
+ * to a specified QP state.
+ * @cm_id: Communication identifier associated with the QP attributes to
+ * initialize.
+ * @qp_attr: On input, specifies the desired QP state. On output, the
+ * mandatory and desired optional attributes will be set in order to
+ * modify the QP to the specified state.
+ * @qp_attr_mask: The QP attribute mask that may be used to transition the
+ * QP to the specified state.
+ *
+ * Users must set the @qp_attr->qp_state to the desired QP state. This call
+ * will set all required attributes for the given transition, along with
+ * known optional attributes. Users may override the attributes returned from
+ * this call before calling ib_modify_qp.
+ */
+int ib_cm_init_qp_attr(struct ib_cm_id *cm_id,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask);
+
+struct ib_cm_sidr_req_param {
+ struct sa_path_rec *path;
+ const struct ib_gid_attr *sgid_attr;
+ __be64 service_id;
+ unsigned long timeout_ms;
+ const void *private_data;
+ u8 private_data_len;
+ u8 max_cm_retries;
+};
+
+/**
+ * ib_send_cm_sidr_req - Sends a service ID resolution request to the
+ * remote node.
+ * @cm_id: Communication identifier that will be associated with the
+ * service ID resolution request.
+ * @param: Service ID resolution request information.
+ */
+int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
+ struct ib_cm_sidr_req_param *param);
+
+struct ib_cm_sidr_rep_param {
+ u32 qp_num;
+ u32 qkey;
+ enum ib_cm_sidr_status status;
+ const void *info;
+ u8 info_length;
+ const void *private_data;
+ u8 private_data_len;
+ struct rdma_ucm_ece ece;
+};
+
+/**
+ * ib_send_cm_sidr_rep - Sends a service ID resolution reply to the
+ * remote node.
+ * @cm_id: Communication identifier associated with the received service ID
+ * resolution request.
+ * @param: Service ID resolution reply information.
+ */
+int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id,
+ struct ib_cm_sidr_rep_param *param);
+
+/**
+ * ibcm_reject_msg - return a pointer to a reject message string.
+ * @reason: Value returned in the REJECT event status field.
+ */
+const char *__attribute_const__ ibcm_reject_msg(int reason);
+
+#endif /* IB_CM_H */
diff --git a/include/rdma/ib_hdrs.h b/include/rdma/ib_hdrs.h
new file mode 100644
index 0000000000..8ae07c0ecd
--- /dev/null
+++ b/include/rdma/ib_hdrs.h
@@ -0,0 +1,307 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/*
+ * Copyright(c) 2016 - 2018 Intel Corporation.
+ */
+
+#ifndef IB_HDRS_H
+#define IB_HDRS_H
+
+#include <linux/types.h>
+#include <asm/unaligned.h>
+#include <rdma/ib_verbs.h>
+
+#define IB_SEQ_NAK (3 << 29)
+
+/* AETH NAK opcode values */
+#define IB_RNR_NAK 0x20
+#define IB_NAK_PSN_ERROR 0x60
+#define IB_NAK_INVALID_REQUEST 0x61
+#define IB_NAK_REMOTE_ACCESS_ERROR 0x62
+#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63
+#define IB_NAK_INVALID_RD_REQUEST 0x64
+
+#define IB_BTH_REQ_ACK BIT(31)
+#define IB_BTH_SOLICITED BIT(23)
+#define IB_BTH_MIG_REQ BIT(22)
+
+#define IB_GRH_VERSION 6
+#define IB_GRH_VERSION_MASK 0xF
+#define IB_GRH_VERSION_SHIFT 28
+#define IB_GRH_TCLASS_MASK 0xFF
+#define IB_GRH_TCLASS_SHIFT 20
+#define IB_GRH_FLOW_MASK 0xFFFFF
+#define IB_GRH_FLOW_SHIFT 0
+#define IB_GRH_NEXT_HDR 0x1B
+#define IB_FECN_SHIFT 31
+#define IB_FECN_MASK 1
+#define IB_FECN_SMASK BIT(IB_FECN_SHIFT)
+#define IB_BECN_SHIFT 30
+#define IB_BECN_MASK 1
+#define IB_BECN_SMASK BIT(IB_BECN_SHIFT)
+
+#define IB_AETH_CREDIT_SHIFT 24
+#define IB_AETH_CREDIT_MASK 0x1F
+#define IB_AETH_CREDIT_INVAL 0x1F
+#define IB_AETH_NAK_SHIFT 29
+#define IB_MSN_MASK 0xFFFFFF
+
+struct ib_reth {
+ __be64 vaddr; /* potentially unaligned */
+ __be32 rkey;
+ __be32 length;
+} __packed;
+
+struct ib_atomic_eth {
+ __be64 vaddr; /* potentially unaligned */
+ __be32 rkey;
+ __be64 swap_data; /* potentially unaligned */
+ __be64 compare_data; /* potentially unaligned */
+} __packed;
+
+#include <rdma/tid_rdma_defs.h>
+
+union ib_ehdrs {
+ struct {
+ __be32 deth[2];
+ __be32 imm_data;
+ } ud;
+ struct {
+ struct ib_reth reth;
+ __be32 imm_data;
+ } rc;
+ struct {
+ __be32 aeth;
+ __be64 atomic_ack_eth; /* potentially unaligned */
+ } __packed at;
+ __be32 imm_data;
+ __be32 aeth;
+ __be32 ieth;
+ struct ib_atomic_eth atomic_eth;
+ /* TID RDMA headers */
+ union {
+ struct tid_rdma_read_req r_req;
+ struct tid_rdma_read_resp r_rsp;
+ struct tid_rdma_write_req w_req;
+ struct tid_rdma_write_resp w_rsp;
+ struct tid_rdma_write_data w_data;
+ struct tid_rdma_resync resync;
+ struct tid_rdma_ack ack;
+ } tid_rdma;
+} __packed;
+
+struct ib_other_headers {
+ __be32 bth[3];
+ union ib_ehdrs u;
+} __packed;
+
+struct ib_header {
+ __be16 lrh[4];
+ union {
+ struct {
+ struct ib_grh grh;
+ struct ib_other_headers oth;
+ } l;
+ struct ib_other_headers oth;
+ } u;
+} __packed;
+
+/* accessors for unaligned __be64 items */
+
+static inline u64 ib_u64_get(__be64 *p)
+{
+ return get_unaligned_be64(p);
+}
+
+static inline void ib_u64_put(u64 val, __be64 *p)
+{
+ put_unaligned_be64(val, p);
+}
+
+static inline u64 get_ib_reth_vaddr(struct ib_reth *reth)
+{
+ return ib_u64_get(&reth->vaddr);
+}
+
+static inline void put_ib_reth_vaddr(u64 val, struct ib_reth *reth)
+{
+ ib_u64_put(val, &reth->vaddr);
+}
+
+static inline u64 get_ib_ateth_vaddr(struct ib_atomic_eth *ateth)
+{
+ return ib_u64_get(&ateth->vaddr);
+}
+
+static inline void put_ib_ateth_vaddr(u64 val, struct ib_atomic_eth *ateth)
+{
+ ib_u64_put(val, &ateth->vaddr);
+}
+
+static inline u64 get_ib_ateth_swap(struct ib_atomic_eth *ateth)
+{
+ return ib_u64_get(&ateth->swap_data);
+}
+
+static inline void put_ib_ateth_swap(u64 val, struct ib_atomic_eth *ateth)
+{
+ ib_u64_put(val, &ateth->swap_data);
+}
+
+static inline u64 get_ib_ateth_compare(struct ib_atomic_eth *ateth)
+{
+ return ib_u64_get(&ateth->compare_data);
+}
+
+static inline void put_ib_ateth_compare(u64 val, struct ib_atomic_eth *ateth)
+{
+ ib_u64_put(val, &ateth->compare_data);
+}
+
+/*
+ * 9B/IB Packet Format
+ */
+#define IB_LNH_MASK 3
+#define IB_SC_MASK 0xf
+#define IB_SC_SHIFT 12
+#define IB_SC5_MASK 0x10
+#define IB_SL_MASK 0xf
+#define IB_SL_SHIFT 4
+#define IB_SL_SHIFT 4
+#define IB_LVER_MASK 0xf
+#define IB_LVER_SHIFT 8
+
+static inline u8 ib_get_lnh(struct ib_header *hdr)
+{
+ return (be16_to_cpu(hdr->lrh[0]) & IB_LNH_MASK);
+}
+
+static inline u8 ib_get_sc(struct ib_header *hdr)
+{
+ return ((be16_to_cpu(hdr->lrh[0]) >> IB_SC_SHIFT) & IB_SC_MASK);
+}
+
+static inline bool ib_is_sc5(u16 sc5)
+{
+ return !!(sc5 & IB_SC5_MASK);
+}
+
+static inline u8 ib_get_sl(struct ib_header *hdr)
+{
+ return ((be16_to_cpu(hdr->lrh[0]) >> IB_SL_SHIFT) & IB_SL_MASK);
+}
+
+static inline u16 ib_get_dlid(struct ib_header *hdr)
+{
+ return (be16_to_cpu(hdr->lrh[1]));
+}
+
+static inline u16 ib_get_slid(struct ib_header *hdr)
+{
+ return (be16_to_cpu(hdr->lrh[3]));
+}
+
+static inline u8 ib_get_lver(struct ib_header *hdr)
+{
+ return (u8)((be16_to_cpu(hdr->lrh[0]) >> IB_LVER_SHIFT) &
+ IB_LVER_MASK);
+}
+
+static inline u32 ib_get_qkey(struct ib_other_headers *ohdr)
+{
+ return be32_to_cpu(ohdr->u.ud.deth[0]);
+}
+
+static inline u32 ib_get_sqpn(struct ib_other_headers *ohdr)
+{
+ return ((be32_to_cpu(ohdr->u.ud.deth[1])) & IB_QPN_MASK);
+}
+
+/*
+ * BTH
+ */
+#define IB_BTH_OPCODE_MASK 0xff
+#define IB_BTH_OPCODE_SHIFT 24
+#define IB_BTH_PAD_MASK 3
+#define IB_BTH_PKEY_MASK 0xffff
+#define IB_BTH_PAD_SHIFT 20
+#define IB_BTH_A_MASK 1
+#define IB_BTH_A_SHIFT 31
+#define IB_BTH_M_MASK 1
+#define IB_BTH_M_SHIFT 22
+#define IB_BTH_SE_MASK 1
+#define IB_BTH_SE_SHIFT 23
+#define IB_BTH_TVER_MASK 0xf
+#define IB_BTH_TVER_SHIFT 16
+#define IB_BTH_OPCODE_CNP 0x81
+
+static inline u8 ib_bth_get_pad(struct ib_other_headers *ohdr)
+{
+ return ((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_PAD_SHIFT) &
+ IB_BTH_PAD_MASK);
+}
+
+static inline u16 ib_bth_get_pkey(struct ib_other_headers *ohdr)
+{
+ return (be32_to_cpu(ohdr->bth[0]) & IB_BTH_PKEY_MASK);
+}
+
+static inline u8 ib_bth_get_opcode(struct ib_other_headers *ohdr)
+{
+ return ((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_OPCODE_SHIFT) &
+ IB_BTH_OPCODE_MASK);
+}
+
+static inline u8 ib_bth_get_ackreq(struct ib_other_headers *ohdr)
+{
+ return (u8)((be32_to_cpu(ohdr->bth[2]) >> IB_BTH_A_SHIFT) &
+ IB_BTH_A_MASK);
+}
+
+static inline u8 ib_bth_get_migreq(struct ib_other_headers *ohdr)
+{
+ return (u8)((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_M_SHIFT) &
+ IB_BTH_M_MASK);
+}
+
+static inline u8 ib_bth_get_se(struct ib_other_headers *ohdr)
+{
+ return (u8)((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_SE_SHIFT) &
+ IB_BTH_SE_MASK);
+}
+
+static inline u32 ib_bth_get_psn(struct ib_other_headers *ohdr)
+{
+ return (u32)(be32_to_cpu(ohdr->bth[2]));
+}
+
+static inline u32 ib_bth_get_qpn(struct ib_other_headers *ohdr)
+{
+ return (u32)((be32_to_cpu(ohdr->bth[1])) & IB_QPN_MASK);
+}
+
+static inline bool ib_bth_get_becn(struct ib_other_headers *ohdr)
+{
+ return (ohdr->bth[1]) & cpu_to_be32(IB_BECN_SMASK);
+}
+
+static inline bool ib_bth_get_fecn(struct ib_other_headers *ohdr)
+{
+ return (ohdr->bth[1]) & cpu_to_be32(IB_FECN_SMASK);
+}
+
+static inline u8 ib_bth_get_tver(struct ib_other_headers *ohdr)
+{
+ return (u8)((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_TVER_SHIFT) &
+ IB_BTH_TVER_MASK);
+}
+
+static inline bool ib_bth_is_solicited(struct ib_other_headers *ohdr)
+{
+ return ohdr->bth[0] & cpu_to_be32(IB_BTH_SOLICITED);
+}
+
+static inline bool ib_bth_is_migration(struct ib_other_headers *ohdr)
+{
+ return ohdr->bth[0] & cpu_to_be32(IB_BTH_MIG_REQ);
+}
+#endif /* IB_HDRS_H */
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
new file mode 100644
index 0000000000..2e3843b761
--- /dev/null
+++ b/include/rdma/ib_mad.h
@@ -0,0 +1,819 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004-2006 Voltaire Corporation. All rights reserved.
+ */
+
+#ifndef IB_MAD_H
+#define IB_MAD_H
+
+#include <linux/list.h>
+
+#include <rdma/ib_verbs.h>
+#include <uapi/rdma/ib_user_mad.h>
+
+/* Management base versions */
+#define IB_MGMT_BASE_VERSION 1
+#define OPA_MGMT_BASE_VERSION 0x80
+
+#define OPA_SM_CLASS_VERSION 0x80
+
+/* Management classes */
+#define IB_MGMT_CLASS_SUBN_LID_ROUTED 0x01
+#define IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE 0x81
+#define IB_MGMT_CLASS_SUBN_ADM 0x03
+#define IB_MGMT_CLASS_PERF_MGMT 0x04
+#define IB_MGMT_CLASS_BM 0x05
+#define IB_MGMT_CLASS_DEVICE_MGMT 0x06
+#define IB_MGMT_CLASS_CM 0x07
+#define IB_MGMT_CLASS_SNMP 0x08
+#define IB_MGMT_CLASS_DEVICE_ADM 0x10
+#define IB_MGMT_CLASS_BOOT_MGMT 0x11
+#define IB_MGMT_CLASS_BIS 0x12
+#define IB_MGMT_CLASS_CONG_MGMT 0x21
+#define IB_MGMT_CLASS_VENDOR_RANGE2_START 0x30
+#define IB_MGMT_CLASS_VENDOR_RANGE2_END 0x4F
+
+#define IB_OPENIB_OUI (0x001405)
+
+/* Management methods */
+#define IB_MGMT_METHOD_GET 0x01
+#define IB_MGMT_METHOD_SET 0x02
+#define IB_MGMT_METHOD_GET_RESP 0x81
+#define IB_MGMT_METHOD_SEND 0x03
+#define IB_MGMT_METHOD_TRAP 0x05
+#define IB_MGMT_METHOD_REPORT 0x06
+#define IB_MGMT_METHOD_REPORT_RESP 0x86
+#define IB_MGMT_METHOD_TRAP_REPRESS 0x07
+
+#define IB_MGMT_METHOD_RESP 0x80
+#define IB_BM_ATTR_MOD_RESP cpu_to_be32(1)
+
+#define IB_MGMT_MAX_METHODS 128
+
+/* MAD Status field bit masks */
+#define IB_MGMT_MAD_STATUS_SUCCESS 0x0000
+#define IB_MGMT_MAD_STATUS_BUSY 0x0001
+#define IB_MGMT_MAD_STATUS_REDIRECT_REQD 0x0002
+#define IB_MGMT_MAD_STATUS_BAD_VERSION 0x0004
+#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD 0x0008
+#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB 0x000c
+#define IB_MGMT_MAD_STATUS_INVALID_ATTRIB_VALUE 0x001c
+
+/* RMPP information */
+#define IB_MGMT_RMPP_VERSION 1
+
+#define IB_MGMT_RMPP_TYPE_DATA 1
+#define IB_MGMT_RMPP_TYPE_ACK 2
+#define IB_MGMT_RMPP_TYPE_STOP 3
+#define IB_MGMT_RMPP_TYPE_ABORT 4
+
+#define IB_MGMT_RMPP_FLAG_ACTIVE 1
+#define IB_MGMT_RMPP_FLAG_FIRST (1<<1)
+#define IB_MGMT_RMPP_FLAG_LAST (1<<2)
+
+#define IB_MGMT_RMPP_NO_RESPTIME 0x1F
+
+#define IB_MGMT_RMPP_STATUS_SUCCESS 0
+#define IB_MGMT_RMPP_STATUS_RESX 1
+#define IB_MGMT_RMPP_STATUS_ABORT_MIN 118
+#define IB_MGMT_RMPP_STATUS_T2L 118
+#define IB_MGMT_RMPP_STATUS_BAD_LEN 119
+#define IB_MGMT_RMPP_STATUS_BAD_SEG 120
+#define IB_MGMT_RMPP_STATUS_BADT 121
+#define IB_MGMT_RMPP_STATUS_W2S 122
+#define IB_MGMT_RMPP_STATUS_S2B 123
+#define IB_MGMT_RMPP_STATUS_BAD_STATUS 124
+#define IB_MGMT_RMPP_STATUS_UNV 125
+#define IB_MGMT_RMPP_STATUS_TMR 126
+#define IB_MGMT_RMPP_STATUS_UNSPEC 127
+#define IB_MGMT_RMPP_STATUS_ABORT_MAX 127
+
+#define IB_QP0 0
+#define IB_QP1 cpu_to_be32(1)
+#define IB_QP1_QKEY 0x80010000
+#define IB_QP_SET_QKEY 0x80000000
+
+#define IB_DEFAULT_PKEY_PARTIAL 0x7FFF
+#define IB_DEFAULT_PKEY_FULL 0xFFFF
+
+/*
+ * Generic trap/notice types
+ */
+#define IB_NOTICE_TYPE_FATAL 0x80
+#define IB_NOTICE_TYPE_URGENT 0x81
+#define IB_NOTICE_TYPE_SECURITY 0x82
+#define IB_NOTICE_TYPE_SM 0x83
+#define IB_NOTICE_TYPE_INFO 0x84
+
+/*
+ * Generic trap/notice producers
+ */
+#define IB_NOTICE_PROD_CA cpu_to_be16(1)
+#define IB_NOTICE_PROD_SWITCH cpu_to_be16(2)
+#define IB_NOTICE_PROD_ROUTER cpu_to_be16(3)
+#define IB_NOTICE_PROD_CLASS_MGR cpu_to_be16(4)
+
+enum {
+ IB_MGMT_MAD_HDR = 24,
+ IB_MGMT_MAD_DATA = 232,
+ IB_MGMT_RMPP_HDR = 36,
+ IB_MGMT_RMPP_DATA = 220,
+ IB_MGMT_VENDOR_HDR = 40,
+ IB_MGMT_VENDOR_DATA = 216,
+ IB_MGMT_SA_HDR = 56,
+ IB_MGMT_SA_DATA = 200,
+ IB_MGMT_DEVICE_HDR = 64,
+ IB_MGMT_DEVICE_DATA = 192,
+ IB_MGMT_MAD_SIZE = IB_MGMT_MAD_HDR + IB_MGMT_MAD_DATA,
+ OPA_MGMT_MAD_DATA = 2024,
+ OPA_MGMT_RMPP_DATA = 2012,
+ OPA_MGMT_MAD_SIZE = IB_MGMT_MAD_HDR + OPA_MGMT_MAD_DATA,
+};
+
+struct ib_mad_hdr {
+ u8 base_version;
+ u8 mgmt_class;
+ u8 class_version;
+ u8 method;
+ __be16 status;
+ __be16 class_specific;
+ __be64 tid;
+ __be16 attr_id;
+ __be16 resv;
+ __be32 attr_mod;
+};
+
+struct ib_rmpp_hdr {
+ u8 rmpp_version;
+ u8 rmpp_type;
+ u8 rmpp_rtime_flags;
+ u8 rmpp_status;
+ __be32 seg_num;
+ __be32 paylen_newwin;
+};
+
+typedef u64 __bitwise ib_sa_comp_mask;
+
+#define IB_SA_COMP_MASK(n) ((__force ib_sa_comp_mask) cpu_to_be64(1ull << (n)))
+
+/*
+ * ib_sa_hdr and ib_sa_mad structures must be packed because they have
+ * 64-bit fields that are only 32-bit aligned. 64-bit architectures will
+ * lay them out wrong otherwise. (And unfortunately they are sent on
+ * the wire so we can't change the layout)
+ */
+struct ib_sa_hdr {
+ __be64 sm_key;
+ __be16 attr_offset;
+ __be16 reserved;
+ ib_sa_comp_mask comp_mask;
+} __packed;
+
+struct ib_mad {
+ struct ib_mad_hdr mad_hdr;
+ u8 data[IB_MGMT_MAD_DATA];
+};
+
+struct opa_mad {
+ struct ib_mad_hdr mad_hdr;
+ u8 data[OPA_MGMT_MAD_DATA];
+};
+
+struct ib_rmpp_mad {
+ struct ib_mad_hdr mad_hdr;
+ struct ib_rmpp_hdr rmpp_hdr;
+ u8 data[IB_MGMT_RMPP_DATA];
+};
+
+struct opa_rmpp_mad {
+ struct ib_mad_hdr mad_hdr;
+ struct ib_rmpp_hdr rmpp_hdr;
+ u8 data[OPA_MGMT_RMPP_DATA];
+};
+
+struct ib_sa_mad {
+ struct ib_mad_hdr mad_hdr;
+ struct ib_rmpp_hdr rmpp_hdr;
+ struct ib_sa_hdr sa_hdr;
+ u8 data[IB_MGMT_SA_DATA];
+} __packed;
+
+struct ib_vendor_mad {
+ struct ib_mad_hdr mad_hdr;
+ struct ib_rmpp_hdr rmpp_hdr;
+ u8 reserved;
+ u8 oui[3];
+ u8 data[IB_MGMT_VENDOR_DATA];
+};
+
+#define IB_MGMT_CLASSPORTINFO_ATTR_ID cpu_to_be16(0x0001)
+
+#define IB_CLASS_PORT_INFO_RESP_TIME_MASK 0x1F
+#define IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE 5
+
+struct ib_class_port_info {
+ u8 base_version;
+ u8 class_version;
+ __be16 capability_mask;
+ /* 27 bits for cap_mask2, 5 bits for resp_time */
+ __be32 cap_mask2_resp_time;
+ u8 redirect_gid[16];
+ __be32 redirect_tcslfl;
+ __be16 redirect_lid;
+ __be16 redirect_pkey;
+ __be32 redirect_qp;
+ __be32 redirect_qkey;
+ u8 trap_gid[16];
+ __be32 trap_tcslfl;
+ __be16 trap_lid;
+ __be16 trap_pkey;
+ __be32 trap_hlqp;
+ __be32 trap_qkey;
+};
+
+/* PortInfo CapabilityMask */
+enum ib_port_capability_mask_bits {
+ IB_PORT_SM = 1 << 1,
+ IB_PORT_NOTICE_SUP = 1 << 2,
+ IB_PORT_TRAP_SUP = 1 << 3,
+ IB_PORT_OPT_IPD_SUP = 1 << 4,
+ IB_PORT_AUTO_MIGR_SUP = 1 << 5,
+ IB_PORT_SL_MAP_SUP = 1 << 6,
+ IB_PORT_MKEY_NVRAM = 1 << 7,
+ IB_PORT_PKEY_NVRAM = 1 << 8,
+ IB_PORT_LED_INFO_SUP = 1 << 9,
+ IB_PORT_SM_DISABLED = 1 << 10,
+ IB_PORT_SYS_IMAGE_GUID_SUP = 1 << 11,
+ IB_PORT_PKEY_SW_EXT_PORT_TRAP_SUP = 1 << 12,
+ IB_PORT_EXTENDED_SPEEDS_SUP = 1 << 14,
+ IB_PORT_CAP_MASK2_SUP = 1 << 15,
+ IB_PORT_CM_SUP = 1 << 16,
+ IB_PORT_SNMP_TUNNEL_SUP = 1 << 17,
+ IB_PORT_REINIT_SUP = 1 << 18,
+ IB_PORT_DEVICE_MGMT_SUP = 1 << 19,
+ IB_PORT_VENDOR_CLASS_SUP = 1 << 20,
+ IB_PORT_DR_NOTICE_SUP = 1 << 21,
+ IB_PORT_CAP_MASK_NOTICE_SUP = 1 << 22,
+ IB_PORT_BOOT_MGMT_SUP = 1 << 23,
+ IB_PORT_LINK_LATENCY_SUP = 1 << 24,
+ IB_PORT_CLIENT_REG_SUP = 1 << 25,
+ IB_PORT_OTHER_LOCAL_CHANGES_SUP = 1 << 26,
+ IB_PORT_LINK_SPEED_WIDTH_TABLE_SUP = 1 << 27,
+ IB_PORT_VENDOR_SPECIFIC_MADS_TABLE_SUP = 1 << 28,
+ IB_PORT_MCAST_PKEY_TRAP_SUPPRESSION_SUP = 1 << 29,
+ IB_PORT_MCAST_FDB_TOP_SUP = 1 << 30,
+ IB_PORT_HIERARCHY_INFO_SUP = 1ULL << 31,
+};
+
+enum ib_port_capability_mask2_bits {
+ IB_PORT_SET_NODE_DESC_SUP = 1 << 0,
+ IB_PORT_EX_PORT_INFO_EX_SUP = 1 << 1,
+ IB_PORT_VIRT_SUP = 1 << 2,
+ IB_PORT_SWITCH_PORT_STATE_TABLE_SUP = 1 << 3,
+ IB_PORT_LINK_WIDTH_2X_SUP = 1 << 4,
+ IB_PORT_LINK_SPEED_HDR_SUP = 1 << 5,
+ IB_PORT_LINK_SPEED_NDR_SUP = 1 << 10,
+};
+
+#define OPA_CLASS_PORT_INFO_PR_SUPPORT BIT(26)
+
+struct opa_class_port_info {
+ u8 base_version;
+ u8 class_version;
+ __be16 cap_mask;
+ __be32 cap_mask2_resp_time;
+
+ u8 redirect_gid[16];
+ __be32 redirect_tc_fl;
+ __be32 redirect_lid;
+ __be32 redirect_sl_qp;
+ __be32 redirect_qkey;
+
+ u8 trap_gid[16];
+ __be32 trap_tc_fl;
+ __be32 trap_lid;
+ __be32 trap_hl_qp;
+ __be32 trap_qkey;
+
+ __be16 trap_pkey;
+ __be16 redirect_pkey;
+
+ u8 trap_sl_rsvd;
+ u8 reserved[3];
+} __packed;
+
+/**
+ * ib_get_cpi_resp_time - Returns the resp_time value from
+ * cap_mask2_resp_time in ib_class_port_info.
+ * @cpi: A struct ib_class_port_info mad.
+ */
+static inline u8 ib_get_cpi_resp_time(struct ib_class_port_info *cpi)
+{
+ return (u8)(be32_to_cpu(cpi->cap_mask2_resp_time) &
+ IB_CLASS_PORT_INFO_RESP_TIME_MASK);
+}
+
+/**
+ * ib_set_cpi_resptime - Sets the response time in an
+ * ib_class_port_info mad.
+ * @cpi: A struct ib_class_port_info.
+ * @rtime: The response time to set.
+ */
+static inline void ib_set_cpi_resp_time(struct ib_class_port_info *cpi,
+ u8 rtime)
+{
+ cpi->cap_mask2_resp_time =
+ (cpi->cap_mask2_resp_time &
+ cpu_to_be32(~IB_CLASS_PORT_INFO_RESP_TIME_MASK)) |
+ cpu_to_be32(rtime & IB_CLASS_PORT_INFO_RESP_TIME_MASK);
+}
+
+/**
+ * ib_get_cpi_capmask2 - Returns the capmask2 value from
+ * cap_mask2_resp_time in ib_class_port_info.
+ * @cpi: A struct ib_class_port_info mad.
+ */
+static inline u32 ib_get_cpi_capmask2(struct ib_class_port_info *cpi)
+{
+ return (be32_to_cpu(cpi->cap_mask2_resp_time) >>
+ IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE);
+}
+
+/**
+ * ib_set_cpi_capmask2 - Sets the capmask2 in an
+ * ib_class_port_info mad.
+ * @cpi: A struct ib_class_port_info.
+ * @capmask2: The capmask2 to set.
+ */
+static inline void ib_set_cpi_capmask2(struct ib_class_port_info *cpi,
+ u32 capmask2)
+{
+ cpi->cap_mask2_resp_time =
+ (cpi->cap_mask2_resp_time &
+ cpu_to_be32(IB_CLASS_PORT_INFO_RESP_TIME_MASK)) |
+ cpu_to_be32(capmask2 <<
+ IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE);
+}
+
+/**
+ * opa_get_cpi_capmask2 - Returns the capmask2 value from
+ * cap_mask2_resp_time in ib_class_port_info.
+ * @cpi: A struct opa_class_port_info mad.
+ */
+static inline u32 opa_get_cpi_capmask2(struct opa_class_port_info *cpi)
+{
+ return (be32_to_cpu(cpi->cap_mask2_resp_time) >>
+ IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE);
+}
+
+struct ib_mad_notice_attr {
+ u8 generic_type;
+ u8 prod_type_msb;
+ __be16 prod_type_lsb;
+ __be16 trap_num;
+ __be16 issuer_lid;
+ __be16 toggle_count;
+
+ union {
+ struct {
+ u8 details[54];
+ } raw_data;
+
+ struct {
+ __be16 reserved;
+ __be16 lid; /* where violation happened */
+ u8 port_num; /* where violation happened */
+ } __packed ntc_129_131;
+
+ struct {
+ __be16 reserved;
+ __be16 lid; /* LID where change occurred */
+ u8 reserved2;
+ u8 local_changes; /* low bit - local changes */
+ __be32 new_cap_mask; /* new capability mask */
+ u8 reserved3;
+ u8 change_flags; /* low 3 bits only */
+ } __packed ntc_144;
+
+ struct {
+ __be16 reserved;
+ __be16 lid; /* lid where sys guid changed */
+ __be16 reserved2;
+ __be64 new_sys_guid;
+ } __packed ntc_145;
+
+ struct {
+ __be16 reserved;
+ __be16 lid;
+ __be16 dr_slid;
+ u8 method;
+ u8 reserved2;
+ __be16 attr_id;
+ __be32 attr_mod;
+ __be64 mkey;
+ u8 reserved3;
+ u8 dr_trunc_hop;
+ u8 dr_rtn_path[30];
+ } __packed ntc_256;
+
+ struct {
+ __be16 reserved;
+ __be16 lid1;
+ __be16 lid2;
+ __be32 key;
+ __be32 sl_qp1; /* SL: high 4 bits */
+ __be32 qp2; /* high 8 bits reserved */
+ union ib_gid gid1;
+ union ib_gid gid2;
+ } __packed ntc_257_258;
+
+ } details;
+};
+
+/**
+ * ib_mad_send_buf - MAD data buffer and work request for sends.
+ * @next: A pointer used to chain together MADs for posting.
+ * @mad: References an allocated MAD data buffer for MADs that do not have
+ * RMPP active. For MADs using RMPP, references the common and management
+ * class specific headers.
+ * @mad_agent: MAD agent that allocated the buffer.
+ * @ah: The address handle to use when sending the MAD.
+ * @context: User-controlled context fields.
+ * @hdr_len: Indicates the size of the data header of the MAD. This length
+ * includes the common MAD, RMPP, and class specific headers.
+ * @data_len: Indicates the total size of user-transferred data.
+ * @seg_count: The number of RMPP segments allocated for this send.
+ * @seg_size: Size of the data in each RMPP segment. This does not include
+ * class specific headers.
+ * @seg_rmpp_size: Size of each RMPP segment including the class specific
+ * headers.
+ * @timeout_ms: Time to wait for a response.
+ * @retries: Number of times to retry a request for a response. For MADs
+ * using RMPP, this applies per window. On completion, returns the number
+ * of retries needed to complete the transfer.
+ *
+ * Users are responsible for initializing the MAD buffer itself, with the
+ * exception of any RMPP header. Additional segment buffer space allocated
+ * beyond data_len is padding.
+ */
+struct ib_mad_send_buf {
+ struct ib_mad_send_buf *next;
+ void *mad;
+ struct ib_mad_agent *mad_agent;
+ struct ib_ah *ah;
+ void *context[2];
+ int hdr_len;
+ int data_len;
+ int seg_count;
+ int seg_size;
+ int seg_rmpp_size;
+ int timeout_ms;
+ int retries;
+};
+
+/**
+ * ib_response_mad - Returns if the specified MAD has been generated in
+ * response to a sent request or trap.
+ */
+int ib_response_mad(const struct ib_mad_hdr *hdr);
+
+/**
+ * ib_get_rmpp_resptime - Returns the RMPP response time.
+ * @rmpp_hdr: An RMPP header.
+ */
+static inline u8 ib_get_rmpp_resptime(struct ib_rmpp_hdr *rmpp_hdr)
+{
+ return rmpp_hdr->rmpp_rtime_flags >> 3;
+}
+
+/**
+ * ib_get_rmpp_flags - Returns the RMPP flags.
+ * @rmpp_hdr: An RMPP header.
+ */
+static inline u8 ib_get_rmpp_flags(struct ib_rmpp_hdr *rmpp_hdr)
+{
+ return rmpp_hdr->rmpp_rtime_flags & 0x7;
+}
+
+/**
+ * ib_set_rmpp_resptime - Sets the response time in an RMPP header.
+ * @rmpp_hdr: An RMPP header.
+ * @rtime: The response time to set.
+ */
+static inline void ib_set_rmpp_resptime(struct ib_rmpp_hdr *rmpp_hdr, u8 rtime)
+{
+ rmpp_hdr->rmpp_rtime_flags = ib_get_rmpp_flags(rmpp_hdr) | (rtime << 3);
+}
+
+/**
+ * ib_set_rmpp_flags - Sets the flags in an RMPP header.
+ * @rmpp_hdr: An RMPP header.
+ * @flags: The flags to set.
+ */
+static inline void ib_set_rmpp_flags(struct ib_rmpp_hdr *rmpp_hdr, u8 flags)
+{
+ rmpp_hdr->rmpp_rtime_flags = (rmpp_hdr->rmpp_rtime_flags & 0xF8) |
+ (flags & 0x7);
+}
+
+struct ib_mad_agent;
+struct ib_mad_send_wc;
+struct ib_mad_recv_wc;
+
+/**
+ * ib_mad_send_handler - callback handler for a sent MAD.
+ * @mad_agent: MAD agent that sent the MAD.
+ * @mad_send_wc: Send work completion information on the sent MAD.
+ */
+typedef void (*ib_mad_send_handler)(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_wc *mad_send_wc);
+
+/**
+ * ib_mad_recv_handler - callback handler for a received MAD.
+ * @mad_agent: MAD agent requesting the received MAD.
+ * @send_buf: Send buffer if found, else NULL
+ * @mad_recv_wc: Received work completion information on the received MAD.
+ *
+ * MADs received in response to a send request operation will be handed to
+ * the user before the send operation completes. All data buffers given
+ * to registered agents through this routine are owned by the receiving
+ * client.
+ */
+typedef void (*ib_mad_recv_handler)(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf,
+ struct ib_mad_recv_wc *mad_recv_wc);
+
+/**
+ * ib_mad_agent - Used to track MAD registration with the access layer.
+ * @device: Reference to device registration is on.
+ * @qp: Reference to QP used for sending and receiving MADs.
+ * @mr: Memory region for system memory usable for DMA.
+ * @recv_handler: Callback handler for a received MAD.
+ * @send_handler: Callback handler for a sent MAD.
+ * @context: User-specified context associated with this registration.
+ * @hi_tid: Access layer assigned transaction ID for this client.
+ * Unsolicited MADs sent by this client will have the upper 32-bits
+ * of their TID set to this value.
+ * @flags: registration flags
+ * @port_num: Port number on which QP is registered
+ * @rmpp_version: If set, indicates the RMPP version used by this agent.
+ */
+enum {
+ IB_MAD_USER_RMPP = IB_USER_MAD_USER_RMPP,
+};
+struct ib_mad_agent {
+ struct ib_device *device;
+ struct ib_qp *qp;
+ ib_mad_recv_handler recv_handler;
+ ib_mad_send_handler send_handler;
+ void *context;
+ u32 hi_tid;
+ u32 flags;
+ void *security;
+ struct list_head mad_agent_sec_list;
+ u8 port_num;
+ u8 rmpp_version;
+ bool smp_allowed;
+};
+
+/**
+ * ib_mad_send_wc - MAD send completion information.
+ * @send_buf: Send MAD data buffer associated with the send MAD request.
+ * @status: Completion status.
+ * @vendor_err: Optional vendor error information returned with a failed
+ * request.
+ */
+struct ib_mad_send_wc {
+ struct ib_mad_send_buf *send_buf;
+ enum ib_wc_status status;
+ u32 vendor_err;
+};
+
+/**
+ * ib_mad_recv_buf - received MAD buffer information.
+ * @list: Reference to next data buffer for a received RMPP MAD.
+ * @grh: References a data buffer containing the global route header.
+ * The data refereced by this buffer is only valid if the GRH is
+ * valid.
+ * @mad: References the start of the received MAD.
+ */
+struct ib_mad_recv_buf {
+ struct list_head list;
+ struct ib_grh *grh;
+ union {
+ struct ib_mad *mad;
+ struct opa_mad *opa_mad;
+ };
+};
+
+/**
+ * ib_mad_recv_wc - received MAD information.
+ * @wc: Completion information for the received data.
+ * @recv_buf: Specifies the location of the received data buffer(s).
+ * @rmpp_list: Specifies a list of RMPP reassembled received MAD buffers.
+ * @mad_len: The length of the received MAD, without duplicated headers.
+ * @mad_seg_size: The size of individual MAD segments
+ *
+ * For received response, the wr_id contains a pointer to the ib_mad_send_buf
+ * for the corresponding send request.
+ */
+struct ib_mad_recv_wc {
+ struct ib_wc *wc;
+ struct ib_mad_recv_buf recv_buf;
+ struct list_head rmpp_list;
+ int mad_len;
+ size_t mad_seg_size;
+};
+
+/**
+ * ib_mad_reg_req - MAD registration request
+ * @mgmt_class: Indicates which management class of MADs should be receive
+ * by the caller. This field is only required if the user wishes to
+ * receive unsolicited MADs, otherwise it should be 0.
+ * @mgmt_class_version: Indicates which version of MADs for the given
+ * management class to receive.
+ * @oui: Indicates IEEE OUI when mgmt_class is a vendor class
+ * in the range from 0x30 to 0x4f. Otherwise not used.
+ * @method_mask: The caller will receive unsolicited MADs for any method
+ * where @method_mask = 1.
+ *
+ */
+struct ib_mad_reg_req {
+ u8 mgmt_class;
+ u8 mgmt_class_version;
+ u8 oui[3];
+ DECLARE_BITMAP(method_mask, IB_MGMT_MAX_METHODS);
+};
+
+/**
+ * ib_register_mad_agent - Register to send/receive MADs.
+ * @device: The device to register with.
+ * @port_num: The port on the specified device to use.
+ * @qp_type: Specifies which QP to access. Must be either
+ * IB_QPT_SMI or IB_QPT_GSI.
+ * @mad_reg_req: Specifies which unsolicited MADs should be received
+ * by the caller. This parameter may be NULL if the caller only
+ * wishes to receive solicited responses.
+ * @rmpp_version: If set, indicates that the client will send
+ * and receive MADs that contain the RMPP header for the given version.
+ * If set to 0, indicates that RMPP is not used by this client.
+ * @send_handler: The completion callback routine invoked after a send
+ * request has completed.
+ * @recv_handler: The completion callback routine invoked for a received
+ * MAD.
+ * @context: User specified context associated with the registration.
+ * @registration_flags: Registration flags to set for this agent
+ */
+struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
+ u32 port_num,
+ enum ib_qp_type qp_type,
+ struct ib_mad_reg_req *mad_reg_req,
+ u8 rmpp_version,
+ ib_mad_send_handler send_handler,
+ ib_mad_recv_handler recv_handler,
+ void *context,
+ u32 registration_flags);
+/**
+ * ib_unregister_mad_agent - Unregisters a client from using MAD services.
+ * @mad_agent: Corresponding MAD registration request to deregister.
+ *
+ * After invoking this routine, MAD services are no longer usable by the
+ * client on the associated QP.
+ */
+void ib_unregister_mad_agent(struct ib_mad_agent *mad_agent);
+
+/**
+ * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated
+ * with the registered client.
+ * @send_buf: Specifies the information needed to send the MAD(s).
+ * @bad_send_buf: Specifies the MAD on which an error was encountered. This
+ * parameter is optional if only a single MAD is posted.
+ *
+ * Sent MADs are not guaranteed to complete in the order that they were posted.
+ *
+ * If the MAD requires RMPP, the data buffer should contain a single copy
+ * of the common MAD, RMPP, and class specific headers, followed by the class
+ * defined data. If the class defined data would not divide evenly into
+ * RMPP segments, then space must be allocated at the end of the referenced
+ * buffer for any required padding. To indicate the amount of class defined
+ * data being transferred, the paylen_newwin field in the RMPP header should
+ * be set to the size of the class specific header plus the amount of class
+ * defined data being transferred. The paylen_newwin field should be
+ * specified in network-byte order.
+ */
+int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
+ struct ib_mad_send_buf **bad_send_buf);
+
+
+/**
+ * ib_free_recv_mad - Returns data buffers used to receive a MAD.
+ * @mad_recv_wc: Work completion information for a received MAD.
+ *
+ * Clients receiving MADs through their ib_mad_recv_handler must call this
+ * routine to return the work completion buffers to the access layer.
+ */
+void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc);
+
+/**
+ * ib_modify_mad - Modifies an outstanding send MAD operation.
+ * @send_buf: Indicates the MAD to modify.
+ * @timeout_ms: New timeout value for sent MAD.
+ *
+ * This call will reset the timeout value for a sent MAD to the specified
+ * value.
+ */
+int ib_modify_mad(struct ib_mad_send_buf *send_buf, u32 timeout_ms);
+
+/**
+ * ib_cancel_mad - Cancels an outstanding send MAD operation.
+ * @send_buf: Indicates the MAD to cancel.
+ *
+ * MADs will be returned to the user through the corresponding
+ * ib_mad_send_handler.
+ */
+static inline void ib_cancel_mad(struct ib_mad_send_buf *send_buf)
+{
+ ib_modify_mad(send_buf, 0);
+}
+
+/**
+ * ib_create_send_mad - Allocate and initialize a data buffer and work request
+ * for sending a MAD.
+ * @mad_agent: Specifies the registered MAD service to associate with the MAD.
+ * @remote_qpn: Specifies the QPN of the receiving node.
+ * @pkey_index: Specifies which PKey the MAD will be sent using. This field
+ * is valid only if the remote_qpn is QP 1.
+ * @rmpp_active: Indicates if the send will enable RMPP.
+ * @hdr_len: Indicates the size of the data header of the MAD. This length
+ * should include the common MAD header, RMPP header, plus any class
+ * specific header.
+ * @data_len: Indicates the size of any user-transferred data. The call will
+ * automatically adjust the allocated buffer size to account for any
+ * additional padding that may be necessary.
+ * @gfp_mask: GFP mask used for the memory allocation.
+ * @base_version: Base Version of this MAD
+ *
+ * This routine allocates a MAD for sending. The returned MAD send buffer
+ * will reference a data buffer usable for sending a MAD, along
+ * with an initialized work request structure. Users may modify the returned
+ * MAD data buffer before posting the send.
+ *
+ * The returned MAD header, class specific headers, and any padding will be
+ * cleared. Users are responsible for initializing the common MAD header,
+ * any class specific header, and MAD data area.
+ * If @rmpp_active is set, the RMPP header will be initialized for sending.
+ */
+struct ib_mad_send_buf *ib_create_send_mad(struct ib_mad_agent *mad_agent,
+ u32 remote_qpn, u16 pkey_index,
+ int rmpp_active,
+ int hdr_len, int data_len,
+ gfp_t gfp_mask,
+ u8 base_version);
+
+/**
+ * ib_is_mad_class_rmpp - returns whether given management class
+ * supports RMPP.
+ * @mgmt_class: management class
+ *
+ * This routine returns whether the management class supports RMPP.
+ */
+int ib_is_mad_class_rmpp(u8 mgmt_class);
+
+/**
+ * ib_get_mad_data_offset - returns the data offset for a given
+ * management class.
+ * @mgmt_class: management class
+ *
+ * This routine returns the data offset in the MAD for the management
+ * class requested.
+ */
+int ib_get_mad_data_offset(u8 mgmt_class);
+
+/**
+ * ib_get_rmpp_segment - returns the data buffer for a given RMPP segment.
+ * @send_buf: Previously allocated send data buffer.
+ * @seg_num: number of segment to return
+ *
+ * This routine returns a pointer to the data buffer of an RMPP MAD.
+ * Users must provide synchronization to @send_buf around this call.
+ */
+void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num);
+
+/**
+ * ib_free_send_mad - Returns data buffers used to send a MAD.
+ * @send_buf: Previously allocated send data buffer.
+ */
+void ib_free_send_mad(struct ib_mad_send_buf *send_buf);
+
+/**
+ * ib_mad_kernel_rmpp_agent - Returns if the agent is performing RMPP.
+ * @agent: the agent in question
+ * @return: true if agent is performing rmpp, false otherwise.
+ */
+int ib_mad_kernel_rmpp_agent(const struct ib_mad_agent *agent);
+
+#endif /* IB_MAD_H */
diff --git a/include/rdma/ib_marshall.h b/include/rdma/ib_marshall.h
new file mode 100644
index 0000000000..1838869aad
--- /dev/null
+++ b/include/rdma/ib_marshall.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2005-2006 Intel Corporation. All rights reserved.
+ */
+
+#ifndef IB_USER_MARSHALL_H
+#define IB_USER_MARSHALL_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_user_sa.h>
+
+void ib_copy_qp_attr_to_user(struct ib_device *device,
+ struct ib_uverbs_qp_attr *dst,
+ struct ib_qp_attr *src);
+
+void ib_copy_ah_attr_to_user(struct ib_device *device,
+ struct ib_uverbs_ah_attr *dst,
+ struct rdma_ah_attr *src);
+
+void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst,
+ struct sa_path_rec *src);
+
+void ib_copy_path_rec_from_user(struct sa_path_rec *dst,
+ struct ib_user_path_rec *src);
+
+#endif /* IB_USER_MARSHALL_H */
diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h
new file mode 100644
index 0000000000..b8c56d7dc3
--- /dev/null
+++ b/include/rdma/ib_pack.h
@@ -0,0 +1,289 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ */
+
+#ifndef IB_PACK_H
+#define IB_PACK_H
+
+#include <rdma/ib_verbs.h>
+#include <uapi/linux/if_ether.h>
+
+enum {
+ IB_LRH_BYTES = 8,
+ IB_ETH_BYTES = 14,
+ IB_VLAN_BYTES = 4,
+ IB_GRH_BYTES = 40,
+ IB_IP4_BYTES = 20,
+ IB_UDP_BYTES = 8,
+ IB_BTH_BYTES = 12,
+ IB_DETH_BYTES = 8,
+ IB_EXT_ATOMICETH_BYTES = 28,
+ IB_EXT_XRC_BYTES = 4,
+ IB_ICRC_BYTES = 4
+};
+
+struct ib_field {
+ size_t struct_offset_bytes;
+ size_t struct_size_bytes;
+ int offset_words;
+ int offset_bits;
+ int size_bits;
+ char *field_name;
+};
+
+#define RESERVED \
+ .field_name = "reserved"
+
+/*
+ * This macro cleans up the definitions of constants for BTH opcodes.
+ * It is used to define constants such as IB_OPCODE_UD_SEND_ONLY,
+ * which becomes IB_OPCODE_UD + IB_OPCODE_SEND_ONLY, and this gives
+ * the correct value.
+ *
+ * In short, user code should use the constants defined using the
+ * macro rather than worrying about adding together other constants.
+*/
+#define IB_OPCODE(transport, op) \
+ IB_OPCODE_ ## transport ## _ ## op = \
+ IB_OPCODE_ ## transport + IB_OPCODE_ ## op
+
+enum {
+ /* transport types -- just used to define real constants */
+ IB_OPCODE_RC = 0x00,
+ IB_OPCODE_UC = 0x20,
+ IB_OPCODE_RD = 0x40,
+ IB_OPCODE_UD = 0x60,
+ /* per IBTA 1.3 vol 1 Table 38, A10.3.2 */
+ IB_OPCODE_CNP = 0x80,
+ /* Manufacturer specific */
+ IB_OPCODE_MSP = 0xe0,
+
+ /* operations -- just used to define real constants */
+ IB_OPCODE_SEND_FIRST = 0x00,
+ IB_OPCODE_SEND_MIDDLE = 0x01,
+ IB_OPCODE_SEND_LAST = 0x02,
+ IB_OPCODE_SEND_LAST_WITH_IMMEDIATE = 0x03,
+ IB_OPCODE_SEND_ONLY = 0x04,
+ IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE = 0x05,
+ IB_OPCODE_RDMA_WRITE_FIRST = 0x06,
+ IB_OPCODE_RDMA_WRITE_MIDDLE = 0x07,
+ IB_OPCODE_RDMA_WRITE_LAST = 0x08,
+ IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE = 0x09,
+ IB_OPCODE_RDMA_WRITE_ONLY = 0x0a,
+ IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE = 0x0b,
+ IB_OPCODE_RDMA_READ_REQUEST = 0x0c,
+ IB_OPCODE_RDMA_READ_RESPONSE_FIRST = 0x0d,
+ IB_OPCODE_RDMA_READ_RESPONSE_MIDDLE = 0x0e,
+ IB_OPCODE_RDMA_READ_RESPONSE_LAST = 0x0f,
+ IB_OPCODE_RDMA_READ_RESPONSE_ONLY = 0x10,
+ IB_OPCODE_ACKNOWLEDGE = 0x11,
+ IB_OPCODE_ATOMIC_ACKNOWLEDGE = 0x12,
+ IB_OPCODE_COMPARE_SWAP = 0x13,
+ IB_OPCODE_FETCH_ADD = 0x14,
+ /* opcode 0x15 is reserved */
+ IB_OPCODE_SEND_LAST_WITH_INVALIDATE = 0x16,
+ IB_OPCODE_SEND_ONLY_WITH_INVALIDATE = 0x17,
+ IB_OPCODE_FLUSH = 0x1C,
+ IB_OPCODE_ATOMIC_WRITE = 0x1D,
+
+ /* real constants follow -- see comment about above IB_OPCODE()
+ macro for more details */
+
+ /* RC */
+ IB_OPCODE(RC, SEND_FIRST),
+ IB_OPCODE(RC, SEND_MIDDLE),
+ IB_OPCODE(RC, SEND_LAST),
+ IB_OPCODE(RC, SEND_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RC, SEND_ONLY),
+ IB_OPCODE(RC, SEND_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RC, RDMA_WRITE_FIRST),
+ IB_OPCODE(RC, RDMA_WRITE_MIDDLE),
+ IB_OPCODE(RC, RDMA_WRITE_LAST),
+ IB_OPCODE(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RC, RDMA_WRITE_ONLY),
+ IB_OPCODE(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RC, RDMA_READ_REQUEST),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_FIRST),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_MIDDLE),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_LAST),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_ONLY),
+ IB_OPCODE(RC, ACKNOWLEDGE),
+ IB_OPCODE(RC, ATOMIC_ACKNOWLEDGE),
+ IB_OPCODE(RC, COMPARE_SWAP),
+ IB_OPCODE(RC, FETCH_ADD),
+ IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE),
+ IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE),
+ IB_OPCODE(RC, FLUSH),
+ IB_OPCODE(RC, ATOMIC_WRITE),
+
+ /* UC */
+ IB_OPCODE(UC, SEND_FIRST),
+ IB_OPCODE(UC, SEND_MIDDLE),
+ IB_OPCODE(UC, SEND_LAST),
+ IB_OPCODE(UC, SEND_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(UC, SEND_ONLY),
+ IB_OPCODE(UC, SEND_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(UC, RDMA_WRITE_FIRST),
+ IB_OPCODE(UC, RDMA_WRITE_MIDDLE),
+ IB_OPCODE(UC, RDMA_WRITE_LAST),
+ IB_OPCODE(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(UC, RDMA_WRITE_ONLY),
+ IB_OPCODE(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+
+ /* RD */
+ IB_OPCODE(RD, SEND_FIRST),
+ IB_OPCODE(RD, SEND_MIDDLE),
+ IB_OPCODE(RD, SEND_LAST),
+ IB_OPCODE(RD, SEND_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RD, SEND_ONLY),
+ IB_OPCODE(RD, SEND_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RD, RDMA_WRITE_FIRST),
+ IB_OPCODE(RD, RDMA_WRITE_MIDDLE),
+ IB_OPCODE(RD, RDMA_WRITE_LAST),
+ IB_OPCODE(RD, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RD, RDMA_WRITE_ONLY),
+ IB_OPCODE(RD, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RD, RDMA_READ_REQUEST),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_FIRST),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_MIDDLE),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_LAST),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_ONLY),
+ IB_OPCODE(RD, ACKNOWLEDGE),
+ IB_OPCODE(RD, ATOMIC_ACKNOWLEDGE),
+ IB_OPCODE(RD, COMPARE_SWAP),
+ IB_OPCODE(RD, FETCH_ADD),
+ IB_OPCODE(RD, FLUSH),
+
+ /* UD */
+ IB_OPCODE(UD, SEND_ONLY),
+ IB_OPCODE(UD, SEND_ONLY_WITH_IMMEDIATE)
+};
+
+enum {
+ IB_LNH_RAW = 0,
+ IB_LNH_IP = 1,
+ IB_LNH_IBA_LOCAL = 2,
+ IB_LNH_IBA_GLOBAL = 3
+};
+
+struct ib_unpacked_lrh {
+ u8 virtual_lane;
+ u8 link_version;
+ u8 service_level;
+ u8 link_next_header;
+ __be16 destination_lid;
+ __be16 packet_length;
+ __be16 source_lid;
+};
+
+struct ib_unpacked_grh {
+ u8 ip_version;
+ u8 traffic_class;
+ __be32 flow_label;
+ __be16 payload_length;
+ u8 next_header;
+ u8 hop_limit;
+ union ib_gid source_gid;
+ union ib_gid destination_gid;
+};
+
+struct ib_unpacked_bth {
+ u8 opcode;
+ u8 solicited_event;
+ u8 mig_req;
+ u8 pad_count;
+ u8 transport_header_version;
+ __be16 pkey;
+ __be32 destination_qpn;
+ u8 ack_req;
+ __be32 psn;
+};
+
+struct ib_unpacked_deth {
+ __be32 qkey;
+ __be32 source_qpn;
+};
+
+struct ib_unpacked_eth {
+ u8 dmac_h[4];
+ u8 dmac_l[2];
+ u8 smac_h[2];
+ u8 smac_l[4];
+ __be16 type;
+};
+
+struct ib_unpacked_ip4 {
+ u8 ver;
+ u8 hdr_len;
+ u8 tos;
+ __be16 tot_len;
+ __be16 id;
+ __be16 frag_off;
+ u8 ttl;
+ u8 protocol;
+ __sum16 check;
+ __be32 saddr;
+ __be32 daddr;
+};
+
+struct ib_unpacked_udp {
+ __be16 sport;
+ __be16 dport;
+ __be16 length;
+ __be16 csum;
+};
+
+struct ib_unpacked_vlan {
+ __be16 tag;
+ __be16 type;
+};
+
+struct ib_ud_header {
+ int lrh_present;
+ struct ib_unpacked_lrh lrh;
+ int eth_present;
+ struct ib_unpacked_eth eth;
+ int vlan_present;
+ struct ib_unpacked_vlan vlan;
+ int grh_present;
+ struct ib_unpacked_grh grh;
+ int ipv4_present;
+ struct ib_unpacked_ip4 ip4;
+ int udp_present;
+ struct ib_unpacked_udp udp;
+ struct ib_unpacked_bth bth;
+ struct ib_unpacked_deth deth;
+ int immediate_present;
+ __be32 immediate_data;
+};
+
+void ib_pack(const struct ib_field *desc,
+ int desc_len,
+ void *structure,
+ void *buf);
+
+void ib_unpack(const struct ib_field *desc,
+ int desc_len,
+ void *buf,
+ void *structure);
+
+__sum16 ib_ud_ip4_csum(struct ib_ud_header *header);
+
+int ib_ud_header_init(int payload_bytes,
+ int lrh_present,
+ int eth_present,
+ int vlan_present,
+ int grh_present,
+ int ip_version,
+ int udp_present,
+ int immediate_present,
+ struct ib_ud_header *header);
+
+int ib_ud_header_pack(struct ib_ud_header *header,
+ void *buf);
+
+int ib_ud_header_unpack(void *buf,
+ struct ib_ud_header *header);
+
+#endif /* IB_PACK_H */
diff --git a/include/rdma/ib_pma.h b/include/rdma/ib_pma.h
new file mode 100644
index 0000000000..44c6182037
--- /dev/null
+++ b/include/rdma/ib_pma.h
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation.
+ * All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ */
+
+#ifndef IB_PMA_H
+#define IB_PMA_H
+
+#include <rdma/ib_mad.h>
+
+/*
+ * PMA class portinfo capability mask bits
+ */
+#define IB_PMA_CLASS_CAP_ALLPORTSELECT cpu_to_be16(1 << 8)
+#define IB_PMA_CLASS_CAP_EXT_WIDTH cpu_to_be16(1 << 9)
+#define IB_PMA_CLASS_CAP_EXT_WIDTH_NOIETF cpu_to_be16(1 << 10)
+#define IB_PMA_CLASS_CAP_XMIT_WAIT cpu_to_be16(1 << 12)
+
+#define IB_PMA_CLASS_PORT_INFO cpu_to_be16(0x0001)
+#define IB_PMA_PORT_SAMPLES_CONTROL cpu_to_be16(0x0010)
+#define IB_PMA_PORT_SAMPLES_RESULT cpu_to_be16(0x0011)
+#define IB_PMA_PORT_COUNTERS cpu_to_be16(0x0012)
+#define IB_PMA_PORT_COUNTERS_EXT cpu_to_be16(0x001D)
+#define IB_PMA_PORT_SAMPLES_RESULT_EXT cpu_to_be16(0x001E)
+
+struct ib_pma_mad {
+ struct ib_mad_hdr mad_hdr;
+ u8 reserved[40];
+ u8 data[192];
+} __packed;
+
+struct ib_pma_portsamplescontrol {
+ u8 opcode;
+ u8 port_select;
+ u8 tick;
+ u8 counter_width; /* resv: 7:3, counter width: 2:0 */
+ __be32 counter_mask0_9; /* 2, 10 3-bit fields */
+ __be16 counter_mask10_14; /* 1, 5 3-bit fields */
+ u8 sample_mechanisms;
+ u8 sample_status; /* only lower 2 bits */
+ __be64 option_mask;
+ __be64 vendor_mask;
+ __be32 sample_start;
+ __be32 sample_interval;
+ __be16 tag;
+ __be16 counter_select[15];
+ __be32 reserved1;
+ __be64 samples_only_option_mask;
+ __be32 reserved2[28];
+};
+
+struct ib_pma_portsamplesresult {
+ __be16 tag;
+ __be16 sample_status; /* only lower 2 bits */
+ __be32 counter[15];
+};
+
+struct ib_pma_portsamplesresult_ext {
+ __be16 tag;
+ __be16 sample_status; /* only lower 2 bits */
+ __be32 extended_width; /* only upper 2 bits */
+ __be64 counter[15];
+};
+
+struct ib_pma_portcounters {
+ u8 reserved;
+ u8 port_select;
+ __be16 counter_select;
+ __be16 symbol_error_counter;
+ u8 link_error_recovery_counter;
+ u8 link_downed_counter;
+ __be16 port_rcv_errors;
+ __be16 port_rcv_remphys_errors;
+ __be16 port_rcv_switch_relay_errors;
+ __be16 port_xmit_discards;
+ u8 port_xmit_constraint_errors;
+ u8 port_rcv_constraint_errors;
+ u8 reserved1;
+ u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */
+ __be16 reserved2;
+ __be16 vl15_dropped;
+ __be32 port_xmit_data;
+ __be32 port_rcv_data;
+ __be32 port_xmit_packets;
+ __be32 port_rcv_packets;
+ __be32 port_xmit_wait;
+} __packed;
+
+
+#define IB_PMA_SEL_SYMBOL_ERROR cpu_to_be16(0x0001)
+#define IB_PMA_SEL_LINK_ERROR_RECOVERY cpu_to_be16(0x0002)
+#define IB_PMA_SEL_LINK_DOWNED cpu_to_be16(0x0004)
+#define IB_PMA_SEL_PORT_RCV_ERRORS cpu_to_be16(0x0008)
+#define IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS cpu_to_be16(0x0010)
+#define IB_PMA_SEL_PORT_XMIT_DISCARDS cpu_to_be16(0x0040)
+#define IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS cpu_to_be16(0x0200)
+#define IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS cpu_to_be16(0x0400)
+#define IB_PMA_SEL_PORT_VL15_DROPPED cpu_to_be16(0x0800)
+#define IB_PMA_SEL_PORT_XMIT_DATA cpu_to_be16(0x1000)
+#define IB_PMA_SEL_PORT_RCV_DATA cpu_to_be16(0x2000)
+#define IB_PMA_SEL_PORT_XMIT_PACKETS cpu_to_be16(0x4000)
+#define IB_PMA_SEL_PORT_RCV_PACKETS cpu_to_be16(0x8000)
+
+struct ib_pma_portcounters_ext {
+ u8 reserved;
+ u8 port_select;
+ __be16 counter_select;
+ __be32 reserved1;
+ __be64 port_xmit_data;
+ __be64 port_rcv_data;
+ __be64 port_xmit_packets;
+ __be64 port_rcv_packets;
+ __be64 port_unicast_xmit_packets;
+ __be64 port_unicast_rcv_packets;
+ __be64 port_multicast_xmit_packets;
+ __be64 port_multicast_rcv_packets;
+} __packed;
+
+#define IB_PMA_SELX_PORT_XMIT_DATA cpu_to_be16(0x0001)
+#define IB_PMA_SELX_PORT_RCV_DATA cpu_to_be16(0x0002)
+#define IB_PMA_SELX_PORT_XMIT_PACKETS cpu_to_be16(0x0004)
+#define IB_PMA_SELX_PORT_RCV_PACKETS cpu_to_be16(0x0008)
+#define IB_PMA_SELX_PORT_UNI_XMIT_PACKETS cpu_to_be16(0x0010)
+#define IB_PMA_SELX_PORT_UNI_RCV_PACKETS cpu_to_be16(0x0020)
+#define IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS cpu_to_be16(0x0040)
+#define IB_PMA_SELX_PORT_MULTI_RCV_PACKETS cpu_to_be16(0x0080)
+
+#endif /* IB_PMA_H */
diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h
new file mode 100644
index 0000000000..b46353fc53
--- /dev/null
+++ b/include/rdma/ib_sa.h
@@ -0,0 +1,609 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2006 Intel Corporation. All rights reserved.
+ */
+
+#ifndef IB_SA_H
+#define IB_SA_H
+
+#include <linux/completion.h>
+#include <linux/compiler.h>
+
+#include <linux/atomic.h>
+#include <linux/netdevice.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_addr.h>
+#include <rdma/opa_addr.h>
+
+enum {
+ IB_SA_CLASS_VERSION = 2, /* IB spec version 1.1/1.2 */
+
+ IB_SA_METHOD_GET_TABLE = 0x12,
+ IB_SA_METHOD_GET_TABLE_RESP = 0x92,
+ IB_SA_METHOD_DELETE = 0x15,
+ IB_SA_METHOD_DELETE_RESP = 0x95,
+ IB_SA_METHOD_GET_MULTI = 0x14,
+ IB_SA_METHOD_GET_MULTI_RESP = 0x94,
+ IB_SA_METHOD_GET_TRACE_TBL = 0x13
+};
+
+#define OPA_SA_CLASS_VERSION 0x80
+enum {
+ IB_SA_ATTR_CLASS_PORTINFO = 0x01,
+ IB_SA_ATTR_NOTICE = 0x02,
+ IB_SA_ATTR_INFORM_INFO = 0x03,
+ IB_SA_ATTR_NODE_REC = 0x11,
+ IB_SA_ATTR_PORT_INFO_REC = 0x12,
+ IB_SA_ATTR_SL2VL_REC = 0x13,
+ IB_SA_ATTR_SWITCH_REC = 0x14,
+ IB_SA_ATTR_LINEAR_FDB_REC = 0x15,
+ IB_SA_ATTR_RANDOM_FDB_REC = 0x16,
+ IB_SA_ATTR_MCAST_FDB_REC = 0x17,
+ IB_SA_ATTR_SM_INFO_REC = 0x18,
+ IB_SA_ATTR_LINK_REC = 0x20,
+ IB_SA_ATTR_GUID_INFO_REC = 0x30,
+ IB_SA_ATTR_SERVICE_REC = 0x31,
+ IB_SA_ATTR_PARTITION_REC = 0x33,
+ IB_SA_ATTR_PATH_REC = 0x35,
+ IB_SA_ATTR_VL_ARB_REC = 0x36,
+ IB_SA_ATTR_MC_MEMBER_REC = 0x38,
+ IB_SA_ATTR_TRACE_REC = 0x39,
+ IB_SA_ATTR_MULTI_PATH_REC = 0x3a,
+ IB_SA_ATTR_SERVICE_ASSOC_REC = 0x3b,
+ IB_SA_ATTR_INFORM_INFO_REC = 0xf3
+};
+
+enum ib_sa_selector {
+ IB_SA_GT = 0,
+ IB_SA_LT = 1,
+ IB_SA_EQ = 2,
+ /*
+ * The meaning of "best" depends on the attribute: for
+ * example, for MTU best will return the largest available
+ * MTU, while for packet life time, best will return the
+ * smallest available life time.
+ */
+ IB_SA_BEST = 3
+};
+
+/*
+ * There are 4 types of join states:
+ * FullMember, NonMember, SendOnlyNonMember, SendOnlyFullMember.
+ * The order corresponds to JoinState bits in MCMemberRecord.
+ */
+enum ib_sa_mc_join_states {
+ FULLMEMBER_JOIN,
+ NONMEMBER_JOIN,
+ SENDONLY_NONMEBER_JOIN,
+ SENDONLY_FULLMEMBER_JOIN,
+ NUM_JOIN_MEMBERSHIP_TYPES,
+};
+
+#define IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT BIT(12)
+
+/*
+ * Structures for SA records are named "struct ib_sa_xxx_rec." No
+ * attempt is made to pack structures to match the physical layout of
+ * SA records in SA MADs; all packing and unpacking is handled by the
+ * SA query code.
+ *
+ * For a record with structure ib_sa_xxx_rec, the naming convention
+ * for the component mask value for field yyy is IB_SA_XXX_REC_YYY (we
+ * never use different abbreviations or otherwise change the spelling
+ * of xxx/yyy between ib_sa_xxx_rec.yyy and IB_SA_XXX_REC_YYY).
+ *
+ * Reserved rows are indicated with comments to help maintainability.
+ */
+
+#define IB_SA_PATH_REC_SERVICE_ID (IB_SA_COMP_MASK( 0) |\
+ IB_SA_COMP_MASK( 1))
+#define IB_SA_PATH_REC_DGID IB_SA_COMP_MASK( 2)
+#define IB_SA_PATH_REC_SGID IB_SA_COMP_MASK( 3)
+#define IB_SA_PATH_REC_DLID IB_SA_COMP_MASK( 4)
+#define IB_SA_PATH_REC_SLID IB_SA_COMP_MASK( 5)
+#define IB_SA_PATH_REC_RAW_TRAFFIC IB_SA_COMP_MASK( 6)
+/* reserved: 7 */
+#define IB_SA_PATH_REC_FLOW_LABEL IB_SA_COMP_MASK( 8)
+#define IB_SA_PATH_REC_HOP_LIMIT IB_SA_COMP_MASK( 9)
+#define IB_SA_PATH_REC_TRAFFIC_CLASS IB_SA_COMP_MASK(10)
+#define IB_SA_PATH_REC_REVERSIBLE IB_SA_COMP_MASK(11)
+#define IB_SA_PATH_REC_NUMB_PATH IB_SA_COMP_MASK(12)
+#define IB_SA_PATH_REC_PKEY IB_SA_COMP_MASK(13)
+#define IB_SA_PATH_REC_QOS_CLASS IB_SA_COMP_MASK(14)
+#define IB_SA_PATH_REC_SL IB_SA_COMP_MASK(15)
+#define IB_SA_PATH_REC_MTU_SELECTOR IB_SA_COMP_MASK(16)
+#define IB_SA_PATH_REC_MTU IB_SA_COMP_MASK(17)
+#define IB_SA_PATH_REC_RATE_SELECTOR IB_SA_COMP_MASK(18)
+#define IB_SA_PATH_REC_RATE IB_SA_COMP_MASK(19)
+#define IB_SA_PATH_REC_PACKET_LIFE_TIME_SELECTOR IB_SA_COMP_MASK(20)
+#define IB_SA_PATH_REC_PACKET_LIFE_TIME IB_SA_COMP_MASK(21)
+#define IB_SA_PATH_REC_PREFERENCE IB_SA_COMP_MASK(22)
+
+enum sa_path_rec_type {
+ SA_PATH_REC_TYPE_IB,
+ SA_PATH_REC_TYPE_ROCE_V1,
+ SA_PATH_REC_TYPE_ROCE_V2,
+ SA_PATH_REC_TYPE_OPA
+};
+
+struct sa_path_rec_ib {
+ __be16 dlid;
+ __be16 slid;
+ u8 raw_traffic;
+};
+
+/**
+ * struct sa_path_rec_roce - RoCE specific portion of the path record entry
+ * @route_resolved: When set, it indicates that this route is already
+ * resolved for this path record entry.
+ * @dmac: Destination mac address for the given DGID entry
+ * of the path record entry.
+ */
+struct sa_path_rec_roce {
+ bool route_resolved;
+ u8 dmac[ETH_ALEN];
+};
+
+struct sa_path_rec_opa {
+ __be32 dlid;
+ __be32 slid;
+ u8 raw_traffic;
+ u8 l2_8B;
+ u8 l2_10B;
+ u8 l2_9B;
+ u8 l2_16B;
+ u8 qos_type;
+ u8 qos_priority;
+};
+
+struct sa_path_rec {
+ union ib_gid dgid;
+ union ib_gid sgid;
+ __be64 service_id;
+ /* reserved */
+ __be32 flow_label;
+ u8 hop_limit;
+ u8 traffic_class;
+ u8 reversible;
+ u8 numb_path;
+ __be16 pkey;
+ __be16 qos_class;
+ u8 sl;
+ u8 mtu_selector;
+ u8 mtu;
+ u8 rate_selector;
+ u8 rate;
+ u8 packet_life_time_selector;
+ u8 packet_life_time;
+ u8 preference;
+ union {
+ struct sa_path_rec_ib ib;
+ struct sa_path_rec_roce roce;
+ struct sa_path_rec_opa opa;
+ };
+ enum sa_path_rec_type rec_type;
+ u32 flags;
+};
+
+static inline enum ib_gid_type
+ sa_conv_pathrec_to_gid_type(struct sa_path_rec *rec)
+{
+ switch (rec->rec_type) {
+ case SA_PATH_REC_TYPE_ROCE_V1:
+ return IB_GID_TYPE_ROCE;
+ case SA_PATH_REC_TYPE_ROCE_V2:
+ return IB_GID_TYPE_ROCE_UDP_ENCAP;
+ default:
+ return IB_GID_TYPE_IB;
+ }
+}
+
+static inline enum sa_path_rec_type
+ sa_conv_gid_to_pathrec_type(enum ib_gid_type type)
+{
+ switch (type) {
+ case IB_GID_TYPE_ROCE:
+ return SA_PATH_REC_TYPE_ROCE_V1;
+ case IB_GID_TYPE_ROCE_UDP_ENCAP:
+ return SA_PATH_REC_TYPE_ROCE_V2;
+ default:
+ return SA_PATH_REC_TYPE_IB;
+ }
+}
+
+static inline void path_conv_opa_to_ib(struct sa_path_rec *ib,
+ struct sa_path_rec *opa)
+{
+ if ((be32_to_cpu(opa->opa.dlid) >=
+ be16_to_cpu(IB_MULTICAST_LID_BASE)) ||
+ (be32_to_cpu(opa->opa.slid) >=
+ be16_to_cpu(IB_MULTICAST_LID_BASE))) {
+ /* Create OPA GID and zero out the LID */
+ ib->dgid.global.interface_id
+ = OPA_MAKE_ID(be32_to_cpu(opa->opa.dlid));
+ ib->dgid.global.subnet_prefix
+ = opa->dgid.global.subnet_prefix;
+ ib->sgid.global.interface_id
+ = OPA_MAKE_ID(be32_to_cpu(opa->opa.slid));
+ ib->dgid.global.subnet_prefix
+ = opa->dgid.global.subnet_prefix;
+ ib->ib.dlid = 0;
+
+ ib->ib.slid = 0;
+ } else {
+ ib->ib.dlid = htons(ntohl(opa->opa.dlid));
+ ib->ib.slid = htons(ntohl(opa->opa.slid));
+ }
+ ib->service_id = opa->service_id;
+ ib->ib.raw_traffic = opa->opa.raw_traffic;
+}
+
+static inline void path_conv_ib_to_opa(struct sa_path_rec *opa,
+ struct sa_path_rec *ib)
+{
+ __be32 slid, dlid;
+
+ if ((ib_is_opa_gid(&ib->sgid)) ||
+ (ib_is_opa_gid(&ib->dgid))) {
+ slid = htonl(opa_get_lid_from_gid(&ib->sgid));
+ dlid = htonl(opa_get_lid_from_gid(&ib->dgid));
+ } else {
+ slid = htonl(ntohs(ib->ib.slid));
+ dlid = htonl(ntohs(ib->ib.dlid));
+ }
+ opa->opa.slid = slid;
+ opa->opa.dlid = dlid;
+ opa->service_id = ib->service_id;
+ opa->opa.raw_traffic = ib->ib.raw_traffic;
+}
+
+/* Convert from OPA to IB path record */
+static inline void sa_convert_path_opa_to_ib(struct sa_path_rec *dest,
+ struct sa_path_rec *src)
+{
+ if (src->rec_type != SA_PATH_REC_TYPE_OPA)
+ return;
+
+ *dest = *src;
+ dest->rec_type = SA_PATH_REC_TYPE_IB;
+ path_conv_opa_to_ib(dest, src);
+}
+
+/* Convert from IB to OPA path record */
+static inline void sa_convert_path_ib_to_opa(struct sa_path_rec *dest,
+ struct sa_path_rec *src)
+{
+ if (src->rec_type != SA_PATH_REC_TYPE_IB)
+ return;
+
+ /* Do a structure copy and overwrite the relevant fields */
+ *dest = *src;
+ dest->rec_type = SA_PATH_REC_TYPE_OPA;
+ path_conv_ib_to_opa(dest, src);
+}
+
+#define IB_SA_MCMEMBER_REC_MGID IB_SA_COMP_MASK( 0)
+#define IB_SA_MCMEMBER_REC_PORT_GID IB_SA_COMP_MASK( 1)
+#define IB_SA_MCMEMBER_REC_QKEY IB_SA_COMP_MASK( 2)
+#define IB_SA_MCMEMBER_REC_MLID IB_SA_COMP_MASK( 3)
+#define IB_SA_MCMEMBER_REC_MTU_SELECTOR IB_SA_COMP_MASK( 4)
+#define IB_SA_MCMEMBER_REC_MTU IB_SA_COMP_MASK( 5)
+#define IB_SA_MCMEMBER_REC_TRAFFIC_CLASS IB_SA_COMP_MASK( 6)
+#define IB_SA_MCMEMBER_REC_PKEY IB_SA_COMP_MASK( 7)
+#define IB_SA_MCMEMBER_REC_RATE_SELECTOR IB_SA_COMP_MASK( 8)
+#define IB_SA_MCMEMBER_REC_RATE IB_SA_COMP_MASK( 9)
+#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR IB_SA_COMP_MASK(10)
+#define IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME IB_SA_COMP_MASK(11)
+#define IB_SA_MCMEMBER_REC_SL IB_SA_COMP_MASK(12)
+#define IB_SA_MCMEMBER_REC_FLOW_LABEL IB_SA_COMP_MASK(13)
+#define IB_SA_MCMEMBER_REC_HOP_LIMIT IB_SA_COMP_MASK(14)
+#define IB_SA_MCMEMBER_REC_SCOPE IB_SA_COMP_MASK(15)
+#define IB_SA_MCMEMBER_REC_JOIN_STATE IB_SA_COMP_MASK(16)
+#define IB_SA_MCMEMBER_REC_PROXY_JOIN IB_SA_COMP_MASK(17)
+
+struct ib_sa_mcmember_rec {
+ union ib_gid mgid;
+ union ib_gid port_gid;
+ __be32 qkey;
+ __be16 mlid;
+ u8 mtu_selector;
+ u8 mtu;
+ u8 traffic_class;
+ __be16 pkey;
+ u8 rate_selector;
+ u8 rate;
+ u8 packet_life_time_selector;
+ u8 packet_life_time;
+ u8 sl;
+ __be32 flow_label;
+ u8 hop_limit;
+ u8 scope;
+ u8 join_state;
+ u8 proxy_join;
+};
+
+/* Service Record Component Mask Sec 15.2.5.14 Ver 1.1 */
+#define IB_SA_SERVICE_REC_SERVICE_ID IB_SA_COMP_MASK( 0)
+#define IB_SA_SERVICE_REC_SERVICE_GID IB_SA_COMP_MASK( 1)
+#define IB_SA_SERVICE_REC_SERVICE_PKEY IB_SA_COMP_MASK( 2)
+/* reserved: 3 */
+#define IB_SA_SERVICE_REC_SERVICE_LEASE IB_SA_COMP_MASK( 4)
+#define IB_SA_SERVICE_REC_SERVICE_KEY IB_SA_COMP_MASK( 5)
+#define IB_SA_SERVICE_REC_SERVICE_NAME IB_SA_COMP_MASK( 6)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_0 IB_SA_COMP_MASK( 7)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_1 IB_SA_COMP_MASK( 8)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_2 IB_SA_COMP_MASK( 9)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_3 IB_SA_COMP_MASK(10)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_4 IB_SA_COMP_MASK(11)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_5 IB_SA_COMP_MASK(12)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_6 IB_SA_COMP_MASK(13)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_7 IB_SA_COMP_MASK(14)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_8 IB_SA_COMP_MASK(15)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_9 IB_SA_COMP_MASK(16)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_10 IB_SA_COMP_MASK(17)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_11 IB_SA_COMP_MASK(18)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_12 IB_SA_COMP_MASK(19)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_13 IB_SA_COMP_MASK(20)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_14 IB_SA_COMP_MASK(21)
+#define IB_SA_SERVICE_REC_SERVICE_DATA8_15 IB_SA_COMP_MASK(22)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_0 IB_SA_COMP_MASK(23)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_1 IB_SA_COMP_MASK(24)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_2 IB_SA_COMP_MASK(25)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_3 IB_SA_COMP_MASK(26)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_4 IB_SA_COMP_MASK(27)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_5 IB_SA_COMP_MASK(28)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_6 IB_SA_COMP_MASK(29)
+#define IB_SA_SERVICE_REC_SERVICE_DATA16_7 IB_SA_COMP_MASK(30)
+#define IB_SA_SERVICE_REC_SERVICE_DATA32_0 IB_SA_COMP_MASK(31)
+#define IB_SA_SERVICE_REC_SERVICE_DATA32_1 IB_SA_COMP_MASK(32)
+#define IB_SA_SERVICE_REC_SERVICE_DATA32_2 IB_SA_COMP_MASK(33)
+#define IB_SA_SERVICE_REC_SERVICE_DATA32_3 IB_SA_COMP_MASK(34)
+#define IB_SA_SERVICE_REC_SERVICE_DATA64_0 IB_SA_COMP_MASK(35)
+#define IB_SA_SERVICE_REC_SERVICE_DATA64_1 IB_SA_COMP_MASK(36)
+
+#define IB_DEFAULT_SERVICE_LEASE 0xFFFFFFFF
+
+#define IB_SA_GUIDINFO_REC_LID IB_SA_COMP_MASK(0)
+#define IB_SA_GUIDINFO_REC_BLOCK_NUM IB_SA_COMP_MASK(1)
+#define IB_SA_GUIDINFO_REC_RES1 IB_SA_COMP_MASK(2)
+#define IB_SA_GUIDINFO_REC_RES2 IB_SA_COMP_MASK(3)
+#define IB_SA_GUIDINFO_REC_GID0 IB_SA_COMP_MASK(4)
+#define IB_SA_GUIDINFO_REC_GID1 IB_SA_COMP_MASK(5)
+#define IB_SA_GUIDINFO_REC_GID2 IB_SA_COMP_MASK(6)
+#define IB_SA_GUIDINFO_REC_GID3 IB_SA_COMP_MASK(7)
+#define IB_SA_GUIDINFO_REC_GID4 IB_SA_COMP_MASK(8)
+#define IB_SA_GUIDINFO_REC_GID5 IB_SA_COMP_MASK(9)
+#define IB_SA_GUIDINFO_REC_GID6 IB_SA_COMP_MASK(10)
+#define IB_SA_GUIDINFO_REC_GID7 IB_SA_COMP_MASK(11)
+
+struct ib_sa_guidinfo_rec {
+ __be16 lid;
+ u8 block_num;
+ /* reserved */
+ u8 res1;
+ __be32 res2;
+ u8 guid_info_list[64];
+};
+
+struct ib_sa_client {
+ atomic_t users;
+ struct completion comp;
+};
+
+/**
+ * ib_sa_register_client - Register an SA client.
+ */
+void ib_sa_register_client(struct ib_sa_client *client);
+
+/**
+ * ib_sa_unregister_client - Deregister an SA client.
+ * @client: Client object to deregister.
+ */
+void ib_sa_unregister_client(struct ib_sa_client *client);
+
+struct ib_sa_query;
+
+void ib_sa_cancel_query(int id, struct ib_sa_query *query);
+
+int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device,
+ u32 port_num, struct sa_path_rec *rec,
+ ib_sa_comp_mask comp_mask, unsigned long timeout_ms,
+ gfp_t gfp_mask,
+ void (*callback)(int status, struct sa_path_rec *resp,
+ unsigned int num_prs, void *context),
+ void *context, struct ib_sa_query **query);
+
+struct ib_sa_multicast {
+ struct ib_sa_mcmember_rec rec;
+ ib_sa_comp_mask comp_mask;
+ int (*callback)(int status,
+ struct ib_sa_multicast *multicast);
+ void *context;
+};
+
+/**
+ * ib_sa_join_multicast - Initiates a join request to the specified multicast
+ * group.
+ * @client: SA client
+ * @device: Device associated with the multicast group.
+ * @port_num: Port on the specified device to associate with the multicast
+ * group.
+ * @rec: SA multicast member record specifying group attributes.
+ * @comp_mask: Component mask indicating which group attributes of %rec are
+ * valid.
+ * @gfp_mask: GFP mask for memory allocations.
+ * @callback: User callback invoked once the join operation completes.
+ * @context: User specified context stored with the ib_sa_multicast structure.
+ *
+ * This call initiates a multicast join request with the SA for the specified
+ * multicast group. If the join operation is started successfully, it returns
+ * an ib_sa_multicast structure that is used to track the multicast operation.
+ * Users must free this structure by calling ib_free_multicast, even if the
+ * join operation later fails. (The callback status is non-zero.)
+ *
+ * If the join operation fails; status will be non-zero, with the following
+ * failures possible:
+ * -ETIMEDOUT: The request timed out.
+ * -EIO: An error occurred sending the query.
+ * -EINVAL: The MCMemberRecord values differed from the existing group's.
+ * -ENETRESET: Indicates that an fatal error has occurred on the multicast
+ * group, and the user must rejoin the group to continue using it.
+ */
+struct ib_sa_multicast *ib_sa_join_multicast(struct ib_sa_client *client,
+ struct ib_device *device,
+ u32 port_num,
+ struct ib_sa_mcmember_rec *rec,
+ ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
+ int (*callback)(int status,
+ struct ib_sa_multicast
+ *multicast),
+ void *context);
+
+/**
+ * ib_free_multicast - Frees the multicast tracking structure, and releases
+ * any reference on the multicast group.
+ * @multicast: Multicast tracking structure allocated by ib_join_multicast.
+ *
+ * This call blocks until the multicast identifier is destroyed. It may
+ * not be called from within the multicast callback; however, returning a non-
+ * zero value from the callback will result in destroying the multicast
+ * tracking structure.
+ */
+void ib_sa_free_multicast(struct ib_sa_multicast *multicast);
+
+/**
+ * ib_get_mcmember_rec - Looks up a multicast member record by its MGID and
+ * returns it if found.
+ * @device: Device associated with the multicast group.
+ * @port_num: Port on the specified device to associate with the multicast
+ * group.
+ * @mgid: MGID of multicast group.
+ * @rec: Location to copy SA multicast member record.
+ */
+int ib_sa_get_mcmember_rec(struct ib_device *device, u32 port_num,
+ union ib_gid *mgid, struct ib_sa_mcmember_rec *rec);
+
+/**
+ * ib_init_ah_from_mcmember - Initialize address handle attributes based on
+ * an SA multicast member record.
+ */
+int ib_init_ah_from_mcmember(struct ib_device *device, u32 port_num,
+ struct ib_sa_mcmember_rec *rec,
+ struct net_device *ndev,
+ enum ib_gid_type gid_type,
+ struct rdma_ah_attr *ah_attr);
+
+int ib_init_ah_attr_from_path(struct ib_device *device, u32 port_num,
+ struct sa_path_rec *rec,
+ struct rdma_ah_attr *ah_attr,
+ const struct ib_gid_attr *sgid_attr);
+
+/**
+ * ib_sa_pack_path - Conert a path record from struct ib_sa_path_rec
+ * to IB MAD wire format.
+ */
+void ib_sa_pack_path(struct sa_path_rec *rec, void *attribute);
+
+/**
+ * ib_sa_unpack_path - Convert a path record from MAD format to struct
+ * ib_sa_path_rec.
+ */
+void ib_sa_unpack_path(void *attribute, struct sa_path_rec *rec);
+
+/* Support GuidInfoRecord */
+int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
+ struct ib_device *device, u32 port_num,
+ struct ib_sa_guidinfo_rec *rec,
+ ib_sa_comp_mask comp_mask, u8 method,
+ unsigned long timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_guidinfo_rec *resp,
+ void *context),
+ void *context, struct ib_sa_query **sa_query);
+
+static inline bool sa_path_is_roce(struct sa_path_rec *rec)
+{
+ return ((rec->rec_type == SA_PATH_REC_TYPE_ROCE_V1) ||
+ (rec->rec_type == SA_PATH_REC_TYPE_ROCE_V2));
+}
+
+static inline bool sa_path_is_opa(struct sa_path_rec *rec)
+{
+ return (rec->rec_type == SA_PATH_REC_TYPE_OPA);
+}
+
+static inline void sa_path_set_slid(struct sa_path_rec *rec, u32 slid)
+{
+ if (rec->rec_type == SA_PATH_REC_TYPE_IB)
+ rec->ib.slid = cpu_to_be16(slid);
+ else if (rec->rec_type == SA_PATH_REC_TYPE_OPA)
+ rec->opa.slid = cpu_to_be32(slid);
+}
+
+static inline void sa_path_set_dlid(struct sa_path_rec *rec, u32 dlid)
+{
+ if (rec->rec_type == SA_PATH_REC_TYPE_IB)
+ rec->ib.dlid = cpu_to_be16(dlid);
+ else if (rec->rec_type == SA_PATH_REC_TYPE_OPA)
+ rec->opa.dlid = cpu_to_be32(dlid);
+}
+
+static inline void sa_path_set_raw_traffic(struct sa_path_rec *rec,
+ u8 raw_traffic)
+{
+ if (rec->rec_type == SA_PATH_REC_TYPE_IB)
+ rec->ib.raw_traffic = raw_traffic;
+ else if (rec->rec_type == SA_PATH_REC_TYPE_OPA)
+ rec->opa.raw_traffic = raw_traffic;
+}
+
+static inline __be32 sa_path_get_slid(struct sa_path_rec *rec)
+{
+ if (rec->rec_type == SA_PATH_REC_TYPE_IB)
+ return htonl(ntohs(rec->ib.slid));
+ else if (rec->rec_type == SA_PATH_REC_TYPE_OPA)
+ return rec->opa.slid;
+ return 0;
+}
+
+static inline __be32 sa_path_get_dlid(struct sa_path_rec *rec)
+{
+ if (rec->rec_type == SA_PATH_REC_TYPE_IB)
+ return htonl(ntohs(rec->ib.dlid));
+ else if (rec->rec_type == SA_PATH_REC_TYPE_OPA)
+ return rec->opa.dlid;
+ return 0;
+}
+
+static inline u8 sa_path_get_raw_traffic(struct sa_path_rec *rec)
+{
+ if (rec->rec_type == SA_PATH_REC_TYPE_IB)
+ return rec->ib.raw_traffic;
+ else if (rec->rec_type == SA_PATH_REC_TYPE_OPA)
+ return rec->opa.raw_traffic;
+ return 0;
+}
+
+static inline void sa_path_set_dmac(struct sa_path_rec *rec, u8 *dmac)
+{
+ if (sa_path_is_roce(rec))
+ memcpy(rec->roce.dmac, dmac, ETH_ALEN);
+}
+
+static inline void sa_path_set_dmac_zero(struct sa_path_rec *rec)
+{
+ if (sa_path_is_roce(rec))
+ eth_zero_addr(rec->roce.dmac);
+}
+
+static inline u8 *sa_path_get_dmac(struct sa_path_rec *rec)
+{
+ if (sa_path_is_roce(rec))
+ return rec->roce.dmac;
+ return NULL;
+}
+#endif /* IB_SA_H */
diff --git a/include/rdma/ib_smi.h b/include/rdma/ib_smi.h
new file mode 100644
index 0000000000..fc16b826b2
--- /dev/null
+++ b/include/rdma/ib_smi.h
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ */
+
+#ifndef IB_SMI_H
+#define IB_SMI_H
+
+#include <rdma/ib_mad.h>
+
+#define IB_SMP_DATA_SIZE 64
+#define IB_SMP_MAX_PATH_HOPS 64
+
+struct ib_smp {
+ u8 base_version;
+ u8 mgmt_class;
+ u8 class_version;
+ u8 method;
+ __be16 status;
+ u8 hop_ptr;
+ u8 hop_cnt;
+ __be64 tid;
+ __be16 attr_id;
+ __be16 resv;
+ __be32 attr_mod;
+ __be64 mkey;
+ __be16 dr_slid;
+ __be16 dr_dlid;
+ u8 reserved[28];
+ u8 data[IB_SMP_DATA_SIZE];
+ u8 initial_path[IB_SMP_MAX_PATH_HOPS];
+ u8 return_path[IB_SMP_MAX_PATH_HOPS];
+} __packed;
+
+#define IB_SMP_DIRECTION cpu_to_be16(0x8000)
+
+/* Subnet management attributes */
+#define IB_SMP_ATTR_NOTICE cpu_to_be16(0x0002)
+#define IB_SMP_ATTR_NODE_DESC cpu_to_be16(0x0010)
+#define IB_SMP_ATTR_NODE_INFO cpu_to_be16(0x0011)
+#define IB_SMP_ATTR_SWITCH_INFO cpu_to_be16(0x0012)
+#define IB_SMP_ATTR_GUID_INFO cpu_to_be16(0x0014)
+#define IB_SMP_ATTR_PORT_INFO cpu_to_be16(0x0015)
+#define IB_SMP_ATTR_PKEY_TABLE cpu_to_be16(0x0016)
+#define IB_SMP_ATTR_SL_TO_VL_TABLE cpu_to_be16(0x0017)
+#define IB_SMP_ATTR_VL_ARB_TABLE cpu_to_be16(0x0018)
+#define IB_SMP_ATTR_LINEAR_FORWARD_TABLE cpu_to_be16(0x0019)
+#define IB_SMP_ATTR_RANDOM_FORWARD_TABLE cpu_to_be16(0x001A)
+#define IB_SMP_ATTR_MCAST_FORWARD_TABLE cpu_to_be16(0x001B)
+#define IB_SMP_ATTR_SM_INFO cpu_to_be16(0x0020)
+#define IB_SMP_ATTR_VENDOR_DIAG cpu_to_be16(0x0030)
+#define IB_SMP_ATTR_LED_INFO cpu_to_be16(0x0031)
+#define IB_SMP_ATTR_VENDOR_MASK cpu_to_be16(0xFF00)
+
+struct ib_port_info {
+ __be64 mkey;
+ __be64 gid_prefix;
+ __be16 lid;
+ __be16 sm_lid;
+ __be32 cap_mask;
+ __be16 diag_code;
+ __be16 mkey_lease_period;
+ u8 local_port_num;
+ u8 link_width_enabled;
+ u8 link_width_supported;
+ u8 link_width_active;
+ u8 linkspeed_portstate; /* 4 bits, 4 bits */
+ u8 portphysstate_linkdown; /* 4 bits, 4 bits */
+ u8 mkeyprot_resv_lmc; /* 2 bits, 3, 3 */
+ u8 linkspeedactive_enabled; /* 4 bits, 4 bits */
+ u8 neighbormtu_mastersmsl; /* 4 bits, 4 bits */
+ u8 vlcap_inittype; /* 4 bits, 4 bits */
+ u8 vl_high_limit;
+ u8 vl_arb_high_cap;
+ u8 vl_arb_low_cap;
+ u8 inittypereply_mtucap; /* 4 bits, 4 bits */
+ u8 vlstallcnt_hoqlife; /* 3 bits, 5 bits */
+ u8 operationalvl_pei_peo_fpi_fpo; /* 4 bits, 1, 1, 1, 1 */
+ __be16 mkey_violations;
+ __be16 pkey_violations;
+ __be16 qkey_violations;
+ u8 guid_cap;
+ u8 clientrereg_resv_subnetto; /* 1 bit, 2 bits, 5 */
+ u8 resv_resptimevalue; /* 3 bits, 5 bits */
+ u8 localphyerrors_overrunerrors; /* 4 bits, 4 bits */
+ __be16 max_credit_hint;
+ u8 resv;
+ u8 link_roundtrip_latency[3];
+};
+
+struct ib_node_info {
+ u8 base_version;
+ u8 class_version;
+ u8 node_type;
+ u8 num_ports;
+ __be64 sys_guid;
+ __be64 node_guid;
+ __be64 port_guid;
+ __be16 partition_cap;
+ __be16 device_id;
+ __be32 revision;
+ u8 local_port_num;
+ u8 vendor_id[3];
+} __packed;
+
+struct ib_vl_weight_elem {
+ u8 vl; /* IB: VL is low 4 bits, upper 4 bits reserved */
+ /* OPA: VL is low 5 bits, upper 3 bits reserved */
+ u8 weight;
+};
+
+static inline u8
+ib_get_smp_direction(struct ib_smp *smp)
+{
+ return ((smp->status & IB_SMP_DIRECTION) == IB_SMP_DIRECTION);
+}
+
+/*
+ * SM Trap/Notice numbers
+ */
+#define IB_NOTICE_TRAP_LLI_THRESH cpu_to_be16(129)
+#define IB_NOTICE_TRAP_EBO_THRESH cpu_to_be16(130)
+#define IB_NOTICE_TRAP_FLOW_UPDATE cpu_to_be16(131)
+#define IB_NOTICE_TRAP_CAP_MASK_CHG cpu_to_be16(144)
+#define IB_NOTICE_TRAP_SYS_GUID_CHG cpu_to_be16(145)
+#define IB_NOTICE_TRAP_BAD_MKEY cpu_to_be16(256)
+#define IB_NOTICE_TRAP_BAD_PKEY cpu_to_be16(257)
+#define IB_NOTICE_TRAP_BAD_QKEY cpu_to_be16(258)
+
+/*
+ * Other local changes flags (trap 144).
+ */
+#define IB_NOTICE_TRAP_LSE_CHG 0x04 /* Link Speed Enable changed */
+#define IB_NOTICE_TRAP_LWE_CHG 0x02 /* Link Width Enable changed */
+#define IB_NOTICE_TRAP_NODE_DESC_CHG 0x01
+
+/*
+ * M_Key volation flags in dr_trunc_hop (trap 256).
+ */
+#define IB_NOTICE_TRAP_DR_NOTICE 0x80
+#define IB_NOTICE_TRAP_DR_TRUNC 0x40
+
+/**
+ * ib_init_query_mad - Initialize query MAD.
+ * @mad: MAD to initialize.
+ */
+static inline void ib_init_query_mad(struct ib_smp *mad)
+{
+ mad->base_version = IB_MGMT_BASE_VERSION;
+ mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+ mad->class_version = 1;
+ mad->method = IB_MGMT_METHOD_GET;
+}
+#endif /* IB_SMI_H */
diff --git a/include/rdma/ib_sysfs.h b/include/rdma/ib_sysfs.h
new file mode 100644
index 0000000000..3b77cfd74d
--- /dev/null
+++ b/include/rdma/ib_sysfs.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/*
+ * Copyright (c) 2021 Mellanox Technologies Ltd. All rights reserved.
+ */
+#ifndef DEF_RDMA_IB_SYSFS_H
+#define DEF_RDMA_IB_SYSFS_H
+
+#include <linux/sysfs.h>
+
+struct ib_device;
+
+struct ib_port_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct ib_device *ibdev, u32 port_num,
+ struct ib_port_attribute *attr, char *buf);
+ ssize_t (*store)(struct ib_device *ibdev, u32 port_num,
+ struct ib_port_attribute *attr, const char *buf,
+ size_t count);
+};
+
+#define IB_PORT_ATTR_RW(_name) \
+ struct ib_port_attribute ib_port_attr_##_name = __ATTR_RW(_name)
+
+#define IB_PORT_ATTR_ADMIN_RW(_name) \
+ struct ib_port_attribute ib_port_attr_##_name = \
+ __ATTR_RW_MODE(_name, 0600)
+
+#define IB_PORT_ATTR_RO(_name) \
+ struct ib_port_attribute ib_port_attr_##_name = __ATTR_RO(_name)
+
+#define IB_PORT_ATTR_WO(_name) \
+ struct ib_port_attribute ib_port_attr_##_name = __ATTR_WO(_name)
+
+struct ib_device *ib_port_sysfs_get_ibdev_kobj(struct kobject *kobj,
+ u32 *port_num);
+
+#endif
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
new file mode 100644
index 0000000000..565a850445
--- /dev/null
+++ b/include/rdma/ib_umem.h
@@ -0,0 +1,207 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2007 Cisco Systems. All rights reserved.
+ * Copyright (c) 2020 Intel Corporation. All rights reserved.
+ */
+
+#ifndef IB_UMEM_H
+#define IB_UMEM_H
+
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+#include <linux/workqueue.h>
+#include <rdma/ib_verbs.h>
+
+struct ib_ucontext;
+struct ib_umem_odp;
+struct dma_buf_attach_ops;
+
+struct ib_umem {
+ struct ib_device *ibdev;
+ struct mm_struct *owning_mm;
+ u64 iova;
+ size_t length;
+ unsigned long address;
+ u32 writable : 1;
+ u32 is_odp : 1;
+ u32 is_dmabuf : 1;
+ struct sg_append_table sgt_append;
+};
+
+struct ib_umem_dmabuf {
+ struct ib_umem umem;
+ struct dma_buf_attachment *attach;
+ struct sg_table *sgt;
+ struct scatterlist *first_sg;
+ struct scatterlist *last_sg;
+ unsigned long first_sg_offset;
+ unsigned long last_sg_trim;
+ void *private;
+ u8 pinned : 1;
+};
+
+static inline struct ib_umem_dmabuf *to_ib_umem_dmabuf(struct ib_umem *umem)
+{
+ return container_of(umem, struct ib_umem_dmabuf, umem);
+}
+
+/* Returns the offset of the umem start relative to the first page. */
+static inline int ib_umem_offset(struct ib_umem *umem)
+{
+ return umem->address & ~PAGE_MASK;
+}
+
+static inline unsigned long ib_umem_dma_offset(struct ib_umem *umem,
+ unsigned long pgsz)
+{
+ return (sg_dma_address(umem->sgt_append.sgt.sgl) + ib_umem_offset(umem)) &
+ (pgsz - 1);
+}
+
+static inline size_t ib_umem_num_dma_blocks(struct ib_umem *umem,
+ unsigned long pgsz)
+{
+ return (size_t)((ALIGN(umem->iova + umem->length, pgsz) -
+ ALIGN_DOWN(umem->iova, pgsz))) /
+ pgsz;
+}
+
+static inline size_t ib_umem_num_pages(struct ib_umem *umem)
+{
+ return ib_umem_num_dma_blocks(umem, PAGE_SIZE);
+}
+
+static inline void __rdma_umem_block_iter_start(struct ib_block_iter *biter,
+ struct ib_umem *umem,
+ unsigned long pgsz)
+{
+ __rdma_block_iter_start(biter, umem->sgt_append.sgt.sgl,
+ umem->sgt_append.sgt.nents, pgsz);
+ biter->__sg_advance = ib_umem_offset(umem) & ~(pgsz - 1);
+ biter->__sg_numblocks = ib_umem_num_dma_blocks(umem, pgsz);
+}
+
+static inline bool __rdma_umem_block_iter_next(struct ib_block_iter *biter)
+{
+ return __rdma_block_iter_next(biter) && biter->__sg_numblocks--;
+}
+
+/**
+ * rdma_umem_for_each_dma_block - iterate over contiguous DMA blocks of the umem
+ * @umem: umem to iterate over
+ * @pgsz: Page size to split the list into
+ *
+ * pgsz must be <= PAGE_SIZE or computed by ib_umem_find_best_pgsz(). The
+ * returned DMA blocks will be aligned to pgsz and span the range:
+ * ALIGN_DOWN(umem->address, pgsz) to ALIGN(umem->address + umem->length, pgsz)
+ *
+ * Performs exactly ib_umem_num_dma_blocks() iterations.
+ */
+#define rdma_umem_for_each_dma_block(umem, biter, pgsz) \
+ for (__rdma_umem_block_iter_start(biter, umem, pgsz); \
+ __rdma_umem_block_iter_next(biter);)
+
+#ifdef CONFIG_INFINIBAND_USER_MEM
+
+struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
+ size_t size, int access);
+void ib_umem_release(struct ib_umem *umem);
+int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
+ size_t length);
+unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
+ unsigned long pgsz_bitmap,
+ unsigned long virt);
+
+/**
+ * ib_umem_find_best_pgoff - Find best HW page size
+ *
+ * @umem: umem struct
+ * @pgsz_bitmap bitmap of HW supported page sizes
+ * @pgoff_bitmask: Mask of bits that can be represented with an offset
+ *
+ * This is very similar to ib_umem_find_best_pgsz() except instead of accepting
+ * an IOVA it accepts a bitmask specifying what address bits can be represented
+ * with a page offset.
+ *
+ * For instance if the HW has multiple page sizes, requires 64 byte alignemnt,
+ * and can support aligned offsets up to 4032 then pgoff_bitmask would be
+ * "111111000000".
+ *
+ * If the pgoff_bitmask requires either alignment in the low bit or an
+ * unavailable page size for the high bits, this function returns 0.
+ */
+static inline unsigned long ib_umem_find_best_pgoff(struct ib_umem *umem,
+ unsigned long pgsz_bitmap,
+ u64 pgoff_bitmask)
+{
+ struct scatterlist *sg = umem->sgt_append.sgt.sgl;
+ dma_addr_t dma_addr;
+
+ dma_addr = sg_dma_address(sg) + (umem->address & ~PAGE_MASK);
+ return ib_umem_find_best_pgsz(umem, pgsz_bitmap,
+ dma_addr & pgoff_bitmask);
+}
+
+struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device,
+ unsigned long offset, size_t size,
+ int fd, int access,
+ const struct dma_buf_attach_ops *ops);
+struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device,
+ unsigned long offset,
+ size_t size, int fd,
+ int access);
+int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf);
+void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf);
+void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf);
+
+#else /* CONFIG_INFINIBAND_USER_MEM */
+
+#include <linux/err.h>
+
+static inline struct ib_umem *ib_umem_get(struct ib_device *device,
+ unsigned long addr, size_t size,
+ int access)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+static inline void ib_umem_release(struct ib_umem *umem) { }
+static inline int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
+ size_t length) {
+ return -EOPNOTSUPP;
+}
+static inline unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
+ unsigned long pgsz_bitmap,
+ unsigned long virt)
+{
+ return 0;
+}
+static inline unsigned long ib_umem_find_best_pgoff(struct ib_umem *umem,
+ unsigned long pgsz_bitmap,
+ u64 pgoff_bitmask)
+{
+ return 0;
+}
+static inline
+struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device,
+ unsigned long offset,
+ size_t size, int fd,
+ int access,
+ struct dma_buf_attach_ops *ops)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+static inline struct ib_umem_dmabuf *
+ib_umem_dmabuf_get_pinned(struct ib_device *device, unsigned long offset,
+ size_t size, int fd, int access)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+static inline int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf)
+{
+ return -EOPNOTSUPP;
+}
+static inline void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf) { }
+static inline void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) { }
+
+#endif /* CONFIG_INFINIBAND_USER_MEM */
+#endif /* IB_UMEM_H */
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
new file mode 100644
index 0000000000..0844c1d05a
--- /dev/null
+++ b/include/rdma/ib_umem_odp.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
+ */
+
+#ifndef IB_UMEM_ODP_H
+#define IB_UMEM_ODP_H
+
+#include <rdma/ib_umem.h>
+#include <rdma/ib_verbs.h>
+
+struct ib_umem_odp {
+ struct ib_umem umem;
+ struct mmu_interval_notifier notifier;
+ struct pid *tgid;
+
+ /* An array of the pfns included in the on-demand paging umem. */
+ unsigned long *pfn_list;
+
+ /*
+ * An array with DMA addresses mapped for pfns in pfn_list.
+ * The lower two bits designate access permissions.
+ * See ODP_READ_ALLOWED_BIT and ODP_WRITE_ALLOWED_BIT.
+ */
+ dma_addr_t *dma_list;
+ /*
+ * The umem_mutex protects the page_list and dma_list fields of an ODP
+ * umem, allowing only a single thread to map/unmap pages. The mutex
+ * also protects access to the mmu notifier counters.
+ */
+ struct mutex umem_mutex;
+ void *private; /* for the HW driver to use. */
+
+ int npages;
+
+ /*
+ * An implicit odp umem cannot be DMA mapped, has 0 length, and serves
+ * only as an anchor for the driver to hold onto the per_mm. FIXME:
+ * This should be removed and drivers should work with the per_mm
+ * directly.
+ */
+ bool is_implicit_odp;
+
+ unsigned int page_shift;
+};
+
+static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem)
+{
+ return container_of(umem, struct ib_umem_odp, umem);
+}
+
+/* Returns the first page of an ODP umem. */
+static inline unsigned long ib_umem_start(struct ib_umem_odp *umem_odp)
+{
+ return umem_odp->notifier.interval_tree.start;
+}
+
+/* Returns the address of the page after the last one of an ODP umem. */
+static inline unsigned long ib_umem_end(struct ib_umem_odp *umem_odp)
+{
+ return umem_odp->notifier.interval_tree.last + 1;
+}
+
+static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
+{
+ return (ib_umem_end(umem_odp) - ib_umem_start(umem_odp)) >>
+ umem_odp->page_shift;
+}
+
+/*
+ * The lower 2 bits of the DMA address signal the R/W permissions for
+ * the entry. To upgrade the permissions, provide the appropriate
+ * bitmask to the map_dma_pages function.
+ *
+ * Be aware that upgrading a mapped address might result in change of
+ * the DMA address for the page.
+ */
+#define ODP_READ_ALLOWED_BIT (1<<0ULL)
+#define ODP_WRITE_ALLOWED_BIT (1<<1ULL)
+
+#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT))
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+
+struct ib_umem_odp *
+ib_umem_odp_get(struct ib_device *device, unsigned long addr, size_t size,
+ int access, const struct mmu_interval_notifier_ops *ops);
+struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
+ int access);
+struct ib_umem_odp *
+ib_umem_odp_alloc_child(struct ib_umem_odp *root_umem, unsigned long addr,
+ size_t size,
+ const struct mmu_interval_notifier_ops *ops);
+void ib_umem_odp_release(struct ib_umem_odp *umem_odp);
+
+int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 start_offset,
+ u64 bcnt, u64 access_mask, bool fault);
+
+void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset,
+ u64 bound);
+
+#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
+
+static inline struct ib_umem_odp *
+ib_umem_odp_get(struct ib_device *device, unsigned long addr, size_t size,
+ int access, const struct mmu_interval_notifier_ops *ops)
+{
+ return ERR_PTR(-EINVAL);
+}
+
+static inline void ib_umem_odp_release(struct ib_umem_odp *umem_odp) {}
+
+#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
+
+#endif /* IB_UMEM_ODP_H */
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
new file mode 100644
index 0000000000..62f9d126a7
--- /dev/null
+++ b/include/rdma/ib_verbs.h
@@ -0,0 +1,4836 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004, 2020 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved.
+ */
+
+#ifndef IB_VERBS_H
+#define IB_VERBS_H
+
+#include <linux/ethtool.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/rwsem.h>
+#include <linux/workqueue.h>
+#include <linux/irq_poll.h>
+#include <uapi/linux/if_ether.h>
+#include <net/ipv6.h>
+#include <net/ip.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/refcount.h>
+#include <linux/if_link.h>
+#include <linux/atomic.h>
+#include <linux/mmu_notifier.h>
+#include <linux/uaccess.h>
+#include <linux/cgroup_rdma.h>
+#include <linux/irqflags.h>
+#include <linux/preempt.h>
+#include <linux/dim.h>
+#include <uapi/rdma/ib_user_verbs.h>
+#include <rdma/rdma_counter.h>
+#include <rdma/restrack.h>
+#include <rdma/signature.h>
+#include <uapi/rdma/rdma_user_ioctl.h>
+#include <uapi/rdma/ib_user_ioctl_verbs.h>
+
+#define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN
+
+struct ib_umem_odp;
+struct ib_uqp_object;
+struct ib_usrq_object;
+struct ib_uwq_object;
+struct rdma_cm_id;
+struct ib_port;
+struct hw_stats_device_data;
+
+extern struct workqueue_struct *ib_wq;
+extern struct workqueue_struct *ib_comp_wq;
+extern struct workqueue_struct *ib_comp_unbound_wq;
+
+struct ib_ucq_object;
+
+__printf(3, 4) __cold
+void ibdev_printk(const char *level, const struct ib_device *ibdev,
+ const char *format, ...);
+__printf(2, 3) __cold
+void ibdev_emerg(const struct ib_device *ibdev, const char *format, ...);
+__printf(2, 3) __cold
+void ibdev_alert(const struct ib_device *ibdev, const char *format, ...);
+__printf(2, 3) __cold
+void ibdev_crit(const struct ib_device *ibdev, const char *format, ...);
+__printf(2, 3) __cold
+void ibdev_err(const struct ib_device *ibdev, const char *format, ...);
+__printf(2, 3) __cold
+void ibdev_warn(const struct ib_device *ibdev, const char *format, ...);
+__printf(2, 3) __cold
+void ibdev_notice(const struct ib_device *ibdev, const char *format, ...);
+__printf(2, 3) __cold
+void ibdev_info(const struct ib_device *ibdev, const char *format, ...);
+
+#if defined(CONFIG_DYNAMIC_DEBUG) || \
+ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
+#define ibdev_dbg(__dev, format, args...) \
+ dynamic_ibdev_dbg(__dev, format, ##args)
+#else
+__printf(2, 3) __cold
+static inline
+void ibdev_dbg(const struct ib_device *ibdev, const char *format, ...) {}
+#endif
+
+#define ibdev_level_ratelimited(ibdev_level, ibdev, fmt, ...) \
+do { \
+ static DEFINE_RATELIMIT_STATE(_rs, \
+ DEFAULT_RATELIMIT_INTERVAL, \
+ DEFAULT_RATELIMIT_BURST); \
+ if (__ratelimit(&_rs)) \
+ ibdev_level(ibdev, fmt, ##__VA_ARGS__); \
+} while (0)
+
+#define ibdev_emerg_ratelimited(ibdev, fmt, ...) \
+ ibdev_level_ratelimited(ibdev_emerg, ibdev, fmt, ##__VA_ARGS__)
+#define ibdev_alert_ratelimited(ibdev, fmt, ...) \
+ ibdev_level_ratelimited(ibdev_alert, ibdev, fmt, ##__VA_ARGS__)
+#define ibdev_crit_ratelimited(ibdev, fmt, ...) \
+ ibdev_level_ratelimited(ibdev_crit, ibdev, fmt, ##__VA_ARGS__)
+#define ibdev_err_ratelimited(ibdev, fmt, ...) \
+ ibdev_level_ratelimited(ibdev_err, ibdev, fmt, ##__VA_ARGS__)
+#define ibdev_warn_ratelimited(ibdev, fmt, ...) \
+ ibdev_level_ratelimited(ibdev_warn, ibdev, fmt, ##__VA_ARGS__)
+#define ibdev_notice_ratelimited(ibdev, fmt, ...) \
+ ibdev_level_ratelimited(ibdev_notice, ibdev, fmt, ##__VA_ARGS__)
+#define ibdev_info_ratelimited(ibdev, fmt, ...) \
+ ibdev_level_ratelimited(ibdev_info, ibdev, fmt, ##__VA_ARGS__)
+
+#if defined(CONFIG_DYNAMIC_DEBUG) || \
+ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
+/* descriptor check is first to prevent flooding with "callbacks suppressed" */
+#define ibdev_dbg_ratelimited(ibdev, fmt, ...) \
+do { \
+ static DEFINE_RATELIMIT_STATE(_rs, \
+ DEFAULT_RATELIMIT_INTERVAL, \
+ DEFAULT_RATELIMIT_BURST); \
+ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
+ if (DYNAMIC_DEBUG_BRANCH(descriptor) && __ratelimit(&_rs)) \
+ __dynamic_ibdev_dbg(&descriptor, ibdev, fmt, \
+ ##__VA_ARGS__); \
+} while (0)
+#else
+__printf(2, 3) __cold
+static inline
+void ibdev_dbg_ratelimited(const struct ib_device *ibdev, const char *format, ...) {}
+#endif
+
+union ib_gid {
+ u8 raw[16];
+ struct {
+ __be64 subnet_prefix;
+ __be64 interface_id;
+ } global;
+};
+
+extern union ib_gid zgid;
+
+enum ib_gid_type {
+ IB_GID_TYPE_IB = IB_UVERBS_GID_TYPE_IB,
+ IB_GID_TYPE_ROCE = IB_UVERBS_GID_TYPE_ROCE_V1,
+ IB_GID_TYPE_ROCE_UDP_ENCAP = IB_UVERBS_GID_TYPE_ROCE_V2,
+ IB_GID_TYPE_SIZE
+};
+
+#define ROCE_V2_UDP_DPORT 4791
+struct ib_gid_attr {
+ struct net_device __rcu *ndev;
+ struct ib_device *device;
+ union ib_gid gid;
+ enum ib_gid_type gid_type;
+ u16 index;
+ u32 port_num;
+};
+
+enum {
+ /* set the local administered indication */
+ IB_SA_WELL_KNOWN_GUID = BIT_ULL(57) | 2,
+};
+
+enum rdma_transport_type {
+ RDMA_TRANSPORT_IB,
+ RDMA_TRANSPORT_IWARP,
+ RDMA_TRANSPORT_USNIC,
+ RDMA_TRANSPORT_USNIC_UDP,
+ RDMA_TRANSPORT_UNSPECIFIED,
+};
+
+enum rdma_protocol_type {
+ RDMA_PROTOCOL_IB,
+ RDMA_PROTOCOL_IBOE,
+ RDMA_PROTOCOL_IWARP,
+ RDMA_PROTOCOL_USNIC_UDP
+};
+
+__attribute_const__ enum rdma_transport_type
+rdma_node_get_transport(unsigned int node_type);
+
+enum rdma_network_type {
+ RDMA_NETWORK_IB,
+ RDMA_NETWORK_ROCE_V1,
+ RDMA_NETWORK_IPV4,
+ RDMA_NETWORK_IPV6
+};
+
+static inline enum ib_gid_type ib_network_to_gid_type(enum rdma_network_type network_type)
+{
+ if (network_type == RDMA_NETWORK_IPV4 ||
+ network_type == RDMA_NETWORK_IPV6)
+ return IB_GID_TYPE_ROCE_UDP_ENCAP;
+ else if (network_type == RDMA_NETWORK_ROCE_V1)
+ return IB_GID_TYPE_ROCE;
+ else
+ return IB_GID_TYPE_IB;
+}
+
+static inline enum rdma_network_type
+rdma_gid_attr_network_type(const struct ib_gid_attr *attr)
+{
+ if (attr->gid_type == IB_GID_TYPE_IB)
+ return RDMA_NETWORK_IB;
+
+ if (attr->gid_type == IB_GID_TYPE_ROCE)
+ return RDMA_NETWORK_ROCE_V1;
+
+ if (ipv6_addr_v4mapped((struct in6_addr *)&attr->gid))
+ return RDMA_NETWORK_IPV4;
+ else
+ return RDMA_NETWORK_IPV6;
+}
+
+enum rdma_link_layer {
+ IB_LINK_LAYER_UNSPECIFIED,
+ IB_LINK_LAYER_INFINIBAND,
+ IB_LINK_LAYER_ETHERNET,
+};
+
+enum ib_device_cap_flags {
+ IB_DEVICE_RESIZE_MAX_WR = IB_UVERBS_DEVICE_RESIZE_MAX_WR,
+ IB_DEVICE_BAD_PKEY_CNTR = IB_UVERBS_DEVICE_BAD_PKEY_CNTR,
+ IB_DEVICE_BAD_QKEY_CNTR = IB_UVERBS_DEVICE_BAD_QKEY_CNTR,
+ IB_DEVICE_RAW_MULTI = IB_UVERBS_DEVICE_RAW_MULTI,
+ IB_DEVICE_AUTO_PATH_MIG = IB_UVERBS_DEVICE_AUTO_PATH_MIG,
+ IB_DEVICE_CHANGE_PHY_PORT = IB_UVERBS_DEVICE_CHANGE_PHY_PORT,
+ IB_DEVICE_UD_AV_PORT_ENFORCE = IB_UVERBS_DEVICE_UD_AV_PORT_ENFORCE,
+ IB_DEVICE_CURR_QP_STATE_MOD = IB_UVERBS_DEVICE_CURR_QP_STATE_MOD,
+ IB_DEVICE_SHUTDOWN_PORT = IB_UVERBS_DEVICE_SHUTDOWN_PORT,
+ /* IB_DEVICE_INIT_TYPE = IB_UVERBS_DEVICE_INIT_TYPE, (not in use) */
+ IB_DEVICE_PORT_ACTIVE_EVENT = IB_UVERBS_DEVICE_PORT_ACTIVE_EVENT,
+ IB_DEVICE_SYS_IMAGE_GUID = IB_UVERBS_DEVICE_SYS_IMAGE_GUID,
+ IB_DEVICE_RC_RNR_NAK_GEN = IB_UVERBS_DEVICE_RC_RNR_NAK_GEN,
+ IB_DEVICE_SRQ_RESIZE = IB_UVERBS_DEVICE_SRQ_RESIZE,
+ IB_DEVICE_N_NOTIFY_CQ = IB_UVERBS_DEVICE_N_NOTIFY_CQ,
+
+ /* Reserved, old SEND_W_INV = 1 << 16,*/
+ IB_DEVICE_MEM_WINDOW = IB_UVERBS_DEVICE_MEM_WINDOW,
+ /*
+ * Devices should set IB_DEVICE_UD_IP_SUM if they support
+ * insertion of UDP and TCP checksum on outgoing UD IPoIB
+ * messages and can verify the validity of checksum for
+ * incoming messages. Setting this flag implies that the
+ * IPoIB driver may set NETIF_F_IP_CSUM for datagram mode.
+ */
+ IB_DEVICE_UD_IP_CSUM = IB_UVERBS_DEVICE_UD_IP_CSUM,
+ IB_DEVICE_XRC = IB_UVERBS_DEVICE_XRC,
+
+ /*
+ * This device supports the IB "base memory management extension",
+ * which includes support for fast registrations (IB_WR_REG_MR,
+ * IB_WR_LOCAL_INV and IB_WR_SEND_WITH_INV verbs). This flag should
+ * also be set by any iWarp device which must support FRs to comply
+ * to the iWarp verbs spec. iWarp devices also support the
+ * IB_WR_RDMA_READ_WITH_INV verb for RDMA READs that invalidate the
+ * stag.
+ */
+ IB_DEVICE_MEM_MGT_EXTENSIONS = IB_UVERBS_DEVICE_MEM_MGT_EXTENSIONS,
+ IB_DEVICE_MEM_WINDOW_TYPE_2A = IB_UVERBS_DEVICE_MEM_WINDOW_TYPE_2A,
+ IB_DEVICE_MEM_WINDOW_TYPE_2B = IB_UVERBS_DEVICE_MEM_WINDOW_TYPE_2B,
+ IB_DEVICE_RC_IP_CSUM = IB_UVERBS_DEVICE_RC_IP_CSUM,
+ /* Deprecated. Please use IB_RAW_PACKET_CAP_IP_CSUM. */
+ IB_DEVICE_RAW_IP_CSUM = IB_UVERBS_DEVICE_RAW_IP_CSUM,
+ IB_DEVICE_MANAGED_FLOW_STEERING =
+ IB_UVERBS_DEVICE_MANAGED_FLOW_STEERING,
+ /* Deprecated. Please use IB_RAW_PACKET_CAP_SCATTER_FCS. */
+ IB_DEVICE_RAW_SCATTER_FCS = IB_UVERBS_DEVICE_RAW_SCATTER_FCS,
+ /* The device supports padding incoming writes to cacheline. */
+ IB_DEVICE_PCI_WRITE_END_PADDING =
+ IB_UVERBS_DEVICE_PCI_WRITE_END_PADDING,
+ /* Placement type attributes */
+ IB_DEVICE_FLUSH_GLOBAL = IB_UVERBS_DEVICE_FLUSH_GLOBAL,
+ IB_DEVICE_FLUSH_PERSISTENT = IB_UVERBS_DEVICE_FLUSH_PERSISTENT,
+ IB_DEVICE_ATOMIC_WRITE = IB_UVERBS_DEVICE_ATOMIC_WRITE,
+};
+
+enum ib_kernel_cap_flags {
+ /*
+ * This device supports a per-device lkey or stag that can be
+ * used without performing a memory registration for the local
+ * memory. Note that ULPs should never check this flag, but
+ * instead of use the local_dma_lkey flag in the ib_pd structure,
+ * which will always contain a usable lkey.
+ */
+ IBK_LOCAL_DMA_LKEY = 1 << 0,
+ /* IB_QP_CREATE_INTEGRITY_EN is supported to implement T10-PI */
+ IBK_INTEGRITY_HANDOVER = 1 << 1,
+ /* IB_ACCESS_ON_DEMAND is supported during reg_user_mr() */
+ IBK_ON_DEMAND_PAGING = 1 << 2,
+ /* IB_MR_TYPE_SG_GAPS is supported */
+ IBK_SG_GAPS_REG = 1 << 3,
+ /* Driver supports RDMA_NLDEV_CMD_DELLINK */
+ IBK_ALLOW_USER_UNREG = 1 << 4,
+
+ /* ipoib will use IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK */
+ IBK_BLOCK_MULTICAST_LOOPBACK = 1 << 5,
+ /* iopib will use IB_QP_CREATE_IPOIB_UD_LSO for its QPs */
+ IBK_UD_TSO = 1 << 6,
+ /* iopib will use the device ops:
+ * get_vf_config
+ * get_vf_guid
+ * get_vf_stats
+ * set_vf_guid
+ * set_vf_link_state
+ */
+ IBK_VIRTUAL_FUNCTION = 1 << 7,
+ /* ipoib will use IB_QP_CREATE_NETDEV_USE for its QPs */
+ IBK_RDMA_NETDEV_OPA = 1 << 8,
+};
+
+enum ib_atomic_cap {
+ IB_ATOMIC_NONE,
+ IB_ATOMIC_HCA,
+ IB_ATOMIC_GLOB
+};
+
+enum ib_odp_general_cap_bits {
+ IB_ODP_SUPPORT = 1 << 0,
+ IB_ODP_SUPPORT_IMPLICIT = 1 << 1,
+};
+
+enum ib_odp_transport_cap_bits {
+ IB_ODP_SUPPORT_SEND = 1 << 0,
+ IB_ODP_SUPPORT_RECV = 1 << 1,
+ IB_ODP_SUPPORT_WRITE = 1 << 2,
+ IB_ODP_SUPPORT_READ = 1 << 3,
+ IB_ODP_SUPPORT_ATOMIC = 1 << 4,
+ IB_ODP_SUPPORT_SRQ_RECV = 1 << 5,
+};
+
+struct ib_odp_caps {
+ uint64_t general_caps;
+ struct {
+ uint32_t rc_odp_caps;
+ uint32_t uc_odp_caps;
+ uint32_t ud_odp_caps;
+ uint32_t xrc_odp_caps;
+ } per_transport_caps;
+};
+
+struct ib_rss_caps {
+ /* Corresponding bit will be set if qp type from
+ * 'enum ib_qp_type' is supported, e.g.
+ * supported_qpts |= 1 << IB_QPT_UD
+ */
+ u32 supported_qpts;
+ u32 max_rwq_indirection_tables;
+ u32 max_rwq_indirection_table_size;
+};
+
+enum ib_tm_cap_flags {
+ /* Support tag matching with rendezvous offload for RC transport */
+ IB_TM_CAP_RNDV_RC = 1 << 0,
+};
+
+struct ib_tm_caps {
+ /* Max size of RNDV header */
+ u32 max_rndv_hdr_size;
+ /* Max number of entries in tag matching list */
+ u32 max_num_tags;
+ /* From enum ib_tm_cap_flags */
+ u32 flags;
+ /* Max number of outstanding list operations */
+ u32 max_ops;
+ /* Max number of SGE in tag matching entry */
+ u32 max_sge;
+};
+
+struct ib_cq_init_attr {
+ unsigned int cqe;
+ u32 comp_vector;
+ u32 flags;
+};
+
+enum ib_cq_attr_mask {
+ IB_CQ_MODERATE = 1 << 0,
+};
+
+struct ib_cq_caps {
+ u16 max_cq_moderation_count;
+ u16 max_cq_moderation_period;
+};
+
+struct ib_dm_mr_attr {
+ u64 length;
+ u64 offset;
+ u32 access_flags;
+};
+
+struct ib_dm_alloc_attr {
+ u64 length;
+ u32 alignment;
+ u32 flags;
+};
+
+struct ib_device_attr {
+ u64 fw_ver;
+ __be64 sys_image_guid;
+ u64 max_mr_size;
+ u64 page_size_cap;
+ u32 vendor_id;
+ u32 vendor_part_id;
+ u32 hw_ver;
+ int max_qp;
+ int max_qp_wr;
+ u64 device_cap_flags;
+ u64 kernel_cap_flags;
+ int max_send_sge;
+ int max_recv_sge;
+ int max_sge_rd;
+ int max_cq;
+ int max_cqe;
+ int max_mr;
+ int max_pd;
+ int max_qp_rd_atom;
+ int max_ee_rd_atom;
+ int max_res_rd_atom;
+ int max_qp_init_rd_atom;
+ int max_ee_init_rd_atom;
+ enum ib_atomic_cap atomic_cap;
+ enum ib_atomic_cap masked_atomic_cap;
+ int max_ee;
+ int max_rdd;
+ int max_mw;
+ int max_raw_ipv6_qp;
+ int max_raw_ethy_qp;
+ int max_mcast_grp;
+ int max_mcast_qp_attach;
+ int max_total_mcast_qp_attach;
+ int max_ah;
+ int max_srq;
+ int max_srq_wr;
+ int max_srq_sge;
+ unsigned int max_fast_reg_page_list_len;
+ unsigned int max_pi_fast_reg_page_list_len;
+ u16 max_pkeys;
+ u8 local_ca_ack_delay;
+ int sig_prot_cap;
+ int sig_guard_cap;
+ struct ib_odp_caps odp_caps;
+ uint64_t timestamp_mask;
+ uint64_t hca_core_clock; /* in KHZ */
+ struct ib_rss_caps rss_caps;
+ u32 max_wq_type_rq;
+ u32 raw_packet_caps; /* Use ib_raw_packet_caps enum */
+ struct ib_tm_caps tm_caps;
+ struct ib_cq_caps cq_caps;
+ u64 max_dm_size;
+ /* Max entries for sgl for optimized performance per READ */
+ u32 max_sgl_rd;
+};
+
+enum ib_mtu {
+ IB_MTU_256 = 1,
+ IB_MTU_512 = 2,
+ IB_MTU_1024 = 3,
+ IB_MTU_2048 = 4,
+ IB_MTU_4096 = 5
+};
+
+enum opa_mtu {
+ OPA_MTU_8192 = 6,
+ OPA_MTU_10240 = 7
+};
+
+static inline int ib_mtu_enum_to_int(enum ib_mtu mtu)
+{
+ switch (mtu) {
+ case IB_MTU_256: return 256;
+ case IB_MTU_512: return 512;
+ case IB_MTU_1024: return 1024;
+ case IB_MTU_2048: return 2048;
+ case IB_MTU_4096: return 4096;
+ default: return -1;
+ }
+}
+
+static inline enum ib_mtu ib_mtu_int_to_enum(int mtu)
+{
+ if (mtu >= 4096)
+ return IB_MTU_4096;
+ else if (mtu >= 2048)
+ return IB_MTU_2048;
+ else if (mtu >= 1024)
+ return IB_MTU_1024;
+ else if (mtu >= 512)
+ return IB_MTU_512;
+ else
+ return IB_MTU_256;
+}
+
+static inline int opa_mtu_enum_to_int(enum opa_mtu mtu)
+{
+ switch (mtu) {
+ case OPA_MTU_8192:
+ return 8192;
+ case OPA_MTU_10240:
+ return 10240;
+ default:
+ return(ib_mtu_enum_to_int((enum ib_mtu)mtu));
+ }
+}
+
+static inline enum opa_mtu opa_mtu_int_to_enum(int mtu)
+{
+ if (mtu >= 10240)
+ return OPA_MTU_10240;
+ else if (mtu >= 8192)
+ return OPA_MTU_8192;
+ else
+ return ((enum opa_mtu)ib_mtu_int_to_enum(mtu));
+}
+
+enum ib_port_state {
+ IB_PORT_NOP = 0,
+ IB_PORT_DOWN = 1,
+ IB_PORT_INIT = 2,
+ IB_PORT_ARMED = 3,
+ IB_PORT_ACTIVE = 4,
+ IB_PORT_ACTIVE_DEFER = 5
+};
+
+enum ib_port_phys_state {
+ IB_PORT_PHYS_STATE_SLEEP = 1,
+ IB_PORT_PHYS_STATE_POLLING = 2,
+ IB_PORT_PHYS_STATE_DISABLED = 3,
+ IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING = 4,
+ IB_PORT_PHYS_STATE_LINK_UP = 5,
+ IB_PORT_PHYS_STATE_LINK_ERROR_RECOVERY = 6,
+ IB_PORT_PHYS_STATE_PHY_TEST = 7,
+};
+
+enum ib_port_width {
+ IB_WIDTH_1X = 1,
+ IB_WIDTH_2X = 16,
+ IB_WIDTH_4X = 2,
+ IB_WIDTH_8X = 4,
+ IB_WIDTH_12X = 8
+};
+
+static inline int ib_width_enum_to_int(enum ib_port_width width)
+{
+ switch (width) {
+ case IB_WIDTH_1X: return 1;
+ case IB_WIDTH_2X: return 2;
+ case IB_WIDTH_4X: return 4;
+ case IB_WIDTH_8X: return 8;
+ case IB_WIDTH_12X: return 12;
+ default: return -1;
+ }
+}
+
+enum ib_port_speed {
+ IB_SPEED_SDR = 1,
+ IB_SPEED_DDR = 2,
+ IB_SPEED_QDR = 4,
+ IB_SPEED_FDR10 = 8,
+ IB_SPEED_FDR = 16,
+ IB_SPEED_EDR = 32,
+ IB_SPEED_HDR = 64,
+ IB_SPEED_NDR = 128,
+};
+
+enum ib_stat_flag {
+ IB_STAT_FLAG_OPTIONAL = 1 << 0,
+};
+
+/**
+ * struct rdma_stat_desc
+ * @name - The name of the counter
+ * @flags - Flags of the counter; For example, IB_STAT_FLAG_OPTIONAL
+ * @priv - Driver private information; Core code should not use
+ */
+struct rdma_stat_desc {
+ const char *name;
+ unsigned int flags;
+ const void *priv;
+};
+
+/**
+ * struct rdma_hw_stats
+ * @lock - Mutex to protect parallel write access to lifespan and values
+ * of counters, which are 64bits and not guaranteed to be written
+ * atomicaly on 32bits systems.
+ * @timestamp - Used by the core code to track when the last update was
+ * @lifespan - Used by the core code to determine how old the counters
+ * should be before being updated again. Stored in jiffies, defaults
+ * to 10 milliseconds, drivers can override the default be specifying
+ * their own value during their allocation routine.
+ * @descs - Array of pointers to static descriptors used for the counters
+ * in directory.
+ * @is_disabled - A bitmap to indicate each counter is currently disabled
+ * or not.
+ * @num_counters - How many hardware counters there are. If name is
+ * shorter than this number, a kernel oops will result. Driver authors
+ * are encouraged to leave BUILD_BUG_ON(ARRAY_SIZE(@name) < num_counters)
+ * in their code to prevent this.
+ * @value - Array of u64 counters that are accessed by the sysfs code and
+ * filled in by the drivers get_stats routine
+ */
+struct rdma_hw_stats {
+ struct mutex lock; /* Protect lifespan and values[] */
+ unsigned long timestamp;
+ unsigned long lifespan;
+ const struct rdma_stat_desc *descs;
+ unsigned long *is_disabled;
+ int num_counters;
+ u64 value[];
+};
+
+#define RDMA_HW_STATS_DEFAULT_LIFESPAN 10
+
+struct rdma_hw_stats *rdma_alloc_hw_stats_struct(
+ const struct rdma_stat_desc *descs, int num_counters,
+ unsigned long lifespan);
+
+void rdma_free_hw_stats_struct(struct rdma_hw_stats *stats);
+
+/* Define bits for the various functionality this port needs to be supported by
+ * the core.
+ */
+/* Management 0x00000FFF */
+#define RDMA_CORE_CAP_IB_MAD 0x00000001
+#define RDMA_CORE_CAP_IB_SMI 0x00000002
+#define RDMA_CORE_CAP_IB_CM 0x00000004
+#define RDMA_CORE_CAP_IW_CM 0x00000008
+#define RDMA_CORE_CAP_IB_SA 0x00000010
+#define RDMA_CORE_CAP_OPA_MAD 0x00000020
+
+/* Address format 0x000FF000 */
+#define RDMA_CORE_CAP_AF_IB 0x00001000
+#define RDMA_CORE_CAP_ETH_AH 0x00002000
+#define RDMA_CORE_CAP_OPA_AH 0x00004000
+#define RDMA_CORE_CAP_IB_GRH_REQUIRED 0x00008000
+
+/* Protocol 0xFFF00000 */
+#define RDMA_CORE_CAP_PROT_IB 0x00100000
+#define RDMA_CORE_CAP_PROT_ROCE 0x00200000
+#define RDMA_CORE_CAP_PROT_IWARP 0x00400000
+#define RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP 0x00800000
+#define RDMA_CORE_CAP_PROT_RAW_PACKET 0x01000000
+#define RDMA_CORE_CAP_PROT_USNIC 0x02000000
+
+#define RDMA_CORE_PORT_IB_GRH_REQUIRED (RDMA_CORE_CAP_IB_GRH_REQUIRED \
+ | RDMA_CORE_CAP_PROT_ROCE \
+ | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP)
+
+#define RDMA_CORE_PORT_IBA_IB (RDMA_CORE_CAP_PROT_IB \
+ | RDMA_CORE_CAP_IB_MAD \
+ | RDMA_CORE_CAP_IB_SMI \
+ | RDMA_CORE_CAP_IB_CM \
+ | RDMA_CORE_CAP_IB_SA \
+ | RDMA_CORE_CAP_AF_IB)
+#define RDMA_CORE_PORT_IBA_ROCE (RDMA_CORE_CAP_PROT_ROCE \
+ | RDMA_CORE_CAP_IB_MAD \
+ | RDMA_CORE_CAP_IB_CM \
+ | RDMA_CORE_CAP_AF_IB \
+ | RDMA_CORE_CAP_ETH_AH)
+#define RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP \
+ (RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP \
+ | RDMA_CORE_CAP_IB_MAD \
+ | RDMA_CORE_CAP_IB_CM \
+ | RDMA_CORE_CAP_AF_IB \
+ | RDMA_CORE_CAP_ETH_AH)
+#define RDMA_CORE_PORT_IWARP (RDMA_CORE_CAP_PROT_IWARP \
+ | RDMA_CORE_CAP_IW_CM)
+#define RDMA_CORE_PORT_INTEL_OPA (RDMA_CORE_PORT_IBA_IB \
+ | RDMA_CORE_CAP_OPA_MAD)
+
+#define RDMA_CORE_PORT_RAW_PACKET (RDMA_CORE_CAP_PROT_RAW_PACKET)
+
+#define RDMA_CORE_PORT_USNIC (RDMA_CORE_CAP_PROT_USNIC)
+
+struct ib_port_attr {
+ u64 subnet_prefix;
+ enum ib_port_state state;
+ enum ib_mtu max_mtu;
+ enum ib_mtu active_mtu;
+ u32 phys_mtu;
+ int gid_tbl_len;
+ unsigned int ip_gids:1;
+ /* This is the value from PortInfo CapabilityMask, defined by IBA */
+ u32 port_cap_flags;
+ u32 max_msg_sz;
+ u32 bad_pkey_cntr;
+ u32 qkey_viol_cntr;
+ u16 pkey_tbl_len;
+ u32 sm_lid;
+ u32 lid;
+ u8 lmc;
+ u8 max_vl_num;
+ u8 sm_sl;
+ u8 subnet_timeout;
+ u8 init_type_reply;
+ u8 active_width;
+ u16 active_speed;
+ u8 phys_state;
+ u16 port_cap_flags2;
+};
+
+enum ib_device_modify_flags {
+ IB_DEVICE_MODIFY_SYS_IMAGE_GUID = 1 << 0,
+ IB_DEVICE_MODIFY_NODE_DESC = 1 << 1
+};
+
+#define IB_DEVICE_NODE_DESC_MAX 64
+
+struct ib_device_modify {
+ u64 sys_image_guid;
+ char node_desc[IB_DEVICE_NODE_DESC_MAX];
+};
+
+enum ib_port_modify_flags {
+ IB_PORT_SHUTDOWN = 1,
+ IB_PORT_INIT_TYPE = (1<<2),
+ IB_PORT_RESET_QKEY_CNTR = (1<<3),
+ IB_PORT_OPA_MASK_CHG = (1<<4)
+};
+
+struct ib_port_modify {
+ u32 set_port_cap_mask;
+ u32 clr_port_cap_mask;
+ u8 init_type;
+};
+
+enum ib_event_type {
+ IB_EVENT_CQ_ERR,
+ IB_EVENT_QP_FATAL,
+ IB_EVENT_QP_REQ_ERR,
+ IB_EVENT_QP_ACCESS_ERR,
+ IB_EVENT_COMM_EST,
+ IB_EVENT_SQ_DRAINED,
+ IB_EVENT_PATH_MIG,
+ IB_EVENT_PATH_MIG_ERR,
+ IB_EVENT_DEVICE_FATAL,
+ IB_EVENT_PORT_ACTIVE,
+ IB_EVENT_PORT_ERR,
+ IB_EVENT_LID_CHANGE,
+ IB_EVENT_PKEY_CHANGE,
+ IB_EVENT_SM_CHANGE,
+ IB_EVENT_SRQ_ERR,
+ IB_EVENT_SRQ_LIMIT_REACHED,
+ IB_EVENT_QP_LAST_WQE_REACHED,
+ IB_EVENT_CLIENT_REREGISTER,
+ IB_EVENT_GID_CHANGE,
+ IB_EVENT_WQ_FATAL,
+};
+
+const char *__attribute_const__ ib_event_msg(enum ib_event_type event);
+
+struct ib_event {
+ struct ib_device *device;
+ union {
+ struct ib_cq *cq;
+ struct ib_qp *qp;
+ struct ib_srq *srq;
+ struct ib_wq *wq;
+ u32 port_num;
+ } element;
+ enum ib_event_type event;
+};
+
+struct ib_event_handler {
+ struct ib_device *device;
+ void (*handler)(struct ib_event_handler *, struct ib_event *);
+ struct list_head list;
+};
+
+#define INIT_IB_EVENT_HANDLER(_ptr, _device, _handler) \
+ do { \
+ (_ptr)->device = _device; \
+ (_ptr)->handler = _handler; \
+ INIT_LIST_HEAD(&(_ptr)->list); \
+ } while (0)
+
+struct ib_global_route {
+ const struct ib_gid_attr *sgid_attr;
+ union ib_gid dgid;
+ u32 flow_label;
+ u8 sgid_index;
+ u8 hop_limit;
+ u8 traffic_class;
+};
+
+struct ib_grh {
+ __be32 version_tclass_flow;
+ __be16 paylen;
+ u8 next_hdr;
+ u8 hop_limit;
+ union ib_gid sgid;
+ union ib_gid dgid;
+};
+
+union rdma_network_hdr {
+ struct ib_grh ibgrh;
+ struct {
+ /* The IB spec states that if it's IPv4, the header
+ * is located in the last 20 bytes of the header.
+ */
+ u8 reserved[20];
+ struct iphdr roce4grh;
+ };
+};
+
+#define IB_QPN_MASK 0xFFFFFF
+
+enum {
+ IB_MULTICAST_QPN = 0xffffff
+};
+
+#define IB_LID_PERMISSIVE cpu_to_be16(0xFFFF)
+#define IB_MULTICAST_LID_BASE cpu_to_be16(0xC000)
+
+enum ib_ah_flags {
+ IB_AH_GRH = 1
+};
+
+enum ib_rate {
+ IB_RATE_PORT_CURRENT = 0,
+ IB_RATE_2_5_GBPS = 2,
+ IB_RATE_5_GBPS = 5,
+ IB_RATE_10_GBPS = 3,
+ IB_RATE_20_GBPS = 6,
+ IB_RATE_30_GBPS = 4,
+ IB_RATE_40_GBPS = 7,
+ IB_RATE_60_GBPS = 8,
+ IB_RATE_80_GBPS = 9,
+ IB_RATE_120_GBPS = 10,
+ IB_RATE_14_GBPS = 11,
+ IB_RATE_56_GBPS = 12,
+ IB_RATE_112_GBPS = 13,
+ IB_RATE_168_GBPS = 14,
+ IB_RATE_25_GBPS = 15,
+ IB_RATE_100_GBPS = 16,
+ IB_RATE_200_GBPS = 17,
+ IB_RATE_300_GBPS = 18,
+ IB_RATE_28_GBPS = 19,
+ IB_RATE_50_GBPS = 20,
+ IB_RATE_400_GBPS = 21,
+ IB_RATE_600_GBPS = 22,
+};
+
+/**
+ * ib_rate_to_mult - Convert the IB rate enum to a multiple of the
+ * base rate of 2.5 Gbit/sec. For example, IB_RATE_5_GBPS will be
+ * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec.
+ * @rate: rate to convert.
+ */
+__attribute_const__ int ib_rate_to_mult(enum ib_rate rate);
+
+/**
+ * ib_rate_to_mbps - Convert the IB rate enum to Mbps.
+ * For example, IB_RATE_2_5_GBPS will be converted to 2500.
+ * @rate: rate to convert.
+ */
+__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate);
+
+
+/**
+ * enum ib_mr_type - memory region type
+ * @IB_MR_TYPE_MEM_REG: memory region that is used for
+ * normal registration
+ * @IB_MR_TYPE_SG_GAPS: memory region that is capable to
+ * register any arbitrary sg lists (without
+ * the normal mr constraints - see
+ * ib_map_mr_sg)
+ * @IB_MR_TYPE_DM: memory region that is used for device
+ * memory registration
+ * @IB_MR_TYPE_USER: memory region that is used for the user-space
+ * application
+ * @IB_MR_TYPE_DMA: memory region that is used for DMA operations
+ * without address translations (VA=PA)
+ * @IB_MR_TYPE_INTEGRITY: memory region that is used for
+ * data integrity operations
+ */
+enum ib_mr_type {
+ IB_MR_TYPE_MEM_REG,
+ IB_MR_TYPE_SG_GAPS,
+ IB_MR_TYPE_DM,
+ IB_MR_TYPE_USER,
+ IB_MR_TYPE_DMA,
+ IB_MR_TYPE_INTEGRITY,
+};
+
+enum ib_mr_status_check {
+ IB_MR_CHECK_SIG_STATUS = 1,
+};
+
+/**
+ * struct ib_mr_status - Memory region status container
+ *
+ * @fail_status: Bitmask of MR checks status. For each
+ * failed check a corresponding status bit is set.
+ * @sig_err: Additional info for IB_MR_CEHCK_SIG_STATUS
+ * failure.
+ */
+struct ib_mr_status {
+ u32 fail_status;
+ struct ib_sig_err sig_err;
+};
+
+/**
+ * mult_to_ib_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate
+ * enum.
+ * @mult: multiple to convert.
+ */
+__attribute_const__ enum ib_rate mult_to_ib_rate(int mult);
+
+struct rdma_ah_init_attr {
+ struct rdma_ah_attr *ah_attr;
+ u32 flags;
+ struct net_device *xmit_slave;
+};
+
+enum rdma_ah_attr_type {
+ RDMA_AH_ATTR_TYPE_UNDEFINED,
+ RDMA_AH_ATTR_TYPE_IB,
+ RDMA_AH_ATTR_TYPE_ROCE,
+ RDMA_AH_ATTR_TYPE_OPA,
+};
+
+struct ib_ah_attr {
+ u16 dlid;
+ u8 src_path_bits;
+};
+
+struct roce_ah_attr {
+ u8 dmac[ETH_ALEN];
+};
+
+struct opa_ah_attr {
+ u32 dlid;
+ u8 src_path_bits;
+ bool make_grd;
+};
+
+struct rdma_ah_attr {
+ struct ib_global_route grh;
+ u8 sl;
+ u8 static_rate;
+ u32 port_num;
+ u8 ah_flags;
+ enum rdma_ah_attr_type type;
+ union {
+ struct ib_ah_attr ib;
+ struct roce_ah_attr roce;
+ struct opa_ah_attr opa;
+ };
+};
+
+enum ib_wc_status {
+ IB_WC_SUCCESS,
+ IB_WC_LOC_LEN_ERR,
+ IB_WC_LOC_QP_OP_ERR,
+ IB_WC_LOC_EEC_OP_ERR,
+ IB_WC_LOC_PROT_ERR,
+ IB_WC_WR_FLUSH_ERR,
+ IB_WC_MW_BIND_ERR,
+ IB_WC_BAD_RESP_ERR,
+ IB_WC_LOC_ACCESS_ERR,
+ IB_WC_REM_INV_REQ_ERR,
+ IB_WC_REM_ACCESS_ERR,
+ IB_WC_REM_OP_ERR,
+ IB_WC_RETRY_EXC_ERR,
+ IB_WC_RNR_RETRY_EXC_ERR,
+ IB_WC_LOC_RDD_VIOL_ERR,
+ IB_WC_REM_INV_RD_REQ_ERR,
+ IB_WC_REM_ABORT_ERR,
+ IB_WC_INV_EECN_ERR,
+ IB_WC_INV_EEC_STATE_ERR,
+ IB_WC_FATAL_ERR,
+ IB_WC_RESP_TIMEOUT_ERR,
+ IB_WC_GENERAL_ERR
+};
+
+const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status);
+
+enum ib_wc_opcode {
+ IB_WC_SEND = IB_UVERBS_WC_SEND,
+ IB_WC_RDMA_WRITE = IB_UVERBS_WC_RDMA_WRITE,
+ IB_WC_RDMA_READ = IB_UVERBS_WC_RDMA_READ,
+ IB_WC_COMP_SWAP = IB_UVERBS_WC_COMP_SWAP,
+ IB_WC_FETCH_ADD = IB_UVERBS_WC_FETCH_ADD,
+ IB_WC_BIND_MW = IB_UVERBS_WC_BIND_MW,
+ IB_WC_LOCAL_INV = IB_UVERBS_WC_LOCAL_INV,
+ IB_WC_LSO = IB_UVERBS_WC_TSO,
+ IB_WC_ATOMIC_WRITE = IB_UVERBS_WC_ATOMIC_WRITE,
+ IB_WC_REG_MR,
+ IB_WC_MASKED_COMP_SWAP,
+ IB_WC_MASKED_FETCH_ADD,
+ IB_WC_FLUSH = IB_UVERBS_WC_FLUSH,
+/*
+ * Set value of IB_WC_RECV so consumers can test if a completion is a
+ * receive by testing (opcode & IB_WC_RECV).
+ */
+ IB_WC_RECV = 1 << 7,
+ IB_WC_RECV_RDMA_WITH_IMM
+};
+
+enum ib_wc_flags {
+ IB_WC_GRH = 1,
+ IB_WC_WITH_IMM = (1<<1),
+ IB_WC_WITH_INVALIDATE = (1<<2),
+ IB_WC_IP_CSUM_OK = (1<<3),
+ IB_WC_WITH_SMAC = (1<<4),
+ IB_WC_WITH_VLAN = (1<<5),
+ IB_WC_WITH_NETWORK_HDR_TYPE = (1<<6),
+};
+
+struct ib_wc {
+ union {
+ u64 wr_id;
+ struct ib_cqe *wr_cqe;
+ };
+ enum ib_wc_status status;
+ enum ib_wc_opcode opcode;
+ u32 vendor_err;
+ u32 byte_len;
+ struct ib_qp *qp;
+ union {
+ __be32 imm_data;
+ u32 invalidate_rkey;
+ } ex;
+ u32 src_qp;
+ u32 slid;
+ int wc_flags;
+ u16 pkey_index;
+ u8 sl;
+ u8 dlid_path_bits;
+ u32 port_num; /* valid only for DR SMPs on switches */
+ u8 smac[ETH_ALEN];
+ u16 vlan_id;
+ u8 network_hdr_type;
+};
+
+enum ib_cq_notify_flags {
+ IB_CQ_SOLICITED = 1 << 0,
+ IB_CQ_NEXT_COMP = 1 << 1,
+ IB_CQ_SOLICITED_MASK = IB_CQ_SOLICITED | IB_CQ_NEXT_COMP,
+ IB_CQ_REPORT_MISSED_EVENTS = 1 << 2,
+};
+
+enum ib_srq_type {
+ IB_SRQT_BASIC = IB_UVERBS_SRQT_BASIC,
+ IB_SRQT_XRC = IB_UVERBS_SRQT_XRC,
+ IB_SRQT_TM = IB_UVERBS_SRQT_TM,
+};
+
+static inline bool ib_srq_has_cq(enum ib_srq_type srq_type)
+{
+ return srq_type == IB_SRQT_XRC ||
+ srq_type == IB_SRQT_TM;
+}
+
+enum ib_srq_attr_mask {
+ IB_SRQ_MAX_WR = 1 << 0,
+ IB_SRQ_LIMIT = 1 << 1,
+};
+
+struct ib_srq_attr {
+ u32 max_wr;
+ u32 max_sge;
+ u32 srq_limit;
+};
+
+struct ib_srq_init_attr {
+ void (*event_handler)(struct ib_event *, void *);
+ void *srq_context;
+ struct ib_srq_attr attr;
+ enum ib_srq_type srq_type;
+
+ struct {
+ struct ib_cq *cq;
+ union {
+ struct {
+ struct ib_xrcd *xrcd;
+ } xrc;
+
+ struct {
+ u32 max_num_tags;
+ } tag_matching;
+ };
+ } ext;
+};
+
+struct ib_qp_cap {
+ u32 max_send_wr;
+ u32 max_recv_wr;
+ u32 max_send_sge;
+ u32 max_recv_sge;
+ u32 max_inline_data;
+
+ /*
+ * Maximum number of rdma_rw_ctx structures in flight at a time.
+ * ib_create_qp() will calculate the right amount of neededed WRs
+ * and MRs based on this.
+ */
+ u32 max_rdma_ctxs;
+};
+
+enum ib_sig_type {
+ IB_SIGNAL_ALL_WR,
+ IB_SIGNAL_REQ_WR
+};
+
+enum ib_qp_type {
+ /*
+ * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries
+ * here (and in that order) since the MAD layer uses them as
+ * indices into a 2-entry table.
+ */
+ IB_QPT_SMI,
+ IB_QPT_GSI,
+
+ IB_QPT_RC = IB_UVERBS_QPT_RC,
+ IB_QPT_UC = IB_UVERBS_QPT_UC,
+ IB_QPT_UD = IB_UVERBS_QPT_UD,
+ IB_QPT_RAW_IPV6,
+ IB_QPT_RAW_ETHERTYPE,
+ IB_QPT_RAW_PACKET = IB_UVERBS_QPT_RAW_PACKET,
+ IB_QPT_XRC_INI = IB_UVERBS_QPT_XRC_INI,
+ IB_QPT_XRC_TGT = IB_UVERBS_QPT_XRC_TGT,
+ IB_QPT_MAX,
+ IB_QPT_DRIVER = IB_UVERBS_QPT_DRIVER,
+ /* Reserve a range for qp types internal to the low level driver.
+ * These qp types will not be visible at the IB core layer, so the
+ * IB_QPT_MAX usages should not be affected in the core layer
+ */
+ IB_QPT_RESERVED1 = 0x1000,
+ IB_QPT_RESERVED2,
+ IB_QPT_RESERVED3,
+ IB_QPT_RESERVED4,
+ IB_QPT_RESERVED5,
+ IB_QPT_RESERVED6,
+ IB_QPT_RESERVED7,
+ IB_QPT_RESERVED8,
+ IB_QPT_RESERVED9,
+ IB_QPT_RESERVED10,
+};
+
+enum ib_qp_create_flags {
+ IB_QP_CREATE_IPOIB_UD_LSO = 1 << 0,
+ IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK =
+ IB_UVERBS_QP_CREATE_BLOCK_MULTICAST_LOOPBACK,
+ IB_QP_CREATE_CROSS_CHANNEL = 1 << 2,
+ IB_QP_CREATE_MANAGED_SEND = 1 << 3,
+ IB_QP_CREATE_MANAGED_RECV = 1 << 4,
+ IB_QP_CREATE_NETIF_QP = 1 << 5,
+ IB_QP_CREATE_INTEGRITY_EN = 1 << 6,
+ IB_QP_CREATE_NETDEV_USE = 1 << 7,
+ IB_QP_CREATE_SCATTER_FCS =
+ IB_UVERBS_QP_CREATE_SCATTER_FCS,
+ IB_QP_CREATE_CVLAN_STRIPPING =
+ IB_UVERBS_QP_CREATE_CVLAN_STRIPPING,
+ IB_QP_CREATE_SOURCE_QPN = 1 << 10,
+ IB_QP_CREATE_PCI_WRITE_END_PADDING =
+ IB_UVERBS_QP_CREATE_PCI_WRITE_END_PADDING,
+ /* reserve bits 26-31 for low level drivers' internal use */
+ IB_QP_CREATE_RESERVED_START = 1 << 26,
+ IB_QP_CREATE_RESERVED_END = 1 << 31,
+};
+
+/*
+ * Note: users may not call ib_close_qp or ib_destroy_qp from the event_handler
+ * callback to destroy the passed in QP.
+ */
+
+struct ib_qp_init_attr {
+ /* This callback occurs in workqueue context */
+ void (*event_handler)(struct ib_event *, void *);
+
+ void *qp_context;
+ struct ib_cq *send_cq;
+ struct ib_cq *recv_cq;
+ struct ib_srq *srq;
+ struct ib_xrcd *xrcd; /* XRC TGT QPs only */
+ struct ib_qp_cap cap;
+ enum ib_sig_type sq_sig_type;
+ enum ib_qp_type qp_type;
+ u32 create_flags;
+
+ /*
+ * Only needed for special QP types, or when using the RW API.
+ */
+ u32 port_num;
+ struct ib_rwq_ind_table *rwq_ind_tbl;
+ u32 source_qpn;
+};
+
+struct ib_qp_open_attr {
+ void (*event_handler)(struct ib_event *, void *);
+ void *qp_context;
+ u32 qp_num;
+ enum ib_qp_type qp_type;
+};
+
+enum ib_rnr_timeout {
+ IB_RNR_TIMER_655_36 = 0,
+ IB_RNR_TIMER_000_01 = 1,
+ IB_RNR_TIMER_000_02 = 2,
+ IB_RNR_TIMER_000_03 = 3,
+ IB_RNR_TIMER_000_04 = 4,
+ IB_RNR_TIMER_000_06 = 5,
+ IB_RNR_TIMER_000_08 = 6,
+ IB_RNR_TIMER_000_12 = 7,
+ IB_RNR_TIMER_000_16 = 8,
+ IB_RNR_TIMER_000_24 = 9,
+ IB_RNR_TIMER_000_32 = 10,
+ IB_RNR_TIMER_000_48 = 11,
+ IB_RNR_TIMER_000_64 = 12,
+ IB_RNR_TIMER_000_96 = 13,
+ IB_RNR_TIMER_001_28 = 14,
+ IB_RNR_TIMER_001_92 = 15,
+ IB_RNR_TIMER_002_56 = 16,
+ IB_RNR_TIMER_003_84 = 17,
+ IB_RNR_TIMER_005_12 = 18,
+ IB_RNR_TIMER_007_68 = 19,
+ IB_RNR_TIMER_010_24 = 20,
+ IB_RNR_TIMER_015_36 = 21,
+ IB_RNR_TIMER_020_48 = 22,
+ IB_RNR_TIMER_030_72 = 23,
+ IB_RNR_TIMER_040_96 = 24,
+ IB_RNR_TIMER_061_44 = 25,
+ IB_RNR_TIMER_081_92 = 26,
+ IB_RNR_TIMER_122_88 = 27,
+ IB_RNR_TIMER_163_84 = 28,
+ IB_RNR_TIMER_245_76 = 29,
+ IB_RNR_TIMER_327_68 = 30,
+ IB_RNR_TIMER_491_52 = 31
+};
+
+enum ib_qp_attr_mask {
+ IB_QP_STATE = 1,
+ IB_QP_CUR_STATE = (1<<1),
+ IB_QP_EN_SQD_ASYNC_NOTIFY = (1<<2),
+ IB_QP_ACCESS_FLAGS = (1<<3),
+ IB_QP_PKEY_INDEX = (1<<4),
+ IB_QP_PORT = (1<<5),
+ IB_QP_QKEY = (1<<6),
+ IB_QP_AV = (1<<7),
+ IB_QP_PATH_MTU = (1<<8),
+ IB_QP_TIMEOUT = (1<<9),
+ IB_QP_RETRY_CNT = (1<<10),
+ IB_QP_RNR_RETRY = (1<<11),
+ IB_QP_RQ_PSN = (1<<12),
+ IB_QP_MAX_QP_RD_ATOMIC = (1<<13),
+ IB_QP_ALT_PATH = (1<<14),
+ IB_QP_MIN_RNR_TIMER = (1<<15),
+ IB_QP_SQ_PSN = (1<<16),
+ IB_QP_MAX_DEST_RD_ATOMIC = (1<<17),
+ IB_QP_PATH_MIG_STATE = (1<<18),
+ IB_QP_CAP = (1<<19),
+ IB_QP_DEST_QPN = (1<<20),
+ IB_QP_RESERVED1 = (1<<21),
+ IB_QP_RESERVED2 = (1<<22),
+ IB_QP_RESERVED3 = (1<<23),
+ IB_QP_RESERVED4 = (1<<24),
+ IB_QP_RATE_LIMIT = (1<<25),
+
+ IB_QP_ATTR_STANDARD_BITS = GENMASK(20, 0),
+};
+
+enum ib_qp_state {
+ IB_QPS_RESET,
+ IB_QPS_INIT,
+ IB_QPS_RTR,
+ IB_QPS_RTS,
+ IB_QPS_SQD,
+ IB_QPS_SQE,
+ IB_QPS_ERR
+};
+
+enum ib_mig_state {
+ IB_MIG_MIGRATED,
+ IB_MIG_REARM,
+ IB_MIG_ARMED
+};
+
+enum ib_mw_type {
+ IB_MW_TYPE_1 = 1,
+ IB_MW_TYPE_2 = 2
+};
+
+struct ib_qp_attr {
+ enum ib_qp_state qp_state;
+ enum ib_qp_state cur_qp_state;
+ enum ib_mtu path_mtu;
+ enum ib_mig_state path_mig_state;
+ u32 qkey;
+ u32 rq_psn;
+ u32 sq_psn;
+ u32 dest_qp_num;
+ int qp_access_flags;
+ struct ib_qp_cap cap;
+ struct rdma_ah_attr ah_attr;
+ struct rdma_ah_attr alt_ah_attr;
+ u16 pkey_index;
+ u16 alt_pkey_index;
+ u8 en_sqd_async_notify;
+ u8 sq_draining;
+ u8 max_rd_atomic;
+ u8 max_dest_rd_atomic;
+ u8 min_rnr_timer;
+ u32 port_num;
+ u8 timeout;
+ u8 retry_cnt;
+ u8 rnr_retry;
+ u32 alt_port_num;
+ u8 alt_timeout;
+ u32 rate_limit;
+ struct net_device *xmit_slave;
+};
+
+enum ib_wr_opcode {
+ /* These are shared with userspace */
+ IB_WR_RDMA_WRITE = IB_UVERBS_WR_RDMA_WRITE,
+ IB_WR_RDMA_WRITE_WITH_IMM = IB_UVERBS_WR_RDMA_WRITE_WITH_IMM,
+ IB_WR_SEND = IB_UVERBS_WR_SEND,
+ IB_WR_SEND_WITH_IMM = IB_UVERBS_WR_SEND_WITH_IMM,
+ IB_WR_RDMA_READ = IB_UVERBS_WR_RDMA_READ,
+ IB_WR_ATOMIC_CMP_AND_SWP = IB_UVERBS_WR_ATOMIC_CMP_AND_SWP,
+ IB_WR_ATOMIC_FETCH_AND_ADD = IB_UVERBS_WR_ATOMIC_FETCH_AND_ADD,
+ IB_WR_BIND_MW = IB_UVERBS_WR_BIND_MW,
+ IB_WR_LSO = IB_UVERBS_WR_TSO,
+ IB_WR_SEND_WITH_INV = IB_UVERBS_WR_SEND_WITH_INV,
+ IB_WR_RDMA_READ_WITH_INV = IB_UVERBS_WR_RDMA_READ_WITH_INV,
+ IB_WR_LOCAL_INV = IB_UVERBS_WR_LOCAL_INV,
+ IB_WR_MASKED_ATOMIC_CMP_AND_SWP =
+ IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP,
+ IB_WR_MASKED_ATOMIC_FETCH_AND_ADD =
+ IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD,
+ IB_WR_FLUSH = IB_UVERBS_WR_FLUSH,
+ IB_WR_ATOMIC_WRITE = IB_UVERBS_WR_ATOMIC_WRITE,
+
+ /* These are kernel only and can not be issued by userspace */
+ IB_WR_REG_MR = 0x20,
+ IB_WR_REG_MR_INTEGRITY,
+
+ /* reserve values for low level drivers' internal use.
+ * These values will not be used at all in the ib core layer.
+ */
+ IB_WR_RESERVED1 = 0xf0,
+ IB_WR_RESERVED2,
+ IB_WR_RESERVED3,
+ IB_WR_RESERVED4,
+ IB_WR_RESERVED5,
+ IB_WR_RESERVED6,
+ IB_WR_RESERVED7,
+ IB_WR_RESERVED8,
+ IB_WR_RESERVED9,
+ IB_WR_RESERVED10,
+};
+
+enum ib_send_flags {
+ IB_SEND_FENCE = 1,
+ IB_SEND_SIGNALED = (1<<1),
+ IB_SEND_SOLICITED = (1<<2),
+ IB_SEND_INLINE = (1<<3),
+ IB_SEND_IP_CSUM = (1<<4),
+
+ /* reserve bits 26-31 for low level drivers' internal use */
+ IB_SEND_RESERVED_START = (1 << 26),
+ IB_SEND_RESERVED_END = (1 << 31),
+};
+
+struct ib_sge {
+ u64 addr;
+ u32 length;
+ u32 lkey;
+};
+
+struct ib_cqe {
+ void (*done)(struct ib_cq *cq, struct ib_wc *wc);
+};
+
+struct ib_send_wr {
+ struct ib_send_wr *next;
+ union {
+ u64 wr_id;
+ struct ib_cqe *wr_cqe;
+ };
+ struct ib_sge *sg_list;
+ int num_sge;
+ enum ib_wr_opcode opcode;
+ int send_flags;
+ union {
+ __be32 imm_data;
+ u32 invalidate_rkey;
+ } ex;
+};
+
+struct ib_rdma_wr {
+ struct ib_send_wr wr;
+ u64 remote_addr;
+ u32 rkey;
+};
+
+static inline const struct ib_rdma_wr *rdma_wr(const struct ib_send_wr *wr)
+{
+ return container_of(wr, struct ib_rdma_wr, wr);
+}
+
+struct ib_atomic_wr {
+ struct ib_send_wr wr;
+ u64 remote_addr;
+ u64 compare_add;
+ u64 swap;
+ u64 compare_add_mask;
+ u64 swap_mask;
+ u32 rkey;
+};
+
+static inline const struct ib_atomic_wr *atomic_wr(const struct ib_send_wr *wr)
+{
+ return container_of(wr, struct ib_atomic_wr, wr);
+}
+
+struct ib_ud_wr {
+ struct ib_send_wr wr;
+ struct ib_ah *ah;
+ void *header;
+ int hlen;
+ int mss;
+ u32 remote_qpn;
+ u32 remote_qkey;
+ u16 pkey_index; /* valid for GSI only */
+ u32 port_num; /* valid for DR SMPs on switch only */
+};
+
+static inline const struct ib_ud_wr *ud_wr(const struct ib_send_wr *wr)
+{
+ return container_of(wr, struct ib_ud_wr, wr);
+}
+
+struct ib_reg_wr {
+ struct ib_send_wr wr;
+ struct ib_mr *mr;
+ u32 key;
+ int access;
+};
+
+static inline const struct ib_reg_wr *reg_wr(const struct ib_send_wr *wr)
+{
+ return container_of(wr, struct ib_reg_wr, wr);
+}
+
+struct ib_recv_wr {
+ struct ib_recv_wr *next;
+ union {
+ u64 wr_id;
+ struct ib_cqe *wr_cqe;
+ };
+ struct ib_sge *sg_list;
+ int num_sge;
+};
+
+enum ib_access_flags {
+ IB_ACCESS_LOCAL_WRITE = IB_UVERBS_ACCESS_LOCAL_WRITE,
+ IB_ACCESS_REMOTE_WRITE = IB_UVERBS_ACCESS_REMOTE_WRITE,
+ IB_ACCESS_REMOTE_READ = IB_UVERBS_ACCESS_REMOTE_READ,
+ IB_ACCESS_REMOTE_ATOMIC = IB_UVERBS_ACCESS_REMOTE_ATOMIC,
+ IB_ACCESS_MW_BIND = IB_UVERBS_ACCESS_MW_BIND,
+ IB_ZERO_BASED = IB_UVERBS_ACCESS_ZERO_BASED,
+ IB_ACCESS_ON_DEMAND = IB_UVERBS_ACCESS_ON_DEMAND,
+ IB_ACCESS_HUGETLB = IB_UVERBS_ACCESS_HUGETLB,
+ IB_ACCESS_RELAXED_ORDERING = IB_UVERBS_ACCESS_RELAXED_ORDERING,
+ IB_ACCESS_FLUSH_GLOBAL = IB_UVERBS_ACCESS_FLUSH_GLOBAL,
+ IB_ACCESS_FLUSH_PERSISTENT = IB_UVERBS_ACCESS_FLUSH_PERSISTENT,
+
+ IB_ACCESS_OPTIONAL = IB_UVERBS_ACCESS_OPTIONAL_RANGE,
+ IB_ACCESS_SUPPORTED =
+ ((IB_ACCESS_FLUSH_PERSISTENT << 1) - 1) | IB_ACCESS_OPTIONAL,
+};
+
+/*
+ * XXX: these are apparently used for ->rereg_user_mr, no idea why they
+ * are hidden here instead of a uapi header!
+ */
+enum ib_mr_rereg_flags {
+ IB_MR_REREG_TRANS = 1,
+ IB_MR_REREG_PD = (1<<1),
+ IB_MR_REREG_ACCESS = (1<<2),
+ IB_MR_REREG_SUPPORTED = ((IB_MR_REREG_ACCESS << 1) - 1)
+};
+
+struct ib_umem;
+
+enum rdma_remove_reason {
+ /*
+ * Userspace requested uobject deletion or initial try
+ * to remove uobject via cleanup. Call could fail
+ */
+ RDMA_REMOVE_DESTROY,
+ /* Context deletion. This call should delete the actual object itself */
+ RDMA_REMOVE_CLOSE,
+ /* Driver is being hot-unplugged. This call should delete the actual object itself */
+ RDMA_REMOVE_DRIVER_REMOVE,
+ /* uobj is being cleaned-up before being committed */
+ RDMA_REMOVE_ABORT,
+ /* The driver failed to destroy the uobject and is being disconnected */
+ RDMA_REMOVE_DRIVER_FAILURE,
+};
+
+struct ib_rdmacg_object {
+#ifdef CONFIG_CGROUP_RDMA
+ struct rdma_cgroup *cg; /* owner rdma cgroup */
+#endif
+};
+
+struct ib_ucontext {
+ struct ib_device *device;
+ struct ib_uverbs_file *ufile;
+
+ struct ib_rdmacg_object cg_obj;
+ /*
+ * Implementation details of the RDMA core, don't use in drivers:
+ */
+ struct rdma_restrack_entry res;
+ struct xarray mmap_xa;
+};
+
+struct ib_uobject {
+ u64 user_handle; /* handle given to us by userspace */
+ /* ufile & ucontext owning this object */
+ struct ib_uverbs_file *ufile;
+ /* FIXME, save memory: ufile->context == context */
+ struct ib_ucontext *context; /* associated user context */
+ void *object; /* containing object */
+ struct list_head list; /* link to context's list */
+ struct ib_rdmacg_object cg_obj; /* rdmacg object */
+ int id; /* index into kernel idr */
+ struct kref ref;
+ atomic_t usecnt; /* protects exclusive access */
+ struct rcu_head rcu; /* kfree_rcu() overhead */
+
+ const struct uverbs_api_object *uapi_object;
+};
+
+struct ib_udata {
+ const void __user *inbuf;
+ void __user *outbuf;
+ size_t inlen;
+ size_t outlen;
+};
+
+struct ib_pd {
+ u32 local_dma_lkey;
+ u32 flags;
+ struct ib_device *device;
+ struct ib_uobject *uobject;
+ atomic_t usecnt; /* count all resources */
+
+ u32 unsafe_global_rkey;
+
+ /*
+ * Implementation details of the RDMA core, don't use in drivers:
+ */
+ struct ib_mr *__internal_mr;
+ struct rdma_restrack_entry res;
+};
+
+struct ib_xrcd {
+ struct ib_device *device;
+ atomic_t usecnt; /* count all exposed resources */
+ struct inode *inode;
+ struct rw_semaphore tgt_qps_rwsem;
+ struct xarray tgt_qps;
+};
+
+struct ib_ah {
+ struct ib_device *device;
+ struct ib_pd *pd;
+ struct ib_uobject *uobject;
+ const struct ib_gid_attr *sgid_attr;
+ enum rdma_ah_attr_type type;
+};
+
+typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context);
+
+enum ib_poll_context {
+ IB_POLL_SOFTIRQ, /* poll from softirq context */
+ IB_POLL_WORKQUEUE, /* poll from workqueue */
+ IB_POLL_UNBOUND_WORKQUEUE, /* poll from unbound workqueue */
+ IB_POLL_LAST_POOL_TYPE = IB_POLL_UNBOUND_WORKQUEUE,
+
+ IB_POLL_DIRECT, /* caller context, no hw completions */
+};
+
+struct ib_cq {
+ struct ib_device *device;
+ struct ib_ucq_object *uobject;
+ ib_comp_handler comp_handler;
+ void (*event_handler)(struct ib_event *, void *);
+ void *cq_context;
+ int cqe;
+ unsigned int cqe_used;
+ atomic_t usecnt; /* count number of work queues */
+ enum ib_poll_context poll_ctx;
+ struct ib_wc *wc;
+ struct list_head pool_entry;
+ union {
+ struct irq_poll iop;
+ struct work_struct work;
+ };
+ struct workqueue_struct *comp_wq;
+ struct dim *dim;
+
+ /* updated only by trace points */
+ ktime_t timestamp;
+ u8 interrupt:1;
+ u8 shared:1;
+ unsigned int comp_vector;
+
+ /*
+ * Implementation details of the RDMA core, don't use in drivers:
+ */
+ struct rdma_restrack_entry res;
+};
+
+struct ib_srq {
+ struct ib_device *device;
+ struct ib_pd *pd;
+ struct ib_usrq_object *uobject;
+ void (*event_handler)(struct ib_event *, void *);
+ void *srq_context;
+ enum ib_srq_type srq_type;
+ atomic_t usecnt;
+
+ struct {
+ struct ib_cq *cq;
+ union {
+ struct {
+ struct ib_xrcd *xrcd;
+ u32 srq_num;
+ } xrc;
+ };
+ } ext;
+
+ /*
+ * Implementation details of the RDMA core, don't use in drivers:
+ */
+ struct rdma_restrack_entry res;
+};
+
+enum ib_raw_packet_caps {
+ /*
+ * Strip cvlan from incoming packet and report it in the matching work
+ * completion is supported.
+ */
+ IB_RAW_PACKET_CAP_CVLAN_STRIPPING =
+ IB_UVERBS_RAW_PACKET_CAP_CVLAN_STRIPPING,
+ /*
+ * Scatter FCS field of an incoming packet to host memory is supported.
+ */
+ IB_RAW_PACKET_CAP_SCATTER_FCS = IB_UVERBS_RAW_PACKET_CAP_SCATTER_FCS,
+ /* Checksum offloads are supported (for both send and receive). */
+ IB_RAW_PACKET_CAP_IP_CSUM = IB_UVERBS_RAW_PACKET_CAP_IP_CSUM,
+ /*
+ * When a packet is received for an RQ with no receive WQEs, the
+ * packet processing is delayed.
+ */
+ IB_RAW_PACKET_CAP_DELAY_DROP = IB_UVERBS_RAW_PACKET_CAP_DELAY_DROP,
+};
+
+enum ib_wq_type {
+ IB_WQT_RQ = IB_UVERBS_WQT_RQ,
+};
+
+enum ib_wq_state {
+ IB_WQS_RESET,
+ IB_WQS_RDY,
+ IB_WQS_ERR
+};
+
+struct ib_wq {
+ struct ib_device *device;
+ struct ib_uwq_object *uobject;
+ void *wq_context;
+ void (*event_handler)(struct ib_event *, void *);
+ struct ib_pd *pd;
+ struct ib_cq *cq;
+ u32 wq_num;
+ enum ib_wq_state state;
+ enum ib_wq_type wq_type;
+ atomic_t usecnt;
+};
+
+enum ib_wq_flags {
+ IB_WQ_FLAGS_CVLAN_STRIPPING = IB_UVERBS_WQ_FLAGS_CVLAN_STRIPPING,
+ IB_WQ_FLAGS_SCATTER_FCS = IB_UVERBS_WQ_FLAGS_SCATTER_FCS,
+ IB_WQ_FLAGS_DELAY_DROP = IB_UVERBS_WQ_FLAGS_DELAY_DROP,
+ IB_WQ_FLAGS_PCI_WRITE_END_PADDING =
+ IB_UVERBS_WQ_FLAGS_PCI_WRITE_END_PADDING,
+};
+
+struct ib_wq_init_attr {
+ void *wq_context;
+ enum ib_wq_type wq_type;
+ u32 max_wr;
+ u32 max_sge;
+ struct ib_cq *cq;
+ void (*event_handler)(struct ib_event *, void *);
+ u32 create_flags; /* Use enum ib_wq_flags */
+};
+
+enum ib_wq_attr_mask {
+ IB_WQ_STATE = 1 << 0,
+ IB_WQ_CUR_STATE = 1 << 1,
+ IB_WQ_FLAGS = 1 << 2,
+};
+
+struct ib_wq_attr {
+ enum ib_wq_state wq_state;
+ enum ib_wq_state curr_wq_state;
+ u32 flags; /* Use enum ib_wq_flags */
+ u32 flags_mask; /* Use enum ib_wq_flags */
+};
+
+struct ib_rwq_ind_table {
+ struct ib_device *device;
+ struct ib_uobject *uobject;
+ atomic_t usecnt;
+ u32 ind_tbl_num;
+ u32 log_ind_tbl_size;
+ struct ib_wq **ind_tbl;
+};
+
+struct ib_rwq_ind_table_init_attr {
+ u32 log_ind_tbl_size;
+ /* Each entry is a pointer to Receive Work Queue */
+ struct ib_wq **ind_tbl;
+};
+
+enum port_pkey_state {
+ IB_PORT_PKEY_NOT_VALID = 0,
+ IB_PORT_PKEY_VALID = 1,
+ IB_PORT_PKEY_LISTED = 2,
+};
+
+struct ib_qp_security;
+
+struct ib_port_pkey {
+ enum port_pkey_state state;
+ u16 pkey_index;
+ u32 port_num;
+ struct list_head qp_list;
+ struct list_head to_error_list;
+ struct ib_qp_security *sec;
+};
+
+struct ib_ports_pkeys {
+ struct ib_port_pkey main;
+ struct ib_port_pkey alt;
+};
+
+struct ib_qp_security {
+ struct ib_qp *qp;
+ struct ib_device *dev;
+ /* Hold this mutex when changing port and pkey settings. */
+ struct mutex mutex;
+ struct ib_ports_pkeys *ports_pkeys;
+ /* A list of all open shared QP handles. Required to enforce security
+ * properly for all users of a shared QP.
+ */
+ struct list_head shared_qp_list;
+ void *security;
+ bool destroying;
+ atomic_t error_list_count;
+ struct completion error_complete;
+ int error_comps_pending;
+};
+
+/*
+ * @max_write_sge: Maximum SGE elements per RDMA WRITE request.
+ * @max_read_sge: Maximum SGE elements per RDMA READ request.
+ */
+struct ib_qp {
+ struct ib_device *device;
+ struct ib_pd *pd;
+ struct ib_cq *send_cq;
+ struct ib_cq *recv_cq;
+ spinlock_t mr_lock;
+ int mrs_used;
+ struct list_head rdma_mrs;
+ struct list_head sig_mrs;
+ struct ib_srq *srq;
+ struct ib_xrcd *xrcd; /* XRC TGT QPs only */
+ struct list_head xrcd_list;
+
+ /* count times opened, mcast attaches, flow attaches */
+ atomic_t usecnt;
+ struct list_head open_list;
+ struct ib_qp *real_qp;
+ struct ib_uqp_object *uobject;
+ void (*event_handler)(struct ib_event *, void *);
+ void *qp_context;
+ /* sgid_attrs associated with the AV's */
+ const struct ib_gid_attr *av_sgid_attr;
+ const struct ib_gid_attr *alt_path_sgid_attr;
+ u32 qp_num;
+ u32 max_write_sge;
+ u32 max_read_sge;
+ enum ib_qp_type qp_type;
+ struct ib_rwq_ind_table *rwq_ind_tbl;
+ struct ib_qp_security *qp_sec;
+ u32 port;
+
+ bool integrity_en;
+ /*
+ * Implementation details of the RDMA core, don't use in drivers:
+ */
+ struct rdma_restrack_entry res;
+
+ /* The counter the qp is bind to */
+ struct rdma_counter *counter;
+};
+
+struct ib_dm {
+ struct ib_device *device;
+ u32 length;
+ u32 flags;
+ struct ib_uobject *uobject;
+ atomic_t usecnt;
+};
+
+struct ib_mr {
+ struct ib_device *device;
+ struct ib_pd *pd;
+ u32 lkey;
+ u32 rkey;
+ u64 iova;
+ u64 length;
+ unsigned int page_size;
+ enum ib_mr_type type;
+ bool need_inval;
+ union {
+ struct ib_uobject *uobject; /* user */
+ struct list_head qp_entry; /* FR */
+ };
+
+ struct ib_dm *dm;
+ struct ib_sig_attrs *sig_attrs; /* only for IB_MR_TYPE_INTEGRITY MRs */
+ /*
+ * Implementation details of the RDMA core, don't use in drivers:
+ */
+ struct rdma_restrack_entry res;
+};
+
+struct ib_mw {
+ struct ib_device *device;
+ struct ib_pd *pd;
+ struct ib_uobject *uobject;
+ u32 rkey;
+ enum ib_mw_type type;
+};
+
+/* Supported steering options */
+enum ib_flow_attr_type {
+ /* steering according to rule specifications */
+ IB_FLOW_ATTR_NORMAL = 0x0,
+ /* default unicast and multicast rule -
+ * receive all Eth traffic which isn't steered to any QP
+ */
+ IB_FLOW_ATTR_ALL_DEFAULT = 0x1,
+ /* default multicast rule -
+ * receive all Eth multicast traffic which isn't steered to any QP
+ */
+ IB_FLOW_ATTR_MC_DEFAULT = 0x2,
+ /* sniffer rule - receive all port traffic */
+ IB_FLOW_ATTR_SNIFFER = 0x3
+};
+
+/* Supported steering header types */
+enum ib_flow_spec_type {
+ /* L2 headers*/
+ IB_FLOW_SPEC_ETH = 0x20,
+ IB_FLOW_SPEC_IB = 0x22,
+ /* L3 header*/
+ IB_FLOW_SPEC_IPV4 = 0x30,
+ IB_FLOW_SPEC_IPV6 = 0x31,
+ IB_FLOW_SPEC_ESP = 0x34,
+ /* L4 headers*/
+ IB_FLOW_SPEC_TCP = 0x40,
+ IB_FLOW_SPEC_UDP = 0x41,
+ IB_FLOW_SPEC_VXLAN_TUNNEL = 0x50,
+ IB_FLOW_SPEC_GRE = 0x51,
+ IB_FLOW_SPEC_MPLS = 0x60,
+ IB_FLOW_SPEC_INNER = 0x100,
+ /* Actions */
+ IB_FLOW_SPEC_ACTION_TAG = 0x1000,
+ IB_FLOW_SPEC_ACTION_DROP = 0x1001,
+ IB_FLOW_SPEC_ACTION_HANDLE = 0x1002,
+ IB_FLOW_SPEC_ACTION_COUNT = 0x1003,
+};
+#define IB_FLOW_SPEC_LAYER_MASK 0xF0
+#define IB_FLOW_SPEC_SUPPORT_LAYERS 10
+
+enum ib_flow_flags {
+ IB_FLOW_ATTR_FLAGS_DONT_TRAP = 1UL << 1, /* Continue match, no steal */
+ IB_FLOW_ATTR_FLAGS_EGRESS = 1UL << 2, /* Egress flow */
+ IB_FLOW_ATTR_FLAGS_RESERVED = 1UL << 3 /* Must be last */
+};
+
+struct ib_flow_eth_filter {
+ u8 dst_mac[6];
+ u8 src_mac[6];
+ __be16 ether_type;
+ __be16 vlan_tag;
+ /* Must be last */
+ u8 real_sz[];
+};
+
+struct ib_flow_spec_eth {
+ u32 type;
+ u16 size;
+ struct ib_flow_eth_filter val;
+ struct ib_flow_eth_filter mask;
+};
+
+struct ib_flow_ib_filter {
+ __be16 dlid;
+ __u8 sl;
+ /* Must be last */
+ u8 real_sz[];
+};
+
+struct ib_flow_spec_ib {
+ u32 type;
+ u16 size;
+ struct ib_flow_ib_filter val;
+ struct ib_flow_ib_filter mask;
+};
+
+/* IPv4 header flags */
+enum ib_ipv4_flags {
+ IB_IPV4_DONT_FRAG = 0x2, /* Don't enable packet fragmentation */
+ IB_IPV4_MORE_FRAG = 0X4 /* For All fragmented packets except the
+ last have this flag set */
+};
+
+struct ib_flow_ipv4_filter {
+ __be32 src_ip;
+ __be32 dst_ip;
+ u8 proto;
+ u8 tos;
+ u8 ttl;
+ u8 flags;
+ /* Must be last */
+ u8 real_sz[];
+};
+
+struct ib_flow_spec_ipv4 {
+ u32 type;
+ u16 size;
+ struct ib_flow_ipv4_filter val;
+ struct ib_flow_ipv4_filter mask;
+};
+
+struct ib_flow_ipv6_filter {
+ u8 src_ip[16];
+ u8 dst_ip[16];
+ __be32 flow_label;
+ u8 next_hdr;
+ u8 traffic_class;
+ u8 hop_limit;
+ /* Must be last */
+ u8 real_sz[];
+};
+
+struct ib_flow_spec_ipv6 {
+ u32 type;
+ u16 size;
+ struct ib_flow_ipv6_filter val;
+ struct ib_flow_ipv6_filter mask;
+};
+
+struct ib_flow_tcp_udp_filter {
+ __be16 dst_port;
+ __be16 src_port;
+ /* Must be last */
+ u8 real_sz[];
+};
+
+struct ib_flow_spec_tcp_udp {
+ u32 type;
+ u16 size;
+ struct ib_flow_tcp_udp_filter val;
+ struct ib_flow_tcp_udp_filter mask;
+};
+
+struct ib_flow_tunnel_filter {
+ __be32 tunnel_id;
+ u8 real_sz[];
+};
+
+/* ib_flow_spec_tunnel describes the Vxlan tunnel
+ * the tunnel_id from val has the vni value
+ */
+struct ib_flow_spec_tunnel {
+ u32 type;
+ u16 size;
+ struct ib_flow_tunnel_filter val;
+ struct ib_flow_tunnel_filter mask;
+};
+
+struct ib_flow_esp_filter {
+ __be32 spi;
+ __be32 seq;
+ /* Must be last */
+ u8 real_sz[];
+};
+
+struct ib_flow_spec_esp {
+ u32 type;
+ u16 size;
+ struct ib_flow_esp_filter val;
+ struct ib_flow_esp_filter mask;
+};
+
+struct ib_flow_gre_filter {
+ __be16 c_ks_res0_ver;
+ __be16 protocol;
+ __be32 key;
+ /* Must be last */
+ u8 real_sz[];
+};
+
+struct ib_flow_spec_gre {
+ u32 type;
+ u16 size;
+ struct ib_flow_gre_filter val;
+ struct ib_flow_gre_filter mask;
+};
+
+struct ib_flow_mpls_filter {
+ __be32 tag;
+ /* Must be last */
+ u8 real_sz[];
+};
+
+struct ib_flow_spec_mpls {
+ u32 type;
+ u16 size;
+ struct ib_flow_mpls_filter val;
+ struct ib_flow_mpls_filter mask;
+};
+
+struct ib_flow_spec_action_tag {
+ enum ib_flow_spec_type type;
+ u16 size;
+ u32 tag_id;
+};
+
+struct ib_flow_spec_action_drop {
+ enum ib_flow_spec_type type;
+ u16 size;
+};
+
+struct ib_flow_spec_action_handle {
+ enum ib_flow_spec_type type;
+ u16 size;
+ struct ib_flow_action *act;
+};
+
+enum ib_counters_description {
+ IB_COUNTER_PACKETS,
+ IB_COUNTER_BYTES,
+};
+
+struct ib_flow_spec_action_count {
+ enum ib_flow_spec_type type;
+ u16 size;
+ struct ib_counters *counters;
+};
+
+union ib_flow_spec {
+ struct {
+ u32 type;
+ u16 size;
+ };
+ struct ib_flow_spec_eth eth;
+ struct ib_flow_spec_ib ib;
+ struct ib_flow_spec_ipv4 ipv4;
+ struct ib_flow_spec_tcp_udp tcp_udp;
+ struct ib_flow_spec_ipv6 ipv6;
+ struct ib_flow_spec_tunnel tunnel;
+ struct ib_flow_spec_esp esp;
+ struct ib_flow_spec_gre gre;
+ struct ib_flow_spec_mpls mpls;
+ struct ib_flow_spec_action_tag flow_tag;
+ struct ib_flow_spec_action_drop drop;
+ struct ib_flow_spec_action_handle action;
+ struct ib_flow_spec_action_count flow_count;
+};
+
+struct ib_flow_attr {
+ enum ib_flow_attr_type type;
+ u16 size;
+ u16 priority;
+ u32 flags;
+ u8 num_of_specs;
+ u32 port;
+ union ib_flow_spec flows[];
+};
+
+struct ib_flow {
+ struct ib_qp *qp;
+ struct ib_device *device;
+ struct ib_uobject *uobject;
+};
+
+enum ib_flow_action_type {
+ IB_FLOW_ACTION_UNSPECIFIED,
+ IB_FLOW_ACTION_ESP = 1,
+};
+
+struct ib_flow_action_attrs_esp_keymats {
+ enum ib_uverbs_flow_action_esp_keymat protocol;
+ union {
+ struct ib_uverbs_flow_action_esp_keymat_aes_gcm aes_gcm;
+ } keymat;
+};
+
+struct ib_flow_action_attrs_esp_replays {
+ enum ib_uverbs_flow_action_esp_replay protocol;
+ union {
+ struct ib_uverbs_flow_action_esp_replay_bmp bmp;
+ } replay;
+};
+
+enum ib_flow_action_attrs_esp_flags {
+ /* All user-space flags at the top: Use enum ib_uverbs_flow_action_esp_flags
+ * This is done in order to share the same flags between user-space and
+ * kernel and spare an unnecessary translation.
+ */
+
+ /* Kernel flags */
+ IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED = 1ULL << 32,
+ IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS = 1ULL << 33,
+};
+
+struct ib_flow_spec_list {
+ struct ib_flow_spec_list *next;
+ union ib_flow_spec spec;
+};
+
+struct ib_flow_action_attrs_esp {
+ struct ib_flow_action_attrs_esp_keymats *keymat;
+ struct ib_flow_action_attrs_esp_replays *replay;
+ struct ib_flow_spec_list *encap;
+ /* Used only if IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED is enabled.
+ * Value of 0 is a valid value.
+ */
+ u32 esn;
+ u32 spi;
+ u32 seq;
+ u32 tfc_pad;
+ /* Use enum ib_flow_action_attrs_esp_flags */
+ u64 flags;
+ u64 hard_limit_pkts;
+};
+
+struct ib_flow_action {
+ struct ib_device *device;
+ struct ib_uobject *uobject;
+ enum ib_flow_action_type type;
+ atomic_t usecnt;
+};
+
+struct ib_mad;
+
+enum ib_process_mad_flags {
+ IB_MAD_IGNORE_MKEY = 1,
+ IB_MAD_IGNORE_BKEY = 2,
+ IB_MAD_IGNORE_ALL = IB_MAD_IGNORE_MKEY | IB_MAD_IGNORE_BKEY
+};
+
+enum ib_mad_result {
+ IB_MAD_RESULT_FAILURE = 0, /* (!SUCCESS is the important flag) */
+ IB_MAD_RESULT_SUCCESS = 1 << 0, /* MAD was successfully processed */
+ IB_MAD_RESULT_REPLY = 1 << 1, /* Reply packet needs to be sent */
+ IB_MAD_RESULT_CONSUMED = 1 << 2 /* Packet consumed: stop processing */
+};
+
+struct ib_port_cache {
+ u64 subnet_prefix;
+ struct ib_pkey_cache *pkey;
+ struct ib_gid_table *gid;
+ u8 lmc;
+ enum ib_port_state port_state;
+};
+
+struct ib_port_immutable {
+ int pkey_tbl_len;
+ int gid_tbl_len;
+ u32 core_cap_flags;
+ u32 max_mad_size;
+};
+
+struct ib_port_data {
+ struct ib_device *ib_dev;
+
+ struct ib_port_immutable immutable;
+
+ spinlock_t pkey_list_lock;
+
+ spinlock_t netdev_lock;
+
+ struct list_head pkey_list;
+
+ struct ib_port_cache cache;
+
+ struct net_device __rcu *netdev;
+ netdevice_tracker netdev_tracker;
+ struct hlist_node ndev_hash_link;
+ struct rdma_port_counter port_counter;
+ struct ib_port *sysfs;
+};
+
+/* rdma netdev type - specifies protocol type */
+enum rdma_netdev_t {
+ RDMA_NETDEV_OPA_VNIC,
+ RDMA_NETDEV_IPOIB,
+};
+
+/**
+ * struct rdma_netdev - rdma netdev
+ * For cases where netstack interfacing is required.
+ */
+struct rdma_netdev {
+ void *clnt_priv;
+ struct ib_device *hca;
+ u32 port_num;
+ int mtu;
+
+ /*
+ * cleanup function must be specified.
+ * FIXME: This is only used for OPA_VNIC and that usage should be
+ * removed too.
+ */
+ void (*free_rdma_netdev)(struct net_device *netdev);
+
+ /* control functions */
+ void (*set_id)(struct net_device *netdev, int id);
+ /* send packet */
+ int (*send)(struct net_device *dev, struct sk_buff *skb,
+ struct ib_ah *address, u32 dqpn);
+ /* multicast */
+ int (*attach_mcast)(struct net_device *dev, struct ib_device *hca,
+ union ib_gid *gid, u16 mlid,
+ int set_qkey, u32 qkey);
+ int (*detach_mcast)(struct net_device *dev, struct ib_device *hca,
+ union ib_gid *gid, u16 mlid);
+ /* timeout */
+ void (*tx_timeout)(struct net_device *dev, unsigned int txqueue);
+};
+
+struct rdma_netdev_alloc_params {
+ size_t sizeof_priv;
+ unsigned int txqs;
+ unsigned int rxqs;
+ void *param;
+
+ int (*initialize_rdma_netdev)(struct ib_device *device, u32 port_num,
+ struct net_device *netdev, void *param);
+};
+
+struct ib_odp_counters {
+ atomic64_t faults;
+ atomic64_t invalidations;
+ atomic64_t prefetch;
+};
+
+struct ib_counters {
+ struct ib_device *device;
+ struct ib_uobject *uobject;
+ /* num of objects attached */
+ atomic_t usecnt;
+};
+
+struct ib_counters_read_attr {
+ u64 *counters_buff;
+ u32 ncounters;
+ u32 flags; /* use enum ib_read_counters_flags */
+};
+
+struct uverbs_attr_bundle;
+struct iw_cm_id;
+struct iw_cm_conn_param;
+
+#define INIT_RDMA_OBJ_SIZE(ib_struct, drv_struct, member) \
+ .size_##ib_struct = \
+ (sizeof(struct drv_struct) + \
+ BUILD_BUG_ON_ZERO(offsetof(struct drv_struct, member)) + \
+ BUILD_BUG_ON_ZERO( \
+ !__same_type(((struct drv_struct *)NULL)->member, \
+ struct ib_struct)))
+
+#define rdma_zalloc_drv_obj_gfp(ib_dev, ib_type, gfp) \
+ ((struct ib_type *)rdma_zalloc_obj(ib_dev, ib_dev->ops.size_##ib_type, \
+ gfp, false))
+
+#define rdma_zalloc_drv_obj_numa(ib_dev, ib_type) \
+ ((struct ib_type *)rdma_zalloc_obj(ib_dev, ib_dev->ops.size_##ib_type, \
+ GFP_KERNEL, true))
+
+#define rdma_zalloc_drv_obj(ib_dev, ib_type) \
+ rdma_zalloc_drv_obj_gfp(ib_dev, ib_type, GFP_KERNEL)
+
+#define DECLARE_RDMA_OBJ_SIZE(ib_struct) size_t size_##ib_struct
+
+struct rdma_user_mmap_entry {
+ struct kref ref;
+ struct ib_ucontext *ucontext;
+ unsigned long start_pgoff;
+ size_t npages;
+ bool driver_removed;
+};
+
+/* Return the offset (in bytes) the user should pass to libc's mmap() */
+static inline u64
+rdma_user_mmap_get_offset(const struct rdma_user_mmap_entry *entry)
+{
+ return (u64)entry->start_pgoff << PAGE_SHIFT;
+}
+
+/**
+ * struct ib_device_ops - InfiniBand device operations
+ * This structure defines all the InfiniBand device operations, providers will
+ * need to define the supported operations, otherwise they will be set to null.
+ */
+struct ib_device_ops {
+ struct module *owner;
+ enum rdma_driver_id driver_id;
+ u32 uverbs_abi_ver;
+ unsigned int uverbs_no_driver_id_binding:1;
+
+ /*
+ * NOTE: New drivers should not make use of device_group; instead new
+ * device parameter should be exposed via netlink command. This
+ * mechanism exists only for existing drivers.
+ */
+ const struct attribute_group *device_group;
+ const struct attribute_group **port_groups;
+
+ int (*post_send)(struct ib_qp *qp, const struct ib_send_wr *send_wr,
+ const struct ib_send_wr **bad_send_wr);
+ int (*post_recv)(struct ib_qp *qp, const struct ib_recv_wr *recv_wr,
+ const struct ib_recv_wr **bad_recv_wr);
+ void (*drain_rq)(struct ib_qp *qp);
+ void (*drain_sq)(struct ib_qp *qp);
+ int (*poll_cq)(struct ib_cq *cq, int num_entries, struct ib_wc *wc);
+ int (*peek_cq)(struct ib_cq *cq, int wc_cnt);
+ int (*req_notify_cq)(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+ int (*post_srq_recv)(struct ib_srq *srq,
+ const struct ib_recv_wr *recv_wr,
+ const struct ib_recv_wr **bad_recv_wr);
+ int (*process_mad)(struct ib_device *device, int process_mad_flags,
+ u32 port_num, const struct ib_wc *in_wc,
+ const struct ib_grh *in_grh,
+ const struct ib_mad *in_mad, struct ib_mad *out_mad,
+ size_t *out_mad_size, u16 *out_mad_pkey_index);
+ int (*query_device)(struct ib_device *device,
+ struct ib_device_attr *device_attr,
+ struct ib_udata *udata);
+ int (*modify_device)(struct ib_device *device, int device_modify_mask,
+ struct ib_device_modify *device_modify);
+ void (*get_dev_fw_str)(struct ib_device *device, char *str);
+ const struct cpumask *(*get_vector_affinity)(struct ib_device *ibdev,
+ int comp_vector);
+ int (*query_port)(struct ib_device *device, u32 port_num,
+ struct ib_port_attr *port_attr);
+ int (*modify_port)(struct ib_device *device, u32 port_num,
+ int port_modify_mask,
+ struct ib_port_modify *port_modify);
+ /**
+ * The following mandatory functions are used only at device
+ * registration. Keep functions such as these at the end of this
+ * structure to avoid cache line misses when accessing struct ib_device
+ * in fast paths.
+ */
+ int (*get_port_immutable)(struct ib_device *device, u32 port_num,
+ struct ib_port_immutable *immutable);
+ enum rdma_link_layer (*get_link_layer)(struct ib_device *device,
+ u32 port_num);
+ /**
+ * When calling get_netdev, the HW vendor's driver should return the
+ * net device of device @device at port @port_num or NULL if such
+ * a net device doesn't exist. The vendor driver should call dev_hold
+ * on this net device. The HW vendor's device driver must guarantee
+ * that this function returns NULL before the net device has finished
+ * NETDEV_UNREGISTER state.
+ */
+ struct net_device *(*get_netdev)(struct ib_device *device,
+ u32 port_num);
+ /**
+ * rdma netdev operation
+ *
+ * Driver implementing alloc_rdma_netdev or rdma_netdev_get_params
+ * must return -EOPNOTSUPP if it doesn't support the specified type.
+ */
+ struct net_device *(*alloc_rdma_netdev)(
+ struct ib_device *device, u32 port_num, enum rdma_netdev_t type,
+ const char *name, unsigned char name_assign_type,
+ void (*setup)(struct net_device *));
+
+ int (*rdma_netdev_get_params)(struct ib_device *device, u32 port_num,
+ enum rdma_netdev_t type,
+ struct rdma_netdev_alloc_params *params);
+ /**
+ * query_gid should be return GID value for @device, when @port_num
+ * link layer is either IB or iWarp. It is no-op if @port_num port
+ * is RoCE link layer.
+ */
+ int (*query_gid)(struct ib_device *device, u32 port_num, int index,
+ union ib_gid *gid);
+ /**
+ * When calling add_gid, the HW vendor's driver should add the gid
+ * of device of port at gid index available at @attr. Meta-info of
+ * that gid (for example, the network device related to this gid) is
+ * available at @attr. @context allows the HW vendor driver to store
+ * extra information together with a GID entry. The HW vendor driver may
+ * allocate memory to contain this information and store it in @context
+ * when a new GID entry is written to. Params are consistent until the
+ * next call of add_gid or delete_gid. The function should return 0 on
+ * success or error otherwise. The function could be called
+ * concurrently for different ports. This function is only called when
+ * roce_gid_table is used.
+ */
+ int (*add_gid)(const struct ib_gid_attr *attr, void **context);
+ /**
+ * When calling del_gid, the HW vendor's driver should delete the
+ * gid of device @device at gid index gid_index of port port_num
+ * available in @attr.
+ * Upon the deletion of a GID entry, the HW vendor must free any
+ * allocated memory. The caller will clear @context afterwards.
+ * This function is only called when roce_gid_table is used.
+ */
+ int (*del_gid)(const struct ib_gid_attr *attr, void **context);
+ int (*query_pkey)(struct ib_device *device, u32 port_num, u16 index,
+ u16 *pkey);
+ int (*alloc_ucontext)(struct ib_ucontext *context,
+ struct ib_udata *udata);
+ void (*dealloc_ucontext)(struct ib_ucontext *context);
+ int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma);
+ /**
+ * This will be called once refcount of an entry in mmap_xa reaches
+ * zero. The type of the memory that was mapped may differ between
+ * entries and is opaque to the rdma_user_mmap interface.
+ * Therefore needs to be implemented by the driver in mmap_free.
+ */
+ void (*mmap_free)(struct rdma_user_mmap_entry *entry);
+ void (*disassociate_ucontext)(struct ib_ucontext *ibcontext);
+ int (*alloc_pd)(struct ib_pd *pd, struct ib_udata *udata);
+ int (*dealloc_pd)(struct ib_pd *pd, struct ib_udata *udata);
+ int (*create_ah)(struct ib_ah *ah, struct rdma_ah_init_attr *attr,
+ struct ib_udata *udata);
+ int (*create_user_ah)(struct ib_ah *ah, struct rdma_ah_init_attr *attr,
+ struct ib_udata *udata);
+ int (*modify_ah)(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
+ int (*query_ah)(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
+ int (*destroy_ah)(struct ib_ah *ah, u32 flags);
+ int (*create_srq)(struct ib_srq *srq,
+ struct ib_srq_init_attr *srq_init_attr,
+ struct ib_udata *udata);
+ int (*modify_srq)(struct ib_srq *srq, struct ib_srq_attr *srq_attr,
+ enum ib_srq_attr_mask srq_attr_mask,
+ struct ib_udata *udata);
+ int (*query_srq)(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
+ int (*destroy_srq)(struct ib_srq *srq, struct ib_udata *udata);
+ int (*create_qp)(struct ib_qp *qp, struct ib_qp_init_attr *qp_init_attr,
+ struct ib_udata *udata);
+ int (*modify_qp)(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
+ int qp_attr_mask, struct ib_udata *udata);
+ int (*query_qp)(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
+ int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
+ int (*destroy_qp)(struct ib_qp *qp, struct ib_udata *udata);
+ int (*create_cq)(struct ib_cq *cq, const struct ib_cq_init_attr *attr,
+ struct ib_udata *udata);
+ int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period);
+ int (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata);
+ int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata);
+ struct ib_mr *(*get_dma_mr)(struct ib_pd *pd, int mr_access_flags);
+ struct ib_mr *(*reg_user_mr)(struct ib_pd *pd, u64 start, u64 length,
+ u64 virt_addr, int mr_access_flags,
+ struct ib_udata *udata);
+ struct ib_mr *(*reg_user_mr_dmabuf)(struct ib_pd *pd, u64 offset,
+ u64 length, u64 virt_addr, int fd,
+ int mr_access_flags,
+ struct ib_udata *udata);
+ struct ib_mr *(*rereg_user_mr)(struct ib_mr *mr, int flags, u64 start,
+ u64 length, u64 virt_addr,
+ int mr_access_flags, struct ib_pd *pd,
+ struct ib_udata *udata);
+ int (*dereg_mr)(struct ib_mr *mr, struct ib_udata *udata);
+ struct ib_mr *(*alloc_mr)(struct ib_pd *pd, enum ib_mr_type mr_type,
+ u32 max_num_sg);
+ struct ib_mr *(*alloc_mr_integrity)(struct ib_pd *pd,
+ u32 max_num_data_sg,
+ u32 max_num_meta_sg);
+ int (*advise_mr)(struct ib_pd *pd,
+ enum ib_uverbs_advise_mr_advice advice, u32 flags,
+ struct ib_sge *sg_list, u32 num_sge,
+ struct uverbs_attr_bundle *attrs);
+
+ /*
+ * Kernel users should universally support relaxed ordering (RO), as
+ * they are designed to read data only after observing the CQE and use
+ * the DMA API correctly.
+ *
+ * Some drivers implicitly enable RO if platform supports it.
+ */
+ int (*map_mr_sg)(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
+ unsigned int *sg_offset);
+ int (*check_mr_status)(struct ib_mr *mr, u32 check_mask,
+ struct ib_mr_status *mr_status);
+ int (*alloc_mw)(struct ib_mw *mw, struct ib_udata *udata);
+ int (*dealloc_mw)(struct ib_mw *mw);
+ int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+ int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+ int (*alloc_xrcd)(struct ib_xrcd *xrcd, struct ib_udata *udata);
+ int (*dealloc_xrcd)(struct ib_xrcd *xrcd, struct ib_udata *udata);
+ struct ib_flow *(*create_flow)(struct ib_qp *qp,
+ struct ib_flow_attr *flow_attr,
+ struct ib_udata *udata);
+ int (*destroy_flow)(struct ib_flow *flow_id);
+ int (*destroy_flow_action)(struct ib_flow_action *action);
+ int (*set_vf_link_state)(struct ib_device *device, int vf, u32 port,
+ int state);
+ int (*get_vf_config)(struct ib_device *device, int vf, u32 port,
+ struct ifla_vf_info *ivf);
+ int (*get_vf_stats)(struct ib_device *device, int vf, u32 port,
+ struct ifla_vf_stats *stats);
+ int (*get_vf_guid)(struct ib_device *device, int vf, u32 port,
+ struct ifla_vf_guid *node_guid,
+ struct ifla_vf_guid *port_guid);
+ int (*set_vf_guid)(struct ib_device *device, int vf, u32 port, u64 guid,
+ int type);
+ struct ib_wq *(*create_wq)(struct ib_pd *pd,
+ struct ib_wq_init_attr *init_attr,
+ struct ib_udata *udata);
+ int (*destroy_wq)(struct ib_wq *wq, struct ib_udata *udata);
+ int (*modify_wq)(struct ib_wq *wq, struct ib_wq_attr *attr,
+ u32 wq_attr_mask, struct ib_udata *udata);
+ int (*create_rwq_ind_table)(struct ib_rwq_ind_table *ib_rwq_ind_table,
+ struct ib_rwq_ind_table_init_attr *init_attr,
+ struct ib_udata *udata);
+ int (*destroy_rwq_ind_table)(struct ib_rwq_ind_table *wq_ind_table);
+ struct ib_dm *(*alloc_dm)(struct ib_device *device,
+ struct ib_ucontext *context,
+ struct ib_dm_alloc_attr *attr,
+ struct uverbs_attr_bundle *attrs);
+ int (*dealloc_dm)(struct ib_dm *dm, struct uverbs_attr_bundle *attrs);
+ struct ib_mr *(*reg_dm_mr)(struct ib_pd *pd, struct ib_dm *dm,
+ struct ib_dm_mr_attr *attr,
+ struct uverbs_attr_bundle *attrs);
+ int (*create_counters)(struct ib_counters *counters,
+ struct uverbs_attr_bundle *attrs);
+ int (*destroy_counters)(struct ib_counters *counters);
+ int (*read_counters)(struct ib_counters *counters,
+ struct ib_counters_read_attr *counters_read_attr,
+ struct uverbs_attr_bundle *attrs);
+ int (*map_mr_sg_pi)(struct ib_mr *mr, struct scatterlist *data_sg,
+ int data_sg_nents, unsigned int *data_sg_offset,
+ struct scatterlist *meta_sg, int meta_sg_nents,
+ unsigned int *meta_sg_offset);
+
+ /**
+ * alloc_hw_[device,port]_stats - Allocate a struct rdma_hw_stats and
+ * fill in the driver initialized data. The struct is kfree()'ed by
+ * the sysfs core when the device is removed. A lifespan of -1 in the
+ * return struct tells the core to set a default lifespan.
+ */
+ struct rdma_hw_stats *(*alloc_hw_device_stats)(struct ib_device *device);
+ struct rdma_hw_stats *(*alloc_hw_port_stats)(struct ib_device *device,
+ u32 port_num);
+ /**
+ * get_hw_stats - Fill in the counter value(s) in the stats struct.
+ * @index - The index in the value array we wish to have updated, or
+ * num_counters if we want all stats updated
+ * Return codes -
+ * < 0 - Error, no counters updated
+ * index - Updated the single counter pointed to by index
+ * num_counters - Updated all counters (will reset the timestamp
+ * and prevent further calls for lifespan milliseconds)
+ * Drivers are allowed to update all counters in leiu of just the
+ * one given in index at their option
+ */
+ int (*get_hw_stats)(struct ib_device *device,
+ struct rdma_hw_stats *stats, u32 port, int index);
+
+ /**
+ * modify_hw_stat - Modify the counter configuration
+ * @enable: true/false when enable/disable a counter
+ * Return codes - 0 on success or error code otherwise.
+ */
+ int (*modify_hw_stat)(struct ib_device *device, u32 port,
+ unsigned int counter_index, bool enable);
+ /**
+ * Allows rdma drivers to add their own restrack attributes.
+ */
+ int (*fill_res_mr_entry)(struct sk_buff *msg, struct ib_mr *ibmr);
+ int (*fill_res_mr_entry_raw)(struct sk_buff *msg, struct ib_mr *ibmr);
+ int (*fill_res_cq_entry)(struct sk_buff *msg, struct ib_cq *ibcq);
+ int (*fill_res_cq_entry_raw)(struct sk_buff *msg, struct ib_cq *ibcq);
+ int (*fill_res_qp_entry)(struct sk_buff *msg, struct ib_qp *ibqp);
+ int (*fill_res_qp_entry_raw)(struct sk_buff *msg, struct ib_qp *ibqp);
+ int (*fill_res_cm_id_entry)(struct sk_buff *msg, struct rdma_cm_id *id);
+
+ /* Device lifecycle callbacks */
+ /*
+ * Called after the device becomes registered, before clients are
+ * attached
+ */
+ int (*enable_driver)(struct ib_device *dev);
+ /*
+ * This is called as part of ib_dealloc_device().
+ */
+ void (*dealloc_driver)(struct ib_device *dev);
+
+ /* iWarp CM callbacks */
+ void (*iw_add_ref)(struct ib_qp *qp);
+ void (*iw_rem_ref)(struct ib_qp *qp);
+ struct ib_qp *(*iw_get_qp)(struct ib_device *device, int qpn);
+ int (*iw_connect)(struct iw_cm_id *cm_id,
+ struct iw_cm_conn_param *conn_param);
+ int (*iw_accept)(struct iw_cm_id *cm_id,
+ struct iw_cm_conn_param *conn_param);
+ int (*iw_reject)(struct iw_cm_id *cm_id, const void *pdata,
+ u8 pdata_len);
+ int (*iw_create_listen)(struct iw_cm_id *cm_id, int backlog);
+ int (*iw_destroy_listen)(struct iw_cm_id *cm_id);
+ /**
+ * counter_bind_qp - Bind a QP to a counter.
+ * @counter - The counter to be bound. If counter->id is zero then
+ * the driver needs to allocate a new counter and set counter->id
+ */
+ int (*counter_bind_qp)(struct rdma_counter *counter, struct ib_qp *qp);
+ /**
+ * counter_unbind_qp - Unbind the qp from the dynamically-allocated
+ * counter and bind it onto the default one
+ */
+ int (*counter_unbind_qp)(struct ib_qp *qp);
+ /**
+ * counter_dealloc -De-allocate the hw counter
+ */
+ int (*counter_dealloc)(struct rdma_counter *counter);
+ /**
+ * counter_alloc_stats - Allocate a struct rdma_hw_stats and fill in
+ * the driver initialized data.
+ */
+ struct rdma_hw_stats *(*counter_alloc_stats)(
+ struct rdma_counter *counter);
+ /**
+ * counter_update_stats - Query the stats value of this counter
+ */
+ int (*counter_update_stats)(struct rdma_counter *counter);
+
+ /**
+ * Allows rdma drivers to add their own restrack attributes
+ * dumped via 'rdma stat' iproute2 command.
+ */
+ int (*fill_stat_mr_entry)(struct sk_buff *msg, struct ib_mr *ibmr);
+
+ /* query driver for its ucontext properties */
+ int (*query_ucontext)(struct ib_ucontext *context,
+ struct uverbs_attr_bundle *attrs);
+
+ /*
+ * Provide NUMA node. This API exists for rdmavt/hfi1 only.
+ * Everyone else relies on Linux memory management model.
+ */
+ int (*get_numa_node)(struct ib_device *dev);
+
+ DECLARE_RDMA_OBJ_SIZE(ib_ah);
+ DECLARE_RDMA_OBJ_SIZE(ib_counters);
+ DECLARE_RDMA_OBJ_SIZE(ib_cq);
+ DECLARE_RDMA_OBJ_SIZE(ib_mw);
+ DECLARE_RDMA_OBJ_SIZE(ib_pd);
+ DECLARE_RDMA_OBJ_SIZE(ib_qp);
+ DECLARE_RDMA_OBJ_SIZE(ib_rwq_ind_table);
+ DECLARE_RDMA_OBJ_SIZE(ib_srq);
+ DECLARE_RDMA_OBJ_SIZE(ib_ucontext);
+ DECLARE_RDMA_OBJ_SIZE(ib_xrcd);
+};
+
+struct ib_core_device {
+ /* device must be the first element in structure until,
+ * union of ib_core_device and device exists in ib_device.
+ */
+ struct device dev;
+ possible_net_t rdma_net;
+ struct kobject *ports_kobj;
+ struct list_head port_list;
+ struct ib_device *owner; /* reach back to owner ib_device */
+};
+
+struct rdma_restrack_root;
+struct ib_device {
+ /* Do not access @dma_device directly from ULP nor from HW drivers. */
+ struct device *dma_device;
+ struct ib_device_ops ops;
+ char name[IB_DEVICE_NAME_MAX];
+ struct rcu_head rcu_head;
+
+ struct list_head event_handler_list;
+ /* Protects event_handler_list */
+ struct rw_semaphore event_handler_rwsem;
+
+ /* Protects QP's event_handler calls and open_qp list */
+ spinlock_t qp_open_list_lock;
+
+ struct rw_semaphore client_data_rwsem;
+ struct xarray client_data;
+ struct mutex unregistration_lock;
+
+ /* Synchronize GID, Pkey cache entries, subnet prefix, LMC */
+ rwlock_t cache_lock;
+ /**
+ * port_data is indexed by port number
+ */
+ struct ib_port_data *port_data;
+
+ int num_comp_vectors;
+
+ union {
+ struct device dev;
+ struct ib_core_device coredev;
+ };
+
+ /* First group is for device attributes,
+ * Second group is for driver provided attributes (optional).
+ * Third group is for the hw_stats
+ * It is a NULL terminated array.
+ */
+ const struct attribute_group *groups[4];
+
+ u64 uverbs_cmd_mask;
+
+ char node_desc[IB_DEVICE_NODE_DESC_MAX];
+ __be64 node_guid;
+ u32 local_dma_lkey;
+ u16 is_switch:1;
+ /* Indicates kernel verbs support, should not be used in drivers */
+ u16 kverbs_provider:1;
+ /* CQ adaptive moderation (RDMA DIM) */
+ u16 use_cq_dim:1;
+ u8 node_type;
+ u32 phys_port_cnt;
+ struct ib_device_attr attrs;
+ struct hw_stats_device_data *hw_stats_data;
+
+#ifdef CONFIG_CGROUP_RDMA
+ struct rdmacg_device cg_device;
+#endif
+
+ u32 index;
+
+ spinlock_t cq_pools_lock;
+ struct list_head cq_pools[IB_POLL_LAST_POOL_TYPE + 1];
+
+ struct rdma_restrack_root *res;
+
+ const struct uapi_definition *driver_def;
+
+ /*
+ * Positive refcount indicates that the device is currently
+ * registered and cannot be unregistered.
+ */
+ refcount_t refcount;
+ struct completion unreg_completion;
+ struct work_struct unregistration_work;
+
+ const struct rdma_link_ops *link_ops;
+
+ /* Protects compat_devs xarray modifications */
+ struct mutex compat_devs_mutex;
+ /* Maintains compat devices for each net namespace */
+ struct xarray compat_devs;
+
+ /* Used by iWarp CM */
+ char iw_ifname[IFNAMSIZ];
+ u32 iw_driver_flags;
+ u32 lag_flags;
+};
+
+static inline void *rdma_zalloc_obj(struct ib_device *dev, size_t size,
+ gfp_t gfp, bool is_numa_aware)
+{
+ if (is_numa_aware && dev->ops.get_numa_node)
+ return kzalloc_node(size, gfp, dev->ops.get_numa_node(dev));
+
+ return kzalloc(size, gfp);
+}
+
+struct ib_client_nl_info;
+struct ib_client {
+ const char *name;
+ int (*add)(struct ib_device *ibdev);
+ void (*remove)(struct ib_device *, void *client_data);
+ void (*rename)(struct ib_device *dev, void *client_data);
+ int (*get_nl_info)(struct ib_device *ibdev, void *client_data,
+ struct ib_client_nl_info *res);
+ int (*get_global_nl_info)(struct ib_client_nl_info *res);
+
+ /* Returns the net_dev belonging to this ib_client and matching the
+ * given parameters.
+ * @dev: An RDMA device that the net_dev use for communication.
+ * @port: A physical port number on the RDMA device.
+ * @pkey: P_Key that the net_dev uses if applicable.
+ * @gid: A GID that the net_dev uses to communicate.
+ * @addr: An IP address the net_dev is configured with.
+ * @client_data: The device's client data set by ib_set_client_data().
+ *
+ * An ib_client that implements a net_dev on top of RDMA devices
+ * (such as IP over IB) should implement this callback, allowing the
+ * rdma_cm module to find the right net_dev for a given request.
+ *
+ * The caller is responsible for calling dev_put on the returned
+ * netdev. */
+ struct net_device *(*get_net_dev_by_params)(
+ struct ib_device *dev,
+ u32 port,
+ u16 pkey,
+ const union ib_gid *gid,
+ const struct sockaddr *addr,
+ void *client_data);
+
+ refcount_t uses;
+ struct completion uses_zero;
+ u32 client_id;
+
+ /* kverbs are not required by the client */
+ u8 no_kverbs_req:1;
+};
+
+/*
+ * IB block DMA iterator
+ *
+ * Iterates the DMA-mapped SGL in contiguous memory blocks aligned
+ * to a HW supported page size.
+ */
+struct ib_block_iter {
+ /* internal states */
+ struct scatterlist *__sg; /* sg holding the current aligned block */
+ dma_addr_t __dma_addr; /* unaligned DMA address of this block */
+ size_t __sg_numblocks; /* ib_umem_num_dma_blocks() */
+ unsigned int __sg_nents; /* number of SG entries */
+ unsigned int __sg_advance; /* number of bytes to advance in sg in next step */
+ unsigned int __pg_bit; /* alignment of current block */
+};
+
+struct ib_device *_ib_alloc_device(size_t size);
+#define ib_alloc_device(drv_struct, member) \
+ container_of(_ib_alloc_device(sizeof(struct drv_struct) + \
+ BUILD_BUG_ON_ZERO(offsetof( \
+ struct drv_struct, member))), \
+ struct drv_struct, member)
+
+void ib_dealloc_device(struct ib_device *device);
+
+void ib_get_device_fw_str(struct ib_device *device, char *str);
+
+int ib_register_device(struct ib_device *device, const char *name,
+ struct device *dma_device);
+void ib_unregister_device(struct ib_device *device);
+void ib_unregister_driver(enum rdma_driver_id driver_id);
+void ib_unregister_device_and_put(struct ib_device *device);
+void ib_unregister_device_queued(struct ib_device *ib_dev);
+
+int ib_register_client (struct ib_client *client);
+void ib_unregister_client(struct ib_client *client);
+
+void __rdma_block_iter_start(struct ib_block_iter *biter,
+ struct scatterlist *sglist,
+ unsigned int nents,
+ unsigned long pgsz);
+bool __rdma_block_iter_next(struct ib_block_iter *biter);
+
+/**
+ * rdma_block_iter_dma_address - get the aligned dma address of the current
+ * block held by the block iterator.
+ * @biter: block iterator holding the memory block
+ */
+static inline dma_addr_t
+rdma_block_iter_dma_address(struct ib_block_iter *biter)
+{
+ return biter->__dma_addr & ~(BIT_ULL(biter->__pg_bit) - 1);
+}
+
+/**
+ * rdma_for_each_block - iterate over contiguous memory blocks of the sg list
+ * @sglist: sglist to iterate over
+ * @biter: block iterator holding the memory block
+ * @nents: maximum number of sg entries to iterate over
+ * @pgsz: best HW supported page size to use
+ *
+ * Callers may use rdma_block_iter_dma_address() to get each
+ * blocks aligned DMA address.
+ */
+#define rdma_for_each_block(sglist, biter, nents, pgsz) \
+ for (__rdma_block_iter_start(biter, sglist, nents, \
+ pgsz); \
+ __rdma_block_iter_next(biter);)
+
+/**
+ * ib_get_client_data - Get IB client context
+ * @device:Device to get context for
+ * @client:Client to get context for
+ *
+ * ib_get_client_data() returns the client context data set with
+ * ib_set_client_data(). This can only be called while the client is
+ * registered to the device, once the ib_client remove() callback returns this
+ * cannot be called.
+ */
+static inline void *ib_get_client_data(struct ib_device *device,
+ struct ib_client *client)
+{
+ return xa_load(&device->client_data, client->client_id);
+}
+void ib_set_client_data(struct ib_device *device, struct ib_client *client,
+ void *data);
+void ib_set_device_ops(struct ib_device *device,
+ const struct ib_device_ops *ops);
+
+int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
+ unsigned long pfn, unsigned long size, pgprot_t prot,
+ struct rdma_user_mmap_entry *entry);
+int rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext,
+ struct rdma_user_mmap_entry *entry,
+ size_t length);
+int rdma_user_mmap_entry_insert_range(struct ib_ucontext *ucontext,
+ struct rdma_user_mmap_entry *entry,
+ size_t length, u32 min_pgoff,
+ u32 max_pgoff);
+
+static inline int
+rdma_user_mmap_entry_insert_exact(struct ib_ucontext *ucontext,
+ struct rdma_user_mmap_entry *entry,
+ size_t length, u32 pgoff)
+{
+ return rdma_user_mmap_entry_insert_range(ucontext, entry, length, pgoff,
+ pgoff);
+}
+
+struct rdma_user_mmap_entry *
+rdma_user_mmap_entry_get_pgoff(struct ib_ucontext *ucontext,
+ unsigned long pgoff);
+struct rdma_user_mmap_entry *
+rdma_user_mmap_entry_get(struct ib_ucontext *ucontext,
+ struct vm_area_struct *vma);
+void rdma_user_mmap_entry_put(struct rdma_user_mmap_entry *entry);
+
+void rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry);
+
+static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len)
+{
+ return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0;
+}
+
+static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len)
+{
+ return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0;
+}
+
+static inline bool ib_is_buffer_cleared(const void __user *p,
+ size_t len)
+{
+ bool ret;
+ u8 *buf;
+
+ if (len > USHRT_MAX)
+ return false;
+
+ buf = memdup_user(p, len);
+ if (IS_ERR(buf))
+ return false;
+
+ ret = !memchr_inv(buf, 0, len);
+ kfree(buf);
+ return ret;
+}
+
+static inline bool ib_is_udata_cleared(struct ib_udata *udata,
+ size_t offset,
+ size_t len)
+{
+ return ib_is_buffer_cleared(udata->inbuf + offset, len);
+}
+
+/**
+ * ib_modify_qp_is_ok - Check that the supplied attribute mask
+ * contains all required attributes and no attributes not allowed for
+ * the given QP state transition.
+ * @cur_state: Current QP state
+ * @next_state: Next QP state
+ * @type: QP type
+ * @mask: Mask of supplied QP attributes
+ *
+ * This function is a helper function that a low-level driver's
+ * modify_qp method can use to validate the consumer's input. It
+ * checks that cur_state and next_state are valid QP states, that a
+ * transition from cur_state to next_state is allowed by the IB spec,
+ * and that the attribute mask supplied is allowed for the transition.
+ */
+bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
+ enum ib_qp_type type, enum ib_qp_attr_mask mask);
+
+void ib_register_event_handler(struct ib_event_handler *event_handler);
+void ib_unregister_event_handler(struct ib_event_handler *event_handler);
+void ib_dispatch_event(const struct ib_event *event);
+
+int ib_query_port(struct ib_device *device,
+ u32 port_num, struct ib_port_attr *port_attr);
+
+enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device,
+ u32 port_num);
+
+/**
+ * rdma_cap_ib_switch - Check if the device is IB switch
+ * @device: Device to check
+ *
+ * Device driver is responsible for setting is_switch bit on
+ * in ib_device structure at init time.
+ *
+ * Return: true if the device is IB switch.
+ */
+static inline bool rdma_cap_ib_switch(const struct ib_device *device)
+{
+ return device->is_switch;
+}
+
+/**
+ * rdma_start_port - Return the first valid port number for the device
+ * specified
+ *
+ * @device: Device to be checked
+ *
+ * Return start port number
+ */
+static inline u32 rdma_start_port(const struct ib_device *device)
+{
+ return rdma_cap_ib_switch(device) ? 0 : 1;
+}
+
+/**
+ * rdma_for_each_port - Iterate over all valid port numbers of the IB device
+ * @device - The struct ib_device * to iterate over
+ * @iter - The unsigned int to store the port number
+ */
+#define rdma_for_each_port(device, iter) \
+ for (iter = rdma_start_port(device + \
+ BUILD_BUG_ON_ZERO(!__same_type(u32, \
+ iter))); \
+ iter <= rdma_end_port(device); iter++)
+
+/**
+ * rdma_end_port - Return the last valid port number for the device
+ * specified
+ *
+ * @device: Device to be checked
+ *
+ * Return last port number
+ */
+static inline u32 rdma_end_port(const struct ib_device *device)
+{
+ return rdma_cap_ib_switch(device) ? 0 : device->phys_port_cnt;
+}
+
+static inline int rdma_is_port_valid(const struct ib_device *device,
+ unsigned int port)
+{
+ return (port >= rdma_start_port(device) &&
+ port <= rdma_end_port(device));
+}
+
+static inline bool rdma_is_grh_required(const struct ib_device *device,
+ u32 port_num)
+{
+ return device->port_data[port_num].immutable.core_cap_flags &
+ RDMA_CORE_PORT_IB_GRH_REQUIRED;
+}
+
+static inline bool rdma_protocol_ib(const struct ib_device *device,
+ u32 port_num)
+{
+ return device->port_data[port_num].immutable.core_cap_flags &
+ RDMA_CORE_CAP_PROT_IB;
+}
+
+static inline bool rdma_protocol_roce(const struct ib_device *device,
+ u32 port_num)
+{
+ return device->port_data[port_num].immutable.core_cap_flags &
+ (RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP);
+}
+
+static inline bool rdma_protocol_roce_udp_encap(const struct ib_device *device,
+ u32 port_num)
+{
+ return device->port_data[port_num].immutable.core_cap_flags &
+ RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP;
+}
+
+static inline bool rdma_protocol_roce_eth_encap(const struct ib_device *device,
+ u32 port_num)
+{
+ return device->port_data[port_num].immutable.core_cap_flags &
+ RDMA_CORE_CAP_PROT_ROCE;
+}
+
+static inline bool rdma_protocol_iwarp(const struct ib_device *device,
+ u32 port_num)
+{
+ return device->port_data[port_num].immutable.core_cap_flags &
+ RDMA_CORE_CAP_PROT_IWARP;
+}
+
+static inline bool rdma_ib_or_roce(const struct ib_device *device,
+ u32 port_num)
+{
+ return rdma_protocol_ib(device, port_num) ||
+ rdma_protocol_roce(device, port_num);
+}
+
+static inline bool rdma_protocol_raw_packet(const struct ib_device *device,
+ u32 port_num)
+{
+ return device->port_data[port_num].immutable.core_cap_flags &
+ RDMA_CORE_CAP_PROT_RAW_PACKET;
+}
+
+static inline bool rdma_protocol_usnic(const struct ib_device *device,
+ u32 port_num)
+{
+ return device->port_data[port_num].immutable.core_cap_flags &
+ RDMA_CORE_CAP_PROT_USNIC;
+}
+
+/**
+ * rdma_cap_ib_mad - Check if the port of a device supports Infiniband
+ * Management Datagrams.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * Management Datagrams (MAD) are a required part of the InfiniBand
+ * specification and are supported on all InfiniBand devices. A slightly
+ * extended version are also supported on OPA interfaces.
+ *
+ * Return: true if the port supports sending/receiving of MAD packets.
+ */
+static inline bool rdma_cap_ib_mad(const struct ib_device *device, u32 port_num)
+{
+ return device->port_data[port_num].immutable.core_cap_flags &
+ RDMA_CORE_CAP_IB_MAD;
+}
+
+/**
+ * rdma_cap_opa_mad - Check if the port of device provides support for OPA
+ * Management Datagrams.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * Intel OmniPath devices extend and/or replace the InfiniBand Management
+ * datagrams with their own versions. These OPA MADs share many but not all of
+ * the characteristics of InfiniBand MADs.
+ *
+ * OPA MADs differ in the following ways:
+ *
+ * 1) MADs are variable size up to 2K
+ * IBTA defined MADs remain fixed at 256 bytes
+ * 2) OPA SMPs must carry valid PKeys
+ * 3) OPA SMP packets are a different format
+ *
+ * Return: true if the port supports OPA MAD packet formats.
+ */
+static inline bool rdma_cap_opa_mad(struct ib_device *device, u32 port_num)
+{
+ return device->port_data[port_num].immutable.core_cap_flags &
+ RDMA_CORE_CAP_OPA_MAD;
+}
+
+/**
+ * rdma_cap_ib_smi - Check if the port of a device provides an Infiniband
+ * Subnet Management Agent (SMA) on the Subnet Management Interface (SMI).
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * Each InfiniBand node is required to provide a Subnet Management Agent
+ * that the subnet manager can access. Prior to the fabric being fully
+ * configured by the subnet manager, the SMA is accessed via a well known
+ * interface called the Subnet Management Interface (SMI). This interface
+ * uses directed route packets to communicate with the SM to get around the
+ * chicken and egg problem of the SM needing to know what's on the fabric
+ * in order to configure the fabric, and needing to configure the fabric in
+ * order to send packets to the devices on the fabric. These directed
+ * route packets do not need the fabric fully configured in order to reach
+ * their destination. The SMI is the only method allowed to send
+ * directed route packets on an InfiniBand fabric.
+ *
+ * Return: true if the port provides an SMI.
+ */
+static inline bool rdma_cap_ib_smi(const struct ib_device *device, u32 port_num)
+{
+ return device->port_data[port_num].immutable.core_cap_flags &
+ RDMA_CORE_CAP_IB_SMI;
+}
+
+/**
+ * rdma_cap_ib_cm - Check if the port of device has the capability Infiniband
+ * Communication Manager.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * The InfiniBand Communication Manager is one of many pre-defined General
+ * Service Agents (GSA) that are accessed via the General Service
+ * Interface (GSI). It's role is to facilitate establishment of connections
+ * between nodes as well as other management related tasks for established
+ * connections.
+ *
+ * Return: true if the port supports an IB CM (this does not guarantee that
+ * a CM is actually running however).
+ */
+static inline bool rdma_cap_ib_cm(const struct ib_device *device, u32 port_num)
+{
+ return device->port_data[port_num].immutable.core_cap_flags &
+ RDMA_CORE_CAP_IB_CM;
+}
+
+/**
+ * rdma_cap_iw_cm - Check if the port of device has the capability IWARP
+ * Communication Manager.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * Similar to above, but specific to iWARP connections which have a different
+ * managment protocol than InfiniBand.
+ *
+ * Return: true if the port supports an iWARP CM (this does not guarantee that
+ * a CM is actually running however).
+ */
+static inline bool rdma_cap_iw_cm(const struct ib_device *device, u32 port_num)
+{
+ return device->port_data[port_num].immutable.core_cap_flags &
+ RDMA_CORE_CAP_IW_CM;
+}
+
+/**
+ * rdma_cap_ib_sa - Check if the port of device has the capability Infiniband
+ * Subnet Administration.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * An InfiniBand Subnet Administration (SA) service is a pre-defined General
+ * Service Agent (GSA) provided by the Subnet Manager (SM). On InfiniBand
+ * fabrics, devices should resolve routes to other hosts by contacting the
+ * SA to query the proper route.
+ *
+ * Return: true if the port should act as a client to the fabric Subnet
+ * Administration interface. This does not imply that the SA service is
+ * running locally.
+ */
+static inline bool rdma_cap_ib_sa(const struct ib_device *device, u32 port_num)
+{
+ return device->port_data[port_num].immutable.core_cap_flags &
+ RDMA_CORE_CAP_IB_SA;
+}
+
+/**
+ * rdma_cap_ib_mcast - Check if the port of device has the capability Infiniband
+ * Multicast.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * InfiniBand multicast registration is more complex than normal IPv4 or
+ * IPv6 multicast registration. Each Host Channel Adapter must register
+ * with the Subnet Manager when it wishes to join a multicast group. It
+ * should do so only once regardless of how many queue pairs it subscribes
+ * to this group. And it should leave the group only after all queue pairs
+ * attached to the group have been detached.
+ *
+ * Return: true if the port must undertake the additional adminstrative
+ * overhead of registering/unregistering with the SM and tracking of the
+ * total number of queue pairs attached to the multicast group.
+ */
+static inline bool rdma_cap_ib_mcast(const struct ib_device *device,
+ u32 port_num)
+{
+ return rdma_cap_ib_sa(device, port_num);
+}
+
+/**
+ * rdma_cap_af_ib - Check if the port of device has the capability
+ * Native Infiniband Address.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * InfiniBand addressing uses a port's GUID + Subnet Prefix to make a default
+ * GID. RoCE uses a different mechanism, but still generates a GID via
+ * a prescribed mechanism and port specific data.
+ *
+ * Return: true if the port uses a GID address to identify devices on the
+ * network.
+ */
+static inline bool rdma_cap_af_ib(const struct ib_device *device, u32 port_num)
+{
+ return device->port_data[port_num].immutable.core_cap_flags &
+ RDMA_CORE_CAP_AF_IB;
+}
+
+/**
+ * rdma_cap_eth_ah - Check if the port of device has the capability
+ * Ethernet Address Handle.
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * RoCE is InfiniBand over Ethernet, and it uses a well defined technique
+ * to fabricate GIDs over Ethernet/IP specific addresses native to the
+ * port. Normally, packet headers are generated by the sending host
+ * adapter, but when sending connectionless datagrams, we must manually
+ * inject the proper headers for the fabric we are communicating over.
+ *
+ * Return: true if we are running as a RoCE port and must force the
+ * addition of a Global Route Header built from our Ethernet Address
+ * Handle into our header list for connectionless packets.
+ */
+static inline bool rdma_cap_eth_ah(const struct ib_device *device, u32 port_num)
+{
+ return device->port_data[port_num].immutable.core_cap_flags &
+ RDMA_CORE_CAP_ETH_AH;
+}
+
+/**
+ * rdma_cap_opa_ah - Check if the port of device supports
+ * OPA Address handles
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * Return: true if we are running on an OPA device which supports
+ * the extended OPA addressing.
+ */
+static inline bool rdma_cap_opa_ah(struct ib_device *device, u32 port_num)
+{
+ return (device->port_data[port_num].immutable.core_cap_flags &
+ RDMA_CORE_CAP_OPA_AH) == RDMA_CORE_CAP_OPA_AH;
+}
+
+/**
+ * rdma_max_mad_size - Return the max MAD size required by this RDMA Port.
+ *
+ * @device: Device
+ * @port_num: Port number
+ *
+ * This MAD size includes the MAD headers and MAD payload. No other headers
+ * are included.
+ *
+ * Return the max MAD size required by the Port. Will return 0 if the port
+ * does not support MADs
+ */
+static inline size_t rdma_max_mad_size(const struct ib_device *device,
+ u32 port_num)
+{
+ return device->port_data[port_num].immutable.max_mad_size;
+}
+
+/**
+ * rdma_cap_roce_gid_table - Check if the port of device uses roce_gid_table
+ * @device: Device to check
+ * @port_num: Port number to check
+ *
+ * RoCE GID table mechanism manages the various GIDs for a device.
+ *
+ * NOTE: if allocating the port's GID table has failed, this call will still
+ * return true, but any RoCE GID table API will fail.
+ *
+ * Return: true if the port uses RoCE GID table mechanism in order to manage
+ * its GIDs.
+ */
+static inline bool rdma_cap_roce_gid_table(const struct ib_device *device,
+ u32 port_num)
+{
+ return rdma_protocol_roce(device, port_num) &&
+ device->ops.add_gid && device->ops.del_gid;
+}
+
+/*
+ * Check if the device supports READ W/ INVALIDATE.
+ */
+static inline bool rdma_cap_read_inv(struct ib_device *dev, u32 port_num)
+{
+ /*
+ * iWarp drivers must support READ W/ INVALIDATE. No other protocol
+ * has support for it yet.
+ */
+ return rdma_protocol_iwarp(dev, port_num);
+}
+
+/**
+ * rdma_core_cap_opa_port - Return whether the RDMA Port is OPA or not.
+ * @device: Device
+ * @port_num: 1 based Port number
+ *
+ * Return true if port is an Intel OPA port , false if not
+ */
+static inline bool rdma_core_cap_opa_port(struct ib_device *device,
+ u32 port_num)
+{
+ return (device->port_data[port_num].immutable.core_cap_flags &
+ RDMA_CORE_PORT_INTEL_OPA) == RDMA_CORE_PORT_INTEL_OPA;
+}
+
+/**
+ * rdma_mtu_enum_to_int - Return the mtu of the port as an integer value.
+ * @device: Device
+ * @port_num: Port number
+ * @mtu: enum value of MTU
+ *
+ * Return the MTU size supported by the port as an integer value. Will return
+ * -1 if enum value of mtu is not supported.
+ */
+static inline int rdma_mtu_enum_to_int(struct ib_device *device, u32 port,
+ int mtu)
+{
+ if (rdma_core_cap_opa_port(device, port))
+ return opa_mtu_enum_to_int((enum opa_mtu)mtu);
+ else
+ return ib_mtu_enum_to_int((enum ib_mtu)mtu);
+}
+
+/**
+ * rdma_mtu_from_attr - Return the mtu of the port from the port attribute.
+ * @device: Device
+ * @port_num: Port number
+ * @attr: port attribute
+ *
+ * Return the MTU size supported by the port as an integer value.
+ */
+static inline int rdma_mtu_from_attr(struct ib_device *device, u32 port,
+ struct ib_port_attr *attr)
+{
+ if (rdma_core_cap_opa_port(device, port))
+ return attr->phys_mtu;
+ else
+ return ib_mtu_enum_to_int(attr->max_mtu);
+}
+
+int ib_set_vf_link_state(struct ib_device *device, int vf, u32 port,
+ int state);
+int ib_get_vf_config(struct ib_device *device, int vf, u32 port,
+ struct ifla_vf_info *info);
+int ib_get_vf_stats(struct ib_device *device, int vf, u32 port,
+ struct ifla_vf_stats *stats);
+int ib_get_vf_guid(struct ib_device *device, int vf, u32 port,
+ struct ifla_vf_guid *node_guid,
+ struct ifla_vf_guid *port_guid);
+int ib_set_vf_guid(struct ib_device *device, int vf, u32 port, u64 guid,
+ int type);
+
+int ib_query_pkey(struct ib_device *device,
+ u32 port_num, u16 index, u16 *pkey);
+
+int ib_modify_device(struct ib_device *device,
+ int device_modify_mask,
+ struct ib_device_modify *device_modify);
+
+int ib_modify_port(struct ib_device *device,
+ u32 port_num, int port_modify_mask,
+ struct ib_port_modify *port_modify);
+
+int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+ u32 *port_num, u16 *index);
+
+int ib_find_pkey(struct ib_device *device,
+ u32 port_num, u16 pkey, u16 *index);
+
+enum ib_pd_flags {
+ /*
+ * Create a memory registration for all memory in the system and place
+ * the rkey for it into pd->unsafe_global_rkey. This can be used by
+ * ULPs to avoid the overhead of dynamic MRs.
+ *
+ * This flag is generally considered unsafe and must only be used in
+ * extremly trusted environments. Every use of it will log a warning
+ * in the kernel log.
+ */
+ IB_PD_UNSAFE_GLOBAL_RKEY = 0x01,
+};
+
+struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
+ const char *caller);
+
+/**
+ * ib_alloc_pd - Allocates an unused protection domain.
+ * @device: The device on which to allocate the protection domain.
+ * @flags: protection domain flags
+ *
+ * A protection domain object provides an association between QPs, shared
+ * receive queues, address handles, memory regions, and memory windows.
+ *
+ * Every PD has a local_dma_lkey which can be used as the lkey value for local
+ * memory operations.
+ */
+#define ib_alloc_pd(device, flags) \
+ __ib_alloc_pd((device), (flags), KBUILD_MODNAME)
+
+int ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata);
+
+/**
+ * ib_dealloc_pd - Deallocate kernel PD
+ * @pd: The protection domain
+ *
+ * NOTE: for user PD use ib_dealloc_pd_user with valid udata!
+ */
+static inline void ib_dealloc_pd(struct ib_pd *pd)
+{
+ int ret = ib_dealloc_pd_user(pd, NULL);
+
+ WARN_ONCE(ret, "Destroy of kernel PD shouldn't fail");
+}
+
+enum rdma_create_ah_flags {
+ /* In a sleepable context */
+ RDMA_CREATE_AH_SLEEPABLE = BIT(0),
+};
+
+/**
+ * rdma_create_ah - Creates an address handle for the given address vector.
+ * @pd: The protection domain associated with the address handle.
+ * @ah_attr: The attributes of the address vector.
+ * @flags: Create address handle flags (see enum rdma_create_ah_flags).
+ *
+ * The address handle is used to reference a local or global destination
+ * in all UD QP post sends.
+ */
+struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
+ u32 flags);
+
+/**
+ * rdma_create_user_ah - Creates an address handle for the given address vector.
+ * It resolves destination mac address for ah attribute of RoCE type.
+ * @pd: The protection domain associated with the address handle.
+ * @ah_attr: The attributes of the address vector.
+ * @udata: pointer to user's input output buffer information need by
+ * provider driver.
+ *
+ * It returns 0 on success and returns appropriate error code on error.
+ * The address handle is used to reference a local or global destination
+ * in all UD QP post sends.
+ */
+struct ib_ah *rdma_create_user_ah(struct ib_pd *pd,
+ struct rdma_ah_attr *ah_attr,
+ struct ib_udata *udata);
+/**
+ * ib_get_gids_from_rdma_hdr - Get sgid and dgid from GRH or IPv4 header
+ * work completion.
+ * @hdr: the L3 header to parse
+ * @net_type: type of header to parse
+ * @sgid: place to store source gid
+ * @dgid: place to store destination gid
+ */
+int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr,
+ enum rdma_network_type net_type,
+ union ib_gid *sgid, union ib_gid *dgid);
+
+/**
+ * ib_get_rdma_header_version - Get the header version
+ * @hdr: the L3 header to parse
+ */
+int ib_get_rdma_header_version(const union rdma_network_hdr *hdr);
+
+/**
+ * ib_init_ah_attr_from_wc - Initializes address handle attributes from a
+ * work completion.
+ * @device: Device on which the received message arrived.
+ * @port_num: Port on which the received message arrived.
+ * @wc: Work completion associated with the received message.
+ * @grh: References the received global route header. This parameter is
+ * ignored unless the work completion indicates that the GRH is valid.
+ * @ah_attr: Returned attributes that can be used when creating an address
+ * handle for replying to the message.
+ * When ib_init_ah_attr_from_wc() returns success,
+ * (a) for IB link layer it optionally contains a reference to SGID attribute
+ * when GRH is present for IB link layer.
+ * (b) for RoCE link layer it contains a reference to SGID attribute.
+ * User must invoke rdma_cleanup_ah_attr_gid_attr() to release reference to SGID
+ * attributes which are initialized using ib_init_ah_attr_from_wc().
+ *
+ */
+int ib_init_ah_attr_from_wc(struct ib_device *device, u32 port_num,
+ const struct ib_wc *wc, const struct ib_grh *grh,
+ struct rdma_ah_attr *ah_attr);
+
+/**
+ * ib_create_ah_from_wc - Creates an address handle associated with the
+ * sender of the specified work completion.
+ * @pd: The protection domain associated with the address handle.
+ * @wc: Work completion information associated with a received message.
+ * @grh: References the received global route header. This parameter is
+ * ignored unless the work completion indicates that the GRH is valid.
+ * @port_num: The outbound port number to associate with the address.
+ *
+ * The address handle is used to reference a local or global destination
+ * in all UD QP post sends.
+ */
+struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc,
+ const struct ib_grh *grh, u32 port_num);
+
+/**
+ * rdma_modify_ah - Modifies the address vector associated with an address
+ * handle.
+ * @ah: The address handle to modify.
+ * @ah_attr: The new address vector attributes to associate with the
+ * address handle.
+ */
+int rdma_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
+
+/**
+ * rdma_query_ah - Queries the address vector associated with an address
+ * handle.
+ * @ah: The address handle to query.
+ * @ah_attr: The address vector attributes associated with the address
+ * handle.
+ */
+int rdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
+
+enum rdma_destroy_ah_flags {
+ /* In a sleepable context */
+ RDMA_DESTROY_AH_SLEEPABLE = BIT(0),
+};
+
+/**
+ * rdma_destroy_ah_user - Destroys an address handle.
+ * @ah: The address handle to destroy.
+ * @flags: Destroy address handle flags (see enum rdma_destroy_ah_flags).
+ * @udata: Valid user data or NULL for kernel objects
+ */
+int rdma_destroy_ah_user(struct ib_ah *ah, u32 flags, struct ib_udata *udata);
+
+/**
+ * rdma_destroy_ah - Destroys an kernel address handle.
+ * @ah: The address handle to destroy.
+ * @flags: Destroy address handle flags (see enum rdma_destroy_ah_flags).
+ *
+ * NOTE: for user ah use rdma_destroy_ah_user with valid udata!
+ */
+static inline void rdma_destroy_ah(struct ib_ah *ah, u32 flags)
+{
+ int ret = rdma_destroy_ah_user(ah, flags, NULL);
+
+ WARN_ONCE(ret, "Destroy of kernel AH shouldn't fail");
+}
+
+struct ib_srq *ib_create_srq_user(struct ib_pd *pd,
+ struct ib_srq_init_attr *srq_init_attr,
+ struct ib_usrq_object *uobject,
+ struct ib_udata *udata);
+static inline struct ib_srq *
+ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr)
+{
+ if (!pd->device->ops.create_srq)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ return ib_create_srq_user(pd, srq_init_attr, NULL, NULL);
+}
+
+/**
+ * ib_modify_srq - Modifies the attributes for the specified SRQ.
+ * @srq: The SRQ to modify.
+ * @srq_attr: On input, specifies the SRQ attributes to modify. On output,
+ * the current values of selected SRQ attributes are returned.
+ * @srq_attr_mask: A bit-mask used to specify which attributes of the SRQ
+ * are being modified.
+ *
+ * The mask may contain IB_SRQ_MAX_WR to resize the SRQ and/or
+ * IB_SRQ_LIMIT to set the SRQ's limit and request notification when
+ * the number of receives queued drops below the limit.
+ */
+int ib_modify_srq(struct ib_srq *srq,
+ struct ib_srq_attr *srq_attr,
+ enum ib_srq_attr_mask srq_attr_mask);
+
+/**
+ * ib_query_srq - Returns the attribute list and current values for the
+ * specified SRQ.
+ * @srq: The SRQ to query.
+ * @srq_attr: The attributes of the specified SRQ.
+ */
+int ib_query_srq(struct ib_srq *srq,
+ struct ib_srq_attr *srq_attr);
+
+/**
+ * ib_destroy_srq_user - Destroys the specified SRQ.
+ * @srq: The SRQ to destroy.
+ * @udata: Valid user data or NULL for kernel objects
+ */
+int ib_destroy_srq_user(struct ib_srq *srq, struct ib_udata *udata);
+
+/**
+ * ib_destroy_srq - Destroys the specified kernel SRQ.
+ * @srq: The SRQ to destroy.
+ *
+ * NOTE: for user srq use ib_destroy_srq_user with valid udata!
+ */
+static inline void ib_destroy_srq(struct ib_srq *srq)
+{
+ int ret = ib_destroy_srq_user(srq, NULL);
+
+ WARN_ONCE(ret, "Destroy of kernel SRQ shouldn't fail");
+}
+
+/**
+ * ib_post_srq_recv - Posts a list of work requests to the specified SRQ.
+ * @srq: The SRQ to post the work request on.
+ * @recv_wr: A list of work requests to post on the receive queue.
+ * @bad_recv_wr: On an immediate failure, this parameter will reference
+ * the work request that failed to be posted on the QP.
+ */
+static inline int ib_post_srq_recv(struct ib_srq *srq,
+ const struct ib_recv_wr *recv_wr,
+ const struct ib_recv_wr **bad_recv_wr)
+{
+ const struct ib_recv_wr *dummy;
+
+ return srq->device->ops.post_srq_recv(srq, recv_wr,
+ bad_recv_wr ? : &dummy);
+}
+
+struct ib_qp *ib_create_qp_kernel(struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr,
+ const char *caller);
+/**
+ * ib_create_qp - Creates a kernel QP associated with the specific protection
+ * domain.
+ * @pd: The protection domain associated with the QP.
+ * @init_attr: A list of initial attributes required to create the
+ * QP. If QP creation succeeds, then the attributes are updated to
+ * the actual capabilities of the created QP.
+ */
+static inline struct ib_qp *ib_create_qp(struct ib_pd *pd,
+ struct ib_qp_init_attr *init_attr)
+{
+ return ib_create_qp_kernel(pd, init_attr, KBUILD_MODNAME);
+}
+
+/**
+ * ib_modify_qp_with_udata - Modifies the attributes for the specified QP.
+ * @qp: The QP to modify.
+ * @attr: On input, specifies the QP attributes to modify. On output,
+ * the current values of selected QP attributes are returned.
+ * @attr_mask: A bit-mask used to specify which attributes of the QP
+ * are being modified.
+ * @udata: pointer to user's input output buffer information
+ * are being modified.
+ * It returns 0 on success and returns appropriate error code on error.
+ */
+int ib_modify_qp_with_udata(struct ib_qp *qp,
+ struct ib_qp_attr *attr,
+ int attr_mask,
+ struct ib_udata *udata);
+
+/**
+ * ib_modify_qp - Modifies the attributes for the specified QP and then
+ * transitions the QP to the given state.
+ * @qp: The QP to modify.
+ * @qp_attr: On input, specifies the QP attributes to modify. On output,
+ * the current values of selected QP attributes are returned.
+ * @qp_attr_mask: A bit-mask used to specify which attributes of the QP
+ * are being modified.
+ */
+int ib_modify_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask);
+
+/**
+ * ib_query_qp - Returns the attribute list and current values for the
+ * specified QP.
+ * @qp: The QP to query.
+ * @qp_attr: The attributes of the specified QP.
+ * @qp_attr_mask: A bit-mask used to select specific attributes to query.
+ * @qp_init_attr: Additional attributes of the selected QP.
+ *
+ * The qp_attr_mask may be used to limit the query to gathering only the
+ * selected attributes.
+ */
+int ib_query_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask,
+ struct ib_qp_init_attr *qp_init_attr);
+
+/**
+ * ib_destroy_qp - Destroys the specified QP.
+ * @qp: The QP to destroy.
+ * @udata: Valid udata or NULL for kernel objects
+ */
+int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata);
+
+/**
+ * ib_destroy_qp - Destroys the specified kernel QP.
+ * @qp: The QP to destroy.
+ *
+ * NOTE: for user qp use ib_destroy_qp_user with valid udata!
+ */
+static inline int ib_destroy_qp(struct ib_qp *qp)
+{
+ return ib_destroy_qp_user(qp, NULL);
+}
+
+/**
+ * ib_open_qp - Obtain a reference to an existing sharable QP.
+ * @xrcd - XRC domain
+ * @qp_open_attr: Attributes identifying the QP to open.
+ *
+ * Returns a reference to a sharable QP.
+ */
+struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd,
+ struct ib_qp_open_attr *qp_open_attr);
+
+/**
+ * ib_close_qp - Release an external reference to a QP.
+ * @qp: The QP handle to release
+ *
+ * The opened QP handle is released by the caller. The underlying
+ * shared QP is not destroyed until all internal references are released.
+ */
+int ib_close_qp(struct ib_qp *qp);
+
+/**
+ * ib_post_send - Posts a list of work requests to the send queue of
+ * the specified QP.
+ * @qp: The QP to post the work request on.
+ * @send_wr: A list of work requests to post on the send queue.
+ * @bad_send_wr: On an immediate failure, this parameter will reference
+ * the work request that failed to be posted on the QP.
+ *
+ * While IBA Vol. 1 section 11.4.1.1 specifies that if an immediate
+ * error is returned, the QP state shall not be affected,
+ * ib_post_send() will return an immediate error after queueing any
+ * earlier work requests in the list.
+ */
+static inline int ib_post_send(struct ib_qp *qp,
+ const struct ib_send_wr *send_wr,
+ const struct ib_send_wr **bad_send_wr)
+{
+ const struct ib_send_wr *dummy;
+
+ return qp->device->ops.post_send(qp, send_wr, bad_send_wr ? : &dummy);
+}
+
+/**
+ * ib_post_recv - Posts a list of work requests to the receive queue of
+ * the specified QP.
+ * @qp: The QP to post the work request on.
+ * @recv_wr: A list of work requests to post on the receive queue.
+ * @bad_recv_wr: On an immediate failure, this parameter will reference
+ * the work request that failed to be posted on the QP.
+ */
+static inline int ib_post_recv(struct ib_qp *qp,
+ const struct ib_recv_wr *recv_wr,
+ const struct ib_recv_wr **bad_recv_wr)
+{
+ const struct ib_recv_wr *dummy;
+
+ return qp->device->ops.post_recv(qp, recv_wr, bad_recv_wr ? : &dummy);
+}
+
+struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, int nr_cqe,
+ int comp_vector, enum ib_poll_context poll_ctx,
+ const char *caller);
+static inline struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
+ int nr_cqe, int comp_vector,
+ enum ib_poll_context poll_ctx)
+{
+ return __ib_alloc_cq(dev, private, nr_cqe, comp_vector, poll_ctx,
+ KBUILD_MODNAME);
+}
+
+struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private,
+ int nr_cqe, enum ib_poll_context poll_ctx,
+ const char *caller);
+
+/**
+ * ib_alloc_cq_any: Allocate kernel CQ
+ * @dev: The IB device
+ * @private: Private data attached to the CQE
+ * @nr_cqe: Number of CQEs in the CQ
+ * @poll_ctx: Context used for polling the CQ
+ */
+static inline struct ib_cq *ib_alloc_cq_any(struct ib_device *dev,
+ void *private, int nr_cqe,
+ enum ib_poll_context poll_ctx)
+{
+ return __ib_alloc_cq_any(dev, private, nr_cqe, poll_ctx,
+ KBUILD_MODNAME);
+}
+
+void ib_free_cq(struct ib_cq *cq);
+int ib_process_cq_direct(struct ib_cq *cq, int budget);
+
+/**
+ * ib_create_cq - Creates a CQ on the specified device.
+ * @device: The device on which to create the CQ.
+ * @comp_handler: A user-specified callback that is invoked when a
+ * completion event occurs on the CQ.
+ * @event_handler: A user-specified callback that is invoked when an
+ * asynchronous event not associated with a completion occurs on the CQ.
+ * @cq_context: Context associated with the CQ returned to the user via
+ * the associated completion and event handlers.
+ * @cq_attr: The attributes the CQ should be created upon.
+ *
+ * Users can examine the cq structure to determine the actual CQ size.
+ */
+struct ib_cq *__ib_create_cq(struct ib_device *device,
+ ib_comp_handler comp_handler,
+ void (*event_handler)(struct ib_event *, void *),
+ void *cq_context,
+ const struct ib_cq_init_attr *cq_attr,
+ const char *caller);
+#define ib_create_cq(device, cmp_hndlr, evt_hndlr, cq_ctxt, cq_attr) \
+ __ib_create_cq((device), (cmp_hndlr), (evt_hndlr), (cq_ctxt), (cq_attr), KBUILD_MODNAME)
+
+/**
+ * ib_resize_cq - Modifies the capacity of the CQ.
+ * @cq: The CQ to resize.
+ * @cqe: The minimum size of the CQ.
+ *
+ * Users can examine the cq structure to determine the actual CQ size.
+ */
+int ib_resize_cq(struct ib_cq *cq, int cqe);
+
+/**
+ * rdma_set_cq_moderation - Modifies moderation params of the CQ
+ * @cq: The CQ to modify.
+ * @cq_count: number of CQEs that will trigger an event
+ * @cq_period: max period of time in usec before triggering an event
+ *
+ */
+int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period);
+
+/**
+ * ib_destroy_cq_user - Destroys the specified CQ.
+ * @cq: The CQ to destroy.
+ * @udata: Valid user data or NULL for kernel objects
+ */
+int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata);
+
+/**
+ * ib_destroy_cq - Destroys the specified kernel CQ.
+ * @cq: The CQ to destroy.
+ *
+ * NOTE: for user cq use ib_destroy_cq_user with valid udata!
+ */
+static inline void ib_destroy_cq(struct ib_cq *cq)
+{
+ int ret = ib_destroy_cq_user(cq, NULL);
+
+ WARN_ONCE(ret, "Destroy of kernel CQ shouldn't fail");
+}
+
+/**
+ * ib_poll_cq - poll a CQ for completion(s)
+ * @cq:the CQ being polled
+ * @num_entries:maximum number of completions to return
+ * @wc:array of at least @num_entries &struct ib_wc where completions
+ * will be returned
+ *
+ * Poll a CQ for (possibly multiple) completions. If the return value
+ * is < 0, an error occurred. If the return value is >= 0, it is the
+ * number of completions returned. If the return value is
+ * non-negative and < num_entries, then the CQ was emptied.
+ */
+static inline int ib_poll_cq(struct ib_cq *cq, int num_entries,
+ struct ib_wc *wc)
+{
+ return cq->device->ops.poll_cq(cq, num_entries, wc);
+}
+
+/**
+ * ib_req_notify_cq - Request completion notification on a CQ.
+ * @cq: The CQ to generate an event for.
+ * @flags:
+ * Must contain exactly one of %IB_CQ_SOLICITED or %IB_CQ_NEXT_COMP
+ * to request an event on the next solicited event or next work
+ * completion at any type, respectively. %IB_CQ_REPORT_MISSED_EVENTS
+ * may also be |ed in to request a hint about missed events, as
+ * described below.
+ *
+ * Return Value:
+ * < 0 means an error occurred while requesting notification
+ * == 0 means notification was requested successfully, and if
+ * IB_CQ_REPORT_MISSED_EVENTS was passed in, then no events
+ * were missed and it is safe to wait for another event. In
+ * this case is it guaranteed that any work completions added
+ * to the CQ since the last CQ poll will trigger a completion
+ * notification event.
+ * > 0 is only returned if IB_CQ_REPORT_MISSED_EVENTS was passed
+ * in. It means that the consumer must poll the CQ again to
+ * make sure it is empty to avoid missing an event because of a
+ * race between requesting notification and an entry being
+ * added to the CQ. This return value means it is possible
+ * (but not guaranteed) that a work completion has been added
+ * to the CQ since the last poll without triggering a
+ * completion notification event.
+ */
+static inline int ib_req_notify_cq(struct ib_cq *cq,
+ enum ib_cq_notify_flags flags)
+{
+ return cq->device->ops.req_notify_cq(cq, flags);
+}
+
+struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe,
+ int comp_vector_hint,
+ enum ib_poll_context poll_ctx);
+
+void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe);
+
+/*
+ * Drivers that don't need a DMA mapping at the RDMA layer, set dma_device to
+ * NULL. This causes the ib_dma* helpers to just stash the kernel virtual
+ * address into the dma address.
+ */
+static inline bool ib_uses_virt_dma(struct ib_device *dev)
+{
+ return IS_ENABLED(CONFIG_INFINIBAND_VIRT_DMA) && !dev->dma_device;
+}
+
+/*
+ * Check if a IB device's underlying DMA mapping supports P2PDMA transfers.
+ */
+static inline bool ib_dma_pci_p2p_dma_supported(struct ib_device *dev)
+{
+ if (ib_uses_virt_dma(dev))
+ return false;
+
+ return dma_pci_p2pdma_supported(dev->dma_device);
+}
+
+/**
+ * ib_virt_dma_to_ptr - Convert a dma_addr to a kernel pointer
+ * @dma_addr: The DMA address
+ *
+ * Used by ib_uses_virt_dma() devices to get back to the kernel pointer after
+ * going through the dma_addr marshalling.
+ */
+static inline void *ib_virt_dma_to_ptr(u64 dma_addr)
+{
+ /* virt_dma mode maps the kvs's directly into the dma addr */
+ return (void *)(uintptr_t)dma_addr;
+}
+
+/**
+ * ib_virt_dma_to_page - Convert a dma_addr to a struct page
+ * @dma_addr: The DMA address
+ *
+ * Used by ib_uses_virt_dma() device to get back to the struct page after going
+ * through the dma_addr marshalling.
+ */
+static inline struct page *ib_virt_dma_to_page(u64 dma_addr)
+{
+ return virt_to_page(ib_virt_dma_to_ptr(dma_addr));
+}
+
+/**
+ * ib_dma_mapping_error - check a DMA addr for error
+ * @dev: The device for which the dma_addr was created
+ * @dma_addr: The DMA address to check
+ */
+static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+ if (ib_uses_virt_dma(dev))
+ return 0;
+ return dma_mapping_error(dev->dma_device, dma_addr);
+}
+
+/**
+ * ib_dma_map_single - Map a kernel virtual address to DMA address
+ * @dev: The device for which the dma_addr is to be created
+ * @cpu_addr: The kernel virtual address
+ * @size: The size of the region in bytes
+ * @direction: The direction of the DMA
+ */
+static inline u64 ib_dma_map_single(struct ib_device *dev,
+ void *cpu_addr, size_t size,
+ enum dma_data_direction direction)
+{
+ if (ib_uses_virt_dma(dev))
+ return (uintptr_t)cpu_addr;
+ return dma_map_single(dev->dma_device, cpu_addr, size, direction);
+}
+
+/**
+ * ib_dma_unmap_single - Destroy a mapping created by ib_dma_map_single()
+ * @dev: The device for which the DMA address was created
+ * @addr: The DMA address
+ * @size: The size of the region in bytes
+ * @direction: The direction of the DMA
+ */
+static inline void ib_dma_unmap_single(struct ib_device *dev,
+ u64 addr, size_t size,
+ enum dma_data_direction direction)
+{
+ if (!ib_uses_virt_dma(dev))
+ dma_unmap_single(dev->dma_device, addr, size, direction);
+}
+
+/**
+ * ib_dma_map_page - Map a physical page to DMA address
+ * @dev: The device for which the dma_addr is to be created
+ * @page: The page to be mapped
+ * @offset: The offset within the page
+ * @size: The size of the region in bytes
+ * @direction: The direction of the DMA
+ */
+static inline u64 ib_dma_map_page(struct ib_device *dev,
+ struct page *page,
+ unsigned long offset,
+ size_t size,
+ enum dma_data_direction direction)
+{
+ if (ib_uses_virt_dma(dev))
+ return (uintptr_t)(page_address(page) + offset);
+ return dma_map_page(dev->dma_device, page, offset, size, direction);
+}
+
+/**
+ * ib_dma_unmap_page - Destroy a mapping created by ib_dma_map_page()
+ * @dev: The device for which the DMA address was created
+ * @addr: The DMA address
+ * @size: The size of the region in bytes
+ * @direction: The direction of the DMA
+ */
+static inline void ib_dma_unmap_page(struct ib_device *dev,
+ u64 addr, size_t size,
+ enum dma_data_direction direction)
+{
+ if (!ib_uses_virt_dma(dev))
+ dma_unmap_page(dev->dma_device, addr, size, direction);
+}
+
+int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents);
+static inline int ib_dma_map_sg_attrs(struct ib_device *dev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction direction,
+ unsigned long dma_attrs)
+{
+ if (ib_uses_virt_dma(dev))
+ return ib_dma_virt_map_sg(dev, sg, nents);
+ return dma_map_sg_attrs(dev->dma_device, sg, nents, direction,
+ dma_attrs);
+}
+
+static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction direction,
+ unsigned long dma_attrs)
+{
+ if (!ib_uses_virt_dma(dev))
+ dma_unmap_sg_attrs(dev->dma_device, sg, nents, direction,
+ dma_attrs);
+}
+
+/**
+ * ib_dma_map_sgtable_attrs - Map a scatter/gather table to DMA addresses
+ * @dev: The device for which the DMA addresses are to be created
+ * @sg: The sg_table object describing the buffer
+ * @direction: The direction of the DMA
+ * @attrs: Optional DMA attributes for the map operation
+ */
+static inline int ib_dma_map_sgtable_attrs(struct ib_device *dev,
+ struct sg_table *sgt,
+ enum dma_data_direction direction,
+ unsigned long dma_attrs)
+{
+ int nents;
+
+ if (ib_uses_virt_dma(dev)) {
+ nents = ib_dma_virt_map_sg(dev, sgt->sgl, sgt->orig_nents);
+ if (!nents)
+ return -EIO;
+ sgt->nents = nents;
+ return 0;
+ }
+ return dma_map_sgtable(dev->dma_device, sgt, direction, dma_attrs);
+}
+
+static inline void ib_dma_unmap_sgtable_attrs(struct ib_device *dev,
+ struct sg_table *sgt,
+ enum dma_data_direction direction,
+ unsigned long dma_attrs)
+{
+ if (!ib_uses_virt_dma(dev))
+ dma_unmap_sgtable(dev->dma_device, sgt, direction, dma_attrs);
+}
+
+/**
+ * ib_dma_map_sg - Map a scatter/gather list to DMA addresses
+ * @dev: The device for which the DMA addresses are to be created
+ * @sg: The array of scatter/gather entries
+ * @nents: The number of scatter/gather entries
+ * @direction: The direction of the DMA
+ */
+static inline int ib_dma_map_sg(struct ib_device *dev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction direction)
+{
+ return ib_dma_map_sg_attrs(dev, sg, nents, direction, 0);
+}
+
+/**
+ * ib_dma_unmap_sg - Unmap a scatter/gather list of DMA addresses
+ * @dev: The device for which the DMA addresses were created
+ * @sg: The array of scatter/gather entries
+ * @nents: The number of scatter/gather entries
+ * @direction: The direction of the DMA
+ */
+static inline void ib_dma_unmap_sg(struct ib_device *dev,
+ struct scatterlist *sg, int nents,
+ enum dma_data_direction direction)
+{
+ ib_dma_unmap_sg_attrs(dev, sg, nents, direction, 0);
+}
+
+/**
+ * ib_dma_max_seg_size - Return the size limit of a single DMA transfer
+ * @dev: The device to query
+ *
+ * The returned value represents a size in bytes.
+ */
+static inline unsigned int ib_dma_max_seg_size(struct ib_device *dev)
+{
+ if (ib_uses_virt_dma(dev))
+ return UINT_MAX;
+ return dma_get_max_seg_size(dev->dma_device);
+}
+
+/**
+ * ib_dma_sync_single_for_cpu - Prepare DMA region to be accessed by CPU
+ * @dev: The device for which the DMA address was created
+ * @addr: The DMA address
+ * @size: The size of the region in bytes
+ * @dir: The direction of the DMA
+ */
+static inline void ib_dma_sync_single_for_cpu(struct ib_device *dev,
+ u64 addr,
+ size_t size,
+ enum dma_data_direction dir)
+{
+ if (!ib_uses_virt_dma(dev))
+ dma_sync_single_for_cpu(dev->dma_device, addr, size, dir);
+}
+
+/**
+ * ib_dma_sync_single_for_device - Prepare DMA region to be accessed by device
+ * @dev: The device for which the DMA address was created
+ * @addr: The DMA address
+ * @size: The size of the region in bytes
+ * @dir: The direction of the DMA
+ */
+static inline void ib_dma_sync_single_for_device(struct ib_device *dev,
+ u64 addr,
+ size_t size,
+ enum dma_data_direction dir)
+{
+ if (!ib_uses_virt_dma(dev))
+ dma_sync_single_for_device(dev->dma_device, addr, size, dir);
+}
+
+/* ib_reg_user_mr - register a memory region for virtual addresses from kernel
+ * space. This function should be called when 'current' is the owning MM.
+ */
+struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+ u64 virt_addr, int mr_access_flags);
+
+/* ib_advise_mr - give an advice about an address range in a memory region */
+int ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
+ u32 flags, struct ib_sge *sg_list, u32 num_sge);
+/**
+ * ib_dereg_mr_user - Deregisters a memory region and removes it from the
+ * HCA translation table.
+ * @mr: The memory region to deregister.
+ * @udata: Valid user data or NULL for kernel object
+ *
+ * This function can fail, if the memory region has memory windows bound to it.
+ */
+int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata);
+
+/**
+ * ib_dereg_mr - Deregisters a kernel memory region and removes it from the
+ * HCA translation table.
+ * @mr: The memory region to deregister.
+ *
+ * This function can fail, if the memory region has memory windows bound to it.
+ *
+ * NOTE: for user mr use ib_dereg_mr_user with valid udata!
+ */
+static inline int ib_dereg_mr(struct ib_mr *mr)
+{
+ return ib_dereg_mr_user(mr, NULL);
+}
+
+struct ib_mr *ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+ u32 max_num_sg);
+
+struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd,
+ u32 max_num_data_sg,
+ u32 max_num_meta_sg);
+
+/**
+ * ib_update_fast_reg_key - updates the key portion of the fast_reg MR
+ * R_Key and L_Key.
+ * @mr - struct ib_mr pointer to be updated.
+ * @newkey - new key to be used.
+ */
+static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey)
+{
+ mr->lkey = (mr->lkey & 0xffffff00) | newkey;
+ mr->rkey = (mr->rkey & 0xffffff00) | newkey;
+}
+
+/**
+ * ib_inc_rkey - increments the key portion of the given rkey. Can be used
+ * for calculating a new rkey for type 2 memory windows.
+ * @rkey - the rkey to increment.
+ */
+static inline u32 ib_inc_rkey(u32 rkey)
+{
+ const u32 mask = 0x000000ff;
+ return ((rkey + 1) & mask) | (rkey & ~mask);
+}
+
+/**
+ * ib_attach_mcast - Attaches the specified QP to a multicast group.
+ * @qp: QP to attach to the multicast group. The QP must be type
+ * IB_QPT_UD.
+ * @gid: Multicast group GID.
+ * @lid: Multicast group LID in host byte order.
+ *
+ * In order to send and receive multicast packets, subnet
+ * administration must have created the multicast group and configured
+ * the fabric appropriately. The port associated with the specified
+ * QP must also be a member of the multicast group.
+ */
+int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+/**
+ * ib_detach_mcast - Detaches the specified QP from a multicast group.
+ * @qp: QP to detach from the multicast group.
+ * @gid: Multicast group GID.
+ * @lid: Multicast group LID in host byte order.
+ */
+int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+struct ib_xrcd *ib_alloc_xrcd_user(struct ib_device *device,
+ struct inode *inode, struct ib_udata *udata);
+int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata);
+
+static inline int ib_check_mr_access(struct ib_device *ib_dev,
+ unsigned int flags)
+{
+ u64 device_cap = ib_dev->attrs.device_cap_flags;
+
+ /*
+ * Local write permission is required if remote write or
+ * remote atomic permission is also requested.
+ */
+ if (flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) &&
+ !(flags & IB_ACCESS_LOCAL_WRITE))
+ return -EINVAL;
+
+ if (flags & ~IB_ACCESS_SUPPORTED)
+ return -EINVAL;
+
+ if (flags & IB_ACCESS_ON_DEMAND &&
+ !(ib_dev->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING))
+ return -EOPNOTSUPP;
+
+ if ((flags & IB_ACCESS_FLUSH_GLOBAL &&
+ !(device_cap & IB_DEVICE_FLUSH_GLOBAL)) ||
+ (flags & IB_ACCESS_FLUSH_PERSISTENT &&
+ !(device_cap & IB_DEVICE_FLUSH_PERSISTENT)))
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
+static inline bool ib_access_writable(int access_flags)
+{
+ /*
+ * We have writable memory backing the MR if any of the following
+ * access flags are set. "Local write" and "remote write" obviously
+ * require write access. "Remote atomic" can do things like fetch and
+ * add, which will modify memory, and "MW bind" can change permissions
+ * by binding a window.
+ */
+ return access_flags &
+ (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND);
+}
+
+/**
+ * ib_check_mr_status: lightweight check of MR status.
+ * This routine may provide status checks on a selected
+ * ib_mr. first use is for signature status check.
+ *
+ * @mr: A memory region.
+ * @check_mask: Bitmask of which checks to perform from
+ * ib_mr_status_check enumeration.
+ * @mr_status: The container of relevant status checks.
+ * failed checks will be indicated in the status bitmask
+ * and the relevant info shall be in the error item.
+ */
+int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
+ struct ib_mr_status *mr_status);
+
+/**
+ * ib_device_try_get: Hold a registration lock
+ * device: The device to lock
+ *
+ * A device under an active registration lock cannot become unregistered. It
+ * is only possible to obtain a registration lock on a device that is fully
+ * registered, otherwise this function returns false.
+ *
+ * The registration lock is only necessary for actions which require the
+ * device to still be registered. Uses that only require the device pointer to
+ * be valid should use get_device(&ibdev->dev) to hold the memory.
+ *
+ */
+static inline bool ib_device_try_get(struct ib_device *dev)
+{
+ return refcount_inc_not_zero(&dev->refcount);
+}
+
+void ib_device_put(struct ib_device *device);
+struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
+ enum rdma_driver_id driver_id);
+struct ib_device *ib_device_get_by_name(const char *name,
+ enum rdma_driver_id driver_id);
+struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u32 port,
+ u16 pkey, const union ib_gid *gid,
+ const struct sockaddr *addr);
+int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
+ unsigned int port);
+struct ib_wq *ib_create_wq(struct ib_pd *pd,
+ struct ib_wq_init_attr *init_attr);
+int ib_destroy_wq_user(struct ib_wq *wq, struct ib_udata *udata);
+
+int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
+ unsigned int *sg_offset, unsigned int page_size);
+int ib_map_mr_sg_pi(struct ib_mr *mr, struct scatterlist *data_sg,
+ int data_sg_nents, unsigned int *data_sg_offset,
+ struct scatterlist *meta_sg, int meta_sg_nents,
+ unsigned int *meta_sg_offset, unsigned int page_size);
+
+static inline int
+ib_map_mr_sg_zbva(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
+ unsigned int *sg_offset, unsigned int page_size)
+{
+ int n;
+
+ n = ib_map_mr_sg(mr, sg, sg_nents, sg_offset, page_size);
+ mr->iova = 0;
+
+ return n;
+}
+
+int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents,
+ unsigned int *sg_offset, int (*set_page)(struct ib_mr *, u64));
+
+void ib_drain_rq(struct ib_qp *qp);
+void ib_drain_sq(struct ib_qp *qp);
+void ib_drain_qp(struct ib_qp *qp);
+
+int ib_get_eth_speed(struct ib_device *dev, u32 port_num, u16 *speed,
+ u8 *width);
+
+static inline u8 *rdma_ah_retrieve_dmac(struct rdma_ah_attr *attr)
+{
+ if (attr->type == RDMA_AH_ATTR_TYPE_ROCE)
+ return attr->roce.dmac;
+ return NULL;
+}
+
+static inline void rdma_ah_set_dlid(struct rdma_ah_attr *attr, u32 dlid)
+{
+ if (attr->type == RDMA_AH_ATTR_TYPE_IB)
+ attr->ib.dlid = (u16)dlid;
+ else if (attr->type == RDMA_AH_ATTR_TYPE_OPA)
+ attr->opa.dlid = dlid;
+}
+
+static inline u32 rdma_ah_get_dlid(const struct rdma_ah_attr *attr)
+{
+ if (attr->type == RDMA_AH_ATTR_TYPE_IB)
+ return attr->ib.dlid;
+ else if (attr->type == RDMA_AH_ATTR_TYPE_OPA)
+ return attr->opa.dlid;
+ return 0;
+}
+
+static inline void rdma_ah_set_sl(struct rdma_ah_attr *attr, u8 sl)
+{
+ attr->sl = sl;
+}
+
+static inline u8 rdma_ah_get_sl(const struct rdma_ah_attr *attr)
+{
+ return attr->sl;
+}
+
+static inline void rdma_ah_set_path_bits(struct rdma_ah_attr *attr,
+ u8 src_path_bits)
+{
+ if (attr->type == RDMA_AH_ATTR_TYPE_IB)
+ attr->ib.src_path_bits = src_path_bits;
+ else if (attr->type == RDMA_AH_ATTR_TYPE_OPA)
+ attr->opa.src_path_bits = src_path_bits;
+}
+
+static inline u8 rdma_ah_get_path_bits(const struct rdma_ah_attr *attr)
+{
+ if (attr->type == RDMA_AH_ATTR_TYPE_IB)
+ return attr->ib.src_path_bits;
+ else if (attr->type == RDMA_AH_ATTR_TYPE_OPA)
+ return attr->opa.src_path_bits;
+ return 0;
+}
+
+static inline void rdma_ah_set_make_grd(struct rdma_ah_attr *attr,
+ bool make_grd)
+{
+ if (attr->type == RDMA_AH_ATTR_TYPE_OPA)
+ attr->opa.make_grd = make_grd;
+}
+
+static inline bool rdma_ah_get_make_grd(const struct rdma_ah_attr *attr)
+{
+ if (attr->type == RDMA_AH_ATTR_TYPE_OPA)
+ return attr->opa.make_grd;
+ return false;
+}
+
+static inline void rdma_ah_set_port_num(struct rdma_ah_attr *attr, u32 port_num)
+{
+ attr->port_num = port_num;
+}
+
+static inline u32 rdma_ah_get_port_num(const struct rdma_ah_attr *attr)
+{
+ return attr->port_num;
+}
+
+static inline void rdma_ah_set_static_rate(struct rdma_ah_attr *attr,
+ u8 static_rate)
+{
+ attr->static_rate = static_rate;
+}
+
+static inline u8 rdma_ah_get_static_rate(const struct rdma_ah_attr *attr)
+{
+ return attr->static_rate;
+}
+
+static inline void rdma_ah_set_ah_flags(struct rdma_ah_attr *attr,
+ enum ib_ah_flags flag)
+{
+ attr->ah_flags = flag;
+}
+
+static inline enum ib_ah_flags
+ rdma_ah_get_ah_flags(const struct rdma_ah_attr *attr)
+{
+ return attr->ah_flags;
+}
+
+static inline const struct ib_global_route
+ *rdma_ah_read_grh(const struct rdma_ah_attr *attr)
+{
+ return &attr->grh;
+}
+
+/*To retrieve and modify the grh */
+static inline struct ib_global_route
+ *rdma_ah_retrieve_grh(struct rdma_ah_attr *attr)
+{
+ return &attr->grh;
+}
+
+static inline void rdma_ah_set_dgid_raw(struct rdma_ah_attr *attr, void *dgid)
+{
+ struct ib_global_route *grh = rdma_ah_retrieve_grh(attr);
+
+ memcpy(grh->dgid.raw, dgid, sizeof(grh->dgid));
+}
+
+static inline void rdma_ah_set_subnet_prefix(struct rdma_ah_attr *attr,
+ __be64 prefix)
+{
+ struct ib_global_route *grh = rdma_ah_retrieve_grh(attr);
+
+ grh->dgid.global.subnet_prefix = prefix;
+}
+
+static inline void rdma_ah_set_interface_id(struct rdma_ah_attr *attr,
+ __be64 if_id)
+{
+ struct ib_global_route *grh = rdma_ah_retrieve_grh(attr);
+
+ grh->dgid.global.interface_id = if_id;
+}
+
+static inline void rdma_ah_set_grh(struct rdma_ah_attr *attr,
+ union ib_gid *dgid, u32 flow_label,
+ u8 sgid_index, u8 hop_limit,
+ u8 traffic_class)
+{
+ struct ib_global_route *grh = rdma_ah_retrieve_grh(attr);
+
+ attr->ah_flags = IB_AH_GRH;
+ if (dgid)
+ grh->dgid = *dgid;
+ grh->flow_label = flow_label;
+ grh->sgid_index = sgid_index;
+ grh->hop_limit = hop_limit;
+ grh->traffic_class = traffic_class;
+ grh->sgid_attr = NULL;
+}
+
+void rdma_destroy_ah_attr(struct rdma_ah_attr *ah_attr);
+void rdma_move_grh_sgid_attr(struct rdma_ah_attr *attr, union ib_gid *dgid,
+ u32 flow_label, u8 hop_limit, u8 traffic_class,
+ const struct ib_gid_attr *sgid_attr);
+void rdma_copy_ah_attr(struct rdma_ah_attr *dest,
+ const struct rdma_ah_attr *src);
+void rdma_replace_ah_attr(struct rdma_ah_attr *old,
+ const struct rdma_ah_attr *new);
+void rdma_move_ah_attr(struct rdma_ah_attr *dest, struct rdma_ah_attr *src);
+
+/**
+ * rdma_ah_find_type - Return address handle type.
+ *
+ * @dev: Device to be checked
+ * @port_num: Port number
+ */
+static inline enum rdma_ah_attr_type rdma_ah_find_type(struct ib_device *dev,
+ u32 port_num)
+{
+ if (rdma_protocol_roce(dev, port_num))
+ return RDMA_AH_ATTR_TYPE_ROCE;
+ if (rdma_protocol_ib(dev, port_num)) {
+ if (rdma_cap_opa_ah(dev, port_num))
+ return RDMA_AH_ATTR_TYPE_OPA;
+ return RDMA_AH_ATTR_TYPE_IB;
+ }
+
+ return RDMA_AH_ATTR_TYPE_UNDEFINED;
+}
+
+/**
+ * ib_lid_cpu16 - Return lid in 16bit CPU encoding.
+ * In the current implementation the only way to
+ * get the 32bit lid is from other sources for OPA.
+ * For IB, lids will always be 16bits so cast the
+ * value accordingly.
+ *
+ * @lid: A 32bit LID
+ */
+static inline u16 ib_lid_cpu16(u32 lid)
+{
+ WARN_ON_ONCE(lid & 0xFFFF0000);
+ return (u16)lid;
+}
+
+/**
+ * ib_lid_be16 - Return lid in 16bit BE encoding.
+ *
+ * @lid: A 32bit LID
+ */
+static inline __be16 ib_lid_be16(u32 lid)
+{
+ WARN_ON_ONCE(lid & 0xFFFF0000);
+ return cpu_to_be16((u16)lid);
+}
+
+/**
+ * ib_get_vector_affinity - Get the affinity mappings of a given completion
+ * vector
+ * @device: the rdma device
+ * @comp_vector: index of completion vector
+ *
+ * Returns NULL on failure, otherwise a corresponding cpu map of the
+ * completion vector (returns all-cpus map if the device driver doesn't
+ * implement get_vector_affinity).
+ */
+static inline const struct cpumask *
+ib_get_vector_affinity(struct ib_device *device, int comp_vector)
+{
+ if (comp_vector < 0 || comp_vector >= device->num_comp_vectors ||
+ !device->ops.get_vector_affinity)
+ return NULL;
+
+ return device->ops.get_vector_affinity(device, comp_vector);
+
+}
+
+/**
+ * rdma_roce_rescan_device - Rescan all of the network devices in the system
+ * and add their gids, as needed, to the relevant RoCE devices.
+ *
+ * @device: the rdma device
+ */
+void rdma_roce_rescan_device(struct ib_device *ibdev);
+
+struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile);
+
+int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs);
+
+struct net_device *rdma_alloc_netdev(struct ib_device *device, u32 port_num,
+ enum rdma_netdev_t type, const char *name,
+ unsigned char name_assign_type,
+ void (*setup)(struct net_device *));
+
+int rdma_init_netdev(struct ib_device *device, u32 port_num,
+ enum rdma_netdev_t type, const char *name,
+ unsigned char name_assign_type,
+ void (*setup)(struct net_device *),
+ struct net_device *netdev);
+
+/**
+ * rdma_device_to_ibdev - Get ib_device pointer from device pointer
+ *
+ * @device: device pointer for which ib_device pointer to retrieve
+ *
+ * rdma_device_to_ibdev() retrieves ib_device pointer from device.
+ *
+ */
+static inline struct ib_device *rdma_device_to_ibdev(struct device *device)
+{
+ struct ib_core_device *coredev =
+ container_of(device, struct ib_core_device, dev);
+
+ return coredev->owner;
+}
+
+/**
+ * ibdev_to_node - return the NUMA node for a given ib_device
+ * @dev: device to get the NUMA node for.
+ */
+static inline int ibdev_to_node(struct ib_device *ibdev)
+{
+ struct device *parent = ibdev->dev.parent;
+
+ if (!parent)
+ return NUMA_NO_NODE;
+ return dev_to_node(parent);
+}
+
+/**
+ * rdma_device_to_drv_device - Helper macro to reach back to driver's
+ * ib_device holder structure from device pointer.
+ *
+ * NOTE: New drivers should not make use of this API; This API is only for
+ * existing drivers who have exposed sysfs entries using
+ * ops->device_group.
+ */
+#define rdma_device_to_drv_device(dev, drv_dev_struct, ibdev_member) \
+ container_of(rdma_device_to_ibdev(dev), drv_dev_struct, ibdev_member)
+
+bool rdma_dev_access_netns(const struct ib_device *device,
+ const struct net *net);
+
+#define IB_ROCE_UDP_ENCAP_VALID_PORT_MIN (0xC000)
+#define IB_ROCE_UDP_ENCAP_VALID_PORT_MAX (0xFFFF)
+#define IB_GRH_FLOWLABEL_MASK (0x000FFFFF)
+
+/**
+ * rdma_flow_label_to_udp_sport - generate a RoCE v2 UDP src port value based
+ * on the flow_label
+ *
+ * This function will convert the 20 bit flow_label input to a valid RoCE v2
+ * UDP src port 14 bit value. All RoCE V2 drivers should use this same
+ * convention.
+ */
+static inline u16 rdma_flow_label_to_udp_sport(u32 fl)
+{
+ u32 fl_low = fl & 0x03fff, fl_high = fl & 0xFC000;
+
+ fl_low ^= fl_high >> 14;
+ return (u16)(fl_low | IB_ROCE_UDP_ENCAP_VALID_PORT_MIN);
+}
+
+/**
+ * rdma_calc_flow_label - generate a RDMA symmetric flow label value based on
+ * local and remote qpn values
+ *
+ * This function folded the multiplication results of two qpns, 24 bit each,
+ * fields, and converts it to a 20 bit results.
+ *
+ * This function will create symmetric flow_label value based on the local
+ * and remote qpn values. this will allow both the requester and responder
+ * to calculate the same flow_label for a given connection.
+ *
+ * This helper function should be used by driver in case the upper layer
+ * provide a zero flow_label value. This is to improve entropy of RDMA
+ * traffic in the network.
+ */
+static inline u32 rdma_calc_flow_label(u32 lqpn, u32 rqpn)
+{
+ u64 v = (u64)lqpn * rqpn;
+
+ v ^= v >> 20;
+ v ^= v >> 40;
+
+ return (u32)(v & IB_GRH_FLOWLABEL_MASK);
+}
+
+/**
+ * rdma_get_udp_sport - Calculate and set UDP source port based on the flow
+ * label. If flow label is not defined in GRH then
+ * calculate it based on lqpn/rqpn.
+ *
+ * @fl: flow label from GRH
+ * @lqpn: local qp number
+ * @rqpn: remote qp number
+ */
+static inline u16 rdma_get_udp_sport(u32 fl, u32 lqpn, u32 rqpn)
+{
+ if (!fl)
+ fl = rdma_calc_flow_label(lqpn, rqpn);
+
+ return rdma_flow_label_to_udp_sport(fl);
+}
+
+const struct ib_port_immutable*
+ib_port_immutable_read(struct ib_device *dev, unsigned int port);
+#endif /* IB_VERBS_H */
diff --git a/include/rdma/iba.h b/include/rdma/iba.h
new file mode 100644
index 0000000000..6a1115b02a
--- /dev/null
+++ b/include/rdma/iba.h
@@ -0,0 +1,146 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved.
+ */
+#ifndef _IBA_DEFS_H_
+#define _IBA_DEFS_H_
+
+#include <linux/kernel.h>
+#include <linux/bitfield.h>
+#include <asm/unaligned.h>
+
+static inline u32 _iba_get8(const u8 *ptr)
+{
+ return *ptr;
+}
+
+static inline void _iba_set8(u8 *ptr, u32 mask, u32 prep_value)
+{
+ *ptr = (*ptr & ~mask) | prep_value;
+}
+
+static inline u16 _iba_get16(const __be16 *ptr)
+{
+ return be16_to_cpu(*ptr);
+}
+
+static inline void _iba_set16(__be16 *ptr, u16 mask, u16 prep_value)
+{
+ *ptr = cpu_to_be16((be16_to_cpu(*ptr) & ~mask) | prep_value);
+}
+
+static inline u32 _iba_get32(const __be32 *ptr)
+{
+ return be32_to_cpu(*ptr);
+}
+
+static inline void _iba_set32(__be32 *ptr, u32 mask, u32 prep_value)
+{
+ *ptr = cpu_to_be32((be32_to_cpu(*ptr) & ~mask) | prep_value);
+}
+
+static inline u64 _iba_get64(const __be64 *ptr)
+{
+ /*
+ * The mads are constructed so that 32 bit and smaller are naturally
+ * aligned, everything larger has a max alignment of 4 bytes.
+ */
+ return be64_to_cpu(get_unaligned(ptr));
+}
+
+static inline void _iba_set64(__be64 *ptr, u64 mask, u64 prep_value)
+{
+ put_unaligned(cpu_to_be64((_iba_get64(ptr) & ~mask) | prep_value), ptr);
+}
+
+#define _IBA_SET(field_struct, field_offset, field_mask, num_bits, ptr, value) \
+ ({ \
+ field_struct *_ptr = ptr; \
+ _iba_set##num_bits((void *)_ptr + (field_offset), field_mask, \
+ FIELD_PREP(field_mask, value)); \
+ })
+#define IBA_SET(field, ptr, value) _IBA_SET(field, ptr, value)
+
+#define _IBA_GET_MEM_PTR(field_struct, field_offset, type, num_bits, ptr) \
+ ({ \
+ field_struct *_ptr = ptr; \
+ (type *)((void *)_ptr + (field_offset)); \
+ })
+#define IBA_GET_MEM_PTR(field, ptr) _IBA_GET_MEM_PTR(field, ptr)
+
+/* FIXME: A set should always set the entire field, meaning we should zero the trailing bytes */
+#define _IBA_SET_MEM(field_struct, field_offset, type, num_bits, ptr, in, \
+ bytes) \
+ ({ \
+ const type *_in_ptr = in; \
+ WARN_ON(bytes * 8 > num_bits); \
+ if (in && bytes) \
+ memcpy(_IBA_GET_MEM_PTR(field_struct, field_offset, \
+ type, num_bits, ptr), \
+ _in_ptr, bytes); \
+ })
+#define IBA_SET_MEM(field, ptr, in, bytes) _IBA_SET_MEM(field, ptr, in, bytes)
+
+#define _IBA_GET(field_struct, field_offset, field_mask, num_bits, ptr) \
+ ({ \
+ const field_struct *_ptr = ptr; \
+ (u##num_bits) FIELD_GET( \
+ field_mask, _iba_get##num_bits((const void *)_ptr + \
+ (field_offset))); \
+ })
+#define IBA_GET(field, ptr) _IBA_GET(field, ptr)
+
+#define _IBA_GET_MEM(field_struct, field_offset, type, num_bits, ptr, out, \
+ bytes) \
+ ({ \
+ type *_out_ptr = out; \
+ WARN_ON(bytes * 8 > num_bits); \
+ if (out && bytes) \
+ memcpy(_out_ptr, \
+ _IBA_GET_MEM_PTR(field_struct, field_offset, \
+ type, num_bits, ptr), \
+ bytes); \
+ })
+#define IBA_GET_MEM(field, ptr, out, bytes) _IBA_GET_MEM(field, ptr, out, bytes)
+
+/*
+ * The generated list becomes the parameters to the macros, the order is:
+ * - struct this applies to
+ * - starting offset of the max
+ * - GENMASK or GENMASK_ULL in CPU order
+ * - The width of data the mask operations should work on, in bits
+ */
+
+/*
+ * Extraction using a tabular description like table 106. bit_offset is from
+ * the Byte[Bit] notation.
+ */
+#define IBA_FIELD_BLOC(field_struct, byte_offset, bit_offset, num_bits) \
+ field_struct, byte_offset, \
+ GENMASK(7 - (bit_offset), 7 - (bit_offset) - (num_bits - 1)), \
+ 8
+#define IBA_FIELD8_LOC(field_struct, byte_offset, num_bits) \
+ IBA_FIELD_BLOC(field_struct, byte_offset, 0, num_bits)
+
+#define IBA_FIELD16_LOC(field_struct, byte_offset, num_bits) \
+ field_struct, (byte_offset)&0xFFFE, \
+ GENMASK(15 - (((byte_offset) % 2) * 8), \
+ 15 - (((byte_offset) % 2) * 8) - (num_bits - 1)), \
+ 16
+
+#define IBA_FIELD32_LOC(field_struct, byte_offset, num_bits) \
+ field_struct, (byte_offset)&0xFFFC, \
+ GENMASK(31 - (((byte_offset) % 4) * 8), \
+ 31 - (((byte_offset) % 4) * 8) - (num_bits - 1)), \
+ 32
+
+#define IBA_FIELD64_LOC(field_struct, byte_offset) \
+ field_struct, byte_offset, GENMASK_ULL(63, 0), 64
+/*
+ * In IBTA spec, everything that is more than 64bits is multiple
+ * of bytes without leftover bits.
+ */
+#define IBA_FIELD_MLOC(field_struct, byte_offset, num_bits, type) \
+ field_struct, byte_offset, type, num_bits
+
+#endif /* _IBA_DEFS_H_ */
diff --git a/include/rdma/ibta_vol1_c12.h b/include/rdma/ibta_vol1_c12.h
new file mode 100644
index 0000000000..960c86bec7
--- /dev/null
+++ b/include/rdma/ibta_vol1_c12.h
@@ -0,0 +1,219 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved.
+ *
+ * This file is IBTA volume 1, chapter 12 declarations:
+ * CHAPTER 12: COMMUNICATION MANAGEMENT
+ */
+#ifndef _IBTA_VOL1_C12_H_
+#define _IBTA_VOL1_C12_H_
+
+#include <rdma/iba.h>
+
+#define CM_FIELD_BLOC(field_struct, byte_offset, bits_offset, width) \
+ IBA_FIELD_BLOC(field_struct, \
+ (byte_offset + sizeof(struct ib_mad_hdr)), bits_offset, \
+ width)
+#define CM_FIELD8_LOC(field_struct, byte_offset, width) \
+ IBA_FIELD8_LOC(field_struct, \
+ (byte_offset + sizeof(struct ib_mad_hdr)), width)
+#define CM_FIELD16_LOC(field_struct, byte_offset, width) \
+ IBA_FIELD16_LOC(field_struct, \
+ (byte_offset + sizeof(struct ib_mad_hdr)), width)
+#define CM_FIELD32_LOC(field_struct, byte_offset, width) \
+ IBA_FIELD32_LOC(field_struct, \
+ (byte_offset + sizeof(struct ib_mad_hdr)), width)
+#define CM_FIELD64_LOC(field_struct, byte_offset) \
+ IBA_FIELD64_LOC(field_struct, (byte_offset + sizeof(struct ib_mad_hdr)))
+#define CM_FIELD_MLOC(field_struct, byte_offset, width, type) \
+ IBA_FIELD_MLOC(field_struct, \
+ (byte_offset + sizeof(struct ib_mad_hdr)), width, type)
+#define CM_STRUCT(field_struct, total_len) \
+ field_struct \
+ { \
+ struct ib_mad_hdr hdr; \
+ u32 _data[(total_len) / 32 + \
+ BUILD_BUG_ON_ZERO((total_len) % 32 != 0)]; \
+ }
+
+/* Table 106 REQ Message Contents */
+#define CM_REQ_LOCAL_COMM_ID CM_FIELD32_LOC(struct cm_req_msg, 0, 32)
+#define CM_REQ_VENDOR_ID CM_FIELD32_LOC(struct cm_req_msg, 5, 24)
+#define CM_REQ_SERVICE_ID CM_FIELD64_LOC(struct cm_req_msg, 8)
+#define CM_REQ_LOCAL_CA_GUID CM_FIELD64_LOC(struct cm_req_msg, 16)
+#define CM_REQ_LOCAL_Q_KEY CM_FIELD32_LOC(struct cm_req_msg, 28, 32)
+#define CM_REQ_LOCAL_QPN CM_FIELD32_LOC(struct cm_req_msg, 32, 24)
+#define CM_REQ_RESPONDER_RESOURCES CM_FIELD8_LOC(struct cm_req_msg, 35, 8)
+#define CM_REQ_LOCAL_EECN CM_FIELD32_LOC(struct cm_req_msg, 36, 24)
+#define CM_REQ_INITIATOR_DEPTH CM_FIELD8_LOC(struct cm_req_msg, 39, 8)
+#define CM_REQ_REMOTE_EECN CM_FIELD32_LOC(struct cm_req_msg, 40, 24)
+#define CM_REQ_REMOTE_CM_RESPONSE_TIMEOUT \
+ CM_FIELD8_LOC(struct cm_req_msg, 43, 5)
+#define CM_REQ_TRANSPORT_SERVICE_TYPE CM_FIELD_BLOC(struct cm_req_msg, 43, 5, 2)
+#define CM_REQ_END_TO_END_FLOW_CONTROL \
+ CM_FIELD_BLOC(struct cm_req_msg, 43, 7, 1)
+#define CM_REQ_STARTING_PSN CM_FIELD32_LOC(struct cm_req_msg, 44, 24)
+#define CM_REQ_LOCAL_CM_RESPONSE_TIMEOUT CM_FIELD8_LOC(struct cm_req_msg, 47, 5)
+#define CM_REQ_RETRY_COUNT CM_FIELD_BLOC(struct cm_req_msg, 47, 5, 3)
+#define CM_REQ_PARTITION_KEY CM_FIELD16_LOC(struct cm_req_msg, 48, 16)
+#define CM_REQ_PATH_PACKET_PAYLOAD_MTU CM_FIELD8_LOC(struct cm_req_msg, 50, 4)
+#define CM_REQ_RDC_EXISTS CM_FIELD_BLOC(struct cm_req_msg, 50, 4, 1)
+#define CM_REQ_RNR_RETRY_COUNT CM_FIELD_BLOC(struct cm_req_msg, 50, 5, 3)
+#define CM_REQ_MAX_CM_RETRIES CM_FIELD8_LOC(struct cm_req_msg, 51, 4)
+#define CM_REQ_SRQ CM_FIELD_BLOC(struct cm_req_msg, 51, 4, 1)
+#define CM_REQ_EXTENDED_TRANSPORT_TYPE \
+ CM_FIELD_BLOC(struct cm_req_msg, 51, 5, 3)
+#define CM_REQ_PRIMARY_LOCAL_PORT_LID CM_FIELD16_LOC(struct cm_req_msg, 52, 16)
+#define CM_REQ_PRIMARY_REMOTE_PORT_LID CM_FIELD16_LOC(struct cm_req_msg, 54, 16)
+#define CM_REQ_PRIMARY_LOCAL_PORT_GID \
+ CM_FIELD_MLOC(struct cm_req_msg, 56, 128, union ib_gid)
+#define CM_REQ_PRIMARY_REMOTE_PORT_GID \
+ CM_FIELD_MLOC(struct cm_req_msg, 72, 128, union ib_gid)
+#define CM_REQ_PRIMARY_FLOW_LABEL CM_FIELD32_LOC(struct cm_req_msg, 88, 20)
+#define CM_REQ_PRIMARY_PACKET_RATE CM_FIELD_BLOC(struct cm_req_msg, 91, 2, 6)
+#define CM_REQ_PRIMARY_TRAFFIC_CLASS CM_FIELD8_LOC(struct cm_req_msg, 92, 8)
+#define CM_REQ_PRIMARY_HOP_LIMIT CM_FIELD8_LOC(struct cm_req_msg, 93, 8)
+#define CM_REQ_PRIMARY_SL CM_FIELD8_LOC(struct cm_req_msg, 94, 4)
+#define CM_REQ_PRIMARY_SUBNET_LOCAL CM_FIELD_BLOC(struct cm_req_msg, 94, 4, 1)
+#define CM_REQ_PRIMARY_LOCAL_ACK_TIMEOUT CM_FIELD8_LOC(struct cm_req_msg, 95, 5)
+#define CM_REQ_ALTERNATE_LOCAL_PORT_LID \
+ CM_FIELD16_LOC(struct cm_req_msg, 96, 16)
+#define CM_REQ_ALTERNATE_REMOTE_PORT_LID \
+ CM_FIELD16_LOC(struct cm_req_msg, 98, 16)
+#define CM_REQ_ALTERNATE_LOCAL_PORT_GID \
+ CM_FIELD_MLOC(struct cm_req_msg, 100, 128, union ib_gid)
+#define CM_REQ_ALTERNATE_REMOTE_PORT_GID \
+ CM_FIELD_MLOC(struct cm_req_msg, 116, 128, union ib_gid)
+#define CM_REQ_ALTERNATE_FLOW_LABEL CM_FIELD32_LOC(struct cm_req_msg, 132, 20)
+#define CM_REQ_ALTERNATE_PACKET_RATE CM_FIELD_BLOC(struct cm_req_msg, 135, 2, 6)
+#define CM_REQ_ALTERNATE_TRAFFIC_CLASS CM_FIELD8_LOC(struct cm_req_msg, 136, 8)
+#define CM_REQ_ALTERNATE_HOP_LIMIT CM_FIELD8_LOC(struct cm_req_msg, 137, 8)
+#define CM_REQ_ALTERNATE_SL CM_FIELD8_LOC(struct cm_req_msg, 138, 4)
+#define CM_REQ_ALTERNATE_SUBNET_LOCAL \
+ CM_FIELD_BLOC(struct cm_req_msg, 138, 4, 1)
+#define CM_REQ_ALTERNATE_LOCAL_ACK_TIMEOUT \
+ CM_FIELD8_LOC(struct cm_req_msg, 139, 5)
+#define CM_REQ_SAP_SUPPORTED CM_FIELD_BLOC(struct cm_req_msg, 139, 5, 1)
+#define CM_REQ_PRIVATE_DATA CM_FIELD_MLOC(struct cm_req_msg, 140, 736, void)
+CM_STRUCT(struct cm_req_msg, 140 * 8 + 736);
+
+/* Table 107 MRA Message Contents */
+#define CM_MRA_LOCAL_COMM_ID CM_FIELD32_LOC(struct cm_mra_msg, 0, 32)
+#define CM_MRA_REMOTE_COMM_ID CM_FIELD32_LOC(struct cm_mra_msg, 4, 32)
+#define CM_MRA_MESSAGE_MRAED CM_FIELD8_LOC(struct cm_mra_msg, 8, 2)
+#define CM_MRA_SERVICE_TIMEOUT CM_FIELD8_LOC(struct cm_mra_msg, 9, 5)
+#define CM_MRA_PRIVATE_DATA CM_FIELD_MLOC(struct cm_mra_msg, 10, 1776, void)
+CM_STRUCT(struct cm_mra_msg, 10 * 8 + 1776);
+
+/* Table 108 REJ Message Contents */
+#define CM_REJ_LOCAL_COMM_ID CM_FIELD32_LOC(struct cm_rej_msg, 0, 32)
+#define CM_REJ_REMOTE_COMM_ID CM_FIELD32_LOC(struct cm_rej_msg, 4, 32)
+#define CM_REJ_MESSAGE_REJECTED CM_FIELD8_LOC(struct cm_rej_msg, 8, 2)
+#define CM_REJ_REJECTED_INFO_LENGTH CM_FIELD8_LOC(struct cm_rej_msg, 9, 7)
+#define CM_REJ_REASON CM_FIELD16_LOC(struct cm_rej_msg, 10, 16)
+#define CM_REJ_ARI CM_FIELD_MLOC(struct cm_rej_msg, 12, 576, void)
+#define CM_REJ_PRIVATE_DATA CM_FIELD_MLOC(struct cm_rej_msg, 84, 1184, void)
+CM_STRUCT(struct cm_rej_msg, 84 * 8 + 1184);
+
+/* Table 110 REP Message Contents */
+#define CM_REP_LOCAL_COMM_ID CM_FIELD32_LOC(struct cm_rep_msg, 0, 32)
+#define CM_REP_REMOTE_COMM_ID CM_FIELD32_LOC(struct cm_rep_msg, 4, 32)
+#define CM_REP_LOCAL_Q_KEY CM_FIELD32_LOC(struct cm_rep_msg, 8, 32)
+#define CM_REP_LOCAL_QPN CM_FIELD32_LOC(struct cm_rep_msg, 12, 24)
+#define CM_REP_VENDOR_ID_H CM_FIELD8_LOC(struct cm_rep_msg, 15, 8)
+#define CM_REP_LOCAL_EE_CONTEXT_NUMBER CM_FIELD32_LOC(struct cm_rep_msg, 16, 24)
+#define CM_REP_VENDOR_ID_M CM_FIELD8_LOC(struct cm_rep_msg, 19, 8)
+#define CM_REP_STARTING_PSN CM_FIELD32_LOC(struct cm_rep_msg, 20, 24)
+#define CM_REP_VENDOR_ID_L CM_FIELD8_LOC(struct cm_rep_msg, 23, 8)
+#define CM_REP_RESPONDER_RESOURCES CM_FIELD8_LOC(struct cm_rep_msg, 24, 8)
+#define CM_REP_INITIATOR_DEPTH CM_FIELD8_LOC(struct cm_rep_msg, 25, 8)
+#define CM_REP_TARGET_ACK_DELAY CM_FIELD8_LOC(struct cm_rep_msg, 26, 5)
+#define CM_REP_FAILOVER_ACCEPTED CM_FIELD_BLOC(struct cm_rep_msg, 26, 5, 2)
+#define CM_REP_END_TO_END_FLOW_CONTROL \
+ CM_FIELD_BLOC(struct cm_rep_msg, 26, 7, 1)
+#define CM_REP_RNR_RETRY_COUNT CM_FIELD8_LOC(struct cm_rep_msg, 27, 3)
+#define CM_REP_SRQ CM_FIELD_BLOC(struct cm_rep_msg, 27, 3, 1)
+#define CM_REP_LOCAL_CA_GUID CM_FIELD64_LOC(struct cm_rep_msg, 28)
+#define CM_REP_PRIVATE_DATA CM_FIELD_MLOC(struct cm_rep_msg, 36, 1568, void)
+CM_STRUCT(struct cm_rep_msg, 36 * 8 + 1568);
+
+/* Table 111 RTU Message Contents */
+#define CM_RTU_LOCAL_COMM_ID CM_FIELD32_LOC(struct cm_rtu_msg, 0, 32)
+#define CM_RTU_REMOTE_COMM_ID CM_FIELD32_LOC(struct cm_rtu_msg, 4, 32)
+#define CM_RTU_PRIVATE_DATA CM_FIELD_MLOC(struct cm_rtu_msg, 8, 1792, void)
+CM_STRUCT(struct cm_rtu_msg, 8 * 8 + 1792);
+
+/* Table 112 DREQ Message Contents */
+#define CM_DREQ_LOCAL_COMM_ID CM_FIELD32_LOC(struct cm_dreq_msg, 0, 32)
+#define CM_DREQ_REMOTE_COMM_ID CM_FIELD32_LOC(struct cm_dreq_msg, 4, 32)
+#define CM_DREQ_REMOTE_QPN_EECN CM_FIELD32_LOC(struct cm_dreq_msg, 8, 24)
+#define CM_DREQ_PRIVATE_DATA CM_FIELD_MLOC(struct cm_dreq_msg, 12, 1760, void)
+CM_STRUCT(struct cm_dreq_msg, 12 * 8 + 1760);
+
+/* Table 113 DREP Message Contents */
+#define CM_DREP_LOCAL_COMM_ID CM_FIELD32_LOC(struct cm_drep_msg, 0, 32)
+#define CM_DREP_REMOTE_COMM_ID CM_FIELD32_LOC(struct cm_drep_msg, 4, 32)
+#define CM_DREP_PRIVATE_DATA CM_FIELD_MLOC(struct cm_drep_msg, 8, 1792, void)
+CM_STRUCT(struct cm_drep_msg, 8 * 8 + 1792);
+
+/* Table 115 LAP Message Contents */
+#define CM_LAP_LOCAL_COMM_ID CM_FIELD32_LOC(struct cm_lap_msg, 0, 32)
+#define CM_LAP_REMOTE_COMM_ID CM_FIELD32_LOC(struct cm_lap_msg, 4, 32)
+#define CM_LAP_REMOTE_QPN_EECN CM_FIELD32_LOC(struct cm_lap_msg, 12, 24)
+#define CM_LAP_REMOTE_CM_RESPONSE_TIMEOUT \
+ CM_FIELD8_LOC(struct cm_lap_msg, 15, 5)
+#define CM_LAP_ALTERNATE_LOCAL_PORT_LID \
+ CM_FIELD16_LOC(struct cm_lap_msg, 20, 16)
+#define CM_LAP_ALTERNATE_REMOTE_PORT_LID \
+ CM_FIELD16_LOC(struct cm_lap_msg, 22, 16)
+#define CM_LAP_ALTERNATE_LOCAL_PORT_GID \
+ CM_FIELD_MLOC(struct cm_lap_msg, 24, 128, union ib_gid)
+#define CM_LAP_ALTERNATE_REMOTE_PORT_GID \
+ CM_FIELD_MLOC(struct cm_lap_msg, 40, 128, union ib_gid)
+#define CM_LAP_ALTERNATE_FLOW_LABEL CM_FIELD32_LOC(struct cm_lap_msg, 56, 20)
+#define CM_LAP_ALTERNATE_TRAFFIC_CLASS CM_FIELD8_LOC(struct cm_lap_msg, 59, 8)
+#define CM_LAP_ALTERNATE_HOP_LIMIT CM_FIELD8_LOC(struct cm_lap_msg, 60, 8)
+#define CM_LAP_ALTERNATE_PACKET_RATE CM_FIELD_BLOC(struct cm_lap_msg, 61, 2, 6)
+#define CM_LAP_ALTERNATE_SL CM_FIELD8_LOC(struct cm_lap_msg, 62, 4)
+#define CM_LAP_ALTERNATE_SUBNET_LOCAL CM_FIELD_BLOC(struct cm_lap_msg, 62, 4, 1)
+#define CM_LAP_ALTERNATE_LOCAL_ACK_TIMEOUT \
+ CM_FIELD8_LOC(struct cm_lap_msg, 63, 5)
+#define CM_LAP_PRIVATE_DATA CM_FIELD_MLOC(struct cm_lap_msg, 64, 1344, void)
+CM_STRUCT(struct cm_lap_msg, 64 * 8 + 1344);
+
+/* Table 116 APR Message Contents */
+#define CM_APR_LOCAL_COMM_ID CM_FIELD32_LOC(struct cm_apr_msg, 0, 32)
+#define CM_APR_REMOTE_COMM_ID CM_FIELD32_LOC(struct cm_apr_msg, 4, 32)
+#define CM_APR_ADDITIONAL_INFORMATION_LENGTH \
+ CM_FIELD8_LOC(struct cm_apr_msg, 8, 8)
+#define CM_APR_AR_STATUS CM_FIELD8_LOC(struct cm_apr_msg, 9, 8)
+#define CM_APR_ADDITIONAL_INFORMATION \
+ CM_FIELD_MLOC(struct cm_apr_msg, 12, 576, void)
+#define CM_APR_PRIVATE_DATA CM_FIELD_MLOC(struct cm_apr_msg, 84, 1184, void)
+CM_STRUCT(struct cm_apr_msg, 84 * 8 + 1184);
+
+/* Table 119 SIDR_REQ Message Contents */
+#define CM_SIDR_REQ_REQUESTID CM_FIELD32_LOC(struct cm_sidr_req_msg, 0, 32)
+#define CM_SIDR_REQ_PARTITION_KEY CM_FIELD16_LOC(struct cm_sidr_req_msg, 4, 16)
+#define CM_SIDR_REQ_SERVICEID CM_FIELD64_LOC(struct cm_sidr_req_msg, 8)
+#define CM_SIDR_REQ_PRIVATE_DATA \
+ CM_FIELD_MLOC(struct cm_sidr_req_msg, 16, 1728, void)
+CM_STRUCT(struct cm_sidr_req_msg, 16 * 8 + 1728);
+
+/* Table 120 SIDR_REP Message Contents */
+#define CM_SIDR_REP_REQUESTID CM_FIELD32_LOC(struct cm_sidr_rep_msg, 0, 32)
+#define CM_SIDR_REP_STATUS CM_FIELD8_LOC(struct cm_sidr_rep_msg, 4, 8)
+#define CM_SIDR_REP_ADDITIONAL_INFORMATION_LENGTH \
+ CM_FIELD8_LOC(struct cm_sidr_rep_msg, 5, 8)
+#define CM_SIDR_REP_VENDOR_ID_H CM_FIELD16_LOC(struct cm_sidr_rep_msg, 6, 16)
+#define CM_SIDR_REP_QPN CM_FIELD32_LOC(struct cm_sidr_rep_msg, 8, 24)
+#define CM_SIDR_REP_VENDOR_ID_L CM_FIELD8_LOC(struct cm_sidr_rep_msg, 11, 8)
+#define CM_SIDR_REP_SERVICEID CM_FIELD64_LOC(struct cm_sidr_rep_msg, 12)
+#define CM_SIDR_REP_Q_KEY CM_FIELD32_LOC(struct cm_sidr_rep_msg, 20, 32)
+#define CM_SIDR_REP_ADDITIONAL_INFORMATION \
+ CM_FIELD_MLOC(struct cm_sidr_rep_msg, 24, 576, void)
+#define CM_SIDR_REP_PRIVATE_DATA \
+ CM_FIELD_MLOC(struct cm_sidr_rep_msg, 96, 1088, void)
+CM_STRUCT(struct cm_sidr_rep_msg, 96 * 8 + 1088);
+
+#endif /* _IBTA_VOL1_C12_H_ */
diff --git a/include/rdma/iw_cm.h b/include/rdma/iw_cm.h
new file mode 100644
index 0000000000..2b22f153ef
--- /dev/null
+++ b/include/rdma/iw_cm.h
@@ -0,0 +1,205 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ */
+
+#ifndef IW_CM_H
+#define IW_CM_H
+
+#include <linux/in.h>
+#include <rdma/ib_cm.h>
+
+struct iw_cm_id;
+
+enum iw_cm_event_type {
+ IW_CM_EVENT_CONNECT_REQUEST = 1, /* connect request received */
+ IW_CM_EVENT_CONNECT_REPLY, /* reply from active connect request */
+ IW_CM_EVENT_ESTABLISHED, /* passive side accept successful */
+ IW_CM_EVENT_DISCONNECT, /* orderly shutdown */
+ IW_CM_EVENT_CLOSE /* close complete */
+};
+
+struct iw_cm_event {
+ enum iw_cm_event_type event;
+ int status;
+ struct sockaddr_storage local_addr;
+ struct sockaddr_storage remote_addr;
+ void *private_data;
+ void *provider_data;
+ u8 private_data_len;
+ u8 ord;
+ u8 ird;
+};
+
+/**
+ * iw_cm_handler - Function to be called by the IW CM when delivering events
+ * to the client.
+ *
+ * @cm_id: The IW CM identifier associated with the event.
+ * @event: Pointer to the event structure.
+ */
+typedef int (*iw_cm_handler)(struct iw_cm_id *cm_id,
+ struct iw_cm_event *event);
+
+/**
+ * iw_event_handler - Function called by the provider when delivering provider
+ * events to the IW CM. Returns either 0 indicating the event was processed
+ * or -errno if the event could not be processed.
+ *
+ * @cm_id: The IW CM identifier associated with the event.
+ * @event: Pointer to the event structure.
+ */
+typedef int (*iw_event_handler)(struct iw_cm_id *cm_id,
+ struct iw_cm_event *event);
+
+struct iw_cm_id {
+ iw_cm_handler cm_handler; /* client callback function */
+ void *context; /* client cb context */
+ struct ib_device *device;
+ struct sockaddr_storage local_addr; /* local addr */
+ struct sockaddr_storage remote_addr;
+ struct sockaddr_storage m_local_addr; /* nmapped local addr */
+ struct sockaddr_storage m_remote_addr; /* nmapped rem addr */
+ void *provider_data; /* provider private data */
+ iw_event_handler event_handler; /* cb for provider
+ events */
+ /* Used by provider to add and remove refs on IW cm_id */
+ void (*add_ref)(struct iw_cm_id *);
+ void (*rem_ref)(struct iw_cm_id *);
+ u8 tos;
+ bool tos_set:1;
+ bool mapped:1;
+ bool afonly:1;
+};
+
+struct iw_cm_conn_param {
+ const void *private_data;
+ u16 private_data_len;
+ u32 ord;
+ u32 ird;
+ u32 qpn;
+};
+
+enum iw_flags {
+
+ /*
+ * This flag allows the iwcm and iwpmd to still advertise
+ * mappings but the real and mapped port numbers are the
+ * same. Further, iwpmd will not bind any user socket to
+ * reserve the port. This is required for soft iwarp
+ * to play in the port mapped iwarp space.
+ */
+ IW_F_NO_PORT_MAP = (1 << 0),
+};
+
+/**
+ * iw_create_cm_id - Create an IW CM identifier.
+ *
+ * @device: The IB device on which to create the IW CM identier.
+ * @event_handler: User callback invoked to report events associated with the
+ * returned IW CM identifier.
+ * @context: User specified context associated with the id.
+ */
+struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
+ iw_cm_handler cm_handler, void *context);
+
+/**
+ * iw_destroy_cm_id - Destroy an IW CM identifier.
+ *
+ * @cm_id: The previously created IW CM identifier to destroy.
+ *
+ * The client can assume that no events will be delivered for the CM ID after
+ * this function returns.
+ */
+void iw_destroy_cm_id(struct iw_cm_id *cm_id);
+
+/**
+ * iw_cm_listen - Listen for incoming connection requests on the
+ * specified IW CM id.
+ *
+ * @cm_id: The IW CM identifier.
+ * @backlog: The maximum number of outstanding un-accepted inbound listen
+ * requests to queue.
+ *
+ * The source address and port number are specified in the IW CM identifier
+ * structure.
+ */
+int iw_cm_listen(struct iw_cm_id *cm_id, int backlog);
+
+/**
+ * iw_cm_accept - Called to accept an incoming connect request.
+ *
+ * @cm_id: The IW CM identifier associated with the connection request.
+ * @iw_param: Pointer to a structure containing connection establishment
+ * parameters.
+ *
+ * The specified cm_id will have been provided in the event data for a
+ * CONNECT_REQUEST event. Subsequent events related to this connection will be
+ * delivered to the specified IW CM identifier prior and may occur prior to
+ * the return of this function. If this function returns a non-zero value, the
+ * client can assume that no events will be delivered to the specified IW CM
+ * identifier.
+ */
+int iw_cm_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param);
+
+/**
+ * iw_cm_reject - Reject an incoming connection request.
+ *
+ * @cm_id: Connection identifier associated with the request.
+ * @private_daa: Pointer to data to deliver to the remote peer as part of the
+ * reject message.
+ * @private_data_len: The number of bytes in the private_data parameter.
+ *
+ * The client can assume that no events will be delivered to the specified IW
+ * CM identifier following the return of this function. The private_data
+ * buffer is available for reuse when this function returns.
+ */
+int iw_cm_reject(struct iw_cm_id *cm_id, const void *private_data,
+ u8 private_data_len);
+
+/**
+ * iw_cm_connect - Called to request a connection to a remote peer.
+ *
+ * @cm_id: The IW CM identifier for the connection.
+ * @iw_param: Pointer to a structure containing connection establishment
+ * parameters.
+ *
+ * Events may be delivered to the specified IW CM identifier prior to the
+ * return of this function. If this function returns a non-zero value, the
+ * client can assume that no events will be delivered to the specified IW CM
+ * identifier.
+ */
+int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param);
+
+/**
+ * iw_cm_disconnect - Close the specified connection.
+ *
+ * @cm_id: The IW CM identifier to close.
+ * @abrupt: If 0, the connection will be closed gracefully, otherwise, the
+ * connection will be reset.
+ *
+ * The IW CM identifier is still active until the IW_CM_EVENT_CLOSE event is
+ * delivered.
+ */
+int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt);
+
+/**
+ * iw_cm_init_qp_attr - Called to initialize the attributes of the QP
+ * associated with a IW CM identifier.
+ *
+ * @cm_id: The IW CM identifier associated with the QP
+ * @qp_attr: Pointer to the QP attributes structure.
+ * @qp_attr_mask: Pointer to a bit vector specifying which QP attributes are
+ * valid.
+ */
+int iw_cm_init_qp_attr(struct iw_cm_id *cm_id, struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask);
+
+/**
+ * iwcm_reject_msg - return a pointer to a reject message string.
+ * @reason: Value returned in the REJECT event status field.
+ */
+const char *__attribute_const__ iwcm_reject_msg(int reason);
+
+#endif /* IW_CM_H */
diff --git a/include/rdma/iw_portmap.h b/include/rdma/iw_portmap.h
new file mode 100644
index 0000000000..6dbc1a2359
--- /dev/null
+++ b/include/rdma/iw_portmap.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
+ */
+
+#ifndef _IW_PORTMAP_H
+#define _IW_PORTMAP_H
+
+#include <linux/socket.h>
+#include <linux/netlink.h>
+
+#define IWPM_ULIBNAME_SIZE 32
+#define IWPM_DEVNAME_SIZE 32
+#define IWPM_IFNAME_SIZE 16
+#define IWPM_IPADDR_SIZE 16
+
+enum {
+ IWPM_INVALID_NLMSG_ERR = 10,
+ IWPM_CREATE_MAPPING_ERR,
+ IWPM_DUPLICATE_MAPPING_ERR,
+ IWPM_UNKNOWN_MAPPING_ERR,
+ IWPM_CLIENT_DEV_INFO_ERR,
+ IWPM_USER_LIB_INFO_ERR,
+ IWPM_REMOTE_QUERY_REJECT
+};
+
+struct iwpm_dev_data {
+ char dev_name[IWPM_DEVNAME_SIZE];
+ char if_name[IWPM_IFNAME_SIZE];
+};
+
+struct iwpm_sa_data {
+ struct sockaddr_storage loc_addr;
+ struct sockaddr_storage mapped_loc_addr;
+ struct sockaddr_storage rem_addr;
+ struct sockaddr_storage mapped_rem_addr;
+ u32 flags;
+};
+
+int iwpm_init(u8);
+int iwpm_exit(u8);
+int iwpm_valid_pid(void);
+int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client);
+int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client);
+int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client);
+int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client);
+int iwpm_register_pid_cb(struct sk_buff *, struct netlink_callback *);
+int iwpm_add_mapping_cb(struct sk_buff *, struct netlink_callback *);
+int iwpm_add_and_query_mapping_cb(struct sk_buff *, struct netlink_callback *);
+int iwpm_remote_info_cb(struct sk_buff *, struct netlink_callback *);
+int iwpm_mapping_error_cb(struct sk_buff *, struct netlink_callback *);
+int iwpm_mapping_info_cb(struct sk_buff *, struct netlink_callback *);
+int iwpm_ack_mapping_info_cb(struct sk_buff *, struct netlink_callback *);
+int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr,
+ struct sockaddr_storage *mapped_rem_addr,
+ struct sockaddr_storage *remote_addr, u8 nl_client);
+int iwpm_create_mapinfo(struct sockaddr_storage *local_addr,
+ struct sockaddr_storage *mapped_addr, u8 nl_client,
+ u32 map_flags);
+int iwpm_remove_mapinfo(struct sockaddr_storage *local_addr,
+ struct sockaddr_storage *mapped_addr);
+
+int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb);
+#endif /* _IW_PORTMAP_H */
diff --git a/include/rdma/lag.h b/include/rdma/lag.h
new file mode 100644
index 0000000000..7c06ec9b2e
--- /dev/null
+++ b/include/rdma/lag.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2020 Mellanox Technologies. All rights reserved.
+ */
+
+#ifndef _RDMA_LAG_H_
+#define _RDMA_LAG_H_
+
+#include <net/lag.h>
+
+struct ib_device;
+struct rdma_ah_attr;
+
+enum rdma_lag_flags {
+ RDMA_LAG_FLAGS_HASH_ALL_SLAVES = 1 << 0
+};
+
+void rdma_lag_put_ah_roce_slave(struct net_device *xmit_slave);
+struct net_device *rdma_lag_get_ah_roce_slave(struct ib_device *device,
+ struct rdma_ah_attr *ah_attr,
+ gfp_t flags);
+
+#endif /* _RDMA_LAG_H_ */
diff --git a/include/rdma/mr_pool.h b/include/rdma/mr_pool.h
new file mode 100644
index 0000000000..e77123bcb4
--- /dev/null
+++ b/include/rdma/mr_pool.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2016 HGST, a Western Digital Company.
+ */
+#ifndef _RDMA_MR_POOL_H
+#define _RDMA_MR_POOL_H 1
+
+#include <rdma/ib_verbs.h>
+
+struct ib_mr *ib_mr_pool_get(struct ib_qp *qp, struct list_head *list);
+void ib_mr_pool_put(struct ib_qp *qp, struct list_head *list, struct ib_mr *mr);
+
+int ib_mr_pool_init(struct ib_qp *qp, struct list_head *list, int nr,
+ enum ib_mr_type type, u32 max_num_sg, u32 max_num_meta_sg);
+void ib_mr_pool_destroy(struct ib_qp *qp, struct list_head *list);
+
+#endif /* _RDMA_MR_POOL_H */
diff --git a/include/rdma/opa_addr.h b/include/rdma/opa_addr.h
new file mode 100644
index 0000000000..c6bfa2640b
--- /dev/null
+++ b/include/rdma/opa_addr.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/*
+ * Copyright(c) 2017 Intel Corporation.
+ */
+
+#ifndef OPA_ADDR_H
+#define OPA_ADDR_H
+
+#include <rdma/opa_smi.h>
+
+#define OPA_SPECIAL_OUI (0x00066AULL)
+#define OPA_MAKE_ID(x) (cpu_to_be64(OPA_SPECIAL_OUI << 40 | (x)))
+#define OPA_TO_IB_UCAST_LID(x) (((x) >= be16_to_cpu(IB_MULTICAST_LID_BASE)) \
+ ? 0 : x)
+#define OPA_GID_INDEX 0x1
+/**
+ * 0xF8 - 4 bits of multicast range and 1 bit for collective range
+ * Example: For 24 bit LID space,
+ * Multicast range: 0xF00000 to 0xF7FFFF
+ * Collective range: 0xF80000 to 0xFFFFFE
+ */
+#define OPA_MCAST_NR 0x4 /* Number of top bits set */
+#define OPA_COLLECTIVE_NR 0x1 /* Number of bits after MCAST_NR */
+
+/**
+ * ib_is_opa_gid: Returns true if the top 24 bits of the gid
+ * contains the OPA_STL_OUI identifier. This identifies that
+ * the provided gid is a special purpose GID meant to carry
+ * extended LID information.
+ *
+ * @gid: The Global identifier
+ */
+static inline bool ib_is_opa_gid(const union ib_gid *gid)
+{
+ return ((be64_to_cpu(gid->global.interface_id) >> 40) ==
+ OPA_SPECIAL_OUI);
+}
+
+/**
+ * opa_get_lid_from_gid: Returns the last 32 bits of the gid.
+ * OPA devices use one of the gids in the gid table to also
+ * store the lid.
+ *
+ * @gid: The Global identifier
+ */
+static inline u32 opa_get_lid_from_gid(const union ib_gid *gid)
+{
+ return be64_to_cpu(gid->global.interface_id) & 0xFFFFFFFF;
+}
+
+/**
+ * opa_is_extended_lid: Returns true if dlid or slid are
+ * extended.
+ *
+ * @dlid: The DLID
+ * @slid: The SLID
+ */
+static inline bool opa_is_extended_lid(__be32 dlid, __be32 slid)
+{
+ if ((be32_to_cpu(dlid) >=
+ be16_to_cpu(IB_MULTICAST_LID_BASE)) ||
+ (be32_to_cpu(slid) >=
+ be16_to_cpu(IB_MULTICAST_LID_BASE)))
+ return true;
+
+ return false;
+}
+
+/* Get multicast lid base */
+static inline u32 opa_get_mcast_base(u32 nr_top_bits)
+{
+ return (be32_to_cpu(OPA_LID_PERMISSIVE) << (32 - nr_top_bits));
+}
+
+/* Check for a valid unicast LID for non-SM traffic types */
+static inline bool rdma_is_valid_unicast_lid(struct rdma_ah_attr *attr)
+{
+ if (attr->type == RDMA_AH_ATTR_TYPE_IB) {
+ if (!rdma_ah_get_dlid(attr) ||
+ rdma_ah_get_dlid(attr) >=
+ be16_to_cpu(IB_MULTICAST_LID_BASE))
+ return false;
+ } else if (attr->type == RDMA_AH_ATTR_TYPE_OPA) {
+ if (!rdma_ah_get_dlid(attr) ||
+ rdma_ah_get_dlid(attr) >=
+ opa_get_mcast_base(OPA_MCAST_NR))
+ return false;
+ }
+ return true;
+}
+#endif /* OPA_ADDR_H */
diff --git a/include/rdma/opa_port_info.h b/include/rdma/opa_port_info.h
new file mode 100644
index 0000000000..73bcac90a0
--- /dev/null
+++ b/include/rdma/opa_port_info.h
@@ -0,0 +1,385 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2014-2020 Intel Corporation. All rights reserved.
+ */
+
+#ifndef OPA_PORT_INFO_H
+#define OPA_PORT_INFO_H
+
+#include <rdma/opa_smi.h>
+
+#define OPA_PORT_LINK_MODE_NOP 0 /* No change */
+#define OPA_PORT_LINK_MODE_OPA 4 /* Port mode is OPA */
+
+#define OPA_PORT_PACKET_FORMAT_NOP 0 /* No change */
+#define OPA_PORT_PACKET_FORMAT_8B 1 /* Format 8B */
+#define OPA_PORT_PACKET_FORMAT_9B 2 /* Format 9B */
+#define OPA_PORT_PACKET_FORMAT_10B 4 /* Format 10B */
+#define OPA_PORT_PACKET_FORMAT_16B 8 /* Format 16B */
+
+#define OPA_PORT_LTP_CRC_MODE_NONE 0 /* No change */
+#define OPA_PORT_LTP_CRC_MODE_14 1 /* 14-bit LTP CRC mode (optional) */
+#define OPA_PORT_LTP_CRC_MODE_16 2 /* 16-bit LTP CRC mode */
+#define OPA_PORT_LTP_CRC_MODE_48 4 /* 48-bit LTP CRC mode (optional) */
+#define OPA_PORT_LTP_CRC_MODE_PER_LANE 8 /* 12/16-bit per lane LTP CRC mode */
+
+/* Link Down / Neighbor Link Down Reason; indicated as follows: */
+#define OPA_LINKDOWN_REASON_NONE 0 /* No specified reason */
+#define OPA_LINKDOWN_REASON_RCV_ERROR_0 1
+#define OPA_LINKDOWN_REASON_BAD_PKT_LEN 2
+#define OPA_LINKDOWN_REASON_PKT_TOO_LONG 3
+#define OPA_LINKDOWN_REASON_PKT_TOO_SHORT 4
+#define OPA_LINKDOWN_REASON_BAD_SLID 5
+#define OPA_LINKDOWN_REASON_BAD_DLID 6
+#define OPA_LINKDOWN_REASON_BAD_L2 7
+#define OPA_LINKDOWN_REASON_BAD_SC 8
+#define OPA_LINKDOWN_REASON_RCV_ERROR_8 9
+#define OPA_LINKDOWN_REASON_BAD_MID_TAIL 10
+#define OPA_LINKDOWN_REASON_RCV_ERROR_10 11
+#define OPA_LINKDOWN_REASON_PREEMPT_ERROR 12
+#define OPA_LINKDOWN_REASON_PREEMPT_VL15 13
+#define OPA_LINKDOWN_REASON_BAD_VL_MARKER 14
+#define OPA_LINKDOWN_REASON_RCV_ERROR_14 15
+#define OPA_LINKDOWN_REASON_RCV_ERROR_15 16
+#define OPA_LINKDOWN_REASON_BAD_HEAD_DIST 17
+#define OPA_LINKDOWN_REASON_BAD_TAIL_DIST 18
+#define OPA_LINKDOWN_REASON_BAD_CTRL_DIST 19
+#define OPA_LINKDOWN_REASON_BAD_CREDIT_ACK 20
+#define OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER 21
+#define OPA_LINKDOWN_REASON_BAD_PREEMPT 22
+#define OPA_LINKDOWN_REASON_BAD_CONTROL_FLIT 23
+#define OPA_LINKDOWN_REASON_EXCEED_MULTICAST_LIMIT 24
+#define OPA_LINKDOWN_REASON_RCV_ERROR_24 25
+#define OPA_LINKDOWN_REASON_RCV_ERROR_25 26
+#define OPA_LINKDOWN_REASON_RCV_ERROR_26 27
+#define OPA_LINKDOWN_REASON_RCV_ERROR_27 28
+#define OPA_LINKDOWN_REASON_RCV_ERROR_28 29
+#define OPA_LINKDOWN_REASON_RCV_ERROR_29 30
+#define OPA_LINKDOWN_REASON_RCV_ERROR_30 31
+#define OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN 32
+#define OPA_LINKDOWN_REASON_UNKNOWN 33
+/* 34 -reserved */
+#define OPA_LINKDOWN_REASON_REBOOT 35
+#define OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN 36
+/* 37-38 reserved */
+#define OPA_LINKDOWN_REASON_FM_BOUNCE 39
+#define OPA_LINKDOWN_REASON_SPEED_POLICY 40
+#define OPA_LINKDOWN_REASON_WIDTH_POLICY 41
+/* 42-48 reserved */
+#define OPA_LINKDOWN_REASON_DISCONNECTED 49
+#define OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED 50
+#define OPA_LINKDOWN_REASON_NOT_INSTALLED 51
+#define OPA_LINKDOWN_REASON_CHASSIS_CONFIG 52
+/* 53 reserved */
+#define OPA_LINKDOWN_REASON_END_TO_END_NOT_INSTALLED 54
+/* 55 reserved */
+#define OPA_LINKDOWN_REASON_POWER_POLICY 56
+#define OPA_LINKDOWN_REASON_LINKSPEED_POLICY 57
+#define OPA_LINKDOWN_REASON_LINKWIDTH_POLICY 58
+/* 59 reserved */
+#define OPA_LINKDOWN_REASON_SWITCH_MGMT 60
+#define OPA_LINKDOWN_REASON_SMA_DISABLED 61
+/* 62 reserved */
+#define OPA_LINKDOWN_REASON_TRANSIENT 63
+/* 64-255 reserved */
+
+/* OPA Link Init reason; indicated as follows: */
+/* 3-7; 11-15 reserved; 8-15 cleared on Polling->LinkUp */
+#define OPA_LINKINIT_REASON_NOP 0
+#define OPA_LINKINIT_REASON_LINKUP (1 << 4)
+#define OPA_LINKINIT_REASON_FLAPPING (2 << 4)
+#define OPA_LINKINIT_REASON_CLEAR (8 << 4)
+#define OPA_LINKINIT_OUTSIDE_POLICY (8 << 4)
+#define OPA_LINKINIT_QUARANTINED (9 << 4)
+#define OPA_LINKINIT_INSUFIC_CAPABILITY (10 << 4)
+
+#define OPA_LINK_SPEED_NOP 0x0000 /* Reserved (1-5 Gbps) */
+#define OPA_LINK_SPEED_12_5G 0x0001 /* 12.5 Gbps */
+#define OPA_LINK_SPEED_25G 0x0002 /* 25.78125? Gbps (EDR) */
+
+#define OPA_LINK_WIDTH_1X 0x0001
+#define OPA_LINK_WIDTH_2X 0x0002
+#define OPA_LINK_WIDTH_3X 0x0004
+#define OPA_LINK_WIDTH_4X 0x0008
+
+#define OPA_CAP_MASK3_IsEthOnFabricSupported (1 << 13)
+#define OPA_CAP_MASK3_IsSnoopSupported (1 << 7)
+#define OPA_CAP_MASK3_IsAsyncSC2VLSupported (1 << 6)
+#define OPA_CAP_MASK3_IsAddrRangeConfigSupported (1 << 5)
+#define OPA_CAP_MASK3_IsPassThroughSupported (1 << 4)
+#define OPA_CAP_MASK3_IsSharedSpaceSupported (1 << 3)
+/* reserved (1 << 2) */
+#define OPA_CAP_MASK3_IsVLMarkerSupported (1 << 1)
+#define OPA_CAP_MASK3_IsVLrSupported (1 << 0)
+
+enum {
+ OPA_PORT_PHYS_CONF_DISCONNECTED = 0,
+ OPA_PORT_PHYS_CONF_STANDARD = 1,
+ OPA_PORT_PHYS_CONF_FIXED = 2,
+ OPA_PORT_PHYS_CONF_VARIABLE = 3,
+ OPA_PORT_PHYS_CONF_SI_PHOTO = 4
+};
+
+enum port_info_field_masks {
+ /* vl.cap */
+ OPA_PI_MASK_VL_CAP = 0x1F,
+ /* port_states.ledenable_offlinereason */
+ OPA_PI_MASK_OFFLINE_REASON = 0x0F,
+ OPA_PI_MASK_LED_ENABLE = 0x40,
+ /* port_states.unsleepstate_downdefstate */
+ OPA_PI_MASK_UNSLEEP_STATE = 0xF0,
+ OPA_PI_MASK_DOWNDEF_STATE = 0x0F,
+ /* port_states.portphysstate_portstate */
+ OPA_PI_MASK_PORT_PHYSICAL_STATE = 0xF0,
+ OPA_PI_MASK_PORT_STATE = 0x0F,
+ /* port_phys_conf */
+ OPA_PI_MASK_PORT_PHYSICAL_CONF = 0x0F,
+ /* collectivemask_multicastmask */
+ OPA_PI_MASK_COLLECT_MASK = 0x38,
+ OPA_PI_MASK_MULTICAST_MASK = 0x07,
+ /* mkeyprotect_lmc */
+ OPA_PI_MASK_MKEY_PROT_BIT = 0xC0,
+ OPA_PI_MASK_LMC = 0x0F,
+ /* smsl */
+ OPA_PI_MASK_SMSL = 0x1F,
+ /* partenforce_filterraw */
+ /* Filter Raw In/Out bits 1 and 2 were removed */
+ OPA_PI_MASK_LINKINIT_REASON = 0xF0,
+ OPA_PI_MASK_PARTITION_ENFORCE_IN = 0x08,
+ OPA_PI_MASK_PARTITION_ENFORCE_OUT = 0x04,
+ /* operational_vls */
+ OPA_PI_MASK_OPERATIONAL_VL = 0x1F,
+ /* sa_qp */
+ OPA_PI_MASK_SA_QP = 0x00FFFFFF,
+ /* sm_trap_qp */
+ OPA_PI_MASK_SM_TRAP_QP = 0x00FFFFFF,
+ /* localphy_overrun_errors */
+ OPA_PI_MASK_LOCAL_PHY_ERRORS = 0xF0,
+ OPA_PI_MASK_OVERRUN_ERRORS = 0x0F,
+ /* clientrereg_subnettimeout */
+ OPA_PI_MASK_CLIENT_REREGISTER = 0x80,
+ OPA_PI_MASK_SUBNET_TIMEOUT = 0x1F,
+ /* port_link_mode */
+ OPA_PI_MASK_PORT_LINK_SUPPORTED = (0x001F << 10),
+ OPA_PI_MASK_PORT_LINK_ENABLED = (0x001F << 5),
+ OPA_PI_MASK_PORT_LINK_ACTIVE = (0x001F << 0),
+ /* port_link_crc_mode */
+ OPA_PI_MASK_PORT_LINK_CRC_SUPPORTED = 0x0F00,
+ OPA_PI_MASK_PORT_LINK_CRC_ENABLED = 0x00F0,
+ OPA_PI_MASK_PORT_LINK_CRC_ACTIVE = 0x000F,
+ /* port_mode */
+ OPA_PI_MASK_PORT_MODE_SECURITY_CHECK = 0x0001,
+ OPA_PI_MASK_PORT_MODE_16B_TRAP_QUERY = 0x0002,
+ OPA_PI_MASK_PORT_MODE_PKEY_CONVERT = 0x0004,
+ OPA_PI_MASK_PORT_MODE_SC2SC_MAPPING = 0x0008,
+ OPA_PI_MASK_PORT_MODE_VL_MARKER = 0x0010,
+ OPA_PI_MASK_PORT_PASS_THROUGH = 0x0020,
+ OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE = 0x0040,
+ /* flit_control.interleave */
+ OPA_PI_MASK_INTERLEAVE_DIST_SUP = (0x0003 << 12),
+ OPA_PI_MASK_INTERLEAVE_DIST_ENABLE = (0x0003 << 10),
+ OPA_PI_MASK_INTERLEAVE_MAX_NEST_TX = (0x001F << 5),
+ OPA_PI_MASK_INTERLEAVE_MAX_NEST_RX = (0x001F << 0),
+
+ /* port_error_action */
+ OPA_PI_MASK_EX_BUFFER_OVERRUN = 0x80000000,
+ /* 7 bits reserved */
+ OPA_PI_MASK_FM_CFG_ERR_EXCEED_MULTICAST_LIMIT = 0x00800000,
+ OPA_PI_MASK_FM_CFG_BAD_CONTROL_FLIT = 0x00400000,
+ OPA_PI_MASK_FM_CFG_BAD_PREEMPT = 0x00200000,
+ OPA_PI_MASK_FM_CFG_UNSUPPORTED_VL_MARKER = 0x00100000,
+ OPA_PI_MASK_FM_CFG_BAD_CRDT_ACK = 0x00080000,
+ OPA_PI_MASK_FM_CFG_BAD_CTRL_DIST = 0x00040000,
+ OPA_PI_MASK_FM_CFG_BAD_TAIL_DIST = 0x00020000,
+ OPA_PI_MASK_FM_CFG_BAD_HEAD_DIST = 0x00010000,
+ /* 2 bits reserved */
+ OPA_PI_MASK_PORT_RCV_BAD_VL_MARKER = 0x00002000,
+ OPA_PI_MASK_PORT_RCV_PREEMPT_VL15 = 0x00001000,
+ OPA_PI_MASK_PORT_RCV_PREEMPT_ERROR = 0x00000800,
+ /* 1 bit reserved */
+ OPA_PI_MASK_PORT_RCV_BAD_MidTail = 0x00000200,
+ /* 1 bit reserved */
+ OPA_PI_MASK_PORT_RCV_BAD_SC = 0x00000080,
+ OPA_PI_MASK_PORT_RCV_BAD_L2 = 0x00000040,
+ OPA_PI_MASK_PORT_RCV_BAD_DLID = 0x00000020,
+ OPA_PI_MASK_PORT_RCV_BAD_SLID = 0x00000010,
+ OPA_PI_MASK_PORT_RCV_PKTLEN_TOOSHORT = 0x00000008,
+ OPA_PI_MASK_PORT_RCV_PKTLEN_TOOLONG = 0x00000004,
+ OPA_PI_MASK_PORT_RCV_BAD_PKTLEN = 0x00000002,
+ OPA_PI_MASK_PORT_RCV_BAD_LT = 0x00000001,
+
+ /* pass_through.res_drctl */
+ OPA_PI_MASK_PASS_THROUGH_DR_CONTROL = 0x01,
+
+ /* buffer_units */
+ OPA_PI_MASK_BUF_UNIT_VL15_INIT = (0x00000FFF << 11),
+ OPA_PI_MASK_BUF_UNIT_VL15_CREDIT_RATE = (0x0000001F << 6),
+ OPA_PI_MASK_BUF_UNIT_CREDIT_ACK = (0x00000003 << 3),
+ OPA_PI_MASK_BUF_UNIT_BUF_ALLOC = (0x00000003 << 0),
+
+ /* neigh_mtu.pvlx_to_mtu */
+ OPA_PI_MASK_NEIGH_MTU_PVL0 = 0xF0,
+ OPA_PI_MASK_NEIGH_MTU_PVL1 = 0x0F,
+
+ /* neigh_mtu.vlstall_hoq_life */
+ OPA_PI_MASK_VL_STALL = (0x03 << 5),
+ OPA_PI_MASK_HOQ_LIFE = (0x1F << 0),
+
+ /* port_neigh_mode */
+ OPA_PI_MASK_NEIGH_MGMT_ALLOWED = (0x01 << 3),
+ OPA_PI_MASK_NEIGH_FW_AUTH_BYPASS = (0x01 << 2),
+ OPA_PI_MASK_NEIGH_NODE_TYPE = (0x03 << 0),
+
+ /* resptime_value */
+ OPA_PI_MASK_RESPONSE_TIME_VALUE = 0x1F,
+
+ /* mtucap */
+ OPA_PI_MASK_MTU_CAP = 0x0F,
+};
+
+struct opa_port_states {
+ u8 reserved;
+ u8 ledenable_offlinereason; /* 1 res, 1 bit, 6 bits */
+ u8 reserved2;
+ u8 portphysstate_portstate; /* 4 bits, 4 bits */
+};
+
+struct opa_port_state_info {
+ struct opa_port_states port_states;
+ __be16 link_width_downgrade_tx_active;
+ __be16 link_width_downgrade_rx_active;
+};
+
+struct opa_port_info {
+ __be32 lid;
+ __be32 flow_control_mask;
+
+ struct {
+ u8 res; /* was inittype */
+ u8 cap; /* 3 res, 5 bits */
+ __be16 high_limit;
+ __be16 preempt_limit;
+ u8 arb_high_cap;
+ u8 arb_low_cap;
+ } vl;
+
+ struct opa_port_states port_states;
+ u8 port_phys_conf; /* 4 res, 4 bits */
+ u8 collectivemask_multicastmask; /* 2 res, 3, 3 */
+ u8 mkeyprotect_lmc; /* 2 bits, 2 res, 4 bits */
+ u8 smsl; /* 3 res, 5 bits */
+
+ u8 partenforce_filterraw; /* bit fields */
+ u8 operational_vls; /* 3 res, 5 bits */
+ __be16 pkey_8b;
+ __be16 pkey_10b;
+ __be16 mkey_violations;
+
+ __be16 pkey_violations;
+ __be16 qkey_violations;
+ __be32 sm_trap_qp; /* 8 bits, 24 bits */
+
+ __be32 sa_qp; /* 8 bits, 24 bits */
+ u8 neigh_port_num;
+ u8 link_down_reason;
+ u8 neigh_link_down_reason;
+ u8 clientrereg_subnettimeout; /* 1 bit, 2 bits, 5 */
+
+ struct {
+ __be16 supported;
+ __be16 enabled;
+ __be16 active;
+ } link_speed;
+ struct {
+ __be16 supported;
+ __be16 enabled;
+ __be16 active;
+ } link_width;
+ struct {
+ __be16 supported;
+ __be16 enabled;
+ __be16 tx_active;
+ __be16 rx_active;
+ } link_width_downgrade;
+ __be16 port_link_mode; /* 1 res, 5 bits, 5 bits, 5 bits */
+ __be16 port_ltp_crc_mode; /* 4 res, 4 bits, 4 bits, 4 bits */
+
+ __be16 port_mode; /* 9 res, bit fields */
+ struct {
+ __be16 supported;
+ __be16 enabled;
+ } port_packet_format;
+ struct {
+ __be16 interleave; /* 2 res, 2,2,5,5 */
+ struct {
+ __be16 min_initial;
+ __be16 min_tail;
+ u8 large_pkt_limit;
+ u8 small_pkt_limit;
+ u8 max_small_pkt_limit;
+ u8 preemption_limit;
+ } preemption;
+ } flit_control;
+
+ __be32 reserved4;
+ __be32 port_error_action; /* bit field */
+
+ struct {
+ u8 egress_port;
+ u8 res_drctl; /* 7 res, 1 */
+ } pass_through;
+ __be16 mkey_lease_period;
+ __be32 buffer_units; /* 9 res, 12, 5, 3, 3 */
+
+ __be32 reserved5;
+ __be32 sm_lid;
+
+ __be64 mkey;
+
+ __be64 subnet_prefix;
+
+ struct {
+ u8 pvlx_to_mtu[OPA_MAX_VLS/2]; /* 4 bits, 4 bits */
+ } neigh_mtu;
+
+ struct {
+ u8 vlstall_hoqlife; /* 3 bits, 5 bits */
+ } xmit_q[OPA_MAX_VLS];
+
+ struct {
+ u8 addr[16];
+ } ipaddr_ipv6;
+
+ struct {
+ u8 addr[4];
+ } ipaddr_ipv4;
+
+ u32 reserved6;
+ u32 reserved7;
+ u32 reserved8;
+
+ __be64 neigh_node_guid;
+
+ __be32 ib_cap_mask;
+ __be16 reserved9; /* was ib_cap_mask2 */
+ __be16 opa_cap_mask;
+
+ __be32 reserved10; /* was link_roundtrip_latency */
+ __be16 overall_buffer_space;
+ __be16 reserved11; /* was max_credit_hint */
+
+ __be16 diag_code;
+ struct {
+ u8 buffer;
+ u8 wire;
+ } replay_depth;
+ u8 port_neigh_mode;
+ u8 mtucap; /* 4 res, 4 bits */
+
+ u8 resptimevalue; /* 3 res, 5 bits */
+ u8 local_port_num;
+ u8 reserved12;
+ u8 reserved13; /* was guid_cap */
+} __packed;
+
+#endif /* OPA_PORT_INFO_H */
diff --git a/include/rdma/opa_smi.h b/include/rdma/opa_smi.h
new file mode 100644
index 0000000000..a9f1b5700e
--- /dev/null
+++ b/include/rdma/opa_smi.h
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ */
+
+#ifndef OPA_SMI_H
+#define OPA_SMI_H
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+
+#define OPA_SMP_LID_DATA_SIZE 2016
+#define OPA_SMP_DR_DATA_SIZE 1872
+#define OPA_SMP_MAX_PATH_HOPS 64
+
+#define OPA_MAX_VLS 32
+#define OPA_MAX_SLS 32
+#define OPA_MAX_SCS 32
+
+#define OPA_LID_PERMISSIVE cpu_to_be32(0xFFFFFFFF)
+
+struct opa_smp {
+ u8 base_version;
+ u8 mgmt_class;
+ u8 class_version;
+ u8 method;
+ __be16 status;
+ u8 hop_ptr;
+ u8 hop_cnt;
+ __be64 tid;
+ __be16 attr_id;
+ __be16 resv;
+ __be32 attr_mod;
+ __be64 mkey;
+ union {
+ struct {
+ uint8_t data[OPA_SMP_LID_DATA_SIZE];
+ } lid;
+ struct {
+ __be32 dr_slid;
+ __be32 dr_dlid;
+ u8 initial_path[OPA_SMP_MAX_PATH_HOPS];
+ u8 return_path[OPA_SMP_MAX_PATH_HOPS];
+ u8 reserved[8];
+ u8 data[OPA_SMP_DR_DATA_SIZE];
+ } dr;
+ } route;
+} __packed;
+
+
+/* Subnet management attributes */
+/* ... */
+#define OPA_ATTRIB_ID_NODE_DESCRIPTION cpu_to_be16(0x0010)
+#define OPA_ATTRIB_ID_NODE_INFO cpu_to_be16(0x0011)
+#define OPA_ATTRIB_ID_PORT_INFO cpu_to_be16(0x0015)
+#define OPA_ATTRIB_ID_PARTITION_TABLE cpu_to_be16(0x0016)
+#define OPA_ATTRIB_ID_SL_TO_SC_MAP cpu_to_be16(0x0017)
+#define OPA_ATTRIB_ID_VL_ARBITRATION cpu_to_be16(0x0018)
+#define OPA_ATTRIB_ID_SM_INFO cpu_to_be16(0x0020)
+#define OPA_ATTRIB_ID_CABLE_INFO cpu_to_be16(0x0032)
+#define OPA_ATTRIB_ID_AGGREGATE cpu_to_be16(0x0080)
+#define OPA_ATTRIB_ID_SC_TO_SL_MAP cpu_to_be16(0x0082)
+#define OPA_ATTRIB_ID_SC_TO_VLR_MAP cpu_to_be16(0x0083)
+#define OPA_ATTRIB_ID_SC_TO_VLT_MAP cpu_to_be16(0x0084)
+#define OPA_ATTRIB_ID_SC_TO_VLNT_MAP cpu_to_be16(0x0085)
+/* ... */
+#define OPA_ATTRIB_ID_PORT_STATE_INFO cpu_to_be16(0x0087)
+/* ... */
+#define OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE cpu_to_be16(0x008A)
+/* ... */
+
+struct opa_node_description {
+ u8 data[64];
+} __packed;
+
+struct opa_node_info {
+ u8 base_version;
+ u8 class_version;
+ u8 node_type;
+ u8 num_ports;
+ __be32 reserved;
+ __be64 system_image_guid;
+ __be64 node_guid;
+ __be64 port_guid;
+ __be16 partition_cap;
+ __be16 device_id;
+ __be32 revision;
+ u8 local_port_num;
+ u8 vendor_id[3]; /* network byte order */
+} __packed;
+
+#define OPA_PARTITION_TABLE_BLK_SIZE 32
+
+static inline u8
+opa_get_smp_direction(struct opa_smp *smp)
+{
+ return ib_get_smp_direction((struct ib_smp *)smp);
+}
+
+static inline u8 *opa_get_smp_data(struct opa_smp *smp)
+{
+ if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ return smp->route.dr.data;
+
+ return smp->route.lid.data;
+}
+
+static inline size_t opa_get_smp_data_size(struct opa_smp *smp)
+{
+ if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ return sizeof(smp->route.dr.data);
+
+ return sizeof(smp->route.lid.data);
+}
+
+static inline size_t opa_get_smp_header_size(struct opa_smp *smp)
+{
+ if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ return sizeof(*smp) - sizeof(smp->route.dr.data);
+
+ return sizeof(*smp) - sizeof(smp->route.lid.data);
+}
+
+#endif /* OPA_SMI_H */
diff --git a/include/rdma/opa_vnic.h b/include/rdma/opa_vnic.h
new file mode 100644
index 0000000000..d297f08400
--- /dev/null
+++ b/include/rdma/opa_vnic.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/*
+ * Copyright(c) 2017 - 2020 Intel Corporation.
+ */
+
+#ifndef _OPA_VNIC_H
+#define _OPA_VNIC_H
+
+/*
+ * This file contains Intel Omni-Path (OPA) Virtual Network Interface
+ * Controller (VNIC) specific declarations.
+ */
+
+#include <rdma/ib_verbs.h>
+
+/* 16 header bytes + 2 reserved bytes */
+#define OPA_VNIC_L2_HDR_LEN (16 + 2)
+
+#define OPA_VNIC_L4_HDR_LEN 2
+
+#define OPA_VNIC_HDR_LEN (OPA_VNIC_L2_HDR_LEN + \
+ OPA_VNIC_L4_HDR_LEN)
+
+#define OPA_VNIC_L4_ETHR 0x78
+
+#define OPA_VNIC_ICRC_LEN 4
+#define OPA_VNIC_TAIL_LEN 1
+#define OPA_VNIC_ICRC_TAIL_LEN (OPA_VNIC_ICRC_LEN + OPA_VNIC_TAIL_LEN)
+
+#define OPA_VNIC_SKB_MDATA_LEN 4
+#define OPA_VNIC_SKB_MDATA_ENCAP_ERR 0x1
+
+/* opa vnic rdma netdev's private data structure */
+struct opa_vnic_rdma_netdev {
+ struct rdma_netdev rn; /* keep this first */
+ /* followed by device private data */
+ char *dev_priv[];
+};
+
+static inline void *opa_vnic_priv(const struct net_device *dev)
+{
+ struct rdma_netdev *rn = netdev_priv(dev);
+
+ return rn->clnt_priv;
+}
+
+static inline void *opa_vnic_dev_priv(const struct net_device *dev)
+{
+ struct opa_vnic_rdma_netdev *oparn = netdev_priv(dev);
+
+ return oparn->dev_priv;
+}
+
+/* opa_vnic skb meta data structure */
+struct opa_vnic_skb_mdata {
+ u8 vl;
+ u8 entropy;
+ u8 flags;
+ u8 rsvd;
+} __packed;
+
+/* OPA VNIC group statistics */
+struct opa_vnic_grp_stats {
+ u64 unicast;
+ u64 mcastbcast;
+ u64 untagged;
+ u64 vlan;
+ u64 s_64;
+ u64 s_65_127;
+ u64 s_128_255;
+ u64 s_256_511;
+ u64 s_512_1023;
+ u64 s_1024_1518;
+ u64 s_1519_max;
+};
+
+struct opa_vnic_stats {
+ /* standard netdev statistics */
+ struct rtnl_link_stats64 netstats;
+
+ /* OPA VNIC statistics */
+ struct opa_vnic_grp_stats tx_grp;
+ struct opa_vnic_grp_stats rx_grp;
+ u64 tx_dlid_zero;
+ u64 tx_drop_state;
+ u64 rx_drop_state;
+ u64 rx_runt;
+ u64 rx_oversize;
+};
+
+static inline bool rdma_cap_opa_vnic(struct ib_device *device)
+{
+ return !!(device->attrs.kernel_cap_flags & IBK_RDMA_NETDEV_OPA);
+}
+
+#endif /* _OPA_VNIC_H */
diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h
new file mode 100644
index 0000000000..8a8ab2f793
--- /dev/null
+++ b/include/rdma/rdma_cm.h
@@ -0,0 +1,393 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2005 Voltaire Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ */
+
+#ifndef RDMA_CM_H
+#define RDMA_CM_H
+
+#include <linux/socket.h>
+#include <linux/in6.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_sa.h>
+#include <uapi/rdma/rdma_user_cm.h>
+
+/*
+ * Upon receiving a device removal event, users must destroy the associated
+ * RDMA identifier and release all resources allocated with the device.
+ */
+enum rdma_cm_event_type {
+ RDMA_CM_EVENT_ADDR_RESOLVED,
+ RDMA_CM_EVENT_ADDR_ERROR,
+ RDMA_CM_EVENT_ROUTE_RESOLVED,
+ RDMA_CM_EVENT_ROUTE_ERROR,
+ RDMA_CM_EVENT_CONNECT_REQUEST,
+ RDMA_CM_EVENT_CONNECT_RESPONSE,
+ RDMA_CM_EVENT_CONNECT_ERROR,
+ RDMA_CM_EVENT_UNREACHABLE,
+ RDMA_CM_EVENT_REJECTED,
+ RDMA_CM_EVENT_ESTABLISHED,
+ RDMA_CM_EVENT_DISCONNECTED,
+ RDMA_CM_EVENT_DEVICE_REMOVAL,
+ RDMA_CM_EVENT_MULTICAST_JOIN,
+ RDMA_CM_EVENT_MULTICAST_ERROR,
+ RDMA_CM_EVENT_ADDR_CHANGE,
+ RDMA_CM_EVENT_TIMEWAIT_EXIT
+};
+
+const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event);
+
+#define RDMA_IB_IP_PS_MASK 0xFFFFFFFFFFFF0000ULL
+#define RDMA_IB_IP_PS_TCP 0x0000000001060000ULL
+#define RDMA_IB_IP_PS_UDP 0x0000000001110000ULL
+#define RDMA_IB_IP_PS_IB 0x00000000013F0000ULL
+
+struct rdma_addr {
+ struct sockaddr_storage src_addr;
+ struct sockaddr_storage dst_addr;
+ struct rdma_dev_addr dev_addr;
+};
+
+struct rdma_route {
+ struct rdma_addr addr;
+ struct sa_path_rec *path_rec;
+
+ /* Optional path records of primary path */
+ struct sa_path_rec *path_rec_inbound;
+ struct sa_path_rec *path_rec_outbound;
+
+ /*
+ * 0 - No primary nor alternate path is available
+ * 1 - Only primary path is available
+ * 2 - Both primary and alternate path are available
+ */
+ int num_pri_alt_paths;
+};
+
+struct rdma_conn_param {
+ const void *private_data;
+ u8 private_data_len;
+ u8 responder_resources;
+ u8 initiator_depth;
+ u8 flow_control;
+ u8 retry_count; /* ignored when accepting */
+ u8 rnr_retry_count;
+ /* Fields below ignored if a QP is created on the rdma_cm_id. */
+ u8 srq;
+ u32 qp_num;
+ u32 qkey;
+};
+
+struct rdma_ud_param {
+ const void *private_data;
+ u8 private_data_len;
+ struct rdma_ah_attr ah_attr;
+ u32 qp_num;
+ u32 qkey;
+};
+
+struct rdma_cm_event {
+ enum rdma_cm_event_type event;
+ int status;
+ union {
+ struct rdma_conn_param conn;
+ struct rdma_ud_param ud;
+ } param;
+ struct rdma_ucm_ece ece;
+};
+
+struct rdma_cm_id;
+
+/**
+ * rdma_cm_event_handler - Callback used to report user events.
+ *
+ * Notes: Users may not call rdma_destroy_id from this callback to destroy
+ * the passed in id, or a corresponding listen id. Returning a
+ * non-zero value from the callback will destroy the passed in id.
+ */
+typedef int (*rdma_cm_event_handler)(struct rdma_cm_id *id,
+ struct rdma_cm_event *event);
+
+struct rdma_cm_id {
+ struct ib_device *device;
+ void *context;
+ struct ib_qp *qp;
+ rdma_cm_event_handler event_handler;
+ struct rdma_route route;
+ enum rdma_ucm_port_space ps;
+ enum ib_qp_type qp_type;
+ u32 port_num;
+ struct work_struct net_work;
+};
+
+struct rdma_cm_id *
+__rdma_create_kernel_id(struct net *net, rdma_cm_event_handler event_handler,
+ void *context, enum rdma_ucm_port_space ps,
+ enum ib_qp_type qp_type, const char *caller);
+struct rdma_cm_id *rdma_create_user_id(rdma_cm_event_handler event_handler,
+ void *context,
+ enum rdma_ucm_port_space ps,
+ enum ib_qp_type qp_type);
+
+/**
+ * rdma_create_id - Create an RDMA identifier.
+ *
+ * @net: The network namespace in which to create the new id.
+ * @event_handler: User callback invoked to report events associated with the
+ * returned rdma_id.
+ * @context: User specified context associated with the id.
+ * @ps: RDMA port space.
+ * @qp_type: type of queue pair associated with the id.
+ *
+ * Returns a new rdma_cm_id. The id holds a reference on the network
+ * namespace until it is destroyed.
+ *
+ * The event handler callback serializes on the id's mutex and is
+ * allowed to sleep.
+ */
+#define rdma_create_id(net, event_handler, context, ps, qp_type) \
+ __rdma_create_kernel_id(net, event_handler, context, ps, qp_type, \
+ KBUILD_MODNAME)
+
+/**
+ * rdma_destroy_id - Destroys an RDMA identifier.
+ *
+ * @id: RDMA identifier.
+ *
+ * Note: calling this function has the effect of canceling in-flight
+ * asynchronous operations associated with the id.
+ */
+void rdma_destroy_id(struct rdma_cm_id *id);
+
+/**
+ * rdma_bind_addr - Bind an RDMA identifier to a source address and
+ * associated RDMA device, if needed.
+ *
+ * @id: RDMA identifier.
+ * @addr: Local address information. Wildcard values are permitted.
+ *
+ * This associates a source address with the RDMA identifier before calling
+ * rdma_listen. If a specific local address is given, the RDMA identifier will
+ * be bound to a local RDMA device.
+ */
+int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr);
+
+/**
+ * rdma_resolve_addr - Resolve destination and optional source addresses
+ * from IP addresses to an RDMA address. If successful, the specified
+ * rdma_cm_id will be bound to a local device.
+ *
+ * @id: RDMA identifier.
+ * @src_addr: Source address information. This parameter may be NULL.
+ * @dst_addr: Destination address information.
+ * @timeout_ms: Time to wait for resolution to complete.
+ */
+int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+ const struct sockaddr *dst_addr,
+ unsigned long timeout_ms);
+
+/**
+ * rdma_resolve_route - Resolve the RDMA address bound to the RDMA identifier
+ * into route information needed to establish a connection.
+ *
+ * This is called on the client side of a connection.
+ * Users must have first called rdma_resolve_addr to resolve a dst_addr
+ * into an RDMA address before calling this routine.
+ */
+int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms);
+
+/**
+ * rdma_create_qp - Allocate a QP and associate it with the specified RDMA
+ * identifier.
+ *
+ * QPs allocated to an rdma_cm_id will automatically be transitioned by the CMA
+ * through their states.
+ */
+int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr);
+
+/**
+ * rdma_destroy_qp - Deallocate the QP associated with the specified RDMA
+ * identifier.
+ *
+ * Users must destroy any QP associated with an RDMA identifier before
+ * destroying the RDMA ID.
+ */
+void rdma_destroy_qp(struct rdma_cm_id *id);
+
+/**
+ * rdma_init_qp_attr - Initializes the QP attributes for use in transitioning
+ * to a specified QP state.
+ * @id: Communication identifier associated with the QP attributes to
+ * initialize.
+ * @qp_attr: On input, specifies the desired QP state. On output, the
+ * mandatory and desired optional attributes will be set in order to
+ * modify the QP to the specified state.
+ * @qp_attr_mask: The QP attribute mask that may be used to transition the
+ * QP to the specified state.
+ *
+ * Users must set the @qp_attr->qp_state to the desired QP state. This call
+ * will set all required attributes for the given transition, along with
+ * known optional attributes. Users may override the attributes returned from
+ * this call before calling ib_modify_qp.
+ *
+ * Users that wish to have their QP automatically transitioned through its
+ * states can associate a QP with the rdma_cm_id by calling rdma_create_qp().
+ */
+int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask);
+
+int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param);
+int rdma_connect_locked(struct rdma_cm_id *id,
+ struct rdma_conn_param *conn_param);
+
+int rdma_connect_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param,
+ struct rdma_ucm_ece *ece);
+
+/**
+ * rdma_listen - This function is called by the passive side to
+ * listen for incoming connection requests.
+ *
+ * Users must have bound the rdma_cm_id to a local address by calling
+ * rdma_bind_addr before calling this routine.
+ */
+int rdma_listen(struct rdma_cm_id *id, int backlog);
+
+int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param);
+
+void rdma_lock_handler(struct rdma_cm_id *id);
+void rdma_unlock_handler(struct rdma_cm_id *id);
+int rdma_accept_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param,
+ struct rdma_ucm_ece *ece);
+
+/**
+ * rdma_notify - Notifies the RDMA CM of an asynchronous event that has
+ * occurred on the connection.
+ * @id: Connection identifier to transition to established.
+ * @event: Asynchronous event.
+ *
+ * This routine should be invoked by users to notify the CM of relevant
+ * communication events. Events that should be reported to the CM and
+ * when to report them are:
+ *
+ * IB_EVENT_COMM_EST - Used when a message is received on a connected
+ * QP before an RTU has been received.
+ */
+int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event);
+
+/**
+ * rdma_reject - Called to reject a connection request or response.
+ */
+int rdma_reject(struct rdma_cm_id *id, const void *private_data,
+ u8 private_data_len, u8 reason);
+
+/**
+ * rdma_disconnect - This function disconnects the associated QP and
+ * transitions it into the error state.
+ */
+int rdma_disconnect(struct rdma_cm_id *id);
+
+/**
+ * rdma_join_multicast - Join the multicast group specified by the given
+ * address.
+ * @id: Communication identifier associated with the request.
+ * @addr: Multicast address identifying the group to join.
+ * @join_state: Multicast JoinState bitmap requested by port.
+ * Bitmap is based on IB_SA_MCMEMBER_REC_JOIN_STATE bits.
+ * @context: User-defined context associated with the join request, returned
+ * to the user through the private_data pointer in multicast events.
+ */
+int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
+ u8 join_state, void *context);
+
+/**
+ * rdma_leave_multicast - Leave the multicast group specified by the given
+ * address.
+ */
+void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr);
+
+/**
+ * rdma_set_service_type - Set the type of service associated with a
+ * connection identifier.
+ * @id: Communication identifier to associated with service type.
+ * @tos: Type of service.
+ *
+ * The type of service is interpretted as a differentiated service
+ * field (RFC 2474). The service type should be specified before
+ * performing route resolution, as existing communication on the
+ * connection identifier may be unaffected. The type of service
+ * requested may not be supported by the network to all destinations.
+ */
+void rdma_set_service_type(struct rdma_cm_id *id, int tos);
+
+/**
+ * rdma_set_reuseaddr - Allow the reuse of local addresses when binding
+ * the rdma_cm_id.
+ * @id: Communication identifier to configure.
+ * @reuse: Value indicating if the bound address is reusable.
+ *
+ * Reuse must be set before an address is bound to the id.
+ */
+int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse);
+
+/**
+ * rdma_set_afonly - Specify that listens are restricted to the
+ * bound address family only.
+ * @id: Communication identifer to configure.
+ * @afonly: Value indicating if listens are restricted.
+ *
+ * Must be set before identifier is in the listening state.
+ */
+int rdma_set_afonly(struct rdma_cm_id *id, int afonly);
+
+int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout);
+
+int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer);
+ /**
+ * rdma_get_service_id - Return the IB service ID for a specified address.
+ * @id: Communication identifier associated with the address.
+ * @addr: Address for the service ID.
+ */
+__be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr);
+
+/**
+ * rdma_reject_msg - return a pointer to a reject message string.
+ * @id: Communication identifier that received the REJECT event.
+ * @reason: Value returned in the REJECT event status field.
+ */
+const char *__attribute_const__ rdma_reject_msg(struct rdma_cm_id *id,
+ int reason);
+/**
+ * rdma_consumer_reject_data - return the consumer reject private data and
+ * length, if any.
+ * @id: Communication identifier that received the REJECT event.
+ * @ev: RDMA CM reject event.
+ * @data_len: Pointer to the resulting length of the consumer data.
+ */
+const void *rdma_consumer_reject_data(struct rdma_cm_id *id,
+ struct rdma_cm_event *ev, u8 *data_len);
+
+/**
+ * rdma_read_gids - Return the SGID and DGID used for establishing
+ * connection. This can be used after rdma_resolve_addr()
+ * on client side. This can be use on new connection
+ * on server side. This is applicable to IB, RoCE, iWarp.
+ * If cm_id is not bound yet to the RDMA device, it doesn't
+ * copy and SGID or DGID to the given pointers.
+ * @id: Communication identifier whose GIDs are queried.
+ * @sgid: Pointer to SGID where SGID will be returned. It is optional.
+ * @dgid: Pointer to DGID where DGID will be returned. It is optional.
+ * Note: This API should not be used by any new ULPs or new code.
+ * Instead, users interested in querying GIDs should refer to path record
+ * of the rdma_cm_id to query the GIDs.
+ * This API is provided for compatibility for existing users.
+ */
+
+void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid,
+ union ib_gid *dgid);
+
+struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *cm_id);
+struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res);
+
+#endif /* RDMA_CM_H */
diff --git a/include/rdma/rdma_cm_ib.h b/include/rdma/rdma_cm_ib.h
new file mode 100644
index 0000000000..8354e7de78
--- /dev/null
+++ b/include/rdma/rdma_cm_ib.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2006 Intel Corporation. All rights reserved.
+ */
+
+#ifndef RDMA_CM_IB_H
+#define RDMA_CM_IB_H
+
+#include <rdma/rdma_cm.h>
+
+/**
+ * rdma_set_ib_path - Manually sets the path record used to establish a
+ * connection.
+ * @id: Connection identifier associated with the request.
+ * @path_rec: Reference to the path record
+ *
+ * This call permits a user to specify routing information for rdma_cm_id's
+ * bound to InfiniBand devices. It is called on the client side of a
+ * connection and replaces the call to rdma_resolve_route.
+ */
+int rdma_set_ib_path(struct rdma_cm_id *id,
+ struct sa_path_rec *path_rec);
+
+/* Global qkey for UDP QPs and multicast groups. */
+#define RDMA_UDP_QKEY 0x01234567
+
+#endif /* RDMA_CM_IB_H */
diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h
new file mode 100644
index 0000000000..45d5481a78
--- /dev/null
+++ b/include/rdma/rdma_counter.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2019 Mellanox Technologies. All rights reserved.
+ */
+
+#ifndef _RDMA_COUNTER_H_
+#define _RDMA_COUNTER_H_
+
+#include <linux/mutex.h>
+#include <linux/pid_namespace.h>
+
+#include <rdma/restrack.h>
+#include <rdma/rdma_netlink.h>
+
+struct ib_device;
+struct ib_qp;
+
+struct auto_mode_param {
+ int qp_type;
+};
+
+struct rdma_counter_mode {
+ enum rdma_nl_counter_mode mode;
+ enum rdma_nl_counter_mask mask;
+ struct auto_mode_param param;
+};
+
+struct rdma_port_counter {
+ struct rdma_counter_mode mode;
+ struct rdma_hw_stats *hstats;
+ unsigned int num_counters;
+ struct mutex lock;
+};
+
+struct rdma_counter {
+ struct rdma_restrack_entry res;
+ struct ib_device *device;
+ uint32_t id;
+ struct kref kref;
+ struct rdma_counter_mode mode;
+ struct mutex lock;
+ struct rdma_hw_stats *stats;
+ u32 port;
+};
+
+void rdma_counter_init(struct ib_device *dev);
+void rdma_counter_release(struct ib_device *dev);
+int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port,
+ enum rdma_nl_counter_mask mask,
+ struct netlink_ext_ack *extack);
+int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port);
+int rdma_counter_unbind_qp(struct ib_qp *qp, bool force);
+
+int rdma_counter_query_stats(struct rdma_counter *counter);
+u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u32 port, u32 index);
+int rdma_counter_bind_qpn(struct ib_device *dev, u32 port,
+ u32 qp_num, u32 counter_id);
+int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u32 port,
+ u32 qp_num, u32 *counter_id);
+int rdma_counter_unbind_qpn(struct ib_device *dev, u32 port,
+ u32 qp_num, u32 counter_id);
+int rdma_counter_get_mode(struct ib_device *dev, u32 port,
+ enum rdma_nl_counter_mode *mode,
+ enum rdma_nl_counter_mask *mask);
+
+int rdma_counter_modify(struct ib_device *dev, u32 port,
+ unsigned int index, bool enable);
+#endif /* _RDMA_COUNTER_H_ */
diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h
new file mode 100644
index 0000000000..c2a79aeee1
--- /dev/null
+++ b/include/rdma/rdma_netlink.h
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _RDMA_NETLINK_H
+#define _RDMA_NETLINK_H
+
+#include <linux/netlink.h>
+#include <uapi/rdma/rdma_netlink.h>
+
+enum {
+ RDMA_NLDEV_ATTR_EMPTY_STRING = 1,
+ RDMA_NLDEV_ATTR_ENTRY_STRLEN = 16,
+ RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE = 32,
+};
+
+struct rdma_nl_cbs {
+ int (*doit)(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack);
+ int (*dump)(struct sk_buff *skb, struct netlink_callback *nlcb);
+ u8 flags;
+};
+
+enum rdma_nl_flags {
+ /* Require CAP_NET_ADMIN */
+ RDMA_NL_ADMIN_PERM = 1 << 0,
+};
+
+/* Define this module as providing netlink services for NETLINK_RDMA, with
+ * index _index. Since the client indexes were setup in a uapi header as an
+ * enum and we do no want to change that, the user must supply the expanded
+ * constant as well and the compiler checks they are the same.
+ */
+#define MODULE_ALIAS_RDMA_NETLINK(_index, _val) \
+ static inline void __maybe_unused __chk_##_index(void) \
+ { \
+ BUILD_BUG_ON(_index != _val); \
+ } \
+ MODULE_ALIAS("rdma-netlink-subsys-" __stringify(_val))
+
+/**
+ * Register client in RDMA netlink.
+ * @index: Index of the added client
+ * @cb_table: A table for op->callback
+ */
+void rdma_nl_register(unsigned int index,
+ const struct rdma_nl_cbs cb_table[]);
+
+/**
+ * Remove a client from IB netlink.
+ * @index: Index of the removed IB client.
+ */
+void rdma_nl_unregister(unsigned int index);
+
+/**
+ * Put a new message in a supplied skb.
+ * @skb: The netlink skb.
+ * @nlh: Pointer to put the header of the new netlink message.
+ * @seq: The message sequence number.
+ * @len: The requested message length to allocate.
+ * @client: Calling IB netlink client.
+ * @op: message content op.
+ * Returns the allocated buffer on success and NULL on failure.
+ */
+void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq,
+ int len, int client, int op, int flags);
+/**
+ * Put a new attribute in a supplied skb.
+ * @skb: The netlink skb.
+ * @nlh: Header of the netlink message to append the attribute to.
+ * @len: The length of the attribute data.
+ * @data: The attribute data to put.
+ * @type: The attribute type.
+ * Returns the 0 and a negative error code on failure.
+ */
+int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh,
+ int len, void *data, int type);
+
+/**
+ * Send the supplied skb to a specific userspace PID.
+ * @net: Net namespace in which to send the skb
+ * @skb: The netlink skb
+ * @pid: Userspace netlink process ID
+ * Returns 0 on success or a negative error code.
+ */
+int rdma_nl_unicast(struct net *net, struct sk_buff *skb, u32 pid);
+
+/**
+ * Send, with wait/1 retry, the supplied skb to a specific userspace PID.
+ * @net: Net namespace in which to send the skb
+ * @skb: The netlink skb
+ * @pid: Userspace netlink process ID
+ * Returns 0 on success or a negative error code.
+ */
+int rdma_nl_unicast_wait(struct net *net, struct sk_buff *skb, __u32 pid);
+
+/**
+ * Send the supplied skb to a netlink group.
+ * @net: Net namespace in which to send the skb
+ * @skb: The netlink skb
+ * @group: Netlink group ID
+ * @flags: allocation flags
+ * Returns 0 on success or a negative error code.
+ */
+int rdma_nl_multicast(struct net *net, struct sk_buff *skb,
+ unsigned int group, gfp_t flags);
+
+/**
+ * Check if there are any listeners to the netlink group
+ * @group: the netlink group ID
+ * Returns true on success or false if no listeners.
+ */
+bool rdma_nl_chk_listeners(unsigned int group);
+
+struct rdma_link_ops {
+ struct list_head list;
+ const char *type;
+ int (*newlink)(const char *ibdev_name, struct net_device *ndev);
+};
+
+void rdma_link_register(struct rdma_link_ops *ops);
+void rdma_link_unregister(struct rdma_link_ops *ops);
+
+#define MODULE_ALIAS_RDMA_LINK(type) MODULE_ALIAS("rdma-link-" type)
+#define MODULE_ALIAS_RDMA_CLIENT(type) MODULE_ALIAS("rdma-client-" type)
+
+#endif /* _RDMA_NETLINK_H */
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h
new file mode 100644
index 0000000000..c429d6ddb1
--- /dev/null
+++ b/include/rdma/rdma_vt.h
@@ -0,0 +1,532 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/*
+ * Copyright(c) 2016 - 2019 Intel Corporation.
+ */
+
+#ifndef DEF_RDMA_VT_H
+#define DEF_RDMA_VT_H
+
+/*
+ * Structure that low level drivers will populate in order to register with the
+ * rdmavt layer.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_mad.h>
+#include <rdma/rdmavt_mr.h>
+
+#define RVT_MAX_PKEY_VALUES 16
+
+#define RVT_MAX_TRAP_LEN 100 /* Limit pending trap list */
+#define RVT_MAX_TRAP_LISTS 5 /*((IB_NOTICE_TYPE_INFO & 0x0F) + 1)*/
+#define RVT_TRAP_TIMEOUT 4096 /* 4.096 usec */
+
+struct trap_list {
+ u32 list_len;
+ struct list_head list;
+};
+
+struct rvt_qp;
+struct rvt_qpn_table;
+struct rvt_ibport {
+ struct rvt_qp __rcu *qp[2];
+ struct ib_mad_agent *send_agent; /* agent for SMI (traps) */
+ struct rb_root mcast_tree;
+ spinlock_t lock; /* protect changes in this struct */
+
+ /* non-zero when timer is set */
+ unsigned long mkey_lease_timeout;
+ unsigned long trap_timeout;
+ __be64 gid_prefix; /* in network order */
+ __be64 mkey;
+ u64 tid;
+ u32 port_cap_flags;
+ u16 port_cap3_flags;
+ u32 pma_sample_start;
+ u32 pma_sample_interval;
+ __be16 pma_counter_select[5];
+ u16 pma_tag;
+ u16 mkey_lease_period;
+ u32 sm_lid;
+ u8 sm_sl;
+ u8 mkeyprot;
+ u8 subnet_timeout;
+ u8 vl_high_limit;
+
+ /*
+ * Driver is expected to keep these up to date. These
+ * counters are informational only and not required to be
+ * completely accurate.
+ */
+ u64 n_rc_resends;
+ u64 n_seq_naks;
+ u64 n_rdma_seq;
+ u64 n_rnr_naks;
+ u64 n_other_naks;
+ u64 n_loop_pkts;
+ u64 n_pkt_drops;
+ u64 n_vl15_dropped;
+ u64 n_rc_timeouts;
+ u64 n_dmawait;
+ u64 n_unaligned;
+ u64 n_rc_dupreq;
+ u64 n_rc_seqnak;
+ u64 n_rc_crwaits;
+ u16 pkey_violations;
+ u16 qkey_violations;
+ u16 mkey_violations;
+
+ /* Hot-path per CPU counters to avoid cacheline trading to update */
+ u64 z_rc_acks;
+ u64 z_rc_qacks;
+ u64 z_rc_delayed_comp;
+ u64 __percpu *rc_acks;
+ u64 __percpu *rc_qacks;
+ u64 __percpu *rc_delayed_comp;
+
+ void *priv; /* driver private data */
+
+ /*
+ * The pkey table is allocated and maintained by the driver. Drivers
+ * need to have access to this before registering with rdmav. However
+ * rdmavt will need access to it so drivers need to provide this during
+ * the attach port API call.
+ */
+ u16 *pkey_table;
+
+ struct rvt_ah *sm_ah;
+
+ /*
+ * Keep a list of traps that have not been repressed. They will be
+ * resent based on trap_timer.
+ */
+ struct trap_list trap_lists[RVT_MAX_TRAP_LISTS];
+ struct timer_list trap_timer;
+};
+
+#define RVT_CQN_MAX 16 /* maximum length of cq name */
+
+#define RVT_SGE_COPY_MEMCPY 0
+#define RVT_SGE_COPY_CACHELESS 1
+#define RVT_SGE_COPY_ADAPTIVE 2
+
+/*
+ * Things that are driver specific, module parameters in hfi1 and qib
+ */
+struct rvt_driver_params {
+ struct ib_device_attr props;
+
+ /*
+ * Anything driver specific that is not covered by props
+ * For instance special module parameters. Goes here.
+ */
+ unsigned int lkey_table_size;
+ unsigned int qp_table_size;
+ unsigned int sge_copy_mode;
+ unsigned int wss_threshold;
+ unsigned int wss_clean_period;
+ int qpn_start;
+ int qpn_inc;
+ int qpn_res_start;
+ int qpn_res_end;
+ int nports;
+ int npkeys;
+ int node;
+ int psn_mask;
+ int psn_shift;
+ int psn_modify_mask;
+ u32 core_cap_flags;
+ u32 max_mad_size;
+ u8 qos_shift;
+ u8 max_rdma_atomic;
+ u8 extra_rdma_atomic;
+ u8 reserved_operations;
+};
+
+/* User context */
+struct rvt_ucontext {
+ struct ib_ucontext ibucontext;
+};
+
+/* Protection domain */
+struct rvt_pd {
+ struct ib_pd ibpd;
+ bool user;
+};
+
+/* Address handle */
+struct rvt_ah {
+ struct ib_ah ibah;
+ struct rdma_ah_attr attr;
+ u8 vl;
+ u8 log_pmtu;
+};
+
+/*
+ * This structure is used by rvt_mmap() to validate an offset
+ * when an mmap() request is made. The vm_area_struct then uses
+ * this as its vm_private_data.
+ */
+struct rvt_mmap_info {
+ struct list_head pending_mmaps;
+ struct ib_ucontext *context;
+ void *obj;
+ __u64 offset;
+ struct kref ref;
+ u32 size;
+};
+
+/* memory working set size */
+struct rvt_wss {
+ unsigned long *entries;
+ atomic_t total_count;
+ atomic_t clean_counter;
+ atomic_t clean_entry;
+
+ int threshold;
+ int num_entries;
+ long pages_mask;
+ unsigned int clean_period;
+};
+
+struct rvt_dev_info;
+struct rvt_swqe;
+struct rvt_driver_provided {
+ /*
+ * Which functions are required depends on which verbs rdmavt is
+ * providing and which verbs the driver is overriding. See
+ * check_support() for details.
+ */
+
+ /* hot path calldowns in a single cacheline */
+
+ /*
+ * Give the driver a notice that there is send work to do. It is up to
+ * the driver to generally push the packets out, this just queues the
+ * work with the driver. There are two variants here. The no_lock
+ * version requires the s_lock not to be held. The other assumes the
+ * s_lock is held.
+ */
+ bool (*schedule_send)(struct rvt_qp *qp);
+ bool (*schedule_send_no_lock)(struct rvt_qp *qp);
+
+ /*
+ * Driver specific work request setup and checking.
+ * This function is allowed to perform any setup, checks, or
+ * adjustments required to the SWQE in order to be usable by
+ * underlying protocols. This includes private data structure
+ * allocations.
+ */
+ int (*setup_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ bool *call_send);
+
+ /*
+ * Sometimes rdmavt needs to kick the driver's send progress. That is
+ * done by this call back.
+ */
+ void (*do_send)(struct rvt_qp *qp);
+
+ /*
+ * Returns a pointer to the underlying hardware's PCI device. This is
+ * used to display information as to what hardware is being referenced
+ * in an output message
+ */
+ struct pci_dev * (*get_pci_dev)(struct rvt_dev_info *rdi);
+
+ /*
+ * Allocate a private queue pair data structure for driver specific
+ * information which is opaque to rdmavt. Errors are returned via
+ * ERR_PTR(err). The driver is free to return NULL or a valid
+ * pointer.
+ */
+ void * (*qp_priv_alloc)(struct rvt_dev_info *rdi, struct rvt_qp *qp);
+
+ /*
+ * Init a structure allocated with qp_priv_alloc(). This should be
+ * called after all qp fields have been initialized in rdmavt.
+ */
+ int (*qp_priv_init)(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+ struct ib_qp_init_attr *init_attr);
+
+ /*
+ * Free the driver's private qp structure.
+ */
+ void (*qp_priv_free)(struct rvt_dev_info *rdi, struct rvt_qp *qp);
+
+ /*
+ * Inform the driver the particular qp in question has been reset so
+ * that it can clean up anything it needs to.
+ */
+ void (*notify_qp_reset)(struct rvt_qp *qp);
+
+ /*
+ * Get a path mtu from the driver based on qp attributes.
+ */
+ int (*get_pmtu_from_attr)(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+ struct ib_qp_attr *attr);
+
+ /*
+ * Notify driver that it needs to flush any outstanding IO requests that
+ * are waiting on a qp.
+ */
+ void (*flush_qp_waiters)(struct rvt_qp *qp);
+
+ /*
+ * Notify driver to stop its queue of sending packets. Nothing else
+ * should be posted to the queue pair after this has been called.
+ */
+ void (*stop_send_queue)(struct rvt_qp *qp);
+
+ /*
+ * Have the driver drain any in progress operations
+ */
+ void (*quiesce_qp)(struct rvt_qp *qp);
+
+ /*
+ * Inform the driver a qp has went to error state.
+ */
+ void (*notify_error_qp)(struct rvt_qp *qp);
+
+ /*
+ * Get an MTU for a qp.
+ */
+ u32 (*mtu_from_qp)(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+ u32 pmtu);
+ /*
+ * Convert an mtu to a path mtu
+ */
+ int (*mtu_to_path_mtu)(u32 mtu);
+
+ /*
+ * Get the guid of a port in big endian byte order
+ */
+ int (*get_guid_be)(struct rvt_dev_info *rdi, struct rvt_ibport *rvp,
+ int guid_index, __be64 *guid);
+
+ /*
+ * Query driver for the state of the port.
+ */
+ int (*query_port_state)(struct rvt_dev_info *rdi, u32 port_num,
+ struct ib_port_attr *props);
+
+ /*
+ * Tell driver to shutdown a port
+ */
+ int (*shut_down_port)(struct rvt_dev_info *rdi, u32 port_num);
+
+ /* Tell driver to send a trap for changed port capabilities */
+ void (*cap_mask_chg)(struct rvt_dev_info *rdi, u32 port_num);
+
+ /*
+ * The following functions can be safely ignored completely. Any use of
+ * these is checked for NULL before blindly calling. Rdmavt should also
+ * be functional if drivers omit these.
+ */
+
+ /* Called to inform the driver that all qps should now be freed. */
+ unsigned (*free_all_qps)(struct rvt_dev_info *rdi);
+
+ /* Driver specific AH validation */
+ int (*check_ah)(struct ib_device *, struct rdma_ah_attr *);
+
+ /* Inform the driver a new AH has been created */
+ void (*notify_new_ah)(struct ib_device *, struct rdma_ah_attr *,
+ struct rvt_ah *);
+
+ /* Let the driver pick the next queue pair number*/
+ int (*alloc_qpn)(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt,
+ enum ib_qp_type type, u32 port_num);
+
+ /* Determine if its safe or allowed to modify the qp */
+ int (*check_modify_qp)(struct rvt_qp *qp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata);
+
+ /* Driver specific QP modification/notification-of */
+ void (*modify_qp)(struct rvt_qp *qp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata);
+
+ /* Notify driver a mad agent has been created */
+ void (*notify_create_mad_agent)(struct rvt_dev_info *rdi, int port_idx);
+
+ /* Notify driver a mad agent has been removed */
+ void (*notify_free_mad_agent)(struct rvt_dev_info *rdi, int port_idx);
+
+ /* Notify driver to restart rc */
+ void (*notify_restart_rc)(struct rvt_qp *qp, u32 psn, int wait);
+
+ /* Get and return CPU to pin CQ processing thread */
+ int (*comp_vect_cpu_lookup)(struct rvt_dev_info *rdi, int comp_vect);
+};
+
+struct rvt_dev_info {
+ struct ib_device ibdev; /* Keep this first. Nothing above here */
+
+ /*
+ * Prior to calling for registration the driver will be responsible for
+ * allocating space for this structure.
+ *
+ * The driver will also be responsible for filling in certain members of
+ * dparms.props. The driver needs to fill in dparms exactly as it would
+ * want values reported to a ULP. This will be returned to the caller
+ * in rdmavt's device. The driver should also therefore refrain from
+ * modifying this directly after registration with rdmavt.
+ */
+
+ /* Driver specific properties */
+ struct rvt_driver_params dparms;
+
+ /* post send table */
+ const struct rvt_operation_params *post_parms;
+
+ /* opcode translation table */
+ const enum ib_wc_opcode *wc_opcode;
+
+ /* Driver specific helper functions */
+ struct rvt_driver_provided driver_f;
+
+ struct rvt_mregion __rcu *dma_mr;
+ struct rvt_lkey_table lkey_table;
+
+ /* Internal use */
+ int n_pds_allocated;
+ spinlock_t n_pds_lock; /* Protect pd allocated count */
+
+ int n_ahs_allocated;
+ spinlock_t n_ahs_lock; /* Protect ah allocated count */
+
+ u32 n_srqs_allocated;
+ spinlock_t n_srqs_lock; /* Protect srqs allocated count */
+
+ int flags;
+ struct rvt_ibport **ports;
+
+ /* QP */
+ struct rvt_qp_ibdev *qp_dev;
+ u32 n_qps_allocated; /* number of QPs allocated for device */
+ u32 n_rc_qps; /* number of RC QPs allocated for device */
+ u32 busy_jiffies; /* timeout scaling based on RC QP count */
+ spinlock_t n_qps_lock; /* protect qps, rc qps and busy jiffy counts */
+
+ /* memory maps */
+ struct list_head pending_mmaps;
+ spinlock_t mmap_offset_lock; /* protect mmap_offset */
+ u32 mmap_offset;
+ spinlock_t pending_lock; /* protect pending mmap list */
+
+ /* CQ */
+ u32 n_cqs_allocated; /* number of CQs allocated for device */
+ spinlock_t n_cqs_lock; /* protect count of in use cqs */
+
+ /* Multicast */
+ u32 n_mcast_grps_allocated; /* number of mcast groups allocated */
+ spinlock_t n_mcast_grps_lock;
+
+ /* Memory Working Set Size */
+ struct rvt_wss *wss;
+};
+
+/**
+ * rvt_set_ibdev_name - Craft an IB device name from client info
+ * @rdi: pointer to the client rvt_dev_info structure
+ * @name: client specific name
+ * @unit: client specific unit number.
+ */
+static inline void rvt_set_ibdev_name(struct rvt_dev_info *rdi,
+ const char *fmt, const char *name,
+ const int unit)
+{
+ /*
+ * FIXME: rvt and its users want to touch the ibdev before
+ * registration and have things like the name work. We don't have the
+ * infrastructure in the core to support this directly today, hack it
+ * to work by setting the name manually here.
+ */
+ dev_set_name(&rdi->ibdev.dev, fmt, name, unit);
+ strscpy(rdi->ibdev.name, dev_name(&rdi->ibdev.dev), IB_DEVICE_NAME_MAX);
+}
+
+/**
+ * rvt_get_ibdev_name - return the IB name
+ * @rdi: rdmavt device
+ *
+ * Return the registered name of the device.
+ */
+static inline const char *rvt_get_ibdev_name(const struct rvt_dev_info *rdi)
+{
+ return dev_name(&rdi->ibdev.dev);
+}
+
+static inline struct rvt_pd *ibpd_to_rvtpd(struct ib_pd *ibpd)
+{
+ return container_of(ibpd, struct rvt_pd, ibpd);
+}
+
+static inline struct rvt_ah *ibah_to_rvtah(struct ib_ah *ibah)
+{
+ return container_of(ibah, struct rvt_ah, ibah);
+}
+
+static inline struct rvt_dev_info *ib_to_rvt(struct ib_device *ibdev)
+{
+ return container_of(ibdev, struct rvt_dev_info, ibdev);
+}
+
+static inline unsigned rvt_get_npkeys(struct rvt_dev_info *rdi)
+{
+ /*
+ * All ports have same number of pkeys.
+ */
+ return rdi->dparms.npkeys;
+}
+
+/*
+ * Return the max atomic suitable for determining
+ * the size of the ack ring buffer in a QP.
+ */
+static inline unsigned int rvt_max_atomic(struct rvt_dev_info *rdi)
+{
+ return rdi->dparms.max_rdma_atomic +
+ rdi->dparms.extra_rdma_atomic + 1;
+}
+
+static inline unsigned int rvt_size_atomic(struct rvt_dev_info *rdi)
+{
+ return rdi->dparms.max_rdma_atomic +
+ rdi->dparms.extra_rdma_atomic;
+}
+
+/*
+ * Return the indexed PKEY from the port PKEY table.
+ */
+static inline u16 rvt_get_pkey(struct rvt_dev_info *rdi,
+ int port_index,
+ unsigned index)
+{
+ if (index >= rvt_get_npkeys(rdi))
+ return 0;
+ else
+ return rdi->ports[port_index]->pkey_table[index];
+}
+
+struct rvt_dev_info *rvt_alloc_device(size_t size, int nports);
+void rvt_dealloc_device(struct rvt_dev_info *rdi);
+int rvt_register_device(struct rvt_dev_info *rvd);
+void rvt_unregister_device(struct rvt_dev_info *rvd);
+int rvt_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr);
+int rvt_init_port(struct rvt_dev_info *rdi, struct rvt_ibport *port,
+ int port_index, u16 *pkey_table);
+int rvt_fast_reg_mr(struct rvt_qp *qp, struct ib_mr *ibmr, u32 key,
+ int access);
+int rvt_invalidate_rkey(struct rvt_qp *qp, u32 rkey);
+int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge,
+ u32 len, u64 vaddr, u32 rkey, int acc);
+int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd,
+ struct rvt_sge *isge, struct rvt_sge *last_sge,
+ struct ib_sge *sge, int acc);
+struct rvt_mcast *rvt_mcast_find(struct rvt_ibport *ibp, union ib_gid *mgid,
+ u16 lid);
+
+#endif /* DEF_RDMA_VT_H */
diff --git a/include/rdma/rdmavt_cq.h b/include/rdma/rdmavt_cq.h
new file mode 100644
index 0000000000..1fe2bb5a63
--- /dev/null
+++ b/include/rdma/rdmavt_cq.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/*
+ * Copyright(c) 2016 - 2018 Intel Corporation.
+ */
+
+#ifndef DEF_RDMAVT_INCCQ_H
+#define DEF_RDMAVT_INCCQ_H
+
+#include <linux/kthread.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_verbs.h>
+
+/*
+ * Define an ib_cq_notify value that is not valid so we know when CQ
+ * notifications are armed.
+ */
+#define RVT_CQ_NONE (IB_CQ_NEXT_COMP + 1)
+
+/*
+ * Define read macro that apply smp_load_acquire memory barrier
+ * when reading indice of circular buffer that mmaped to user space.
+ */
+#define RDMA_READ_UAPI_ATOMIC(member) smp_load_acquire(&(member).val)
+
+/*
+ * Define write macro that uses smp_store_release memory barrier
+ * when writing indice of circular buffer that mmaped to user space.
+ */
+#define RDMA_WRITE_UAPI_ATOMIC(member, x) smp_store_release(&(member).val, x)
+#include <rdma/rvt-abi.h>
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and completion queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ */
+struct rvt_k_cq_wc {
+ u32 head; /* index of next entry to fill */
+ u32 tail; /* index of next ib_poll_cq() entry */
+ struct ib_wc kqueue[];
+};
+
+/*
+ * The completion queue structure.
+ */
+struct rvt_cq {
+ struct ib_cq ibcq;
+ struct work_struct comptask;
+ spinlock_t lock; /* protect changes in this struct */
+ u8 notify;
+ u8 triggered;
+ u8 cq_full;
+ int comp_vector_cpu;
+ struct rvt_dev_info *rdi;
+ struct rvt_cq_wc *queue;
+ struct rvt_mmap_info *ip;
+ struct rvt_k_cq_wc *kqueue;
+};
+
+static inline struct rvt_cq *ibcq_to_rvtcq(struct ib_cq *ibcq)
+{
+ return container_of(ibcq, struct rvt_cq, ibcq);
+}
+
+bool rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited);
+
+#endif /* DEF_RDMAVT_INCCQH */
diff --git a/include/rdma/rdmavt_mr.h b/include/rdma/rdmavt_mr.h
new file mode 100644
index 0000000000..c3367e9833
--- /dev/null
+++ b/include/rdma/rdmavt_mr.h
@@ -0,0 +1,155 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ */
+
+#ifndef DEF_RDMAVT_INCMR_H
+#define DEF_RDMAVT_INCMR_H
+
+/*
+ * For Memory Regions. This stuff should probably be moved into rdmavt/mr.h once
+ * drivers no longer need access to the MR directly.
+ */
+#include <linux/percpu-refcount.h>
+
+/*
+ * A segment is a linear region of low physical memory.
+ * Used by the verbs layer.
+ */
+struct rvt_seg {
+ void *vaddr;
+ size_t length;
+};
+
+/* The number of rvt_segs that fit in a page. */
+#define RVT_SEGSZ (PAGE_SIZE / sizeof(struct rvt_seg))
+
+struct rvt_segarray {
+ struct rvt_seg segs[RVT_SEGSZ];
+};
+
+struct rvt_mregion {
+ struct ib_pd *pd; /* shares refcnt of ibmr.pd */
+ u64 user_base; /* User's address for this region */
+ u64 iova; /* IB start address of this region */
+ size_t length;
+ u32 lkey;
+ u32 offset; /* offset (bytes) to start of region */
+ int access_flags;
+ u32 max_segs; /* number of rvt_segs in all the arrays */
+ u32 mapsz; /* size of the map array */
+ atomic_t lkey_invalid; /* true if current lkey is invalid */
+ u8 page_shift; /* 0 - non unform/non powerof2 sizes */
+ u8 lkey_published; /* in global table */
+ struct percpu_ref refcount;
+ struct completion comp; /* complete when refcount goes to zero */
+ struct rvt_segarray *map[]; /* the segments */
+};
+
+#define RVT_MAX_LKEY_TABLE_BITS 23
+
+struct rvt_lkey_table {
+ /* read mostly fields */
+ u32 max; /* size of the table */
+ u32 shift; /* lkey/rkey shift */
+ struct rvt_mregion __rcu **table;
+ /* writeable fields */
+ /* protect changes in this struct */
+ spinlock_t lock ____cacheline_aligned_in_smp;
+ u32 next; /* next unused index (speeds search) */
+ u32 gen; /* generation count */
+};
+
+/*
+ * These keep track of the copy progress within a memory region.
+ * Used by the verbs layer.
+ */
+struct rvt_sge {
+ struct rvt_mregion *mr;
+ void *vaddr; /* kernel virtual address of segment */
+ u32 sge_length; /* length of the SGE */
+ u32 length; /* remaining length of the segment */
+ u16 m; /* current index: mr->map[m] */
+ u16 n; /* current index: mr->map[m]->segs[n] */
+};
+
+struct rvt_sge_state {
+ struct rvt_sge *sg_list; /* next SGE to be used if any */
+ struct rvt_sge sge; /* progress state for the current SGE */
+ u32 total_len;
+ u8 num_sge;
+};
+
+static inline void rvt_put_mr(struct rvt_mregion *mr)
+{
+ percpu_ref_put(&mr->refcount);
+}
+
+static inline void rvt_get_mr(struct rvt_mregion *mr)
+{
+ percpu_ref_get(&mr->refcount);
+}
+
+static inline void rvt_put_ss(struct rvt_sge_state *ss)
+{
+ while (ss->num_sge) {
+ rvt_put_mr(ss->sge.mr);
+ if (--ss->num_sge)
+ ss->sge = *ss->sg_list++;
+ }
+}
+
+static inline u32 rvt_get_sge_length(struct rvt_sge *sge, u32 length)
+{
+ u32 len = sge->length;
+
+ if (len > length)
+ len = length;
+ if (len > sge->sge_length)
+ len = sge->sge_length;
+
+ return len;
+}
+
+static inline void rvt_update_sge(struct rvt_sge_state *ss, u32 length,
+ bool release)
+{
+ struct rvt_sge *sge = &ss->sge;
+
+ sge->vaddr += length;
+ sge->length -= length;
+ sge->sge_length -= length;
+ if (sge->sge_length == 0) {
+ if (release)
+ rvt_put_mr(sge->mr);
+ if (--ss->num_sge)
+ *sge = *ss->sg_list++;
+ } else if (sge->length == 0 && sge->mr->lkey) {
+ if (++sge->n >= RVT_SEGSZ) {
+ if (++sge->m >= sge->mr->mapsz)
+ return;
+ sge->n = 0;
+ }
+ sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+}
+
+static inline void rvt_skip_sge(struct rvt_sge_state *ss, u32 length,
+ bool release)
+{
+ struct rvt_sge *sge = &ss->sge;
+
+ while (length) {
+ u32 len = rvt_get_sge_length(sge, length);
+
+ WARN_ON_ONCE(len == 0);
+ rvt_update_sge(ss, len, release);
+ length -= len;
+ }
+}
+
+bool rvt_ss_has_lkey(struct rvt_sge_state *ss, u32 lkey);
+bool rvt_mr_has_lkey(struct rvt_mregion *mr, u32 lkey);
+
+#endif /* DEF_RDMAVT_INCMRH */
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
new file mode 100644
index 0000000000..2e58d5e6ac
--- /dev/null
+++ b/include/rdma/rdmavt_qp.h
@@ -0,0 +1,1003 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/*
+ * Copyright(c) 2016 - 2020 Intel Corporation.
+ */
+
+#ifndef DEF_RDMAVT_INCQP_H
+#define DEF_RDMAVT_INCQP_H
+
+#include <rdma/rdma_vt.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdmavt_cq.h>
+#include <rdma/rvt-abi.h>
+/*
+ * Atomic bit definitions for r_aflags.
+ */
+#define RVT_R_WRID_VALID 0
+#define RVT_R_REWIND_SGE 1
+
+/*
+ * Bit definitions for r_flags.
+ */
+#define RVT_R_REUSE_SGE 0x01
+#define RVT_R_RDMAR_SEQ 0x02
+#define RVT_R_RSP_NAK 0x04
+#define RVT_R_RSP_SEND 0x08
+#define RVT_R_COMM_EST 0x10
+
+/*
+ * If a packet's QP[23:16] bits match this value, then it is
+ * a PSM packet and the hardware will expect a KDETH header
+ * following the BTH.
+ */
+#define RVT_KDETH_QP_PREFIX 0x80
+#define RVT_KDETH_QP_SUFFIX 0xffff
+#define RVT_KDETH_QP_PREFIX_MASK 0x00ff0000
+#define RVT_KDETH_QP_PREFIX_SHIFT 16
+#define RVT_KDETH_QP_BASE (u32)(RVT_KDETH_QP_PREFIX << \
+ RVT_KDETH_QP_PREFIX_SHIFT)
+#define RVT_KDETH_QP_MAX (u32)(RVT_KDETH_QP_BASE + RVT_KDETH_QP_SUFFIX)
+
+/*
+ * If a packet's LNH == BTH and DEST QPN[23:16] in the BTH match this
+ * prefix value, then it is an AIP packet with a DETH containing the entropy
+ * value in byte 4 following the BTH.
+ */
+#define RVT_AIP_QP_PREFIX 0x81
+#define RVT_AIP_QP_SUFFIX 0xffff
+#define RVT_AIP_QP_PREFIX_MASK 0x00ff0000
+#define RVT_AIP_QP_PREFIX_SHIFT 16
+#define RVT_AIP_QP_BASE (u32)(RVT_AIP_QP_PREFIX << \
+ RVT_AIP_QP_PREFIX_SHIFT)
+#define RVT_AIP_QPN_MAX BIT(RVT_AIP_QP_PREFIX_SHIFT)
+#define RVT_AIP_QP_MAX (u32)(RVT_AIP_QP_BASE + RVT_AIP_QPN_MAX - 1)
+
+/*
+ * Bit definitions for s_flags.
+ *
+ * RVT_S_SIGNAL_REQ_WR - set if QP send WRs contain completion signaled
+ * RVT_S_BUSY - send tasklet is processing the QP
+ * RVT_S_TIMER - the RC retry timer is active
+ * RVT_S_ACK_PENDING - an ACK is waiting to be sent after RDMA read/atomics
+ * RVT_S_WAIT_FENCE - waiting for all prior RDMA read or atomic SWQEs
+ * before processing the next SWQE
+ * RVT_S_WAIT_RDMAR - waiting for a RDMA read or atomic SWQE to complete
+ * before processing the next SWQE
+ * RVT_S_WAIT_RNR - waiting for RNR timeout
+ * RVT_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE
+ * RVT_S_WAIT_DMA - waiting for send DMA queue to drain before generating
+ * next send completion entry not via send DMA
+ * RVT_S_WAIT_PIO - waiting for a send buffer to be available
+ * RVT_S_WAIT_TX - waiting for a struct verbs_txreq to be available
+ * RVT_S_WAIT_DMA_DESC - waiting for DMA descriptors to be available
+ * RVT_S_WAIT_KMEM - waiting for kernel memory to be available
+ * RVT_S_WAIT_PSN - waiting for a packet to exit the send DMA queue
+ * RVT_S_WAIT_ACK - waiting for an ACK packet before sending more requests
+ * RVT_S_SEND_ONE - send one packet, request ACK, then wait for ACK
+ * RVT_S_ECN - a BECN was queued to the send engine
+ * RVT_S_MAX_BIT_MASK - The max bit that can be used by rdmavt
+ */
+#define RVT_S_SIGNAL_REQ_WR 0x0001
+#define RVT_S_BUSY 0x0002
+#define RVT_S_TIMER 0x0004
+#define RVT_S_RESP_PENDING 0x0008
+#define RVT_S_ACK_PENDING 0x0010
+#define RVT_S_WAIT_FENCE 0x0020
+#define RVT_S_WAIT_RDMAR 0x0040
+#define RVT_S_WAIT_RNR 0x0080
+#define RVT_S_WAIT_SSN_CREDIT 0x0100
+#define RVT_S_WAIT_DMA 0x0200
+#define RVT_S_WAIT_PIO 0x0400
+#define RVT_S_WAIT_TX 0x0800
+#define RVT_S_WAIT_DMA_DESC 0x1000
+#define RVT_S_WAIT_KMEM 0x2000
+#define RVT_S_WAIT_PSN 0x4000
+#define RVT_S_WAIT_ACK 0x8000
+#define RVT_S_SEND_ONE 0x10000
+#define RVT_S_UNLIMITED_CREDIT 0x20000
+#define RVT_S_ECN 0x40000
+#define RVT_S_MAX_BIT_MASK 0x800000
+
+/*
+ * Drivers should use s_flags starting with bit 31 down to the bit next to
+ * RVT_S_MAX_BIT_MASK
+ */
+
+/*
+ * Wait flags that would prevent any packet type from being sent.
+ */
+#define RVT_S_ANY_WAIT_IO \
+ (RVT_S_WAIT_PIO | RVT_S_WAIT_TX | \
+ RVT_S_WAIT_DMA_DESC | RVT_S_WAIT_KMEM)
+
+/*
+ * Wait flags that would prevent send work requests from making progress.
+ */
+#define RVT_S_ANY_WAIT_SEND (RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR | \
+ RVT_S_WAIT_RNR | RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_DMA | \
+ RVT_S_WAIT_PSN | RVT_S_WAIT_ACK)
+
+#define RVT_S_ANY_WAIT (RVT_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND)
+
+/* Number of bits to pay attention to in the opcode for checking qp type */
+#define RVT_OPCODE_QP_MASK 0xE0
+
+/* Flags for checking QP state (see ib_rvt_state_ops[]) */
+#define RVT_POST_SEND_OK 0x01
+#define RVT_POST_RECV_OK 0x02
+#define RVT_PROCESS_RECV_OK 0x04
+#define RVT_PROCESS_SEND_OK 0x08
+#define RVT_PROCESS_NEXT_SEND_OK 0x10
+#define RVT_FLUSH_SEND 0x20
+#define RVT_FLUSH_RECV 0x40
+#define RVT_PROCESS_OR_FLUSH_SEND \
+ (RVT_PROCESS_SEND_OK | RVT_FLUSH_SEND)
+#define RVT_SEND_OR_FLUSH_OR_RECV_OK \
+ (RVT_PROCESS_SEND_OK | RVT_FLUSH_SEND | RVT_PROCESS_RECV_OK)
+
+/*
+ * Internal send flags
+ */
+#define RVT_SEND_RESERVE_USED IB_SEND_RESERVED_START
+#define RVT_SEND_COMPLETION_ONLY (IB_SEND_RESERVED_START << 1)
+
+/**
+ * rvt_ud_wr - IB UD work plus AH cache
+ * @wr: valid IB work request
+ * @attr: pointer to an allocated AH attribute
+ *
+ * Special case the UD WR so we can keep track of the AH attributes.
+ *
+ * NOTE: This data structure is stricly ordered wr then attr. I.e the attr
+ * MUST come after wr. The ib_ud_wr is sized and copied in rvt_post_one_wr.
+ * The copy assumes that wr is first.
+ */
+struct rvt_ud_wr {
+ struct ib_ud_wr wr;
+ struct rdma_ah_attr *attr;
+};
+
+/*
+ * Send work request queue entry.
+ * The size of the sg_list is determined when the QP is created and stored
+ * in qp->s_max_sge.
+ */
+struct rvt_swqe {
+ union {
+ struct ib_send_wr wr; /* don't use wr.sg_list */
+ struct rvt_ud_wr ud_wr;
+ struct ib_reg_wr reg_wr;
+ struct ib_rdma_wr rdma_wr;
+ struct ib_atomic_wr atomic_wr;
+ };
+ u32 psn; /* first packet sequence number */
+ u32 lpsn; /* last packet sequence number */
+ u32 ssn; /* send sequence number */
+ u32 length; /* total length of data in sg_list */
+ void *priv; /* driver dependent field */
+ struct rvt_sge sg_list[];
+};
+
+/**
+ * struct rvt_krwq - kernel struct receive work request
+ * @p_lock: lock to protect producer of the kernel buffer
+ * @head: index of next entry to fill
+ * @c_lock:lock to protect consumer of the kernel buffer
+ * @tail: index of next entry to pull
+ * @count: count is aproximate of total receive enteries posted
+ * @rvt_rwqe: struct of receive work request queue entry
+ *
+ * This structure is used to contain the head pointer,
+ * tail pointer and receive work queue entries for kernel
+ * mode user.
+ */
+struct rvt_krwq {
+ spinlock_t p_lock; /* protect producer */
+ u32 head; /* new work requests posted to the head */
+
+ /* protect consumer */
+ spinlock_t c_lock ____cacheline_aligned_in_smp;
+ u32 tail; /* receives pull requests from here. */
+ u32 count; /* approx count of receive entries posted */
+ struct rvt_rwqe *curr_wq;
+ struct rvt_rwqe wq[];
+};
+
+/*
+ * rvt_get_swqe_ah - Return the pointer to the struct rvt_ah
+ * @swqe: valid Send WQE
+ *
+ */
+static inline struct rvt_ah *rvt_get_swqe_ah(struct rvt_swqe *swqe)
+{
+ return ibah_to_rvtah(swqe->ud_wr.wr.ah);
+}
+
+/**
+ * rvt_get_swqe_ah_attr - Return the cached ah attribute information
+ * @swqe: valid Send WQE
+ *
+ */
+static inline struct rdma_ah_attr *rvt_get_swqe_ah_attr(struct rvt_swqe *swqe)
+{
+ return swqe->ud_wr.attr;
+}
+
+/**
+ * rvt_get_swqe_remote_qpn - Access the remote QPN value
+ * @swqe: valid Send WQE
+ *
+ */
+static inline u32 rvt_get_swqe_remote_qpn(struct rvt_swqe *swqe)
+{
+ return swqe->ud_wr.wr.remote_qpn;
+}
+
+/**
+ * rvt_get_swqe_remote_qkey - Acces the remote qkey value
+ * @swqe: valid Send WQE
+ *
+ */
+static inline u32 rvt_get_swqe_remote_qkey(struct rvt_swqe *swqe)
+{
+ return swqe->ud_wr.wr.remote_qkey;
+}
+
+/**
+ * rvt_get_swqe_pkey_index - Access the pkey index
+ * @swqe: valid Send WQE
+ *
+ */
+static inline u16 rvt_get_swqe_pkey_index(struct rvt_swqe *swqe)
+{
+ return swqe->ud_wr.wr.pkey_index;
+}
+
+struct rvt_rq {
+ struct rvt_rwq *wq;
+ struct rvt_krwq *kwq;
+ u32 size; /* size of RWQE array */
+ u8 max_sge;
+ /* protect changes in this struct */
+ spinlock_t lock ____cacheline_aligned_in_smp;
+};
+
+/**
+ * rvt_get_rq_count - count numbers of request work queue entries
+ * in circular buffer
+ * @rq: data structure for request queue entry
+ * @head: head indices of the circular buffer
+ * @tail: tail indices of the circular buffer
+ *
+ * Return - total number of entries in the Receive Queue
+ */
+
+static inline u32 rvt_get_rq_count(struct rvt_rq *rq, u32 head, u32 tail)
+{
+ u32 count = head - tail;
+
+ if ((s32)count < 0)
+ count += rq->size;
+ return count;
+}
+
+/*
+ * This structure holds the information that the send tasklet needs
+ * to send a RDMA read response or atomic operation.
+ */
+struct rvt_ack_entry {
+ struct rvt_sge rdma_sge;
+ u64 atomic_data;
+ u32 psn;
+ u32 lpsn;
+ u8 opcode;
+ u8 sent;
+ void *priv;
+};
+
+#define RC_QP_SCALING_INTERVAL 5
+
+#define RVT_OPERATION_PRIV 0x00000001
+#define RVT_OPERATION_ATOMIC 0x00000002
+#define RVT_OPERATION_ATOMIC_SGE 0x00000004
+#define RVT_OPERATION_LOCAL 0x00000008
+#define RVT_OPERATION_USE_RESERVE 0x00000010
+#define RVT_OPERATION_IGN_RNR_CNT 0x00000020
+
+#define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1)
+
+/**
+ * rvt_operation_params - op table entry
+ * @length - the length to copy into the swqe entry
+ * @qpt_support - a bit mask indicating QP type support
+ * @flags - RVT_OPERATION flags (see above)
+ *
+ * This supports table driven post send so that
+ * the driver can have differing an potentially
+ * different sets of operations.
+ *
+ **/
+
+struct rvt_operation_params {
+ size_t length;
+ u32 qpt_support;
+ u32 flags;
+};
+
+/*
+ * Common variables are protected by both r_rq.lock and s_lock in that order
+ * which only happens in modify_qp() or changing the QP 'state'.
+ */
+struct rvt_qp {
+ struct ib_qp ibqp;
+ void *priv; /* Driver private data */
+ /* read mostly fields above and below */
+ struct rdma_ah_attr remote_ah_attr;
+ struct rdma_ah_attr alt_ah_attr;
+ struct rvt_qp __rcu *next; /* link list for QPN hash table */
+ struct rvt_swqe *s_wq; /* send work queue */
+ struct rvt_mmap_info *ip;
+
+ unsigned long timeout_jiffies; /* computed from timeout */
+
+ int srate_mbps; /* s_srate (below) converted to Mbit/s */
+ pid_t pid; /* pid for user mode QPs */
+ u32 remote_qpn;
+ u32 qkey; /* QKEY for this QP (for UD or RD) */
+ u32 s_size; /* send work queue size */
+
+ u16 pmtu; /* decoded from path_mtu */
+ u8 log_pmtu; /* shift for pmtu */
+ u8 state; /* QP state */
+ u8 allowed_ops; /* high order bits of allowed opcodes */
+ u8 qp_access_flags;
+ u8 alt_timeout; /* Alternate path timeout for this QP */
+ u8 timeout; /* Timeout for this QP */
+ u8 s_srate;
+ u8 s_mig_state;
+ u8 port_num;
+ u8 s_pkey_index; /* PKEY index to use */
+ u8 s_alt_pkey_index; /* Alternate path PKEY index to use */
+ u8 r_max_rd_atomic; /* max number of RDMA read/atomic to receive */
+ u8 s_max_rd_atomic; /* max number of RDMA read/atomic to send */
+ u8 s_retry_cnt; /* number of times to retry */
+ u8 s_rnr_retry_cnt;
+ u8 r_min_rnr_timer; /* retry timeout value for RNR NAKs */
+ u8 s_max_sge; /* size of s_wq->sg_list */
+ u8 s_draining;
+
+ /* start of read/write fields */
+ atomic_t refcount ____cacheline_aligned_in_smp;
+ wait_queue_head_t wait;
+
+ struct rvt_ack_entry *s_ack_queue;
+ struct rvt_sge_state s_rdma_read_sge;
+
+ spinlock_t r_lock ____cacheline_aligned_in_smp; /* used for APM */
+ u32 r_psn; /* expected rcv packet sequence number */
+ unsigned long r_aflags;
+ u64 r_wr_id; /* ID for current receive WQE */
+ u32 r_ack_psn; /* PSN for next ACK or atomic ACK */
+ u32 r_len; /* total length of r_sge */
+ u32 r_rcv_len; /* receive data len processed */
+ u32 r_msn; /* message sequence number */
+
+ u8 r_state; /* opcode of last packet received */
+ u8 r_flags;
+ u8 r_head_ack_queue; /* index into s_ack_queue[] */
+ u8 r_adefered; /* defered ack count */
+
+ struct list_head rspwait; /* link for waiting to respond */
+
+ struct rvt_sge_state r_sge; /* current receive data */
+ struct rvt_rq r_rq; /* receive work queue */
+
+ /* post send line */
+ spinlock_t s_hlock ____cacheline_aligned_in_smp;
+ u32 s_head; /* new entries added here */
+ u32 s_next_psn; /* PSN for next request */
+ u32 s_avail; /* number of entries avail */
+ u32 s_ssn; /* SSN of tail entry */
+ atomic_t s_reserved_used; /* reserved entries in use */
+
+ spinlock_t s_lock ____cacheline_aligned_in_smp;
+ u32 s_flags;
+ struct rvt_sge_state *s_cur_sge;
+ struct rvt_swqe *s_wqe;
+ struct rvt_sge_state s_sge; /* current send request data */
+ struct rvt_mregion *s_rdma_mr;
+ u32 s_len; /* total length of s_sge */
+ u32 s_rdma_read_len; /* total length of s_rdma_read_sge */
+ u32 s_last_psn; /* last response PSN processed */
+ u32 s_sending_psn; /* lowest PSN that is being sent */
+ u32 s_sending_hpsn; /* highest PSN that is being sent */
+ u32 s_psn; /* current packet sequence number */
+ u32 s_ack_rdma_psn; /* PSN for sending RDMA read responses */
+ u32 s_ack_psn; /* PSN for acking sends and RDMA writes */
+ u32 s_tail; /* next entry to process */
+ u32 s_cur; /* current work queue entry */
+ u32 s_acked; /* last un-ACK'ed entry */
+ u32 s_last; /* last completed entry */
+ u32 s_lsn; /* limit sequence number (credit) */
+ u32 s_ahgpsn; /* set to the psn in the copy of the header */
+ u16 s_cur_size; /* size of send packet in bytes */
+ u16 s_rdma_ack_cnt;
+ u8 s_hdrwords; /* size of s_hdr in 32 bit words */
+ s8 s_ahgidx;
+ u8 s_state; /* opcode of last packet sent */
+ u8 s_ack_state; /* opcode of packet to ACK */
+ u8 s_nak_state; /* non-zero if NAK is pending */
+ u8 r_nak_state; /* non-zero if NAK is pending */
+ u8 s_retry; /* requester retry counter */
+ u8 s_rnr_retry; /* requester RNR retry counter */
+ u8 s_num_rd_atomic; /* number of RDMA read/atomic pending */
+ u8 s_tail_ack_queue; /* index into s_ack_queue[] */
+ u8 s_acked_ack_queue; /* index into s_ack_queue[] */
+
+ struct rvt_sge_state s_ack_rdma_sge;
+ struct timer_list s_timer;
+ struct hrtimer s_rnr_timer;
+
+ atomic_t local_ops_pending; /* number of fast_reg/local_inv reqs */
+
+ /*
+ * This sge list MUST be last. Do not add anything below here.
+ */
+ struct rvt_sge *r_sg_list /* verified SGEs */
+ ____cacheline_aligned_in_smp;
+};
+
+struct rvt_srq {
+ struct ib_srq ibsrq;
+ struct rvt_rq rq;
+ struct rvt_mmap_info *ip;
+ /* send signal when number of RWQEs < limit */
+ u32 limit;
+};
+
+static inline struct rvt_srq *ibsrq_to_rvtsrq(struct ib_srq *ibsrq)
+{
+ return container_of(ibsrq, struct rvt_srq, ibsrq);
+}
+
+static inline struct rvt_qp *ibqp_to_rvtqp(struct ib_qp *ibqp)
+{
+ return container_of(ibqp, struct rvt_qp, ibqp);
+}
+
+#define RVT_QPN_MAX BIT(24)
+#define RVT_QPNMAP_ENTRIES (RVT_QPN_MAX / PAGE_SIZE / BITS_PER_BYTE)
+#define RVT_BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE)
+#define RVT_BITS_PER_PAGE_MASK (RVT_BITS_PER_PAGE - 1)
+#define RVT_QPN_MASK IB_QPN_MASK
+
+/*
+ * QPN-map pages start out as NULL, they get allocated upon
+ * first use and are never deallocated. This way,
+ * large bitmaps are not allocated unless large numbers of QPs are used.
+ */
+struct rvt_qpn_map {
+ void *page;
+};
+
+struct rvt_qpn_table {
+ spinlock_t lock; /* protect changes to the qp table */
+ unsigned flags; /* flags for QP0/1 allocated for each port */
+ u32 last; /* last QP number allocated */
+ u32 nmaps; /* size of the map table */
+ u16 limit;
+ u8 incr;
+ /* bit map of free QP numbers other than 0/1 */
+ struct rvt_qpn_map map[RVT_QPNMAP_ENTRIES];
+};
+
+struct rvt_qp_ibdev {
+ u32 qp_table_size;
+ u32 qp_table_bits;
+ struct rvt_qp __rcu **qp_table;
+ spinlock_t qpt_lock; /* qptable lock */
+ struct rvt_qpn_table qpn_table;
+};
+
+/*
+ * There is one struct rvt_mcast for each multicast GID.
+ * All attached QPs are then stored as a list of
+ * struct rvt_mcast_qp.
+ */
+struct rvt_mcast_qp {
+ struct list_head list;
+ struct rvt_qp *qp;
+};
+
+struct rvt_mcast_addr {
+ union ib_gid mgid;
+ u16 lid;
+};
+
+struct rvt_mcast {
+ struct rb_node rb_node;
+ struct rvt_mcast_addr mcast_addr;
+ struct list_head qp_list;
+ wait_queue_head_t wait;
+ atomic_t refcount;
+ int n_attached;
+};
+
+/*
+ * Since struct rvt_swqe is not a fixed size, we can't simply index into
+ * struct rvt_qp.s_wq. This function does the array index computation.
+ */
+static inline struct rvt_swqe *rvt_get_swqe_ptr(struct rvt_qp *qp,
+ unsigned n)
+{
+ return (struct rvt_swqe *)((char *)qp->s_wq +
+ (sizeof(struct rvt_swqe) +
+ qp->s_max_sge *
+ sizeof(struct rvt_sge)) * n);
+}
+
+/*
+ * Since struct rvt_rwqe is not a fixed size, we can't simply index into
+ * struct rvt_rwq.wq. This function does the array index computation.
+ */
+static inline struct rvt_rwqe *rvt_get_rwqe_ptr(struct rvt_rq *rq, unsigned n)
+{
+ return (struct rvt_rwqe *)
+ ((char *)rq->kwq->curr_wq +
+ (sizeof(struct rvt_rwqe) +
+ rq->max_sge * sizeof(struct ib_sge)) * n);
+}
+
+/**
+ * rvt_is_user_qp - return if this is user mode QP
+ * @qp - the target QP
+ */
+static inline bool rvt_is_user_qp(struct rvt_qp *qp)
+{
+ return !!qp->pid;
+}
+
+/**
+ * rvt_get_qp - get a QP reference
+ * @qp - the QP to hold
+ */
+static inline void rvt_get_qp(struct rvt_qp *qp)
+{
+ atomic_inc(&qp->refcount);
+}
+
+/**
+ * rvt_put_qp - release a QP reference
+ * @qp - the QP to release
+ */
+static inline void rvt_put_qp(struct rvt_qp *qp)
+{
+ if (qp && atomic_dec_and_test(&qp->refcount))
+ wake_up(&qp->wait);
+}
+
+/**
+ * rvt_put_swqe - drop mr refs held by swqe
+ * @wqe - the send wqe
+ *
+ * This drops any mr references held by the swqe
+ */
+static inline void rvt_put_swqe(struct rvt_swqe *wqe)
+{
+ int i;
+
+ for (i = 0; i < wqe->wr.num_sge; i++) {
+ struct rvt_sge *sge = &wqe->sg_list[i];
+
+ rvt_put_mr(sge->mr);
+ }
+}
+
+/**
+ * rvt_qp_wqe_reserve - reserve operation
+ * @qp - the rvt qp
+ * @wqe - the send wqe
+ *
+ * This routine used in post send to record
+ * a wqe relative reserved operation use.
+ */
+static inline void rvt_qp_wqe_reserve(
+ struct rvt_qp *qp,
+ struct rvt_swqe *wqe)
+{
+ atomic_inc(&qp->s_reserved_used);
+}
+
+/**
+ * rvt_qp_wqe_unreserve - clean reserved operation
+ * @qp - the rvt qp
+ * @flags - send wqe flags
+ *
+ * This decrements the reserve use count.
+ *
+ * This call MUST precede the change to
+ * s_last to insure that post send sees a stable
+ * s_avail.
+ *
+ * An smp_mp__after_atomic() is used to insure
+ * the compiler does not juggle the order of the s_last
+ * ring index and the decrementing of s_reserved_used.
+ */
+static inline void rvt_qp_wqe_unreserve(struct rvt_qp *qp, int flags)
+{
+ if (unlikely(flags & RVT_SEND_RESERVE_USED)) {
+ atomic_dec(&qp->s_reserved_used);
+ /* insure no compiler re-order up to s_last change */
+ smp_mb__after_atomic();
+ }
+}
+
+extern const enum ib_wc_opcode ib_rvt_wc_opcode[];
+
+/*
+ * Compare the lower 24 bits of the msn values.
+ * Returns an integer <, ==, or > than zero.
+ */
+static inline int rvt_cmp_msn(u32 a, u32 b)
+{
+ return (((int)a) - ((int)b)) << 8;
+}
+
+__be32 rvt_compute_aeth(struct rvt_qp *qp);
+
+void rvt_get_credit(struct rvt_qp *qp, u32 aeth);
+
+u32 rvt_restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len);
+
+/**
+ * rvt_div_round_up_mtu - round up divide
+ * @qp - the qp pair
+ * @len - the length
+ *
+ * Perform a shift based mtu round up divide
+ */
+static inline u32 rvt_div_round_up_mtu(struct rvt_qp *qp, u32 len)
+{
+ return (len + qp->pmtu - 1) >> qp->log_pmtu;
+}
+
+/**
+ * @qp - the qp pair
+ * @len - the length
+ *
+ * Perform a shift based mtu divide
+ */
+static inline u32 rvt_div_mtu(struct rvt_qp *qp, u32 len)
+{
+ return len >> qp->log_pmtu;
+}
+
+/**
+ * rvt_timeout_to_jiffies - Convert a ULP timeout input into jiffies
+ * @timeout - timeout input(0 - 31).
+ *
+ * Return a timeout value in jiffies.
+ */
+static inline unsigned long rvt_timeout_to_jiffies(u8 timeout)
+{
+ if (timeout > 31)
+ timeout = 31;
+
+ return usecs_to_jiffies(1U << timeout) * 4096UL / 1000UL;
+}
+
+/**
+ * rvt_lookup_qpn - return the QP with the given QPN
+ * @ibp: the ibport
+ * @qpn: the QP number to look up
+ *
+ * The caller must hold the rcu_read_lock(), and keep the lock until
+ * the returned qp is no longer in use.
+ */
+static inline struct rvt_qp *rvt_lookup_qpn(struct rvt_dev_info *rdi,
+ struct rvt_ibport *rvp,
+ u32 qpn) __must_hold(RCU)
+{
+ struct rvt_qp *qp = NULL;
+
+ if (unlikely(qpn <= 1)) {
+ qp = rcu_dereference(rvp->qp[qpn]);
+ } else {
+ u32 n = hash_32(qpn, rdi->qp_dev->qp_table_bits);
+
+ for (qp = rcu_dereference(rdi->qp_dev->qp_table[n]); qp;
+ qp = rcu_dereference(qp->next))
+ if (qp->ibqp.qp_num == qpn)
+ break;
+ }
+ return qp;
+}
+
+/**
+ * rvt_mod_retry_timer - mod a retry timer
+ * @qp - the QP
+ * @shift - timeout shift to wait for multiple packets
+ * Modify a potentially already running retry timer
+ */
+static inline void rvt_mod_retry_timer_ext(struct rvt_qp *qp, u8 shift)
+{
+ struct ib_qp *ibqp = &qp->ibqp;
+ struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+
+ lockdep_assert_held(&qp->s_lock);
+ qp->s_flags |= RVT_S_TIMER;
+ /* 4.096 usec. * (1 << qp->timeout) */
+ mod_timer(&qp->s_timer, jiffies + rdi->busy_jiffies +
+ (qp->timeout_jiffies << shift));
+}
+
+static inline void rvt_mod_retry_timer(struct rvt_qp *qp)
+{
+ return rvt_mod_retry_timer_ext(qp, 0);
+}
+
+/**
+ * rvt_put_qp_swqe - drop refs held by swqe
+ * @qp: the send qp
+ * @wqe: the send wqe
+ *
+ * This drops any references held by the swqe
+ */
+static inline void rvt_put_qp_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+ rvt_put_swqe(wqe);
+ if (qp->allowed_ops == IB_OPCODE_UD)
+ rdma_destroy_ah_attr(wqe->ud_wr.attr);
+}
+
+/**
+ * rvt_qp_sqwe_incr - increment ring index
+ * @qp: the qp
+ * @val: the starting value
+ *
+ * Return: the new value wrapping as appropriate
+ */
+static inline u32
+rvt_qp_swqe_incr(struct rvt_qp *qp, u32 val)
+{
+ if (++val >= qp->s_size)
+ val = 0;
+ return val;
+}
+
+int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err);
+
+/**
+ * rvt_recv_cq - add a new entry to completion queue
+ * by receive queue
+ * @qp: receive queue
+ * @wc: work completion entry to add
+ * @solicited: true if @entry is solicited
+ *
+ * This is wrapper function for rvt_enter_cq function call by
+ * receive queue. If rvt_cq_enter return false, it means cq is
+ * full and the qp is put into error state.
+ */
+static inline void rvt_recv_cq(struct rvt_qp *qp, struct ib_wc *wc,
+ bool solicited)
+{
+ struct rvt_cq *cq = ibcq_to_rvtcq(qp->ibqp.recv_cq);
+
+ if (unlikely(!rvt_cq_enter(cq, wc, solicited)))
+ rvt_error_qp(qp, IB_WC_LOC_QP_OP_ERR);
+}
+
+/**
+ * rvt_send_cq - add a new entry to completion queue
+ * by send queue
+ * @qp: send queue
+ * @wc: work completion entry to add
+ * @solicited: true if @entry is solicited
+ *
+ * This is wrapper function for rvt_enter_cq function call by
+ * send queue. If rvt_cq_enter return false, it means cq is
+ * full and the qp is put into error state.
+ */
+static inline void rvt_send_cq(struct rvt_qp *qp, struct ib_wc *wc,
+ bool solicited)
+{
+ struct rvt_cq *cq = ibcq_to_rvtcq(qp->ibqp.send_cq);
+
+ if (unlikely(!rvt_cq_enter(cq, wc, solicited)))
+ rvt_error_qp(qp, IB_WC_LOC_QP_OP_ERR);
+}
+
+/**
+ * rvt_qp_complete_swqe - insert send completion
+ * @qp - the qp
+ * @wqe - the send wqe
+ * @opcode - wc operation (driver dependent)
+ * @status - completion status
+ *
+ * Update the s_last information, and then insert a send
+ * completion into the completion
+ * queue if the qp indicates it should be done.
+ *
+ * See IBTA 10.7.3.1 for info on completion
+ * control.
+ *
+ * Return: new last
+ */
+static inline u32
+rvt_qp_complete_swqe(struct rvt_qp *qp,
+ struct rvt_swqe *wqe,
+ enum ib_wc_opcode opcode,
+ enum ib_wc_status status)
+{
+ bool need_completion;
+ u64 wr_id;
+ u32 byte_len, last;
+ int flags = wqe->wr.send_flags;
+
+ rvt_qp_wqe_unreserve(qp, flags);
+ rvt_put_qp_swqe(qp, wqe);
+
+ need_completion =
+ !(flags & RVT_SEND_RESERVE_USED) &&
+ (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
+ (flags & IB_SEND_SIGNALED) ||
+ status != IB_WC_SUCCESS);
+ if (need_completion) {
+ wr_id = wqe->wr.wr_id;
+ byte_len = wqe->length;
+ /* above fields required before writing s_last */
+ }
+ last = rvt_qp_swqe_incr(qp, qp->s_last);
+ /* see rvt_qp_is_avail() */
+ smp_store_release(&qp->s_last, last);
+ if (need_completion) {
+ struct ib_wc w = {
+ .wr_id = wr_id,
+ .status = status,
+ .opcode = opcode,
+ .qp = &qp->ibqp,
+ .byte_len = byte_len,
+ };
+ rvt_send_cq(qp, &w, status != IB_WC_SUCCESS);
+ }
+ return last;
+}
+
+extern const int ib_rvt_state_ops[];
+
+struct rvt_dev_info;
+int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only);
+void rvt_comm_est(struct rvt_qp *qp);
+void rvt_rc_error(struct rvt_qp *qp, enum ib_wc_status err);
+unsigned long rvt_rnr_tbl_to_usec(u32 index);
+enum hrtimer_restart rvt_rc_rnr_retry(struct hrtimer *t);
+void rvt_add_rnr_timer(struct rvt_qp *qp, u32 aeth);
+void rvt_del_timers_sync(struct rvt_qp *qp);
+void rvt_stop_rc_timers(struct rvt_qp *qp);
+void rvt_add_retry_timer_ext(struct rvt_qp *qp, u8 shift);
+static inline void rvt_add_retry_timer(struct rvt_qp *qp)
+{
+ rvt_add_retry_timer_ext(qp, 0);
+}
+
+void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss,
+ void *data, u32 length,
+ bool release, bool copy_last);
+void rvt_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ enum ib_wc_status status);
+void rvt_ruc_loopback(struct rvt_qp *qp);
+
+/**
+ * struct rvt_qp_iter - the iterator for QPs
+ * @qp - the current QP
+ *
+ * This structure defines the current iterator
+ * state for sequenced access to all QPs relative
+ * to an rvt_dev_info.
+ */
+struct rvt_qp_iter {
+ struct rvt_qp *qp;
+ /* private: backpointer */
+ struct rvt_dev_info *rdi;
+ /* private: callback routine */
+ void (*cb)(struct rvt_qp *qp, u64 v);
+ /* private: for arg to callback routine */
+ u64 v;
+ /* private: number of SMI,GSI QPs for device */
+ int specials;
+ /* private: current iterator index */
+ int n;
+};
+
+/**
+ * ib_cq_tail - Return tail index of cq buffer
+ * @send_cq - The cq for send
+ *
+ * This is called in qp_iter_print to get tail
+ * of cq buffer.
+ */
+static inline u32 ib_cq_tail(struct ib_cq *send_cq)
+{
+ struct rvt_cq *cq = ibcq_to_rvtcq(send_cq);
+
+ return ibcq_to_rvtcq(send_cq)->ip ?
+ RDMA_READ_UAPI_ATOMIC(cq->queue->tail) :
+ ibcq_to_rvtcq(send_cq)->kqueue->tail;
+}
+
+/**
+ * ib_cq_head - Return head index of cq buffer
+ * @send_cq - The cq for send
+ *
+ * This is called in qp_iter_print to get head
+ * of cq buffer.
+ */
+static inline u32 ib_cq_head(struct ib_cq *send_cq)
+{
+ struct rvt_cq *cq = ibcq_to_rvtcq(send_cq);
+
+ return ibcq_to_rvtcq(send_cq)->ip ?
+ RDMA_READ_UAPI_ATOMIC(cq->queue->head) :
+ ibcq_to_rvtcq(send_cq)->kqueue->head;
+}
+
+/**
+ * rvt_free_rq - free memory allocated for rvt_rq struct
+ * @rvt_rq: request queue data structure
+ *
+ * This function should only be called if the rvt_mmap_info()
+ * has not succeeded.
+ */
+static inline void rvt_free_rq(struct rvt_rq *rq)
+{
+ kvfree(rq->kwq);
+ rq->kwq = NULL;
+ vfree(rq->wq);
+ rq->wq = NULL;
+}
+
+/**
+ * rvt_to_iport - Get the ibport pointer
+ * @qp: the qp pointer
+ *
+ * This function returns the ibport pointer from the qp pointer.
+ */
+static inline struct rvt_ibport *rvt_to_iport(struct rvt_qp *qp)
+{
+ struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+
+ return rdi->ports[qp->port_num - 1];
+}
+
+/**
+ * rvt_rc_credit_avail - Check if there are enough RC credits for the request
+ * @qp: the qp
+ * @wqe: the request
+ *
+ * This function returns false when there are not enough credits for the given
+ * request and true otherwise.
+ */
+static inline bool rvt_rc_credit_avail(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+ lockdep_assert_held(&qp->s_lock);
+ if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
+ rvt_cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
+ struct rvt_ibport *rvp = rvt_to_iport(qp);
+
+ qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
+ rvp->n_rc_crwaits++;
+ return false;
+ }
+ return true;
+}
+
+struct rvt_qp_iter *rvt_qp_iter_init(struct rvt_dev_info *rdi,
+ u64 v,
+ void (*cb)(struct rvt_qp *qp, u64 v));
+int rvt_qp_iter_next(struct rvt_qp_iter *iter);
+void rvt_qp_iter(struct rvt_dev_info *rdi,
+ u64 v,
+ void (*cb)(struct rvt_qp *qp, u64 v));
+void rvt_qp_mr_clean(struct rvt_qp *qp, u32 lkey);
+#endif /* DEF_RDMAVT_INCQP_H */
diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h
new file mode 100644
index 0000000000..8b7c46daeb
--- /dev/null
+++ b/include/rdma/restrack.h
@@ -0,0 +1,176 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved.
+ */
+
+#ifndef _RDMA_RESTRACK_H_
+#define _RDMA_RESTRACK_H_
+
+#include <linux/typecheck.h>
+#include <linux/sched.h>
+#include <linux/kref.h>
+#include <linux/completion.h>
+#include <linux/sched/task.h>
+#include <uapi/rdma/rdma_netlink.h>
+#include <linux/xarray.h>
+
+struct ib_device;
+struct sk_buff;
+
+/**
+ * enum rdma_restrack_type - HW objects to track
+ */
+enum rdma_restrack_type {
+ /**
+ * @RDMA_RESTRACK_PD: Protection domain (PD)
+ */
+ RDMA_RESTRACK_PD,
+ /**
+ * @RDMA_RESTRACK_CQ: Completion queue (CQ)
+ */
+ RDMA_RESTRACK_CQ,
+ /**
+ * @RDMA_RESTRACK_QP: Queue pair (QP)
+ */
+ RDMA_RESTRACK_QP,
+ /**
+ * @RDMA_RESTRACK_CM_ID: Connection Manager ID (CM_ID)
+ */
+ RDMA_RESTRACK_CM_ID,
+ /**
+ * @RDMA_RESTRACK_MR: Memory Region (MR)
+ */
+ RDMA_RESTRACK_MR,
+ /**
+ * @RDMA_RESTRACK_CTX: Verbs contexts (CTX)
+ */
+ RDMA_RESTRACK_CTX,
+ /**
+ * @RDMA_RESTRACK_COUNTER: Statistic Counter
+ */
+ RDMA_RESTRACK_COUNTER,
+ /**
+ * @RDMA_RESTRACK_SRQ: Shared receive queue (SRQ)
+ */
+ RDMA_RESTRACK_SRQ,
+ /**
+ * @RDMA_RESTRACK_MAX: Last entry, used for array dclarations
+ */
+ RDMA_RESTRACK_MAX
+};
+
+/**
+ * struct rdma_restrack_entry - metadata per-entry
+ */
+struct rdma_restrack_entry {
+ /**
+ * @valid: validity indicator
+ *
+ * The entries are filled during rdma_restrack_add,
+ * can be attempted to be free during rdma_restrack_del.
+ *
+ * As an example for that, see mlx5 QPs with type MLX5_IB_QPT_HW_GSI
+ */
+ bool valid;
+ /**
+ * @no_track: don't add this entry to restrack DB
+ *
+ * This field is used to mark an entry that doesn't need to be added to
+ * internal restrack DB and presented later to the users at the nldev
+ * query stage.
+ */
+ u8 no_track : 1;
+ /*
+ * @kref: Protect destroy of the resource
+ */
+ struct kref kref;
+ /*
+ * @comp: Signal that all consumers of resource are completed their work
+ */
+ struct completion comp;
+ /**
+ * @task: owner of resource tracking entity
+ *
+ * There are two types of entities: created by user and created
+ * by kernel.
+ *
+ * This is relevant for the entities created by users.
+ * For the entities created by kernel, this pointer will be NULL.
+ */
+ struct task_struct *task;
+ /**
+ * @kern_name: name of owner for the kernel created entities.
+ */
+ const char *kern_name;
+ /**
+ * @type: various objects in restrack database
+ */
+ enum rdma_restrack_type type;
+ /**
+ * @user: user resource
+ */
+ bool user;
+ /**
+ * @id: ID to expose to users
+ */
+ u32 id;
+};
+
+int rdma_restrack_count(struct ib_device *dev,
+ enum rdma_restrack_type type);
+/**
+ * rdma_is_kernel_res() - check the owner of resource
+ * @res: resource entry
+ */
+static inline bool rdma_is_kernel_res(const struct rdma_restrack_entry *res)
+{
+ return !res->user;
+}
+
+/**
+ * rdma_restrack_get() - grab to protect resource from release
+ * @res: resource entry
+ */
+int __must_check rdma_restrack_get(struct rdma_restrack_entry *res);
+
+/**
+ * rdma_restrack_put() - release resource
+ * @res: resource entry
+ */
+int rdma_restrack_put(struct rdma_restrack_entry *res);
+
+/*
+ * Helper functions for rdma drivers when filling out
+ * nldev driver attributes.
+ */
+int rdma_nl_put_driver_u32(struct sk_buff *msg, const char *name, u32 value);
+int rdma_nl_put_driver_u32_hex(struct sk_buff *msg, const char *name,
+ u32 value);
+int rdma_nl_put_driver_u64(struct sk_buff *msg, const char *name, u64 value);
+int rdma_nl_put_driver_u64_hex(struct sk_buff *msg, const char *name,
+ u64 value);
+int rdma_nl_put_driver_string(struct sk_buff *msg, const char *name,
+ const char *str);
+int rdma_nl_stat_hwcounter_entry(struct sk_buff *msg, const char *name,
+ u64 value);
+
+struct rdma_restrack_entry *rdma_restrack_get_byid(struct ib_device *dev,
+ enum rdma_restrack_type type,
+ u32 id);
+
+/**
+ * rdma_restrack_no_track() - don't add resource to the DB
+ * @res: resource entry
+ *
+ * Every user of this API should be cross examined.
+ * Probably you don't need to use this function.
+ */
+static inline void rdma_restrack_no_track(struct rdma_restrack_entry *res)
+{
+ res->no_track = true;
+}
+static inline bool rdma_restrack_is_tracked(struct rdma_restrack_entry *res)
+{
+ return !res->no_track;
+}
+#endif /* _RDMA_RESTRACK_H_ */
diff --git a/include/rdma/rw.h b/include/rdma/rw.h
new file mode 100644
index 0000000000..d606cac482
--- /dev/null
+++ b/include/rdma/rw.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2016 HGST, a Western Digital Company.
+ */
+#ifndef _RDMA_RW_H
+#define _RDMA_RW_H
+
+#include <linux/dma-mapping.h>
+#include <linux/scatterlist.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/mr_pool.h>
+
+struct rdma_rw_ctx {
+ /* number of RDMA READ/WRITE WRs (not counting MR WRs) */
+ u32 nr_ops;
+
+ /* tag for the union below: */
+ u8 type;
+
+ union {
+ /* for mapping a single SGE: */
+ struct {
+ struct ib_sge sge;
+ struct ib_rdma_wr wr;
+ } single;
+
+ /* for mapping of multiple SGEs: */
+ struct {
+ struct ib_sge *sges;
+ struct ib_rdma_wr *wrs;
+ } map;
+
+ /* for registering multiple WRs: */
+ struct rdma_rw_reg_ctx {
+ struct ib_sge sge;
+ struct ib_rdma_wr wr;
+ struct ib_reg_wr reg_wr;
+ struct ib_send_wr inv_wr;
+ struct ib_mr *mr;
+ } *reg;
+ };
+};
+
+int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num,
+ struct scatterlist *sg, u32 sg_cnt, u32 sg_offset,
+ u64 remote_addr, u32 rkey, enum dma_data_direction dir);
+void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u32 port_num, struct scatterlist *sg, u32 sg_cnt,
+ enum dma_data_direction dir);
+
+int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u32 port_num, struct scatterlist *sg, u32 sg_cnt,
+ struct scatterlist *prot_sg, u32 prot_sg_cnt,
+ struct ib_sig_attrs *sig_attrs, u64 remote_addr, u32 rkey,
+ enum dma_data_direction dir);
+void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u32 port_num, struct scatterlist *sg, u32 sg_cnt,
+ struct scatterlist *prot_sg, u32 prot_sg_cnt,
+ enum dma_data_direction dir);
+
+struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u32 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr);
+int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u32 port_num,
+ struct ib_cqe *cqe, struct ib_send_wr *chain_wr);
+
+unsigned int rdma_rw_mr_factor(struct ib_device *device, u32 port_num,
+ unsigned int maxpages);
+void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr);
+int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr);
+void rdma_rw_cleanup_mrs(struct ib_qp *qp);
+
+#endif /* _RDMA_RW_H */
diff --git a/include/rdma/signature.h b/include/rdma/signature.h
new file mode 100644
index 0000000000..d16b0fcc83
--- /dev/null
+++ b/include/rdma/signature.h
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) */
+/*
+ * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved.
+ */
+
+#ifndef _RDMA_SIGNATURE_H_
+#define _RDMA_SIGNATURE_H_
+
+#include <linux/types.h>
+
+enum ib_signature_prot_cap {
+ IB_PROT_T10DIF_TYPE_1 = 1,
+ IB_PROT_T10DIF_TYPE_2 = 1 << 1,
+ IB_PROT_T10DIF_TYPE_3 = 1 << 2,
+};
+
+enum ib_signature_guard_cap {
+ IB_GUARD_T10DIF_CRC = 1,
+ IB_GUARD_T10DIF_CSUM = 1 << 1,
+};
+
+/**
+ * enum ib_signature_type - Signature types
+ * @IB_SIG_TYPE_NONE: Unprotected.
+ * @IB_SIG_TYPE_T10_DIF: Type T10-DIF
+ */
+enum ib_signature_type {
+ IB_SIG_TYPE_NONE,
+ IB_SIG_TYPE_T10_DIF,
+};
+
+/**
+ * enum ib_t10_dif_bg_type - Signature T10-DIF block-guard types
+ * @IB_T10DIF_CRC: Corresponds to T10-PI mandated CRC checksum rules.
+ * @IB_T10DIF_CSUM: Corresponds to IP checksum rules.
+ */
+enum ib_t10_dif_bg_type {
+ IB_T10DIF_CRC,
+ IB_T10DIF_CSUM,
+};
+
+/**
+ * struct ib_t10_dif_domain - Parameters specific for T10-DIF
+ * domain.
+ * @bg_type: T10-DIF block guard type (CRC|CSUM)
+ * @pi_interval: protection information interval.
+ * @bg: seed of guard computation.
+ * @app_tag: application tag of guard block
+ * @ref_tag: initial guard block reference tag.
+ * @ref_remap: Indicate wethear the reftag increments each block
+ * @app_escape: Indicate to skip block check if apptag=0xffff
+ * @ref_escape: Indicate to skip block check if reftag=0xffffffff
+ * @apptag_check_mask: check bitmask of application tag.
+ */
+struct ib_t10_dif_domain {
+ enum ib_t10_dif_bg_type bg_type;
+ u16 pi_interval;
+ u16 bg;
+ u16 app_tag;
+ u32 ref_tag;
+ bool ref_remap;
+ bool app_escape;
+ bool ref_escape;
+ u16 apptag_check_mask;
+};
+
+/**
+ * struct ib_sig_domain - Parameters for signature domain
+ * @sig_type: specific signauture type
+ * @sig: union of all signature domain attributes that may
+ * be used to set domain layout.
+ */
+struct ib_sig_domain {
+ enum ib_signature_type sig_type;
+ union {
+ struct ib_t10_dif_domain dif;
+ } sig;
+};
+
+/**
+ * struct ib_sig_attrs - Parameters for signature handover operation
+ * @check_mask: bitmask for signature byte check (8 bytes)
+ * @mem: memory domain layout descriptor.
+ * @wire: wire domain layout descriptor.
+ * @meta_length: metadata length
+ */
+struct ib_sig_attrs {
+ u8 check_mask;
+ struct ib_sig_domain mem;
+ struct ib_sig_domain wire;
+ int meta_length;
+};
+
+enum ib_sig_err_type {
+ IB_SIG_BAD_GUARD,
+ IB_SIG_BAD_REFTAG,
+ IB_SIG_BAD_APPTAG,
+};
+
+/*
+ * Signature check masks (8 bytes in total) according to the T10-PI standard:
+ * -------- -------- ------------
+ * | GUARD | APPTAG | REFTAG |
+ * | 2B | 2B | 4B |
+ * -------- -------- ------------
+ */
+enum {
+ IB_SIG_CHECK_GUARD = 0xc0,
+ IB_SIG_CHECK_APPTAG = 0x30,
+ IB_SIG_CHECK_REFTAG = 0x0f,
+};
+
+/*
+ * struct ib_sig_err - signature error descriptor
+ */
+struct ib_sig_err {
+ enum ib_sig_err_type err_type;
+ u32 expected;
+ u32 actual;
+ u64 sig_err_offset;
+ u32 key;
+};
+
+#endif /* _RDMA_SIGNATURE_H_ */
diff --git a/include/rdma/tid_rdma_defs.h b/include/rdma/tid_rdma_defs.h
new file mode 100644
index 0000000000..08fe47c7ad
--- /dev/null
+++ b/include/rdma/tid_rdma_defs.h
@@ -0,0 +1,108 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+
+#ifndef TID_RDMA_DEFS_H
+#define TID_RDMA_DEFS_H
+
+#include <rdma/ib_pack.h>
+
+struct tid_rdma_read_req {
+ __le32 kdeth0;
+ __le32 kdeth1;
+ struct ib_reth reth;
+ __be32 tid_flow_psn;
+ __be32 tid_flow_qp;
+ __be32 verbs_qp;
+};
+
+struct tid_rdma_read_resp {
+ __le32 kdeth0;
+ __le32 kdeth1;
+ __be32 aeth;
+ __be32 reserved[4];
+ __be32 verbs_psn;
+ __be32 verbs_qp;
+};
+
+struct tid_rdma_write_req {
+ __le32 kdeth0;
+ __le32 kdeth1;
+ struct ib_reth reth;
+ __be32 reserved[2];
+ __be32 verbs_qp;
+};
+
+struct tid_rdma_write_resp {
+ __le32 kdeth0;
+ __le32 kdeth1;
+ __be32 aeth;
+ __be32 reserved[3];
+ __be32 tid_flow_psn;
+ __be32 tid_flow_qp;
+ __be32 verbs_qp;
+};
+
+struct tid_rdma_write_data {
+ __le32 kdeth0;
+ __le32 kdeth1;
+ __be32 reserved[6];
+ __be32 verbs_qp;
+};
+
+struct tid_rdma_resync {
+ __le32 kdeth0;
+ __le32 kdeth1;
+ __be32 reserved[6];
+ __be32 verbs_qp;
+};
+
+struct tid_rdma_ack {
+ __le32 kdeth0;
+ __le32 kdeth1;
+ __be32 aeth;
+ __be32 reserved[2];
+ __be32 tid_flow_psn;
+ __be32 verbs_psn;
+ __be32 tid_flow_qp;
+ __be32 verbs_qp;
+};
+
+/*
+ * TID RDMA Opcodes
+ */
+#define IB_OPCODE_TID_RDMA 0xe0
+enum {
+ IB_OPCODE_WRITE_REQ = 0x0,
+ IB_OPCODE_WRITE_RESP = 0x1,
+ IB_OPCODE_WRITE_DATA = 0x2,
+ IB_OPCODE_WRITE_DATA_LAST = 0x3,
+ IB_OPCODE_READ_REQ = 0x4,
+ IB_OPCODE_READ_RESP = 0x5,
+ IB_OPCODE_RESYNC = 0x6,
+ IB_OPCODE_ACK = 0x7,
+
+ IB_OPCODE(TID_RDMA, WRITE_REQ),
+ IB_OPCODE(TID_RDMA, WRITE_RESP),
+ IB_OPCODE(TID_RDMA, WRITE_DATA),
+ IB_OPCODE(TID_RDMA, WRITE_DATA_LAST),
+ IB_OPCODE(TID_RDMA, READ_REQ),
+ IB_OPCODE(TID_RDMA, READ_RESP),
+ IB_OPCODE(TID_RDMA, RESYNC),
+ IB_OPCODE(TID_RDMA, ACK),
+};
+
+#define TID_OP(x) IB_OPCODE_TID_RDMA_##x
+
+/*
+ * Define TID RDMA specific WR opcodes. The ib_wr_opcode
+ * enum already provides some reserved values for use by
+ * low level drivers. Two of those are used but renamed
+ * to be more descriptive.
+ */
+#define IB_WR_TID_RDMA_WRITE IB_WR_RESERVED1
+#define IB_WR_TID_RDMA_READ IB_WR_RESERVED2
+
+#endif /* TID_RDMA_DEFS_H */
diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h
new file mode 100644
index 0000000000..06287de69c
--- /dev/null
+++ b/include/rdma/uverbs_ioctl.h
@@ -0,0 +1,1017 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved.
+ */
+
+#ifndef _UVERBS_IOCTL_
+#define _UVERBS_IOCTL_
+
+#include <rdma/uverbs_types.h>
+#include <linux/uaccess.h>
+#include <rdma/rdma_user_ioctl.h>
+#include <rdma/ib_user_ioctl_verbs.h>
+#include <rdma/ib_user_ioctl_cmds.h>
+
+/*
+ * =======================================
+ * Verbs action specifications
+ * =======================================
+ */
+
+enum uverbs_attr_type {
+ UVERBS_ATTR_TYPE_NA,
+ UVERBS_ATTR_TYPE_PTR_IN,
+ UVERBS_ATTR_TYPE_PTR_OUT,
+ UVERBS_ATTR_TYPE_IDR,
+ UVERBS_ATTR_TYPE_FD,
+ UVERBS_ATTR_TYPE_RAW_FD,
+ UVERBS_ATTR_TYPE_ENUM_IN,
+ UVERBS_ATTR_TYPE_IDRS_ARRAY,
+};
+
+enum uverbs_obj_access {
+ UVERBS_ACCESS_READ,
+ UVERBS_ACCESS_WRITE,
+ UVERBS_ACCESS_NEW,
+ UVERBS_ACCESS_DESTROY
+};
+
+/* Specification of a single attribute inside the ioctl message */
+/* good size 16 */
+struct uverbs_attr_spec {
+ u8 type;
+
+ /*
+ * Support extending attributes by length. Allow the user to provide
+ * more bytes than ptr.len, but check that everything after is zero'd
+ * by the user.
+ */
+ u8 zero_trailing:1;
+ /*
+ * Valid only for PTR_IN. Allocate and copy the data inside
+ * the parser
+ */
+ u8 alloc_and_copy:1;
+ u8 mandatory:1;
+ /* True if this is from UVERBS_ATTR_UHW */
+ u8 is_udata:1;
+
+ union {
+ struct {
+ /* Current known size to kernel */
+ u16 len;
+ /* User isn't allowed to provide something < min_len */
+ u16 min_len;
+ } ptr;
+
+ struct {
+ /*
+ * higher bits mean the namespace and lower bits mean
+ * the type id within the namespace.
+ */
+ u16 obj_type;
+ u8 access;
+ } obj;
+
+ struct {
+ u8 num_elems;
+ } enum_def;
+ } u;
+
+ /* This weird split lets us remove some padding */
+ union {
+ struct {
+ /*
+ * The enum attribute can select one of the attributes
+ * contained in the ids array. Currently only PTR_IN
+ * attributes are supported in the ids array.
+ */
+ const struct uverbs_attr_spec *ids;
+ } enum_def;
+
+ struct {
+ /*
+ * higher bits mean the namespace and lower bits mean
+ * the type id within the namespace.
+ */
+ u16 obj_type;
+ u16 min_len;
+ u16 max_len;
+ u8 access;
+ } objs_arr;
+ } u2;
+};
+
+/*
+ * Information about the API is loaded into a radix tree. For IOCTL we start
+ * with a tuple of:
+ * object_id, attr_id, method_id
+ *
+ * Which is a 48 bit value, with most of the bits guaranteed to be zero. Based
+ * on the current kernel support this is compressed into 16 bit key for the
+ * radix tree. Since this compression is entirely internal to the kernel the
+ * below limits can be revised if the kernel gains additional data.
+ *
+ * With 64 leafs per node this is a 3 level radix tree.
+ *
+ * The tree encodes multiple types, and uses a scheme where OBJ_ID,0,0 returns
+ * the object slot, and OBJ_ID,METH_ID,0 and returns the method slot.
+ *
+ * This also encodes the tables for the write() and write() extended commands
+ * using the coding
+ * OBJ_ID,UVERBS_API_METHOD_IS_WRITE,command #
+ * OBJ_ID,UVERBS_API_METHOD_IS_WRITE_EX,command_ex #
+ * ie the WRITE path is treated as a special method type in the ioctl
+ * framework.
+ */
+enum uapi_radix_data {
+ UVERBS_API_NS_FLAG = 1U << UVERBS_ID_NS_SHIFT,
+
+ UVERBS_API_ATTR_KEY_BITS = 6,
+ UVERBS_API_ATTR_KEY_MASK = GENMASK(UVERBS_API_ATTR_KEY_BITS - 1, 0),
+ UVERBS_API_ATTR_BKEY_LEN = (1 << UVERBS_API_ATTR_KEY_BITS) - 1,
+ UVERBS_API_WRITE_KEY_NUM = 1 << UVERBS_API_ATTR_KEY_BITS,
+
+ UVERBS_API_METHOD_KEY_BITS = 5,
+ UVERBS_API_METHOD_KEY_SHIFT = UVERBS_API_ATTR_KEY_BITS,
+ UVERBS_API_METHOD_KEY_NUM_CORE = 22,
+ UVERBS_API_METHOD_IS_WRITE = 30 << UVERBS_API_METHOD_KEY_SHIFT,
+ UVERBS_API_METHOD_IS_WRITE_EX = 31 << UVERBS_API_METHOD_KEY_SHIFT,
+ UVERBS_API_METHOD_KEY_NUM_DRIVER =
+ (UVERBS_API_METHOD_IS_WRITE >> UVERBS_API_METHOD_KEY_SHIFT) -
+ UVERBS_API_METHOD_KEY_NUM_CORE,
+ UVERBS_API_METHOD_KEY_MASK = GENMASK(
+ UVERBS_API_METHOD_KEY_BITS + UVERBS_API_METHOD_KEY_SHIFT - 1,
+ UVERBS_API_METHOD_KEY_SHIFT),
+
+ UVERBS_API_OBJ_KEY_BITS = 5,
+ UVERBS_API_OBJ_KEY_SHIFT =
+ UVERBS_API_METHOD_KEY_BITS + UVERBS_API_METHOD_KEY_SHIFT,
+ UVERBS_API_OBJ_KEY_NUM_CORE = 20,
+ UVERBS_API_OBJ_KEY_NUM_DRIVER =
+ (1 << UVERBS_API_OBJ_KEY_BITS) - UVERBS_API_OBJ_KEY_NUM_CORE,
+ UVERBS_API_OBJ_KEY_MASK = GENMASK(31, UVERBS_API_OBJ_KEY_SHIFT),
+
+ /* This id guaranteed to not exist in the radix tree */
+ UVERBS_API_KEY_ERR = 0xFFFFFFFF,
+};
+
+static inline __attribute_const__ u32 uapi_key_obj(u32 id)
+{
+ if (id & UVERBS_API_NS_FLAG) {
+ id &= ~UVERBS_API_NS_FLAG;
+ if (id >= UVERBS_API_OBJ_KEY_NUM_DRIVER)
+ return UVERBS_API_KEY_ERR;
+ id = id + UVERBS_API_OBJ_KEY_NUM_CORE;
+ } else {
+ if (id >= UVERBS_API_OBJ_KEY_NUM_CORE)
+ return UVERBS_API_KEY_ERR;
+ }
+
+ return id << UVERBS_API_OBJ_KEY_SHIFT;
+}
+
+static inline __attribute_const__ bool uapi_key_is_object(u32 key)
+{
+ return (key & ~UVERBS_API_OBJ_KEY_MASK) == 0;
+}
+
+static inline __attribute_const__ u32 uapi_key_ioctl_method(u32 id)
+{
+ if (id & UVERBS_API_NS_FLAG) {
+ id &= ~UVERBS_API_NS_FLAG;
+ if (id >= UVERBS_API_METHOD_KEY_NUM_DRIVER)
+ return UVERBS_API_KEY_ERR;
+ id = id + UVERBS_API_METHOD_KEY_NUM_CORE;
+ } else {
+ id++;
+ if (id >= UVERBS_API_METHOD_KEY_NUM_CORE)
+ return UVERBS_API_KEY_ERR;
+ }
+
+ return id << UVERBS_API_METHOD_KEY_SHIFT;
+}
+
+static inline __attribute_const__ u32 uapi_key_write_method(u32 id)
+{
+ if (id >= UVERBS_API_WRITE_KEY_NUM)
+ return UVERBS_API_KEY_ERR;
+ return UVERBS_API_METHOD_IS_WRITE | id;
+}
+
+static inline __attribute_const__ u32 uapi_key_write_ex_method(u32 id)
+{
+ if (id >= UVERBS_API_WRITE_KEY_NUM)
+ return UVERBS_API_KEY_ERR;
+ return UVERBS_API_METHOD_IS_WRITE_EX | id;
+}
+
+static inline __attribute_const__ u32
+uapi_key_attr_to_ioctl_method(u32 attr_key)
+{
+ return attr_key &
+ (UVERBS_API_OBJ_KEY_MASK | UVERBS_API_METHOD_KEY_MASK);
+}
+
+static inline __attribute_const__ bool uapi_key_is_ioctl_method(u32 key)
+{
+ unsigned int method = key & UVERBS_API_METHOD_KEY_MASK;
+
+ return method != 0 && method < UVERBS_API_METHOD_IS_WRITE &&
+ (key & UVERBS_API_ATTR_KEY_MASK) == 0;
+}
+
+static inline __attribute_const__ bool uapi_key_is_write_method(u32 key)
+{
+ return (key & UVERBS_API_METHOD_KEY_MASK) == UVERBS_API_METHOD_IS_WRITE;
+}
+
+static inline __attribute_const__ bool uapi_key_is_write_ex_method(u32 key)
+{
+ return (key & UVERBS_API_METHOD_KEY_MASK) ==
+ UVERBS_API_METHOD_IS_WRITE_EX;
+}
+
+static inline __attribute_const__ u32 uapi_key_attrs_start(u32 ioctl_method_key)
+{
+ /* 0 is the method slot itself */
+ return ioctl_method_key + 1;
+}
+
+static inline __attribute_const__ u32 uapi_key_attr(u32 id)
+{
+ /*
+ * The attr is designed to fit in the typical single radix tree node
+ * of 64 entries. Since allmost all methods have driver attributes we
+ * organize things so that the driver and core attributes interleave to
+ * reduce the length of the attributes array in typical cases.
+ */
+ if (id & UVERBS_API_NS_FLAG) {
+ id &= ~UVERBS_API_NS_FLAG;
+ id++;
+ if (id >= 1 << (UVERBS_API_ATTR_KEY_BITS - 1))
+ return UVERBS_API_KEY_ERR;
+ id = (id << 1) | 0;
+ } else {
+ if (id >= 1 << (UVERBS_API_ATTR_KEY_BITS - 1))
+ return UVERBS_API_KEY_ERR;
+ id = (id << 1) | 1;
+ }
+
+ return id;
+}
+
+/* Only true for ioctl methods */
+static inline __attribute_const__ bool uapi_key_is_attr(u32 key)
+{
+ unsigned int method = key & UVERBS_API_METHOD_KEY_MASK;
+
+ return method != 0 && method < UVERBS_API_METHOD_IS_WRITE &&
+ (key & UVERBS_API_ATTR_KEY_MASK) != 0;
+}
+
+/*
+ * This returns a value in the range [0 to UVERBS_API_ATTR_BKEY_LEN),
+ * basically it undoes the reservation of 0 in the ID numbering. attr_key
+ * must already be masked with UVERBS_API_ATTR_KEY_MASK, or be the output of
+ * uapi_key_attr().
+ */
+static inline __attribute_const__ u32 uapi_bkey_attr(u32 attr_key)
+{
+ return attr_key - 1;
+}
+
+static inline __attribute_const__ u32 uapi_bkey_to_key_attr(u32 attr_bkey)
+{
+ return attr_bkey + 1;
+}
+
+/*
+ * =======================================
+ * Verbs definitions
+ * =======================================
+ */
+
+struct uverbs_attr_def {
+ u16 id;
+ struct uverbs_attr_spec attr;
+};
+
+struct uverbs_method_def {
+ u16 id;
+ /* Combination of bits from enum UVERBS_ACTION_FLAG_XXXX */
+ u32 flags;
+ size_t num_attrs;
+ const struct uverbs_attr_def * const (*attrs)[];
+ int (*handler)(struct uverbs_attr_bundle *attrs);
+};
+
+struct uverbs_object_def {
+ u16 id;
+ const struct uverbs_obj_type *type_attrs;
+ size_t num_methods;
+ const struct uverbs_method_def * const (*methods)[];
+};
+
+enum uapi_definition_kind {
+ UAPI_DEF_END = 0,
+ UAPI_DEF_OBJECT_START,
+ UAPI_DEF_WRITE,
+ UAPI_DEF_CHAIN_OBJ_TREE,
+ UAPI_DEF_CHAIN,
+ UAPI_DEF_IS_SUPPORTED_FUNC,
+ UAPI_DEF_IS_SUPPORTED_DEV_FN,
+};
+
+enum uapi_definition_scope {
+ UAPI_SCOPE_OBJECT = 1,
+ UAPI_SCOPE_METHOD = 2,
+};
+
+struct uapi_definition {
+ u8 kind;
+ u8 scope;
+ union {
+ struct {
+ u16 object_id;
+ } object_start;
+ struct {
+ u16 command_num;
+ u8 is_ex:1;
+ u8 has_udata:1;
+ u8 has_resp:1;
+ u8 req_size;
+ u8 resp_size;
+ } write;
+ };
+
+ union {
+ bool (*func_is_supported)(struct ib_device *device);
+ int (*func_write)(struct uverbs_attr_bundle *attrs);
+ const struct uapi_definition *chain;
+ const struct uverbs_object_def *chain_obj_tree;
+ size_t needs_fn_offset;
+ };
+};
+
+/* Define things connected to object_id */
+#define DECLARE_UVERBS_OBJECT(_object_id, ...) \
+ { \
+ .kind = UAPI_DEF_OBJECT_START, \
+ .object_start = { .object_id = _object_id }, \
+ }, \
+ ##__VA_ARGS__
+
+/* Use in a var_args of DECLARE_UVERBS_OBJECT */
+#define DECLARE_UVERBS_WRITE(_command_num, _func, _cmd_desc, ...) \
+ { \
+ .kind = UAPI_DEF_WRITE, \
+ .scope = UAPI_SCOPE_OBJECT, \
+ .write = { .is_ex = 0, .command_num = _command_num }, \
+ .func_write = _func, \
+ _cmd_desc, \
+ }, \
+ ##__VA_ARGS__
+
+/* Use in a var_args of DECLARE_UVERBS_OBJECT */
+#define DECLARE_UVERBS_WRITE_EX(_command_num, _func, _cmd_desc, ...) \
+ { \
+ .kind = UAPI_DEF_WRITE, \
+ .scope = UAPI_SCOPE_OBJECT, \
+ .write = { .is_ex = 1, .command_num = _command_num }, \
+ .func_write = _func, \
+ _cmd_desc, \
+ }, \
+ ##__VA_ARGS__
+
+/*
+ * Object is only supported if the function pointer named ibdev_fn in struct
+ * ib_device is not NULL.
+ */
+#define UAPI_DEF_OBJ_NEEDS_FN(ibdev_fn) \
+ { \
+ .kind = UAPI_DEF_IS_SUPPORTED_DEV_FN, \
+ .scope = UAPI_SCOPE_OBJECT, \
+ .needs_fn_offset = \
+ offsetof(struct ib_device_ops, ibdev_fn) + \
+ BUILD_BUG_ON_ZERO(sizeof_field(struct ib_device_ops, \
+ ibdev_fn) != \
+ sizeof(void *)), \
+ }
+
+/*
+ * Method is only supported if the function pointer named ibdev_fn in struct
+ * ib_device is not NULL.
+ */
+#define UAPI_DEF_METHOD_NEEDS_FN(ibdev_fn) \
+ { \
+ .kind = UAPI_DEF_IS_SUPPORTED_DEV_FN, \
+ .scope = UAPI_SCOPE_METHOD, \
+ .needs_fn_offset = \
+ offsetof(struct ib_device_ops, ibdev_fn) + \
+ BUILD_BUG_ON_ZERO(sizeof_field(struct ib_device_ops, \
+ ibdev_fn) != \
+ sizeof(void *)), \
+ }
+
+/* Call a function to determine if the entire object is supported or not */
+#define UAPI_DEF_IS_OBJ_SUPPORTED(_func) \
+ { \
+ .kind = UAPI_DEF_IS_SUPPORTED_FUNC, \
+ .scope = UAPI_SCOPE_OBJECT, .func_is_supported = _func, \
+ }
+
+/* Include another struct uapi_definition in this one */
+#define UAPI_DEF_CHAIN(_def_var) \
+ { \
+ .kind = UAPI_DEF_CHAIN, .chain = _def_var, \
+ }
+
+/* Temporary until the tree base description is replaced */
+#define UAPI_DEF_CHAIN_OBJ_TREE(_object_enum, _object_ptr, ...) \
+ { \
+ .kind = UAPI_DEF_CHAIN_OBJ_TREE, \
+ .object_start = { .object_id = _object_enum }, \
+ .chain_obj_tree = _object_ptr, \
+ }, \
+ ##__VA_ARGS__
+#define UAPI_DEF_CHAIN_OBJ_TREE_NAMED(_object_enum, ...) \
+ UAPI_DEF_CHAIN_OBJ_TREE(_object_enum, \
+ PTR_IF(IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS), \
+ &UVERBS_OBJECT(_object_enum)), \
+ ##__VA_ARGS__)
+
+/*
+ * =======================================
+ * Attribute Specifications
+ * =======================================
+ */
+
+#define UVERBS_ATTR_SIZE(_min_len, _len) \
+ .u.ptr.min_len = _min_len, .u.ptr.len = _len
+
+#define UVERBS_ATTR_NO_DATA() UVERBS_ATTR_SIZE(0, 0)
+
+/*
+ * Specifies a uapi structure that cannot be extended. The user must always
+ * supply the whole structure and nothing more. The structure must be declared
+ * in a header under include/uapi/rdma.
+ */
+#define UVERBS_ATTR_TYPE(_type) \
+ .u.ptr.min_len = sizeof(_type), .u.ptr.len = sizeof(_type)
+/*
+ * Specifies a uapi structure where the user must provide at least up to
+ * member 'last'. Anything after last and up until the end of the structure
+ * can be non-zero, anything longer than the end of the structure must be
+ * zero. The structure must be declared in a header under include/uapi/rdma.
+ */
+#define UVERBS_ATTR_STRUCT(_type, _last) \
+ .zero_trailing = 1, \
+ UVERBS_ATTR_SIZE(offsetofend(_type, _last), sizeof(_type))
+/*
+ * Specifies at least min_len bytes must be passed in, but the amount can be
+ * larger, up to the protocol maximum size. No check for zeroing is done.
+ */
+#define UVERBS_ATTR_MIN_SIZE(_min_len) UVERBS_ATTR_SIZE(_min_len, USHRT_MAX)
+
+/* Must be used in the '...' of any UVERBS_ATTR */
+#define UA_ALLOC_AND_COPY .alloc_and_copy = 1
+#define UA_MANDATORY .mandatory = 1
+#define UA_OPTIONAL .mandatory = 0
+
+/*
+ * min_len must be bigger than 0 and _max_len must be smaller than 4095. Only
+ * READ\WRITE accesses are supported.
+ */
+#define UVERBS_ATTR_IDRS_ARR(_attr_id, _idr_type, _access, _min_len, _max_len, \
+ ...) \
+ (&(const struct uverbs_attr_def){ \
+ .id = (_attr_id) + \
+ BUILD_BUG_ON_ZERO((_min_len) == 0 || \
+ (_max_len) > \
+ PAGE_SIZE / sizeof(void *) || \
+ (_min_len) > (_max_len) || \
+ (_access) == UVERBS_ACCESS_NEW || \
+ (_access) == UVERBS_ACCESS_DESTROY), \
+ .attr = { .type = UVERBS_ATTR_TYPE_IDRS_ARRAY, \
+ .u2.objs_arr.obj_type = _idr_type, \
+ .u2.objs_arr.access = _access, \
+ .u2.objs_arr.min_len = _min_len, \
+ .u2.objs_arr.max_len = _max_len, \
+ __VA_ARGS__ } })
+
+/*
+ * Only for use with UVERBS_ATTR_IDR, allows any uobject type to be accepted,
+ * the user must validate the type of the uobject instead.
+ */
+#define UVERBS_IDR_ANY_OBJECT 0xFFFF
+
+#define UVERBS_ATTR_IDR(_attr_id, _idr_type, _access, ...) \
+ (&(const struct uverbs_attr_def){ \
+ .id = _attr_id, \
+ .attr = { .type = UVERBS_ATTR_TYPE_IDR, \
+ .u.obj.obj_type = _idr_type, \
+ .u.obj.access = _access, \
+ __VA_ARGS__ } })
+
+#define UVERBS_ATTR_FD(_attr_id, _fd_type, _access, ...) \
+ (&(const struct uverbs_attr_def){ \
+ .id = (_attr_id) + \
+ BUILD_BUG_ON_ZERO((_access) != UVERBS_ACCESS_NEW && \
+ (_access) != UVERBS_ACCESS_READ), \
+ .attr = { .type = UVERBS_ATTR_TYPE_FD, \
+ .u.obj.obj_type = _fd_type, \
+ .u.obj.access = _access, \
+ __VA_ARGS__ } })
+
+#define UVERBS_ATTR_RAW_FD(_attr_id, ...) \
+ (&(const struct uverbs_attr_def){ \
+ .id = (_attr_id), \
+ .attr = { .type = UVERBS_ATTR_TYPE_RAW_FD, __VA_ARGS__ } })
+
+#define UVERBS_ATTR_PTR_IN(_attr_id, _type, ...) \
+ (&(const struct uverbs_attr_def){ \
+ .id = _attr_id, \
+ .attr = { .type = UVERBS_ATTR_TYPE_PTR_IN, \
+ _type, \
+ __VA_ARGS__ } })
+
+#define UVERBS_ATTR_PTR_OUT(_attr_id, _type, ...) \
+ (&(const struct uverbs_attr_def){ \
+ .id = _attr_id, \
+ .attr = { .type = UVERBS_ATTR_TYPE_PTR_OUT, \
+ _type, \
+ __VA_ARGS__ } })
+
+/* _enum_arry should be a 'static const union uverbs_attr_spec[]' */
+#define UVERBS_ATTR_ENUM_IN(_attr_id, _enum_arr, ...) \
+ (&(const struct uverbs_attr_def){ \
+ .id = _attr_id, \
+ .attr = { .type = UVERBS_ATTR_TYPE_ENUM_IN, \
+ .u2.enum_def.ids = _enum_arr, \
+ .u.enum_def.num_elems = ARRAY_SIZE(_enum_arr), \
+ __VA_ARGS__ }, \
+ })
+
+/* An input value that is a member in the enum _enum_type. */
+#define UVERBS_ATTR_CONST_IN(_attr_id, _enum_type, ...) \
+ UVERBS_ATTR_PTR_IN( \
+ _attr_id, \
+ UVERBS_ATTR_SIZE( \
+ sizeof(u64) + BUILD_BUG_ON_ZERO(!sizeof(_enum_type)), \
+ sizeof(u64)), \
+ __VA_ARGS__)
+
+/*
+ * An input value that is a bitwise combination of values of _enum_type.
+ * This permits the flag value to be passed as either a u32 or u64, it must
+ * be retrieved via uverbs_get_flag().
+ */
+#define UVERBS_ATTR_FLAGS_IN(_attr_id, _enum_type, ...) \
+ UVERBS_ATTR_PTR_IN( \
+ _attr_id, \
+ UVERBS_ATTR_SIZE(sizeof(u32) + BUILD_BUG_ON_ZERO( \
+ !sizeof(_enum_type *)), \
+ sizeof(u64)), \
+ __VA_ARGS__)
+
+/*
+ * This spec is used in order to pass information to the hardware driver in a
+ * legacy way. Every verb that could get driver specific data should get this
+ * spec.
+ */
+#define UVERBS_ATTR_UHW() \
+ UVERBS_ATTR_PTR_IN(UVERBS_ATTR_UHW_IN, \
+ UVERBS_ATTR_MIN_SIZE(0), \
+ UA_OPTIONAL, \
+ .is_udata = 1), \
+ UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_UHW_OUT, \
+ UVERBS_ATTR_MIN_SIZE(0), \
+ UA_OPTIONAL, \
+ .is_udata = 1)
+
+/* =================================================
+ * Parsing infrastructure
+ * =================================================
+ */
+
+
+struct uverbs_ptr_attr {
+ /*
+ * If UVERBS_ATTR_SPEC_F_ALLOC_AND_COPY is set then the 'ptr' is
+ * used.
+ */
+ union {
+ void *ptr;
+ u64 data;
+ };
+ u16 len;
+ u16 uattr_idx;
+ u8 enum_id;
+};
+
+struct uverbs_obj_attr {
+ struct ib_uobject *uobject;
+ const struct uverbs_api_attr *attr_elm;
+};
+
+struct uverbs_objs_arr_attr {
+ struct ib_uobject **uobjects;
+ u16 len;
+};
+
+struct uverbs_attr {
+ union {
+ struct uverbs_ptr_attr ptr_attr;
+ struct uverbs_obj_attr obj_attr;
+ struct uverbs_objs_arr_attr objs_arr_attr;
+ };
+};
+
+struct uverbs_attr_bundle {
+ struct ib_udata driver_udata;
+ struct ib_udata ucore;
+ struct ib_uverbs_file *ufile;
+ struct ib_ucontext *context;
+ struct ib_uobject *uobject;
+ DECLARE_BITMAP(attr_present, UVERBS_API_ATTR_BKEY_LEN);
+ struct uverbs_attr attrs[];
+};
+
+static inline bool uverbs_attr_is_valid(const struct uverbs_attr_bundle *attrs_bundle,
+ unsigned int idx)
+{
+ return test_bit(uapi_bkey_attr(uapi_key_attr(idx)),
+ attrs_bundle->attr_present);
+}
+
+/**
+ * rdma_udata_to_drv_context - Helper macro to get the driver's context out of
+ * ib_udata which is embedded in uverbs_attr_bundle.
+ *
+ * If udata is not NULL this cannot fail. Otherwise a NULL udata will result
+ * in a NULL ucontext pointer, as a safety precaution. Callers should be using
+ * 'udata' to determine if the driver call is in user or kernel mode, not
+ * 'ucontext'.
+ *
+ */
+static inline struct uverbs_attr_bundle *
+rdma_udata_to_uverbs_attr_bundle(struct ib_udata *udata)
+{
+ return container_of(udata, struct uverbs_attr_bundle, driver_udata);
+}
+
+#define rdma_udata_to_drv_context(udata, drv_dev_struct, member) \
+ (udata ? container_of(rdma_udata_to_uverbs_attr_bundle(udata)->context, \
+ drv_dev_struct, member) : (drv_dev_struct *)NULL)
+
+#define IS_UVERBS_COPY_ERR(_ret) ((_ret) && (_ret) != -ENOENT)
+
+static inline const struct uverbs_attr *uverbs_attr_get(const struct uverbs_attr_bundle *attrs_bundle,
+ u16 idx)
+{
+ if (!uverbs_attr_is_valid(attrs_bundle, idx))
+ return ERR_PTR(-ENOENT);
+
+ return &attrs_bundle->attrs[uapi_bkey_attr(uapi_key_attr(idx))];
+}
+
+static inline int uverbs_attr_get_enum_id(const struct uverbs_attr_bundle *attrs_bundle,
+ u16 idx)
+{
+ const struct uverbs_attr *attr = uverbs_attr_get(attrs_bundle, idx);
+
+ if (IS_ERR(attr))
+ return PTR_ERR(attr);
+
+ return attr->ptr_attr.enum_id;
+}
+
+static inline void *uverbs_attr_get_obj(const struct uverbs_attr_bundle *attrs_bundle,
+ u16 idx)
+{
+ const struct uverbs_attr *attr;
+
+ attr = uverbs_attr_get(attrs_bundle, idx);
+ if (IS_ERR(attr))
+ return ERR_CAST(attr);
+
+ return attr->obj_attr.uobject->object;
+}
+
+static inline struct ib_uobject *uverbs_attr_get_uobject(const struct uverbs_attr_bundle *attrs_bundle,
+ u16 idx)
+{
+ const struct uverbs_attr *attr = uverbs_attr_get(attrs_bundle, idx);
+
+ if (IS_ERR(attr))
+ return ERR_CAST(attr);
+
+ return attr->obj_attr.uobject;
+}
+
+static inline int
+uverbs_attr_get_len(const struct uverbs_attr_bundle *attrs_bundle, u16 idx)
+{
+ const struct uverbs_attr *attr = uverbs_attr_get(attrs_bundle, idx);
+
+ if (IS_ERR(attr))
+ return PTR_ERR(attr);
+
+ return attr->ptr_attr.len;
+}
+
+void uverbs_finalize_uobj_create(const struct uverbs_attr_bundle *attrs_bundle,
+ u16 idx);
+
+/*
+ * uverbs_attr_ptr_get_array_size() - Get array size pointer by a ptr
+ * attribute.
+ * @attrs: The attribute bundle
+ * @idx: The ID of the attribute
+ * @elem_size: The size of the element in the array
+ */
+static inline int
+uverbs_attr_ptr_get_array_size(struct uverbs_attr_bundle *attrs, u16 idx,
+ size_t elem_size)
+{
+ int size = uverbs_attr_get_len(attrs, idx);
+
+ if (size < 0)
+ return size;
+
+ if (size % elem_size)
+ return -EINVAL;
+
+ return size / elem_size;
+}
+
+/**
+ * uverbs_attr_get_uobjs_arr() - Provides array's properties for attribute for
+ * UVERBS_ATTR_TYPE_IDRS_ARRAY.
+ * @arr: Returned pointer to array of pointers for uobjects or NULL if
+ * the attribute isn't provided.
+ *
+ * Return: The array length or 0 if no attribute was provided.
+ */
+static inline int uverbs_attr_get_uobjs_arr(
+ const struct uverbs_attr_bundle *attrs_bundle, u16 attr_idx,
+ struct ib_uobject ***arr)
+{
+ const struct uverbs_attr *attr =
+ uverbs_attr_get(attrs_bundle, attr_idx);
+
+ if (IS_ERR(attr)) {
+ *arr = NULL;
+ return 0;
+ }
+
+ *arr = attr->objs_arr_attr.uobjects;
+
+ return attr->objs_arr_attr.len;
+}
+
+static inline bool uverbs_attr_ptr_is_inline(const struct uverbs_attr *attr)
+{
+ return attr->ptr_attr.len <= sizeof(attr->ptr_attr.data);
+}
+
+static inline void *uverbs_attr_get_alloced_ptr(
+ const struct uverbs_attr_bundle *attrs_bundle, u16 idx)
+{
+ const struct uverbs_attr *attr = uverbs_attr_get(attrs_bundle, idx);
+
+ if (IS_ERR(attr))
+ return (void *)attr;
+
+ return uverbs_attr_ptr_is_inline(attr) ? (void *)&attr->ptr_attr.data :
+ attr->ptr_attr.ptr;
+}
+
+static inline int _uverbs_copy_from(void *to,
+ const struct uverbs_attr_bundle *attrs_bundle,
+ size_t idx,
+ size_t size)
+{
+ const struct uverbs_attr *attr = uverbs_attr_get(attrs_bundle, idx);
+
+ if (IS_ERR(attr))
+ return PTR_ERR(attr);
+
+ /*
+ * Validation ensures attr->ptr_attr.len >= size. If the caller is
+ * using UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO then it must call
+ * uverbs_copy_from_or_zero.
+ */
+ if (unlikely(size < attr->ptr_attr.len))
+ return -EINVAL;
+
+ if (uverbs_attr_ptr_is_inline(attr))
+ memcpy(to, &attr->ptr_attr.data, attr->ptr_attr.len);
+ else if (copy_from_user(to, u64_to_user_ptr(attr->ptr_attr.data),
+ attr->ptr_attr.len))
+ return -EFAULT;
+
+ return 0;
+}
+
+static inline int _uverbs_copy_from_or_zero(void *to,
+ const struct uverbs_attr_bundle *attrs_bundle,
+ size_t idx,
+ size_t size)
+{
+ const struct uverbs_attr *attr = uverbs_attr_get(attrs_bundle, idx);
+ size_t min_size;
+
+ if (IS_ERR(attr))
+ return PTR_ERR(attr);
+
+ min_size = min_t(size_t, size, attr->ptr_attr.len);
+
+ if (uverbs_attr_ptr_is_inline(attr))
+ memcpy(to, &attr->ptr_attr.data, min_size);
+ else if (copy_from_user(to, u64_to_user_ptr(attr->ptr_attr.data),
+ min_size))
+ return -EFAULT;
+
+ if (size > min_size)
+ memset(to + min_size, 0, size - min_size);
+
+ return 0;
+}
+
+#define uverbs_copy_from(to, attrs_bundle, idx) \
+ _uverbs_copy_from(to, attrs_bundle, idx, sizeof(*to))
+
+#define uverbs_copy_from_or_zero(to, attrs_bundle, idx) \
+ _uverbs_copy_from_or_zero(to, attrs_bundle, idx, sizeof(*to))
+
+static inline struct ib_ucontext *
+ib_uverbs_get_ucontext(const struct uverbs_attr_bundle *attrs)
+{
+ return ib_uverbs_get_ucontext_file(attrs->ufile);
+}
+
+#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
+int uverbs_get_flags64(u64 *to, const struct uverbs_attr_bundle *attrs_bundle,
+ size_t idx, u64 allowed_bits);
+int uverbs_get_flags32(u32 *to, const struct uverbs_attr_bundle *attrs_bundle,
+ size_t idx, u64 allowed_bits);
+int uverbs_copy_to(const struct uverbs_attr_bundle *attrs_bundle, size_t idx,
+ const void *from, size_t size);
+__malloc void *_uverbs_alloc(struct uverbs_attr_bundle *bundle, size_t size,
+ gfp_t flags);
+
+static inline __malloc void *uverbs_alloc(struct uverbs_attr_bundle *bundle,
+ size_t size)
+{
+ return _uverbs_alloc(bundle, size, GFP_KERNEL);
+}
+
+static inline __malloc void *uverbs_zalloc(struct uverbs_attr_bundle *bundle,
+ size_t size)
+{
+ return _uverbs_alloc(bundle, size, GFP_KERNEL | __GFP_ZERO);
+}
+
+static inline __malloc void *uverbs_kcalloc(struct uverbs_attr_bundle *bundle,
+ size_t n, size_t size)
+{
+ size_t bytes;
+
+ if (unlikely(check_mul_overflow(n, size, &bytes)))
+ return ERR_PTR(-EOVERFLOW);
+ return uverbs_zalloc(bundle, bytes);
+}
+
+int _uverbs_get_const_signed(s64 *to,
+ const struct uverbs_attr_bundle *attrs_bundle,
+ size_t idx, s64 lower_bound, u64 upper_bound,
+ s64 *def_val);
+int _uverbs_get_const_unsigned(u64 *to,
+ const struct uverbs_attr_bundle *attrs_bundle,
+ size_t idx, u64 upper_bound, u64 *def_val);
+int uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle,
+ size_t idx, const void *from, size_t size);
+#else
+static inline int
+uverbs_get_flags64(u64 *to, const struct uverbs_attr_bundle *attrs_bundle,
+ size_t idx, u64 allowed_bits)
+{
+ return -EINVAL;
+}
+static inline int
+uverbs_get_flags32(u32 *to, const struct uverbs_attr_bundle *attrs_bundle,
+ size_t idx, u64 allowed_bits)
+{
+ return -EINVAL;
+}
+static inline int uverbs_copy_to(const struct uverbs_attr_bundle *attrs_bundle,
+ size_t idx, const void *from, size_t size)
+{
+ return -EINVAL;
+}
+static inline __malloc void *uverbs_alloc(struct uverbs_attr_bundle *bundle,
+ size_t size)
+{
+ return ERR_PTR(-EINVAL);
+}
+static inline __malloc void *uverbs_zalloc(struct uverbs_attr_bundle *bundle,
+ size_t size)
+{
+ return ERR_PTR(-EINVAL);
+}
+static inline int
+_uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle,
+ size_t idx, s64 lower_bound, u64 upper_bound,
+ s64 *def_val)
+{
+ return -EINVAL;
+}
+static inline int
+uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle,
+ size_t idx, const void *from, size_t size)
+{
+ return -EINVAL;
+}
+static inline int
+_uverbs_get_const_signed(s64 *to,
+ const struct uverbs_attr_bundle *attrs_bundle,
+ size_t idx, s64 lower_bound, u64 upper_bound,
+ s64 *def_val)
+{
+ return -EINVAL;
+}
+static inline int
+_uverbs_get_const_unsigned(u64 *to,
+ const struct uverbs_attr_bundle *attrs_bundle,
+ size_t idx, u64 upper_bound, u64 *def_val)
+{
+ return -EINVAL;
+}
+#endif
+
+#define uverbs_get_const_signed(_to, _attrs_bundle, _idx) \
+ ({ \
+ s64 _val; \
+ int _ret = \
+ _uverbs_get_const_signed(&_val, _attrs_bundle, _idx, \
+ type_min(typeof(*(_to))), \
+ type_max(typeof(*(_to))), NULL); \
+ (*(_to)) = _val; \
+ _ret; \
+ })
+
+#define uverbs_get_const_unsigned(_to, _attrs_bundle, _idx) \
+ ({ \
+ u64 _val; \
+ int _ret = \
+ _uverbs_get_const_unsigned(&_val, _attrs_bundle, _idx, \
+ type_max(typeof(*(_to))), NULL); \
+ (*(_to)) = _val; \
+ _ret; \
+ })
+
+#define uverbs_get_const_default_signed(_to, _attrs_bundle, _idx, _default) \
+ ({ \
+ s64 _val; \
+ s64 _def_val = _default; \
+ int _ret = \
+ _uverbs_get_const_signed(&_val, _attrs_bundle, _idx, \
+ type_min(typeof(*(_to))), \
+ type_max(typeof(*(_to))), &_def_val); \
+ (*(_to)) = _val; \
+ _ret; \
+ })
+
+#define uverbs_get_const_default_unsigned(_to, _attrs_bundle, _idx, _default) \
+ ({ \
+ u64 _val; \
+ u64 _def_val = _default; \
+ int _ret = \
+ _uverbs_get_const_unsigned(&_val, _attrs_bundle, _idx, \
+ type_max(typeof(*(_to))), &_def_val); \
+ (*(_to)) = _val; \
+ _ret; \
+ })
+
+#define uverbs_get_const(_to, _attrs_bundle, _idx) \
+ (is_signed_type(typeof(*(_to))) ? \
+ uverbs_get_const_signed(_to, _attrs_bundle, _idx) : \
+ uverbs_get_const_unsigned(_to, _attrs_bundle, _idx)) \
+
+#define uverbs_get_const_default(_to, _attrs_bundle, _idx, _default) \
+ (is_signed_type(typeof(*(_to))) ? \
+ uverbs_get_const_default_signed(_to, _attrs_bundle, _idx, \
+ _default) : \
+ uverbs_get_const_default_unsigned(_to, _attrs_bundle, _idx, \
+ _default))
+
+static inline int
+uverbs_get_raw_fd(int *to, const struct uverbs_attr_bundle *attrs_bundle,
+ size_t idx)
+{
+ return uverbs_get_const_signed(to, attrs_bundle, idx);
+}
+
+#endif
diff --git a/include/rdma/uverbs_named_ioctl.h b/include/rdma/uverbs_named_ioctl.h
new file mode 100644
index 0000000000..ee7873f872
--- /dev/null
+++ b/include/rdma/uverbs_named_ioctl.h
@@ -0,0 +1,97 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved.
+ */
+
+#ifndef _UVERBS_NAMED_IOCTL_
+#define _UVERBS_NAMED_IOCTL_
+
+#include <rdma/uverbs_ioctl.h>
+
+#ifndef UVERBS_MODULE_NAME
+#error "Please #define UVERBS_MODULE_NAME before including rdma/uverbs_named_ioctl.h"
+#endif
+
+#define _UVERBS_PASTE(x, y) x ## y
+#define _UVERBS_NAME(x, y) _UVERBS_PASTE(x, y)
+#define UVERBS_METHOD(id) _UVERBS_NAME(UVERBS_MODULE_NAME, _method_##id)
+#define UVERBS_HANDLER(id) _UVERBS_NAME(UVERBS_MODULE_NAME, _handler_##id)
+#define UVERBS_OBJECT(id) _UVERBS_NAME(UVERBS_MODULE_NAME, _object_##id)
+
+/* These are static so they do not need to be qualified */
+#define UVERBS_METHOD_ATTRS(method_id) _method_attrs_##method_id
+#define UVERBS_OBJECT_METHODS(object_id) _UVERBS_NAME(_object_methods_##object_id, __LINE__)
+
+#define DECLARE_UVERBS_NAMED_METHOD(_method_id, ...) \
+ static const struct uverbs_attr_def *const UVERBS_METHOD_ATTRS( \
+ _method_id)[] = { __VA_ARGS__ }; \
+ static const struct uverbs_method_def UVERBS_METHOD(_method_id) = { \
+ .id = _method_id, \
+ .handler = UVERBS_HANDLER(_method_id), \
+ .num_attrs = ARRAY_SIZE(UVERBS_METHOD_ATTRS(_method_id)), \
+ .attrs = &UVERBS_METHOD_ATTRS(_method_id), \
+ }
+
+/* Create a standard destroy method using the default handler. The handle_attr
+ * argument must be the attribute specifying the handle to destroy, the
+ * default handler does not support any other attributes.
+ */
+#define DECLARE_UVERBS_NAMED_METHOD_DESTROY(_method_id, _handle_attr) \
+ static const struct uverbs_attr_def *const UVERBS_METHOD_ATTRS( \
+ _method_id)[] = { _handle_attr }; \
+ static const struct uverbs_method_def UVERBS_METHOD(_method_id) = { \
+ .id = _method_id, \
+ .handler = uverbs_destroy_def_handler, \
+ .num_attrs = ARRAY_SIZE(UVERBS_METHOD_ATTRS(_method_id)), \
+ .attrs = &UVERBS_METHOD_ATTRS(_method_id), \
+ }
+
+#define DECLARE_UVERBS_NAMED_OBJECT(_object_id, _type_attrs, ...) \
+ static const struct uverbs_method_def *const UVERBS_OBJECT_METHODS( \
+ _object_id)[] = { __VA_ARGS__ }; \
+ static const struct uverbs_object_def UVERBS_OBJECT(_object_id) = { \
+ .id = _object_id, \
+ .type_attrs = &_type_attrs, \
+ .num_methods = ARRAY_SIZE(UVERBS_OBJECT_METHODS(_object_id)), \
+ .methods = &UVERBS_OBJECT_METHODS(_object_id) \
+ }
+
+/*
+ * Declare global methods. These still have a unique object_id because we
+ * identify all uapi methods with a (object,method) tuple. However, they have
+ * no type pointer.
+ */
+#define DECLARE_UVERBS_GLOBAL_METHODS(_object_id, ...) \
+ static const struct uverbs_method_def *const UVERBS_OBJECT_METHODS( \
+ _object_id)[] = { __VA_ARGS__ }; \
+ static const struct uverbs_object_def UVERBS_OBJECT(_object_id) = { \
+ .id = _object_id, \
+ .num_methods = ARRAY_SIZE(UVERBS_OBJECT_METHODS(_object_id)), \
+ .methods = &UVERBS_OBJECT_METHODS(_object_id) \
+ }
+
+/* Used by drivers to declare a complete parsing tree for new methods
+ */
+#define ADD_UVERBS_METHODS(_name, _object_id, ...) \
+ static const struct uverbs_method_def *const UVERBS_OBJECT_METHODS( \
+ _object_id)[] = { __VA_ARGS__ }; \
+ static const struct uverbs_object_def _name = { \
+ .id = _object_id, \
+ .num_methods = ARRAY_SIZE(UVERBS_OBJECT_METHODS(_object_id)), \
+ .methods = &UVERBS_OBJECT_METHODS(_object_id) \
+ };
+
+/* Used by drivers to declare a complete parsing tree for a single method that
+ * differs only in having additional driver specific attributes.
+ */
+#define ADD_UVERBS_ATTRIBUTES_SIMPLE(_name, _object_id, _method_id, ...) \
+ static const struct uverbs_attr_def *const UVERBS_METHOD_ATTRS( \
+ _method_id)[] = { __VA_ARGS__ }; \
+ static const struct uverbs_method_def UVERBS_METHOD(_method_id) = { \
+ .id = _method_id, \
+ .num_attrs = ARRAY_SIZE(UVERBS_METHOD_ATTRS(_method_id)), \
+ .attrs = &UVERBS_METHOD_ATTRS(_method_id), \
+ }; \
+ ADD_UVERBS_METHODS(_name, _object_id, &UVERBS_METHOD(_method_id))
+
+#endif
diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h
new file mode 100644
index 0000000000..fe05121169
--- /dev/null
+++ b/include/rdma/uverbs_std_types.h
@@ -0,0 +1,178 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved.
+ */
+
+#ifndef _UVERBS_STD_TYPES__
+#define _UVERBS_STD_TYPES__
+
+#include <rdma/uverbs_types.h>
+#include <rdma/uverbs_ioctl.h>
+#include <rdma/ib_user_ioctl_verbs.h>
+
+/* Returns _id, or causes a compile error if _id is not a u32.
+ *
+ * The uobj APIs should only be used with the write based uAPI to access
+ * object IDs. The write API must use a u32 for the object handle, which is
+ * checked by this macro.
+ */
+#define _uobj_check_id(_id) ((_id) * typecheck(u32, _id))
+
+#define uobj_get_type(_attrs, _object) \
+ uapi_get_object((_attrs)->ufile->device->uapi, _object)
+
+#define uobj_get_read(_type, _id, _attrs) \
+ rdma_lookup_get_uobject(uobj_get_type(_attrs, _type), (_attrs)->ufile, \
+ _uobj_check_id(_id), UVERBS_LOOKUP_READ, \
+ _attrs)
+
+#define ufd_get_read(_type, _fdnum, _attrs) \
+ rdma_lookup_get_uobject(uobj_get_type(_attrs, _type), (_attrs)->ufile, \
+ (_fdnum)*typecheck(s32, _fdnum), \
+ UVERBS_LOOKUP_READ, _attrs)
+
+static inline void *_uobj_get_obj_read(struct ib_uobject *uobj)
+{
+ if (IS_ERR(uobj))
+ return NULL;
+ return uobj->object;
+}
+#define uobj_get_obj_read(_object, _type, _id, _attrs) \
+ ((struct ib_##_object *)_uobj_get_obj_read( \
+ uobj_get_read(_type, _id, _attrs)))
+
+#define uobj_get_write(_type, _id, _attrs) \
+ rdma_lookup_get_uobject(uobj_get_type(_attrs, _type), (_attrs)->ufile, \
+ _uobj_check_id(_id), UVERBS_LOOKUP_WRITE, \
+ _attrs)
+
+int __uobj_perform_destroy(const struct uverbs_api_object *obj, u32 id,
+ struct uverbs_attr_bundle *attrs);
+#define uobj_perform_destroy(_type, _id, _attrs) \
+ __uobj_perform_destroy(uobj_get_type(_attrs, _type), \
+ _uobj_check_id(_id), _attrs)
+
+struct ib_uobject *__uobj_get_destroy(const struct uverbs_api_object *obj,
+ u32 id, struct uverbs_attr_bundle *attrs);
+
+#define uobj_get_destroy(_type, _id, _attrs) \
+ __uobj_get_destroy(uobj_get_type(_attrs, _type), _uobj_check_id(_id), \
+ _attrs)
+
+static inline void uobj_put_destroy(struct ib_uobject *uobj)
+{
+ rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_DESTROY);
+}
+
+static inline void uobj_put_read(struct ib_uobject *uobj)
+{
+ rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_READ);
+}
+
+#define uobj_put_obj_read(_obj) \
+ uobj_put_read((_obj)->uobject)
+
+static inline void uobj_put_write(struct ib_uobject *uobj)
+{
+ rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_WRITE);
+}
+
+static inline void uobj_alloc_abort(struct ib_uobject *uobj,
+ struct uverbs_attr_bundle *attrs)
+{
+ rdma_alloc_abort_uobject(uobj, attrs, false);
+}
+
+static inline void uobj_finalize_uobj_create(struct ib_uobject *uobj,
+ struct uverbs_attr_bundle *attrs)
+{
+ /*
+ * Tell the core code that the write() handler has completed
+ * initializing the object and that the core should commit or
+ * abort this object based upon the return code from the write()
+ * method. Similar to what uverbs_finalize_uobj_create() does for
+ * ioctl()
+ */
+ WARN_ON(attrs->uobject);
+ attrs->uobject = uobj;
+}
+
+static inline struct ib_uobject *
+__uobj_alloc(const struct uverbs_api_object *obj,
+ struct uverbs_attr_bundle *attrs, struct ib_device **ib_dev)
+{
+ struct ib_uobject *uobj = rdma_alloc_begin_uobject(obj, attrs);
+
+ if (!IS_ERR(uobj))
+ *ib_dev = attrs->context->device;
+ return uobj;
+}
+
+#define uobj_alloc(_type, _attrs, _ib_dev) \
+ __uobj_alloc(uobj_get_type(_attrs, _type), _attrs, _ib_dev)
+
+static inline void uverbs_flow_action_fill_action(struct ib_flow_action *action,
+ struct ib_uobject *uobj,
+ struct ib_device *ib_dev,
+ enum ib_flow_action_type type)
+{
+ atomic_set(&action->usecnt, 0);
+ action->device = ib_dev;
+ action->type = type;
+ action->uobject = uobj;
+ uobj->object = action;
+}
+
+struct ib_uflow_resources {
+ size_t max;
+ size_t num;
+ size_t collection_num;
+ size_t counters_num;
+ struct ib_counters **counters;
+ struct ib_flow_action **collection;
+};
+
+struct ib_uflow_object {
+ struct ib_uobject uobject;
+ struct ib_uflow_resources *resources;
+};
+
+struct ib_uflow_resources *flow_resources_alloc(size_t num_specs);
+void flow_resources_add(struct ib_uflow_resources *uflow_res,
+ enum ib_flow_spec_type type,
+ void *ibobj);
+void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res);
+
+static inline void ib_set_flow(struct ib_uobject *uobj, struct ib_flow *ibflow,
+ struct ib_qp *qp, struct ib_device *device,
+ struct ib_uflow_resources *uflow_res)
+{
+ struct ib_uflow_object *uflow;
+
+ uobj->object = ibflow;
+ ibflow->uobject = uobj;
+
+ if (qp) {
+ atomic_inc(&qp->usecnt);
+ ibflow->qp = qp;
+ }
+
+ ibflow->device = device;
+ uflow = container_of(uobj, typeof(*uflow), uobject);
+ uflow->resources = uflow_res;
+}
+
+struct uverbs_api_object {
+ const struct uverbs_obj_type *type_attrs;
+ const struct uverbs_obj_type_class *type_class;
+ u8 disabled:1;
+ u32 id;
+};
+
+static inline u32 uobj_get_object_id(struct ib_uobject *uobj)
+{
+ return uobj->uapi_object->id;
+}
+
+#endif
+
diff --git a/include/rdma/uverbs_types.h b/include/rdma/uverbs_types.h
new file mode 100644
index 0000000000..ccd11631c1
--- /dev/null
+++ b/include/rdma/uverbs_types.h
@@ -0,0 +1,184 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved.
+ */
+
+#ifndef _UVERBS_TYPES_
+#define _UVERBS_TYPES_
+
+#include <linux/kernel.h>
+#include <rdma/ib_verbs.h>
+
+struct uverbs_obj_type;
+struct uverbs_api_object;
+
+enum rdma_lookup_mode {
+ UVERBS_LOOKUP_READ,
+ UVERBS_LOOKUP_WRITE,
+ /*
+ * Destroy is like LOOKUP_WRITE, except that the uobject is not
+ * locked. uobj_destroy is used to convert a LOOKUP_DESTROY lock into
+ * a LOOKUP_WRITE lock.
+ */
+ UVERBS_LOOKUP_DESTROY,
+};
+
+/*
+ * The following sequences are valid:
+ * Success flow:
+ * alloc_begin
+ * alloc_commit
+ * [..]
+ * Access flow:
+ * lookup_get(exclusive=false) & uverbs_try_lock_object
+ * lookup_put(exclusive=false) via rdma_lookup_put_uobject
+ * Destruction flow:
+ * lookup_get(exclusive=true) & uverbs_try_lock_object
+ * remove_commit
+ * remove_handle (optional)
+ * lookup_put(exclusive=true) via rdma_lookup_put_uobject
+ *
+ * Allocate Error flow #1
+ * alloc_begin
+ * alloc_abort
+ * Allocate Error flow #2
+ * alloc_begin
+ * remove_commit
+ * alloc_abort
+ * Allocate Error flow #3
+ * alloc_begin
+ * alloc_commit (fails)
+ * remove_commit
+ * alloc_abort
+ *
+ * In all cases the caller must hold the ufile kref until alloc_commit or
+ * alloc_abort returns.
+ */
+struct uverbs_obj_type_class {
+ struct ib_uobject *(*alloc_begin)(const struct uverbs_api_object *obj,
+ struct uverbs_attr_bundle *attrs);
+ /* This consumes the kref on uobj */
+ void (*alloc_commit)(struct ib_uobject *uobj);
+ /* This does not consume the kref on uobj */
+ void (*alloc_abort)(struct ib_uobject *uobj);
+
+ struct ib_uobject *(*lookup_get)(const struct uverbs_api_object *obj,
+ struct ib_uverbs_file *ufile, s64 id,
+ enum rdma_lookup_mode mode);
+ void (*lookup_put)(struct ib_uobject *uobj, enum rdma_lookup_mode mode);
+ /* This does not consume the kref on uobj */
+ int __must_check (*destroy_hw)(struct ib_uobject *uobj,
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs);
+ void (*remove_handle)(struct ib_uobject *uobj);
+ void (*swap_uobjects)(struct ib_uobject *obj_old,
+ struct ib_uobject *obj_new);
+};
+
+struct uverbs_obj_type {
+ const struct uverbs_obj_type_class * const type_class;
+ size_t obj_size;
+};
+
+/*
+ * Objects type classes which support a detach state (object is still alive but
+ * it's not attached to any context need to make sure:
+ * (a) no call through to a driver after a detach is called
+ * (b) detach isn't called concurrently with context_cleanup
+ */
+
+struct uverbs_obj_idr_type {
+ /*
+ * In idr based objects, uverbs_obj_type_class points to a generic
+ * idr operations. In order to specialize the underlying types (e.g. CQ,
+ * QPs, etc.), we add destroy_object specific callbacks.
+ */
+ struct uverbs_obj_type type;
+
+ /* Free driver resources from the uobject, make the driver uncallable,
+ * and move the uobject to the detached state. If the object was
+ * destroyed by the user's request, a failure should leave the uobject
+ * completely unchanged.
+ */
+ int __must_check (*destroy_object)(struct ib_uobject *uobj,
+ enum rdma_remove_reason why,
+ struct uverbs_attr_bundle *attrs);
+};
+
+struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_api_object *obj,
+ struct ib_uverbs_file *ufile, s64 id,
+ enum rdma_lookup_mode mode,
+ struct uverbs_attr_bundle *attrs);
+void rdma_lookup_put_uobject(struct ib_uobject *uobj,
+ enum rdma_lookup_mode mode);
+struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_api_object *obj,
+ struct uverbs_attr_bundle *attrs);
+void rdma_alloc_abort_uobject(struct ib_uobject *uobj,
+ struct uverbs_attr_bundle *attrs,
+ bool hw_obj_valid);
+void rdma_alloc_commit_uobject(struct ib_uobject *uobj,
+ struct uverbs_attr_bundle *attrs);
+void rdma_assign_uobject(struct ib_uobject *to_uobj,
+ struct ib_uobject *new_uobj,
+ struct uverbs_attr_bundle *attrs);
+
+/*
+ * uverbs_uobject_get is called in order to increase the reference count on
+ * an uobject. This is useful when a handler wants to keep the uobject's memory
+ * alive, regardless if this uobject is still alive in the context's objects
+ * repository. Objects are put via uverbs_uobject_put.
+ */
+static inline void uverbs_uobject_get(struct ib_uobject *uobject)
+{
+ kref_get(&uobject->ref);
+}
+void uverbs_uobject_put(struct ib_uobject *uobject);
+
+struct uverbs_obj_fd_type {
+ /*
+ * In fd based objects, uverbs_obj_type_ops points to generic
+ * fd operations. In order to specialize the underlying types (e.g.
+ * completion_channel), we use fops, name and flags for fd creation.
+ * destroy_object is called when the uobject is to be destroyed,
+ * because the driver is removed or the FD is closed.
+ */
+ struct uverbs_obj_type type;
+ void (*destroy_object)(struct ib_uobject *uobj,
+ enum rdma_remove_reason why);
+ const struct file_operations *fops;
+ const char *name;
+ int flags;
+};
+
+extern const struct uverbs_obj_type_class uverbs_idr_class;
+extern const struct uverbs_obj_type_class uverbs_fd_class;
+int uverbs_uobject_fd_release(struct inode *inode, struct file *filp);
+
+#define UVERBS_BUILD_BUG_ON(cond) (sizeof(char[1 - 2 * !!(cond)]) - \
+ sizeof(char))
+#define UVERBS_TYPE_ALLOC_FD(_obj_size, _destroy_object, _fops, _name, _flags) \
+ ((&((const struct uverbs_obj_fd_type) \
+ {.type = { \
+ .type_class = &uverbs_fd_class, \
+ .obj_size = (_obj_size) + \
+ UVERBS_BUILD_BUG_ON((_obj_size) < \
+ sizeof(struct ib_uobject)), \
+ }, \
+ .destroy_object = _destroy_object, \
+ .fops = _fops, \
+ .name = _name, \
+ .flags = _flags}))->type)
+#define UVERBS_TYPE_ALLOC_IDR_SZ(_size, _destroy_object) \
+ ((&((const struct uverbs_obj_idr_type) \
+ {.type = { \
+ .type_class = &uverbs_idr_class, \
+ .obj_size = (_size) + \
+ UVERBS_BUILD_BUG_ON((_size) < \
+ sizeof(struct ib_uobject)) \
+ }, \
+ .destroy_object = _destroy_object,}))->type)
+#define UVERBS_TYPE_ALLOC_IDR(_destroy_object) \
+ UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uobject), \
+ _destroy_object)
+
+#endif