diff options
Diffstat (limited to 'drivers/infiniband/sw/rxe')
34 files changed, 13908 insertions, 0 deletions
diff --git a/drivers/infiniband/sw/rxe/Kconfig b/drivers/infiniband/sw/rxe/Kconfig new file mode 100644 index 0000000000..06b8dc5093 --- /dev/null +++ b/drivers/infiniband/sw/rxe/Kconfig @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: GPL-2.0-only +config RDMA_RXE + tristate "Software RDMA over Ethernet (RoCE) driver" + depends on INET && PCI && INFINIBAND + depends on INFINIBAND_VIRT_DMA + select NET_UDP_TUNNEL + select CRYPTO + select CRYPTO_CRC32 + help + This driver implements the InfiniBand RDMA transport over + the Linux network stack. It enables a system with a + standard Ethernet adapter to interoperate with a RoCE + adapter or with another system running the RXE driver. + Documentation on InfiniBand and RoCE can be downloaded at + www.infinibandta.org and www.openfabrics.org. (See also + siw which is a similar software driver for iWARP.) + + The driver is split into two layers, one interfaces with the + Linux RDMA stack and implements a kernel or user space + verbs API. The user space verbs API requires a support + library named librxe which is loaded by the generic user + space verbs API, libibverbs. The other layer interfaces + with the Linux network stack at layer 3. + + To configure and work with soft-RoCE driver please use the + following wiki page under "configure Soft-RoCE (RXE)" section: + + https://github.com/linux-rdma/rdma-core/blob/master/Documentation/rxe.md diff --git a/drivers/infiniband/sw/rxe/Makefile b/drivers/infiniband/sw/rxe/Makefile new file mode 100644 index 0000000000..5395a581f4 --- /dev/null +++ b/drivers/infiniband/sw/rxe/Makefile @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_RDMA_RXE) += rdma_rxe.o + +rdma_rxe-y := \ + rxe.o \ + rxe_comp.o \ + rxe_req.o \ + rxe_resp.o \ + rxe_recv.o \ + rxe_pool.o \ + rxe_queue.o \ + rxe_verbs.o \ + rxe_av.o \ + rxe_srq.o \ + rxe_qp.o \ + rxe_cq.o \ + rxe_mr.o \ + rxe_mw.o \ + rxe_opcode.o \ + rxe_mmap.o \ + rxe_icrc.o \ + rxe_mcast.o \ + rxe_task.o \ + rxe_net.o \ + rxe_hw_counters.o diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c new file mode 100644 index 0000000000..54c723a6ed --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -0,0 +1,243 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <rdma/rdma_netlink.h> +#include <net/addrconf.h> +#include "rxe.h" +#include "rxe_loc.h" + +MODULE_AUTHOR("Bob Pearson, Frank Zago, John Groves, Kamal Heib"); +MODULE_DESCRIPTION("Soft RDMA transport"); +MODULE_LICENSE("Dual BSD/GPL"); + +/* free resources for a rxe device all objects created for this device must + * have been destroyed + */ +void rxe_dealloc(struct ib_device *ib_dev) +{ + struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev); + + rxe_pool_cleanup(&rxe->uc_pool); + rxe_pool_cleanup(&rxe->pd_pool); + rxe_pool_cleanup(&rxe->ah_pool); + rxe_pool_cleanup(&rxe->srq_pool); + rxe_pool_cleanup(&rxe->qp_pool); + rxe_pool_cleanup(&rxe->cq_pool); + rxe_pool_cleanup(&rxe->mr_pool); + rxe_pool_cleanup(&rxe->mw_pool); + + WARN_ON(!RB_EMPTY_ROOT(&rxe->mcg_tree)); + + if (rxe->tfm) + crypto_free_shash(rxe->tfm); +} + +/* initialize rxe device parameters */ +static void rxe_init_device_param(struct rxe_dev *rxe) +{ + rxe->max_inline_data = RXE_MAX_INLINE_DATA; + + rxe->attr.vendor_id = RXE_VENDOR_ID; + rxe->attr.max_mr_size = RXE_MAX_MR_SIZE; + rxe->attr.page_size_cap = RXE_PAGE_SIZE_CAP; + rxe->attr.max_qp = RXE_MAX_QP; + rxe->attr.max_qp_wr = RXE_MAX_QP_WR; + rxe->attr.device_cap_flags = RXE_DEVICE_CAP_FLAGS; + rxe->attr.kernel_cap_flags = IBK_ALLOW_USER_UNREG; + rxe->attr.max_send_sge = RXE_MAX_SGE; + rxe->attr.max_recv_sge = RXE_MAX_SGE; + rxe->attr.max_sge_rd = RXE_MAX_SGE_RD; + rxe->attr.max_cq = RXE_MAX_CQ; + rxe->attr.max_cqe = (1 << RXE_MAX_LOG_CQE) - 1; + rxe->attr.max_mr = RXE_MAX_MR; + rxe->attr.max_mw = RXE_MAX_MW; + rxe->attr.max_pd = RXE_MAX_PD; + rxe->attr.max_qp_rd_atom = RXE_MAX_QP_RD_ATOM; + rxe->attr.max_res_rd_atom = RXE_MAX_RES_RD_ATOM; + rxe->attr.max_qp_init_rd_atom = RXE_MAX_QP_INIT_RD_ATOM; + rxe->attr.atomic_cap = IB_ATOMIC_HCA; + rxe->attr.max_mcast_grp = RXE_MAX_MCAST_GRP; + rxe->attr.max_mcast_qp_attach = RXE_MAX_MCAST_QP_ATTACH; + rxe->attr.max_total_mcast_qp_attach = RXE_MAX_TOT_MCAST_QP_ATTACH; + rxe->attr.max_ah = RXE_MAX_AH; + rxe->attr.max_srq = RXE_MAX_SRQ; + rxe->attr.max_srq_wr = RXE_MAX_SRQ_WR; + rxe->attr.max_srq_sge = RXE_MAX_SRQ_SGE; + rxe->attr.max_fast_reg_page_list_len = RXE_MAX_FMR_PAGE_LIST_LEN; + rxe->attr.max_pkeys = RXE_MAX_PKEYS; + rxe->attr.local_ca_ack_delay = RXE_LOCAL_CA_ACK_DELAY; + addrconf_addr_eui48((unsigned char *)&rxe->attr.sys_image_guid, + rxe->ndev->dev_addr); + + rxe->max_ucontext = RXE_MAX_UCONTEXT; +} + +/* initialize port attributes */ +static void rxe_init_port_param(struct rxe_port *port) +{ + port->attr.state = IB_PORT_DOWN; + port->attr.max_mtu = IB_MTU_4096; + port->attr.active_mtu = IB_MTU_256; + port->attr.gid_tbl_len = RXE_PORT_GID_TBL_LEN; + port->attr.port_cap_flags = RXE_PORT_PORT_CAP_FLAGS; + port->attr.max_msg_sz = RXE_PORT_MAX_MSG_SZ; + port->attr.bad_pkey_cntr = RXE_PORT_BAD_PKEY_CNTR; + port->attr.qkey_viol_cntr = RXE_PORT_QKEY_VIOL_CNTR; + port->attr.pkey_tbl_len = RXE_PORT_PKEY_TBL_LEN; + port->attr.lid = RXE_PORT_LID; + port->attr.sm_lid = RXE_PORT_SM_LID; + port->attr.lmc = RXE_PORT_LMC; + port->attr.max_vl_num = RXE_PORT_MAX_VL_NUM; + port->attr.sm_sl = RXE_PORT_SM_SL; + port->attr.subnet_timeout = RXE_PORT_SUBNET_TIMEOUT; + port->attr.init_type_reply = RXE_PORT_INIT_TYPE_REPLY; + port->attr.active_width = RXE_PORT_ACTIVE_WIDTH; + port->attr.active_speed = RXE_PORT_ACTIVE_SPEED; + port->attr.phys_state = RXE_PORT_PHYS_STATE; + port->mtu_cap = ib_mtu_enum_to_int(IB_MTU_256); + port->subnet_prefix = cpu_to_be64(RXE_PORT_SUBNET_PREFIX); +} + +/* initialize port state, note IB convention that HCA ports are always + * numbered from 1 + */ +static void rxe_init_ports(struct rxe_dev *rxe) +{ + struct rxe_port *port = &rxe->port; + + rxe_init_port_param(port); + addrconf_addr_eui48((unsigned char *)&port->port_guid, + rxe->ndev->dev_addr); + spin_lock_init(&port->port_lock); +} + +/* init pools of managed objects */ +static void rxe_init_pools(struct rxe_dev *rxe) +{ + rxe_pool_init(rxe, &rxe->uc_pool, RXE_TYPE_UC); + rxe_pool_init(rxe, &rxe->pd_pool, RXE_TYPE_PD); + rxe_pool_init(rxe, &rxe->ah_pool, RXE_TYPE_AH); + rxe_pool_init(rxe, &rxe->srq_pool, RXE_TYPE_SRQ); + rxe_pool_init(rxe, &rxe->qp_pool, RXE_TYPE_QP); + rxe_pool_init(rxe, &rxe->cq_pool, RXE_TYPE_CQ); + rxe_pool_init(rxe, &rxe->mr_pool, RXE_TYPE_MR); + rxe_pool_init(rxe, &rxe->mw_pool, RXE_TYPE_MW); +} + +/* initialize rxe device state */ +static void rxe_init(struct rxe_dev *rxe) +{ + /* init default device parameters */ + rxe_init_device_param(rxe); + + rxe_init_ports(rxe); + rxe_init_pools(rxe); + + /* init pending mmap list */ + spin_lock_init(&rxe->mmap_offset_lock); + spin_lock_init(&rxe->pending_lock); + INIT_LIST_HEAD(&rxe->pending_mmaps); + + /* init multicast support */ + spin_lock_init(&rxe->mcg_lock); + rxe->mcg_tree = RB_ROOT; + + mutex_init(&rxe->usdev_lock); +} + +void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) +{ + struct rxe_port *port = &rxe->port; + enum ib_mtu mtu; + + mtu = eth_mtu_int_to_enum(ndev_mtu); + + /* Make sure that new MTU in range */ + mtu = mtu ? min_t(enum ib_mtu, mtu, IB_MTU_4096) : IB_MTU_256; + + port->attr.active_mtu = mtu; + port->mtu_cap = ib_mtu_enum_to_int(mtu); + + rxe_info_dev(rxe, "Set mtu to %d", port->mtu_cap); +} + +/* called by ifc layer to create new rxe device. + * The caller should allocate memory for rxe by calling ib_alloc_device. + */ +int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name) +{ + rxe_init(rxe); + rxe_set_mtu(rxe, mtu); + + return rxe_register_device(rxe, ibdev_name); +} + +static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) +{ + struct rxe_dev *rxe; + int err = 0; + + if (is_vlan_dev(ndev)) { + rxe_err("rxe creation allowed on top of a real device only"); + err = -EPERM; + goto err; + } + + rxe = rxe_get_dev_from_net(ndev); + if (rxe) { + ib_device_put(&rxe->ib_dev); + rxe_err_dev(rxe, "already configured on %s", ndev->name); + err = -EEXIST; + goto err; + } + + err = rxe_net_add(ibdev_name, ndev); + if (err) { + rxe_err("failed to add %s\n", ndev->name); + goto err; + } +err: + return err; +} + +static struct rdma_link_ops rxe_link_ops = { + .type = "rxe", + .newlink = rxe_newlink, +}; + +static int __init rxe_module_init(void) +{ + int err; + + err = rxe_alloc_wq(); + if (err) + return err; + + err = rxe_net_init(); + if (err) { + rxe_destroy_wq(); + return err; + } + + rdma_link_register(&rxe_link_ops); + pr_info("loaded\n"); + return 0; +} + +static void __exit rxe_module_exit(void) +{ + rdma_link_unregister(&rxe_link_ops); + ib_unregister_driver(RDMA_DRIVER_RXE); + rxe_net_exit(); + rxe_destroy_wq(); + + pr_info("unloaded\n"); +} + +late_initcall(rxe_module_init); +module_exit(rxe_module_exit); + +MODULE_ALIAS_RDMA_LINK("rxe"); diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h new file mode 100644 index 0000000000..d33dd6cf83 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -0,0 +1,161 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#ifndef RXE_H +#define RXE_H + +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/skbuff.h> + +#include <rdma/ib_verbs.h> +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_pack.h> +#include <rdma/ib_smi.h> +#include <rdma/ib_umem.h> +#include <rdma/ib_cache.h> +#include <rdma/ib_addr.h> +#include <crypto/hash.h> + +#include "rxe_net.h" +#include "rxe_opcode.h" +#include "rxe_hdr.h" +#include "rxe_param.h" +#include "rxe_verbs.h" +#include "rxe_loc.h" + +/* + * Version 1 and Version 2 are identical on 64 bit machines, but on 32 bit + * machines Version 2 has a different struct layout. + */ +#define RXE_UVERBS_ABI_VERSION 2 + +#define RXE_ROCE_V2_SPORT (0xc000) + +#define rxe_dbg(fmt, ...) pr_debug("%s: " fmt "\n", __func__, ##__VA_ARGS__) +#define rxe_dbg_dev(rxe, fmt, ...) ibdev_dbg(&(rxe)->ib_dev, \ + "%s: " fmt, __func__, ##__VA_ARGS__) +#define rxe_dbg_uc(uc, fmt, ...) ibdev_dbg((uc)->ibuc.device, \ + "uc#%d %s: " fmt, (uc)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_dbg_pd(pd, fmt, ...) ibdev_dbg((pd)->ibpd.device, \ + "pd#%d %s: " fmt, (pd)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_dbg_ah(ah, fmt, ...) ibdev_dbg((ah)->ibah.device, \ + "ah#%d %s: " fmt, (ah)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_dbg_srq(srq, fmt, ...) ibdev_dbg((srq)->ibsrq.device, \ + "srq#%d %s: " fmt, (srq)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_dbg_qp(qp, fmt, ...) ibdev_dbg((qp)->ibqp.device, \ + "qp#%d %s: " fmt, (qp)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_dbg_cq(cq, fmt, ...) ibdev_dbg((cq)->ibcq.device, \ + "cq#%d %s: " fmt, (cq)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_dbg_mr(mr, fmt, ...) ibdev_dbg((mr)->ibmr.device, \ + "mr#%d %s: " fmt, (mr)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_dbg_mw(mw, fmt, ...) ibdev_dbg((mw)->ibmw.device, \ + "mw#%d %s: " fmt, (mw)->elem.index, __func__, ##__VA_ARGS__) + +#define rxe_err(fmt, ...) pr_err_ratelimited("%s: " fmt "\n", __func__, \ + ##__VA_ARGS__) +#define rxe_err_dev(rxe, fmt, ...) ibdev_err_ratelimited(&(rxe)->ib_dev, \ + "%s: " fmt, __func__, ##__VA_ARGS__) +#define rxe_err_uc(uc, fmt, ...) ibdev_err_ratelimited((uc)->ibuc.device, \ + "uc#%d %s: " fmt, (uc)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_err_pd(pd, fmt, ...) ibdev_err_ratelimited((pd)->ibpd.device, \ + "pd#%d %s: " fmt, (pd)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_err_ah(ah, fmt, ...) ibdev_err_ratelimited((ah)->ibah.device, \ + "ah#%d %s: " fmt, (ah)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_err_srq(srq, fmt, ...) ibdev_err_ratelimited((srq)->ibsrq.device, \ + "srq#%d %s: " fmt, (srq)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_err_qp(qp, fmt, ...) ibdev_err_ratelimited((qp)->ibqp.device, \ + "qp#%d %s: " fmt, (qp)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_err_cq(cq, fmt, ...) ibdev_err_ratelimited((cq)->ibcq.device, \ + "cq#%d %s: " fmt, (cq)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_err_mr(mr, fmt, ...) ibdev_err_ratelimited((mr)->ibmr.device, \ + "mr#%d %s: " fmt, (mr)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_err_mw(mw, fmt, ...) ibdev_err_ratelimited((mw)->ibmw.device, \ + "mw#%d %s: " fmt, (mw)->elem.index, __func__, ##__VA_ARGS__) + +#define rxe_info(fmt, ...) pr_info_ratelimited("%s: " fmt "\n", __func__, \ + ##__VA_ARGS__) +#define rxe_info_dev(rxe, fmt, ...) ibdev_info_ratelimited(&(rxe)->ib_dev, \ + "%s: " fmt, __func__, ##__VA_ARGS__) +#define rxe_info_uc(uc, fmt, ...) ibdev_info_ratelimited((uc)->ibuc.device, \ + "uc#%d %s: " fmt, (uc)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_info_pd(pd, fmt, ...) ibdev_info_ratelimited((pd)->ibpd.device, \ + "pd#%d %s: " fmt, (pd)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_info_ah(ah, fmt, ...) ibdev_info_ratelimited((ah)->ibah.device, \ + "ah#%d %s: " fmt, (ah)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_info_srq(srq, fmt, ...) ibdev_info_ratelimited((srq)->ibsrq.device, \ + "srq#%d %s: " fmt, (srq)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_info_qp(qp, fmt, ...) ibdev_info_ratelimited((qp)->ibqp.device, \ + "qp#%d %s: " fmt, (qp)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_info_cq(cq, fmt, ...) ibdev_info_ratelimited((cq)->ibcq.device, \ + "cq#%d %s: " fmt, (cq)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_info_mr(mr, fmt, ...) ibdev_info_ratelimited((mr)->ibmr.device, \ + "mr#%d %s: " fmt, (mr)->elem.index, __func__, ##__VA_ARGS__) +#define rxe_info_mw(mw, fmt, ...) ibdev_info_ratelimited((mw)->ibmw.device, \ + "mw#%d %s: " fmt, (mw)->elem.index, __func__, ##__VA_ARGS__) + +/* responder states */ +enum resp_states { + RESPST_NONE, + RESPST_GET_REQ, + RESPST_CHK_PSN, + RESPST_CHK_OP_SEQ, + RESPST_CHK_OP_VALID, + RESPST_CHK_RESOURCE, + RESPST_CHK_LENGTH, + RESPST_CHK_RKEY, + RESPST_EXECUTE, + RESPST_READ_REPLY, + RESPST_ATOMIC_REPLY, + RESPST_ATOMIC_WRITE_REPLY, + RESPST_PROCESS_FLUSH, + RESPST_COMPLETE, + RESPST_ACKNOWLEDGE, + RESPST_CLEANUP, + RESPST_DUPLICATE_REQUEST, + RESPST_ERR_MALFORMED_WQE, + RESPST_ERR_UNSUPPORTED_OPCODE, + RESPST_ERR_MISALIGNED_ATOMIC, + RESPST_ERR_PSN_OUT_OF_SEQ, + RESPST_ERR_MISSING_OPCODE_FIRST, + RESPST_ERR_MISSING_OPCODE_LAST_C, + RESPST_ERR_MISSING_OPCODE_LAST_D1E, + RESPST_ERR_TOO_MANY_RDMA_ATM_REQ, + RESPST_ERR_RNR, + RESPST_ERR_RKEY_VIOLATION, + RESPST_ERR_INVALIDATE_RKEY, + RESPST_ERR_LENGTH, + RESPST_ERR_CQ_OVERFLOW, + RESPST_ERROR, + RESPST_DONE, + RESPST_EXIT, +}; + +void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu); + +int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name); + +void rxe_rcv(struct sk_buff *skb); + +/* The caller must do a matching ib_device_put(&dev->ib_dev) */ +static inline struct rxe_dev *rxe_get_dev_from_net(struct net_device *ndev) +{ + struct ib_device *ibdev = + ib_device_get_by_netdev(ndev, RDMA_DRIVER_RXE); + + if (!ibdev) + return NULL; + return container_of(ibdev, struct rxe_dev, ib_dev); +} + +void rxe_port_up(struct rxe_dev *rxe); +void rxe_port_down(struct rxe_dev *rxe); +void rxe_set_port_state(struct rxe_dev *rxe); + +#endif /* RXE_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c new file mode 100644 index 0000000000..889d7adbd4 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_av.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include "rxe.h" +#include "rxe_loc.h" + +void rxe_init_av(struct rdma_ah_attr *attr, struct rxe_av *av) +{ + rxe_av_from_attr(rdma_ah_get_port_num(attr), av, attr); + rxe_av_fill_ip_info(av, attr); + memcpy(av->dmac, attr->roce.dmac, ETH_ALEN); +} + +static int chk_attr(void *obj, struct rdma_ah_attr *attr, bool obj_is_ah) +{ + const struct ib_global_route *grh = rdma_ah_read_grh(attr); + struct rxe_port *port; + struct rxe_dev *rxe; + struct rxe_qp *qp; + struct rxe_ah *ah; + int type; + + if (obj_is_ah) { + ah = obj; + rxe = to_rdev(ah->ibah.device); + } else { + qp = obj; + rxe = to_rdev(qp->ibqp.device); + } + + port = &rxe->port; + + if (rdma_ah_get_ah_flags(attr) & IB_AH_GRH) { + if (grh->sgid_index > port->attr.gid_tbl_len) { + if (obj_is_ah) + rxe_dbg_ah(ah, "invalid sgid index = %d\n", + grh->sgid_index); + else + rxe_dbg_qp(qp, "invalid sgid index = %d\n", + grh->sgid_index); + return -EINVAL; + } + + type = rdma_gid_attr_network_type(grh->sgid_attr); + if (type < RDMA_NETWORK_IPV4 || + type > RDMA_NETWORK_IPV6) { + if (obj_is_ah) + rxe_dbg_ah(ah, "invalid network type for rdma_rxe = %d\n", + type); + else + rxe_dbg_qp(qp, "invalid network type for rdma_rxe = %d\n", + type); + return -EINVAL; + } + } + + return 0; +} + +int rxe_av_chk_attr(struct rxe_qp *qp, struct rdma_ah_attr *attr) +{ + return chk_attr(qp, attr, false); +} + +int rxe_ah_chk_attr(struct rxe_ah *ah, struct rdma_ah_attr *attr) +{ + return chk_attr(ah, attr, true); +} + +void rxe_av_from_attr(u8 port_num, struct rxe_av *av, + struct rdma_ah_attr *attr) +{ + const struct ib_global_route *grh = rdma_ah_read_grh(attr); + + memset(av, 0, sizeof(*av)); + memcpy(av->grh.dgid.raw, grh->dgid.raw, sizeof(grh->dgid.raw)); + av->grh.flow_label = grh->flow_label; + av->grh.sgid_index = grh->sgid_index; + av->grh.hop_limit = grh->hop_limit; + av->grh.traffic_class = grh->traffic_class; + av->port_num = port_num; +} + +void rxe_av_to_attr(struct rxe_av *av, struct rdma_ah_attr *attr) +{ + struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); + + attr->type = RDMA_AH_ATTR_TYPE_ROCE; + + memcpy(grh->dgid.raw, av->grh.dgid.raw, sizeof(av->grh.dgid.raw)); + grh->flow_label = av->grh.flow_label; + grh->sgid_index = av->grh.sgid_index; + grh->hop_limit = av->grh.hop_limit; + grh->traffic_class = av->grh.traffic_class; + + rdma_ah_set_ah_flags(attr, IB_AH_GRH); + rdma_ah_set_port_num(attr, av->port_num); +} + +void rxe_av_fill_ip_info(struct rxe_av *av, struct rdma_ah_attr *attr) +{ + const struct ib_gid_attr *sgid_attr = attr->grh.sgid_attr; + int ibtype; + int type; + + rdma_gid2ip((struct sockaddr *)&av->sgid_addr, &sgid_attr->gid); + rdma_gid2ip((struct sockaddr *)&av->dgid_addr, + &rdma_ah_read_grh(attr)->dgid); + + ibtype = rdma_gid_attr_network_type(sgid_attr); + + switch (ibtype) { + case RDMA_NETWORK_IPV4: + type = RXE_NETWORK_TYPE_IPV4; + break; + case RDMA_NETWORK_IPV6: + type = RXE_NETWORK_TYPE_IPV6; + break; + default: + /* not reached - checked in rxe_av_chk_attr */ + type = 0; + break; + } + + av->network_type = type; +} + +struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt, struct rxe_ah **ahp) +{ + struct rxe_ah *ah; + u32 ah_num; + + if (ahp) + *ahp = NULL; + + if (!pkt || !pkt->qp) + return NULL; + + if (qp_type(pkt->qp) == IB_QPT_RC || qp_type(pkt->qp) == IB_QPT_UC) + return &pkt->qp->pri_av; + + if (!pkt->wqe) + return NULL; + + ah_num = pkt->wqe->wr.wr.ud.ah_num; + if (ah_num) { + /* only new user provider or kernel client */ + ah = rxe_pool_get_index(&pkt->rxe->ah_pool, ah_num); + if (!ah) { + rxe_dbg_qp(pkt->qp, "Unable to find AH matching ah_num\n"); + return NULL; + } + + if (rxe_ah_pd(ah) != pkt->qp->pd) { + rxe_dbg_qp(pkt->qp, "PDs don't match for AH and QP\n"); + rxe_put(ah); + return NULL; + } + + if (ahp) + *ahp = ah; + else + rxe_put(ah); + + return &ah->av; + } + + /* only old user provider for UD sends*/ + return &pkt->wqe->wr.wr.ud.av; +} diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c new file mode 100644 index 0000000000..d0bdc2d8ad --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -0,0 +1,851 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <linux/skbuff.h> + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" +#include "rxe_task.h" + +enum comp_state { + COMPST_GET_ACK, + COMPST_GET_WQE, + COMPST_COMP_WQE, + COMPST_COMP_ACK, + COMPST_CHECK_PSN, + COMPST_CHECK_ACK, + COMPST_READ, + COMPST_ATOMIC, + COMPST_WRITE_SEND, + COMPST_UPDATE_COMP, + COMPST_ERROR_RETRY, + COMPST_RNR_RETRY, + COMPST_ERROR, + COMPST_EXIT, /* We have an issue, and we want to rerun the completer */ + COMPST_DONE, /* The completer finished successflly */ +}; + +static char *comp_state_name[] = { + [COMPST_GET_ACK] = "GET ACK", + [COMPST_GET_WQE] = "GET WQE", + [COMPST_COMP_WQE] = "COMP WQE", + [COMPST_COMP_ACK] = "COMP ACK", + [COMPST_CHECK_PSN] = "CHECK PSN", + [COMPST_CHECK_ACK] = "CHECK ACK", + [COMPST_READ] = "READ", + [COMPST_ATOMIC] = "ATOMIC", + [COMPST_WRITE_SEND] = "WRITE/SEND", + [COMPST_UPDATE_COMP] = "UPDATE COMP", + [COMPST_ERROR_RETRY] = "ERROR RETRY", + [COMPST_RNR_RETRY] = "RNR RETRY", + [COMPST_ERROR] = "ERROR", + [COMPST_EXIT] = "EXIT", + [COMPST_DONE] = "DONE", +}; + +static unsigned long rnrnak_usec[32] = { + [IB_RNR_TIMER_655_36] = 655360, + [IB_RNR_TIMER_000_01] = 10, + [IB_RNR_TIMER_000_02] = 20, + [IB_RNR_TIMER_000_03] = 30, + [IB_RNR_TIMER_000_04] = 40, + [IB_RNR_TIMER_000_06] = 60, + [IB_RNR_TIMER_000_08] = 80, + [IB_RNR_TIMER_000_12] = 120, + [IB_RNR_TIMER_000_16] = 160, + [IB_RNR_TIMER_000_24] = 240, + [IB_RNR_TIMER_000_32] = 320, + [IB_RNR_TIMER_000_48] = 480, + [IB_RNR_TIMER_000_64] = 640, + [IB_RNR_TIMER_000_96] = 960, + [IB_RNR_TIMER_001_28] = 1280, + [IB_RNR_TIMER_001_92] = 1920, + [IB_RNR_TIMER_002_56] = 2560, + [IB_RNR_TIMER_003_84] = 3840, + [IB_RNR_TIMER_005_12] = 5120, + [IB_RNR_TIMER_007_68] = 7680, + [IB_RNR_TIMER_010_24] = 10240, + [IB_RNR_TIMER_015_36] = 15360, + [IB_RNR_TIMER_020_48] = 20480, + [IB_RNR_TIMER_030_72] = 30720, + [IB_RNR_TIMER_040_96] = 40960, + [IB_RNR_TIMER_061_44] = 61410, + [IB_RNR_TIMER_081_92] = 81920, + [IB_RNR_TIMER_122_88] = 122880, + [IB_RNR_TIMER_163_84] = 163840, + [IB_RNR_TIMER_245_76] = 245760, + [IB_RNR_TIMER_327_68] = 327680, + [IB_RNR_TIMER_491_52] = 491520, +}; + +static inline unsigned long rnrnak_jiffies(u8 timeout) +{ + return max_t(unsigned long, + usecs_to_jiffies(rnrnak_usec[timeout]), 1); +} + +static enum ib_wc_opcode wr_to_wc_opcode(enum ib_wr_opcode opcode) +{ + switch (opcode) { + case IB_WR_RDMA_WRITE: return IB_WC_RDMA_WRITE; + case IB_WR_RDMA_WRITE_WITH_IMM: return IB_WC_RDMA_WRITE; + case IB_WR_SEND: return IB_WC_SEND; + case IB_WR_SEND_WITH_IMM: return IB_WC_SEND; + case IB_WR_RDMA_READ: return IB_WC_RDMA_READ; + case IB_WR_ATOMIC_CMP_AND_SWP: return IB_WC_COMP_SWAP; + case IB_WR_ATOMIC_FETCH_AND_ADD: return IB_WC_FETCH_ADD; + case IB_WR_LSO: return IB_WC_LSO; + case IB_WR_SEND_WITH_INV: return IB_WC_SEND; + case IB_WR_RDMA_READ_WITH_INV: return IB_WC_RDMA_READ; + case IB_WR_LOCAL_INV: return IB_WC_LOCAL_INV; + case IB_WR_REG_MR: return IB_WC_REG_MR; + case IB_WR_BIND_MW: return IB_WC_BIND_MW; + case IB_WR_ATOMIC_WRITE: return IB_WC_ATOMIC_WRITE; + case IB_WR_FLUSH: return IB_WC_FLUSH; + + default: + return 0xff; + } +} + +void retransmit_timer(struct timer_list *t) +{ + struct rxe_qp *qp = from_timer(qp, t, retrans_timer); + unsigned long flags; + + rxe_dbg_qp(qp, "retransmit timer fired\n"); + + spin_lock_irqsave(&qp->state_lock, flags); + if (qp->valid) { + qp->comp.timeout = 1; + rxe_sched_task(&qp->comp.task); + } + spin_unlock_irqrestore(&qp->state_lock, flags); +} + +void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb) +{ + int must_sched; + + skb_queue_tail(&qp->resp_pkts, skb); + + must_sched = skb_queue_len(&qp->resp_pkts) > 1; + if (must_sched != 0) + rxe_counter_inc(SKB_TO_PKT(skb)->rxe, RXE_CNT_COMPLETER_SCHED); + + if (must_sched) + rxe_sched_task(&qp->comp.task); + else + rxe_run_task(&qp->comp.task); +} + +static inline enum comp_state get_wqe(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe **wqe_p) +{ + struct rxe_send_wqe *wqe; + + /* we come here whether or not we found a response packet to see if + * there are any posted WQEs + */ + wqe = queue_head(qp->sq.queue, QUEUE_TYPE_FROM_CLIENT); + *wqe_p = wqe; + + /* no WQE or requester has not started it yet */ + if (!wqe || wqe->state == wqe_state_posted) + return pkt ? COMPST_DONE : COMPST_EXIT; + + /* WQE does not require an ack */ + if (wqe->state == wqe_state_done) + return COMPST_COMP_WQE; + + /* WQE caused an error */ + if (wqe->state == wqe_state_error) + return COMPST_ERROR; + + /* we have a WQE, if we also have an ack check its PSN */ + return pkt ? COMPST_CHECK_PSN : COMPST_EXIT; +} + +static inline void reset_retry_counters(struct rxe_qp *qp) +{ + qp->comp.retry_cnt = qp->attr.retry_cnt; + qp->comp.rnr_retry = qp->attr.rnr_retry; + qp->comp.started_retry = 0; +} + +static inline enum comp_state check_psn(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + s32 diff; + + /* check to see if response is past the oldest WQE. if it is, complete + * send/write or error read/atomic + */ + diff = psn_compare(pkt->psn, wqe->last_psn); + if (diff > 0) { + if (wqe->state == wqe_state_pending) { + if (wqe->mask & WR_ATOMIC_OR_READ_MASK) + return COMPST_ERROR_RETRY; + + reset_retry_counters(qp); + return COMPST_COMP_WQE; + } else { + return COMPST_DONE; + } + } + + /* compare response packet to expected response */ + diff = psn_compare(pkt->psn, qp->comp.psn); + if (diff < 0) { + /* response is most likely a retried packet if it matches an + * uncompleted WQE go complete it else ignore it + */ + if (pkt->psn == wqe->last_psn) + return COMPST_COMP_ACK; + else if (pkt->opcode == IB_OPCODE_RC_ACKNOWLEDGE && + (qp->comp.opcode == IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST || + qp->comp.opcode == IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE)) + return COMPST_CHECK_ACK; + else + return COMPST_DONE; + } else if ((diff > 0) && (wqe->mask & WR_ATOMIC_OR_READ_MASK)) { + return COMPST_DONE; + } else { + return COMPST_CHECK_ACK; + } +} + +static inline enum comp_state check_ack(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + unsigned int mask = pkt->mask; + u8 syn; + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + + /* Check the sequence only */ + switch (qp->comp.opcode) { + case -1: + /* Will catch all *_ONLY cases. */ + if (!(mask & RXE_START_MASK)) + return COMPST_ERROR; + + break; + + case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST: + case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE: + /* Check NAK code to handle a remote error */ + if (pkt->opcode == IB_OPCODE_RC_ACKNOWLEDGE) + break; + + if (pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE && + pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST) { + /* read retries of partial data may restart from + * read response first or response only. + */ + if ((pkt->psn == wqe->first_psn && + pkt->opcode == + IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) || + (wqe->first_psn == wqe->last_psn && + pkt->opcode == + IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY)) + break; + + return COMPST_ERROR; + } + break; + default: + WARN_ON_ONCE(1); + } + + /* Check operation validity. */ + switch (pkt->opcode) { + case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST: + case IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST: + case IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY: + syn = aeth_syn(pkt); + + if ((syn & AETH_TYPE_MASK) != AETH_ACK) + return COMPST_ERROR; + + if (wqe->wr.opcode == IB_WR_ATOMIC_WRITE) + return COMPST_WRITE_SEND; + + fallthrough; + /* (IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE doesn't have an AETH) + */ + case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE: + if (wqe->wr.opcode != IB_WR_RDMA_READ && + wqe->wr.opcode != IB_WR_RDMA_READ_WITH_INV && + wqe->wr.opcode != IB_WR_FLUSH) { + wqe->status = IB_WC_FATAL_ERR; + return COMPST_ERROR; + } + reset_retry_counters(qp); + return COMPST_READ; + + case IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE: + syn = aeth_syn(pkt); + + if ((syn & AETH_TYPE_MASK) != AETH_ACK) + return COMPST_ERROR; + + if (wqe->wr.opcode != IB_WR_ATOMIC_CMP_AND_SWP && + wqe->wr.opcode != IB_WR_ATOMIC_FETCH_AND_ADD) + return COMPST_ERROR; + reset_retry_counters(qp); + return COMPST_ATOMIC; + + case IB_OPCODE_RC_ACKNOWLEDGE: + syn = aeth_syn(pkt); + switch (syn & AETH_TYPE_MASK) { + case AETH_ACK: + reset_retry_counters(qp); + return COMPST_WRITE_SEND; + + case AETH_RNR_NAK: + rxe_counter_inc(rxe, RXE_CNT_RCV_RNR); + return COMPST_RNR_RETRY; + + case AETH_NAK: + switch (syn) { + case AETH_NAK_PSN_SEQ_ERROR: + /* a nak implicitly acks all packets with psns + * before + */ + if (psn_compare(pkt->psn, qp->comp.psn) > 0) { + rxe_counter_inc(rxe, + RXE_CNT_RCV_SEQ_ERR); + qp->comp.psn = pkt->psn; + if (qp->req.wait_psn) { + qp->req.wait_psn = 0; + rxe_sched_task(&qp->req.task); + } + } + return COMPST_ERROR_RETRY; + + case AETH_NAK_INVALID_REQ: + wqe->status = IB_WC_REM_INV_REQ_ERR; + return COMPST_ERROR; + + case AETH_NAK_REM_ACC_ERR: + wqe->status = IB_WC_REM_ACCESS_ERR; + return COMPST_ERROR; + + case AETH_NAK_REM_OP_ERR: + wqe->status = IB_WC_REM_OP_ERR; + return COMPST_ERROR; + + default: + rxe_dbg_qp(qp, "unexpected nak %x\n", syn); + wqe->status = IB_WC_REM_OP_ERR; + return COMPST_ERROR; + } + + default: + return COMPST_ERROR; + } + break; + + default: + rxe_dbg_qp(qp, "unexpected opcode\n"); + } + + return COMPST_ERROR; +} + +static inline enum comp_state do_read(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + int ret; + + ret = copy_data(qp->pd, IB_ACCESS_LOCAL_WRITE, + &wqe->dma, payload_addr(pkt), + payload_size(pkt), RXE_TO_MR_OBJ); + if (ret) { + wqe->status = IB_WC_LOC_PROT_ERR; + return COMPST_ERROR; + } + + if (wqe->dma.resid == 0 && (pkt->mask & RXE_END_MASK)) + return COMPST_COMP_ACK; + + return COMPST_UPDATE_COMP; +} + +static inline enum comp_state do_atomic(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + int ret; + + u64 atomic_orig = atmack_orig(pkt); + + ret = copy_data(qp->pd, IB_ACCESS_LOCAL_WRITE, + &wqe->dma, &atomic_orig, + sizeof(u64), RXE_TO_MR_OBJ); + if (ret) { + wqe->status = IB_WC_LOC_PROT_ERR; + return COMPST_ERROR; + } + + return COMPST_COMP_ACK; +} + +static void make_send_cqe(struct rxe_qp *qp, struct rxe_send_wqe *wqe, + struct rxe_cqe *cqe) +{ + struct ib_wc *wc = &cqe->ibwc; + struct ib_uverbs_wc *uwc = &cqe->uibwc; + + memset(cqe, 0, sizeof(*cqe)); + + if (!qp->is_user) { + wc->wr_id = wqe->wr.wr_id; + wc->status = wqe->status; + wc->qp = &qp->ibqp; + } else { + uwc->wr_id = wqe->wr.wr_id; + uwc->status = wqe->status; + uwc->qp_num = qp->ibqp.qp_num; + } + + if (wqe->status == IB_WC_SUCCESS) { + if (!qp->is_user) { + wc->opcode = wr_to_wc_opcode(wqe->wr.opcode); + if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM || + wqe->wr.opcode == IB_WR_SEND_WITH_IMM) + wc->wc_flags = IB_WC_WITH_IMM; + wc->byte_len = wqe->dma.length; + } else { + uwc->opcode = wr_to_wc_opcode(wqe->wr.opcode); + if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM || + wqe->wr.opcode == IB_WR_SEND_WITH_IMM) + uwc->wc_flags = IB_WC_WITH_IMM; + uwc->byte_len = wqe->dma.length; + } + } else { + if (wqe->status != IB_WC_WR_FLUSH_ERR) + rxe_err_qp(qp, "non-flush error status = %d", + wqe->status); + } +} + +/* + * IBA Spec. Section 10.7.3.1 SIGNALED COMPLETIONS + * ---------8<---------8<------------- + * ...Note that if a completion error occurs, a Work Completion + * will always be generated, even if the signaling + * indicator requests an Unsignaled Completion. + * ---------8<---------8<------------- + */ +static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct rxe_cqe cqe; + bool post; + + /* do we need to post a completion */ + post = ((qp->sq_sig_type == IB_SIGNAL_ALL_WR) || + (wqe->wr.send_flags & IB_SEND_SIGNALED) || + wqe->status != IB_WC_SUCCESS); + + if (post) + make_send_cqe(qp, wqe, &cqe); + + queue_advance_consumer(qp->sq.queue, QUEUE_TYPE_FROM_CLIENT); + + if (post) + rxe_cq_post(qp->scq, &cqe, 0); + + if (wqe->wr.opcode == IB_WR_SEND || + wqe->wr.opcode == IB_WR_SEND_WITH_IMM || + wqe->wr.opcode == IB_WR_SEND_WITH_INV) + rxe_counter_inc(rxe, RXE_CNT_RDMA_SEND); + + /* + * we completed something so let req run again + * if it is trying to fence + */ + if (qp->req.wait_fence) { + qp->req.wait_fence = 0; + rxe_sched_task(&qp->req.task); + } +} + +static void comp_check_sq_drain_done(struct rxe_qp *qp) +{ + unsigned long flags; + + spin_lock_irqsave(&qp->state_lock, flags); + if (unlikely(qp_state(qp) == IB_QPS_SQD)) { + if (qp->attr.sq_draining && qp->comp.psn == qp->req.psn) { + qp->attr.sq_draining = 0; + spin_unlock_irqrestore(&qp->state_lock, flags); + + if (qp->ibqp.event_handler) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_SQ_DRAINED; + qp->ibqp.event_handler(&ev, + qp->ibqp.qp_context); + } + return; + } + } + spin_unlock_irqrestore(&qp->state_lock, flags); +} + +static inline enum comp_state complete_ack(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + if (wqe->has_rd_atomic) { + wqe->has_rd_atomic = 0; + atomic_inc(&qp->req.rd_atomic); + if (qp->req.need_rd_atomic) { + qp->comp.timeout_retry = 0; + qp->req.need_rd_atomic = 0; + rxe_sched_task(&qp->req.task); + } + } + + comp_check_sq_drain_done(qp); + + do_complete(qp, wqe); + + if (psn_compare(pkt->psn, qp->comp.psn) >= 0) + return COMPST_UPDATE_COMP; + else + return COMPST_DONE; +} + +static inline enum comp_state complete_wqe(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + if (pkt && wqe->state == wqe_state_pending) { + if (psn_compare(wqe->last_psn, qp->comp.psn) >= 0) { + qp->comp.psn = (wqe->last_psn + 1) & BTH_PSN_MASK; + qp->comp.opcode = -1; + } + + if (qp->req.wait_psn) { + qp->req.wait_psn = 0; + rxe_sched_task(&qp->req.task); + } + } + + do_complete(qp, wqe); + + return COMPST_GET_WQE; +} + +/* drain incoming response packet queue */ +static void drain_resp_pkts(struct rxe_qp *qp) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(&qp->resp_pkts))) { + rxe_put(qp); + kfree_skb(skb); + ib_device_put(qp->ibqp.device); + } +} + +/* complete send wqe with flush error */ +static int flush_send_wqe(struct rxe_qp *qp, struct rxe_send_wqe *wqe) +{ + struct rxe_cqe cqe = {}; + struct ib_wc *wc = &cqe.ibwc; + struct ib_uverbs_wc *uwc = &cqe.uibwc; + int err; + + if (qp->is_user) { + uwc->wr_id = wqe->wr.wr_id; + uwc->status = IB_WC_WR_FLUSH_ERR; + uwc->qp_num = qp->ibqp.qp_num; + } else { + wc->wr_id = wqe->wr.wr_id; + wc->status = IB_WC_WR_FLUSH_ERR; + wc->qp = &qp->ibqp; + } + + err = rxe_cq_post(qp->scq, &cqe, 0); + if (err) + rxe_dbg_cq(qp->scq, "post cq failed, err = %d", err); + + return err; +} + +/* drain and optionally complete the send queue + * if unable to complete a wqe, i.e. cq is full, stop + * completing and flush the remaining wqes + */ +static void flush_send_queue(struct rxe_qp *qp, bool notify) +{ + struct rxe_send_wqe *wqe; + struct rxe_queue *q = qp->sq.queue; + int err; + + /* send queue never got created. nothing to do. */ + if (!qp->sq.queue) + return; + + while ((wqe = queue_head(q, q->type))) { + if (notify) { + err = flush_send_wqe(qp, wqe); + if (err) + notify = 0; + } + queue_advance_consumer(q, q->type); + } +} + +static void free_pkt(struct rxe_pkt_info *pkt) +{ + struct sk_buff *skb = PKT_TO_SKB(pkt); + struct rxe_qp *qp = pkt->qp; + struct ib_device *dev = qp->ibqp.device; + + kfree_skb(skb); + rxe_put(qp); + ib_device_put(dev); +} + +/* reset the retry timer if + * - QP is type RC + * - there is a packet sent by the requester that + * might be acked (we still might get spurious + * timeouts but try to keep them as few as possible) + * - the timeout parameter is set + * - the QP is alive + */ +static void reset_retry_timer(struct rxe_qp *qp) +{ + unsigned long flags; + + if (qp_type(qp) == IB_QPT_RC && qp->qp_timeout_jiffies) { + spin_lock_irqsave(&qp->state_lock, flags); + if (qp_state(qp) >= IB_QPS_RTS && + psn_compare(qp->req.psn, qp->comp.psn) > 0) + mod_timer(&qp->retrans_timer, + jiffies + qp->qp_timeout_jiffies); + spin_unlock_irqrestore(&qp->state_lock, flags); + } +} + +int rxe_completer(struct rxe_qp *qp) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct rxe_send_wqe *wqe = NULL; + struct sk_buff *skb = NULL; + struct rxe_pkt_info *pkt = NULL; + enum comp_state state; + int ret; + unsigned long flags; + + spin_lock_irqsave(&qp->state_lock, flags); + if (!qp->valid || qp_state(qp) == IB_QPS_ERR || + qp_state(qp) == IB_QPS_RESET) { + bool notify = qp->valid && (qp_state(qp) == IB_QPS_ERR); + + drain_resp_pkts(qp); + flush_send_queue(qp, notify); + spin_unlock_irqrestore(&qp->state_lock, flags); + goto exit; + } + spin_unlock_irqrestore(&qp->state_lock, flags); + + if (qp->comp.timeout) { + qp->comp.timeout_retry = 1; + qp->comp.timeout = 0; + } else { + qp->comp.timeout_retry = 0; + } + + if (qp->req.need_retry) + goto exit; + + state = COMPST_GET_ACK; + + while (1) { + rxe_dbg_qp(qp, "state = %s\n", comp_state_name[state]); + switch (state) { + case COMPST_GET_ACK: + skb = skb_dequeue(&qp->resp_pkts); + if (skb) { + pkt = SKB_TO_PKT(skb); + qp->comp.timeout_retry = 0; + } + state = COMPST_GET_WQE; + break; + + case COMPST_GET_WQE: + state = get_wqe(qp, pkt, &wqe); + break; + + case COMPST_CHECK_PSN: + state = check_psn(qp, pkt, wqe); + break; + + case COMPST_CHECK_ACK: + state = check_ack(qp, pkt, wqe); + break; + + case COMPST_READ: + state = do_read(qp, pkt, wqe); + break; + + case COMPST_ATOMIC: + state = do_atomic(qp, pkt, wqe); + break; + + case COMPST_WRITE_SEND: + if (wqe->state == wqe_state_pending && + wqe->last_psn == pkt->psn) + state = COMPST_COMP_ACK; + else + state = COMPST_UPDATE_COMP; + break; + + case COMPST_COMP_ACK: + state = complete_ack(qp, pkt, wqe); + break; + + case COMPST_COMP_WQE: + state = complete_wqe(qp, pkt, wqe); + break; + + case COMPST_UPDATE_COMP: + if (pkt->mask & RXE_END_MASK) + qp->comp.opcode = -1; + else + qp->comp.opcode = pkt->opcode; + + if (psn_compare(pkt->psn, qp->comp.psn) >= 0) + qp->comp.psn = (pkt->psn + 1) & BTH_PSN_MASK; + + if (qp->req.wait_psn) { + qp->req.wait_psn = 0; + rxe_sched_task(&qp->req.task); + } + + state = COMPST_DONE; + break; + + case COMPST_DONE: + goto done; + + case COMPST_EXIT: + if (qp->comp.timeout_retry && wqe) { + state = COMPST_ERROR_RETRY; + break; + } + + reset_retry_timer(qp); + goto exit; + + case COMPST_ERROR_RETRY: + /* we come here if the retry timer fired and we did + * not receive a response packet. try to retry the send + * queue if that makes sense and the limits have not + * been exceeded. remember that some timeouts are + * spurious since we do not reset the timer but kick + * it down the road or let it expire + */ + + /* there is nothing to retry in this case */ + if (!wqe || (wqe->state == wqe_state_posted)) + goto exit; + + /* if we've started a retry, don't start another + * retry sequence, unless this is a timeout. + */ + if (qp->comp.started_retry && + !qp->comp.timeout_retry) + goto done; + + if (qp->comp.retry_cnt > 0) { + if (qp->comp.retry_cnt != 7) + qp->comp.retry_cnt--; + + /* no point in retrying if we have already + * seen the last ack that the requester could + * have caused + */ + if (psn_compare(qp->req.psn, + qp->comp.psn) > 0) { + /* tell the requester to retry the + * send queue next time around + */ + rxe_counter_inc(rxe, + RXE_CNT_COMP_RETRY); + qp->req.need_retry = 1; + qp->comp.started_retry = 1; + rxe_sched_task(&qp->req.task); + } + goto done; + + } else { + rxe_counter_inc(rxe, RXE_CNT_RETRY_EXCEEDED); + wqe->status = IB_WC_RETRY_EXC_ERR; + state = COMPST_ERROR; + } + break; + + case COMPST_RNR_RETRY: + /* we come here if we received an RNR NAK */ + if (qp->comp.rnr_retry > 0) { + if (qp->comp.rnr_retry != 7) + qp->comp.rnr_retry--; + + /* don't start a retry flow until the + * rnr timer has fired + */ + qp->req.wait_for_rnr_timer = 1; + rxe_dbg_qp(qp, "set rnr nak timer\n"); + // TODO who protects from destroy_qp?? + mod_timer(&qp->rnr_nak_timer, + jiffies + rnrnak_jiffies(aeth_syn(pkt) + & ~AETH_TYPE_MASK)); + goto exit; + } else { + rxe_counter_inc(rxe, + RXE_CNT_RNR_RETRY_EXCEEDED); + wqe->status = IB_WC_RNR_RETRY_EXC_ERR; + state = COMPST_ERROR; + } + break; + + case COMPST_ERROR: + WARN_ON_ONCE(wqe->status == IB_WC_SUCCESS); + do_complete(qp, wqe); + rxe_qp_error(qp); + goto exit; + } + } + + /* A non-zero return value will cause rxe_do_task to + * exit its loop and end the work item. A zero return + * will continue looping and return to rxe_completer + */ +done: + ret = 0; + goto out; +exit: + ret = -EAGAIN; +out: + if (pkt) + free_pkt(pkt); + return ret; +} diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c new file mode 100644 index 0000000000..d5486cbb3f --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ +#include <linux/vmalloc.h> +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" + +int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq, + int cqe, int comp_vector) +{ + int count; + + if (cqe <= 0) { + rxe_dbg_dev(rxe, "cqe(%d) <= 0\n", cqe); + goto err1; + } + + if (cqe > rxe->attr.max_cqe) { + rxe_dbg_dev(rxe, "cqe(%d) > max_cqe(%d)\n", + cqe, rxe->attr.max_cqe); + goto err1; + } + + if (cq) { + count = queue_count(cq->queue, QUEUE_TYPE_TO_CLIENT); + if (cqe < count) { + rxe_dbg_cq(cq, "cqe(%d) < current # elements in queue (%d)", + cqe, count); + goto err1; + } + } + + return 0; + +err1: + return -EINVAL; +} + +int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, + int comp_vector, struct ib_udata *udata, + struct rxe_create_cq_resp __user *uresp) +{ + int err; + enum queue_type type; + + type = QUEUE_TYPE_TO_CLIENT; + cq->queue = rxe_queue_init(rxe, &cqe, + sizeof(struct rxe_cqe), type); + if (!cq->queue) { + rxe_dbg_dev(rxe, "unable to create cq\n"); + return -ENOMEM; + } + + err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, udata, + cq->queue->buf, cq->queue->buf_size, &cq->queue->ip); + if (err) { + vfree(cq->queue->buf); + kfree(cq->queue); + return err; + } + + cq->is_user = uresp; + + spin_lock_init(&cq->cq_lock); + cq->ibcq.cqe = cqe; + return 0; +} + +int rxe_cq_resize_queue(struct rxe_cq *cq, int cqe, + struct rxe_resize_cq_resp __user *uresp, + struct ib_udata *udata) +{ + int err; + + err = rxe_queue_resize(cq->queue, (unsigned int *)&cqe, + sizeof(struct rxe_cqe), udata, + uresp ? &uresp->mi : NULL, NULL, &cq->cq_lock); + if (!err) + cq->ibcq.cqe = cqe; + + return err; +} + +/* caller holds reference to cq */ +int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited) +{ + struct ib_event ev; + int full; + void *addr; + unsigned long flags; + + spin_lock_irqsave(&cq->cq_lock, flags); + + full = queue_full(cq->queue, QUEUE_TYPE_TO_CLIENT); + if (unlikely(full)) { + rxe_err_cq(cq, "queue full"); + spin_unlock_irqrestore(&cq->cq_lock, flags); + if (cq->ibcq.event_handler) { + ev.device = cq->ibcq.device; + ev.element.cq = &cq->ibcq; + ev.event = IB_EVENT_CQ_ERR; + cq->ibcq.event_handler(&ev, cq->ibcq.cq_context); + } + + return -EBUSY; + } + + addr = queue_producer_addr(cq->queue, QUEUE_TYPE_TO_CLIENT); + memcpy(addr, cqe, sizeof(*cqe)); + + queue_advance_producer(cq->queue, QUEUE_TYPE_TO_CLIENT); + + if ((cq->notify & IB_CQ_NEXT_COMP) || + (cq->notify & IB_CQ_SOLICITED && solicited)) { + cq->notify = 0; + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); + } + + spin_unlock_irqrestore(&cq->cq_lock, flags); + + return 0; +} + +void rxe_cq_cleanup(struct rxe_pool_elem *elem) +{ + struct rxe_cq *cq = container_of(elem, typeof(*cq), elem); + + if (cq->queue) + rxe_queue_cleanup(cq->queue); +} diff --git a/drivers/infiniband/sw/rxe/rxe_hdr.h b/drivers/infiniband/sw/rxe/rxe_hdr.h new file mode 100644 index 0000000000..46f82b27fc --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_hdr.h @@ -0,0 +1,977 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#ifndef RXE_HDR_H +#define RXE_HDR_H + +/* extracted information about a packet carried in an sk_buff struct fits in + * the skbuff cb array. Must be at most 48 bytes. stored in control block of + * sk_buff for received packets. + */ +struct rxe_pkt_info { + struct rxe_dev *rxe; /* device that owns packet */ + struct rxe_qp *qp; /* qp that owns packet */ + struct rxe_send_wqe *wqe; /* send wqe */ + u8 *hdr; /* points to bth */ + u32 mask; /* useful info about pkt */ + u32 psn; /* bth psn of packet */ + u16 pkey_index; /* partition of pkt */ + u16 paylen; /* length of bth - icrc */ + u8 port_num; /* port pkt received on */ + u8 opcode; /* bth opcode of packet */ +}; + +/* Macros should be used only for received skb */ +static inline struct rxe_pkt_info *SKB_TO_PKT(struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(struct rxe_pkt_info) > sizeof(skb->cb)); + return (void *)skb->cb; +} + +static inline struct sk_buff *PKT_TO_SKB(struct rxe_pkt_info *pkt) +{ + return container_of((void *)pkt, struct sk_buff, cb); +} + +/* + * IBA header types and methods + * + * Some of these are for reference and completeness only since + * rxe does not currently support RD transport + * most of this could be moved into IB core. ib_pack.h has + * part of this but is incomplete + * + * Header specific routines to insert/extract values to/from headers + * the routines that are named __hhh_(set_)fff() take a pointer to a + * hhh header and get(set) the fff field. The routines named + * hhh_(set_)fff take a packet info struct and find the + * header and field based on the opcode in the packet. + * Conversion to/from network byte order from cpu order is also done. + */ + +#define RXE_ICRC_SIZE (4) +#define RXE_MAX_HDR_LENGTH (80) + +/****************************************************************************** + * Base Transport Header + ******************************************************************************/ +struct rxe_bth { + u8 opcode; + u8 flags; + __be16 pkey; + __be32 qpn; + __be32 apsn; +}; + +#define BTH_TVER (0) +#define BTH_DEF_PKEY (0xffff) + +#define BTH_SE_MASK (0x80) +#define BTH_MIG_MASK (0x40) +#define BTH_PAD_MASK (0x30) +#define BTH_TVER_MASK (0x0f) +#define BTH_FECN_MASK (0x80000000) +#define BTH_BECN_MASK (0x40000000) +#define BTH_RESV6A_MASK (0x3f000000) +#define BTH_QPN_MASK (0x00ffffff) +#define BTH_ACK_MASK (0x80000000) +#define BTH_RESV7_MASK (0x7f000000) +#define BTH_PSN_MASK (0x00ffffff) + +static inline u8 __bth_opcode(void *arg) +{ + struct rxe_bth *bth = arg; + + return bth->opcode; +} + +static inline void __bth_set_opcode(void *arg, u8 opcode) +{ + struct rxe_bth *bth = arg; + + bth->opcode = opcode; +} + +static inline u8 __bth_se(void *arg) +{ + struct rxe_bth *bth = arg; + + return 0 != (BTH_SE_MASK & bth->flags); +} + +static inline void __bth_set_se(void *arg, int se) +{ + struct rxe_bth *bth = arg; + + if (se) + bth->flags |= BTH_SE_MASK; + else + bth->flags &= ~BTH_SE_MASK; +} + +static inline u8 __bth_mig(void *arg) +{ + struct rxe_bth *bth = arg; + + return 0 != (BTH_MIG_MASK & bth->flags); +} + +static inline void __bth_set_mig(void *arg, u8 mig) +{ + struct rxe_bth *bth = arg; + + if (mig) + bth->flags |= BTH_MIG_MASK; + else + bth->flags &= ~BTH_MIG_MASK; +} + +static inline u8 __bth_pad(void *arg) +{ + struct rxe_bth *bth = arg; + + return (BTH_PAD_MASK & bth->flags) >> 4; +} + +static inline void __bth_set_pad(void *arg, u8 pad) +{ + struct rxe_bth *bth = arg; + + bth->flags = (BTH_PAD_MASK & (pad << 4)) | + (~BTH_PAD_MASK & bth->flags); +} + +static inline u8 __bth_tver(void *arg) +{ + struct rxe_bth *bth = arg; + + return BTH_TVER_MASK & bth->flags; +} + +static inline void __bth_set_tver(void *arg, u8 tver) +{ + struct rxe_bth *bth = arg; + + bth->flags = (BTH_TVER_MASK & tver) | + (~BTH_TVER_MASK & bth->flags); +} + +static inline u16 __bth_pkey(void *arg) +{ + struct rxe_bth *bth = arg; + + return be16_to_cpu(bth->pkey); +} + +static inline void __bth_set_pkey(void *arg, u16 pkey) +{ + struct rxe_bth *bth = arg; + + bth->pkey = cpu_to_be16(pkey); +} + +static inline u32 __bth_qpn(void *arg) +{ + struct rxe_bth *bth = arg; + + return BTH_QPN_MASK & be32_to_cpu(bth->qpn); +} + +static inline void __bth_set_qpn(void *arg, u32 qpn) +{ + struct rxe_bth *bth = arg; + u32 resvqpn = be32_to_cpu(bth->qpn); + + bth->qpn = cpu_to_be32((BTH_QPN_MASK & qpn) | + (~BTH_QPN_MASK & resvqpn)); +} + +static inline int __bth_fecn(void *arg) +{ + struct rxe_bth *bth = arg; + + return 0 != (cpu_to_be32(BTH_FECN_MASK) & bth->qpn); +} + +static inline void __bth_set_fecn(void *arg, int fecn) +{ + struct rxe_bth *bth = arg; + + if (fecn) + bth->qpn |= cpu_to_be32(BTH_FECN_MASK); + else + bth->qpn &= ~cpu_to_be32(BTH_FECN_MASK); +} + +static inline int __bth_becn(void *arg) +{ + struct rxe_bth *bth = arg; + + return 0 != (cpu_to_be32(BTH_BECN_MASK) & bth->qpn); +} + +static inline void __bth_set_becn(void *arg, int becn) +{ + struct rxe_bth *bth = arg; + + if (becn) + bth->qpn |= cpu_to_be32(BTH_BECN_MASK); + else + bth->qpn &= ~cpu_to_be32(BTH_BECN_MASK); +} + +static inline u8 __bth_resv6a(void *arg) +{ + struct rxe_bth *bth = arg; + + return (BTH_RESV6A_MASK & be32_to_cpu(bth->qpn)) >> 24; +} + +static inline void __bth_set_resv6a(void *arg) +{ + struct rxe_bth *bth = arg; + + bth->qpn = cpu_to_be32(~BTH_RESV6A_MASK); +} + +static inline int __bth_ack(void *arg) +{ + struct rxe_bth *bth = arg; + + return 0 != (cpu_to_be32(BTH_ACK_MASK) & bth->apsn); +} + +static inline void __bth_set_ack(void *arg, int ack) +{ + struct rxe_bth *bth = arg; + + if (ack) + bth->apsn |= cpu_to_be32(BTH_ACK_MASK); + else + bth->apsn &= ~cpu_to_be32(BTH_ACK_MASK); +} + +static inline void __bth_set_resv7(void *arg) +{ + struct rxe_bth *bth = arg; + + bth->apsn &= ~cpu_to_be32(BTH_RESV7_MASK); +} + +static inline u32 __bth_psn(void *arg) +{ + struct rxe_bth *bth = arg; + + return BTH_PSN_MASK & be32_to_cpu(bth->apsn); +} + +static inline void __bth_set_psn(void *arg, u32 psn) +{ + struct rxe_bth *bth = arg; + u32 apsn = be32_to_cpu(bth->apsn); + + bth->apsn = cpu_to_be32((BTH_PSN_MASK & psn) | + (~BTH_PSN_MASK & apsn)); +} + +static inline u8 bth_opcode(struct rxe_pkt_info *pkt) +{ + return __bth_opcode(pkt->hdr); +} + +static inline void bth_set_opcode(struct rxe_pkt_info *pkt, u8 opcode) +{ + __bth_set_opcode(pkt->hdr, opcode); +} + +static inline u8 bth_se(struct rxe_pkt_info *pkt) +{ + return __bth_se(pkt->hdr); +} + +static inline void bth_set_se(struct rxe_pkt_info *pkt, int se) +{ + __bth_set_se(pkt->hdr, se); +} + +static inline u8 bth_mig(struct rxe_pkt_info *pkt) +{ + return __bth_mig(pkt->hdr); +} + +static inline void bth_set_mig(struct rxe_pkt_info *pkt, u8 mig) +{ + __bth_set_mig(pkt->hdr, mig); +} + +static inline u8 bth_pad(struct rxe_pkt_info *pkt) +{ + return __bth_pad(pkt->hdr); +} + +static inline void bth_set_pad(struct rxe_pkt_info *pkt, u8 pad) +{ + __bth_set_pad(pkt->hdr, pad); +} + +static inline u8 bth_tver(struct rxe_pkt_info *pkt) +{ + return __bth_tver(pkt->hdr); +} + +static inline void bth_set_tver(struct rxe_pkt_info *pkt, u8 tver) +{ + __bth_set_tver(pkt->hdr, tver); +} + +static inline u16 bth_pkey(struct rxe_pkt_info *pkt) +{ + return __bth_pkey(pkt->hdr); +} + +static inline void bth_set_pkey(struct rxe_pkt_info *pkt, u16 pkey) +{ + __bth_set_pkey(pkt->hdr, pkey); +} + +static inline u32 bth_qpn(struct rxe_pkt_info *pkt) +{ + return __bth_qpn(pkt->hdr); +} + +static inline void bth_set_qpn(struct rxe_pkt_info *pkt, u32 qpn) +{ + __bth_set_qpn(pkt->hdr, qpn); +} + +static inline int bth_fecn(struct rxe_pkt_info *pkt) +{ + return __bth_fecn(pkt->hdr); +} + +static inline void bth_set_fecn(struct rxe_pkt_info *pkt, int fecn) +{ + __bth_set_fecn(pkt->hdr, fecn); +} + +static inline int bth_becn(struct rxe_pkt_info *pkt) +{ + return __bth_becn(pkt->hdr); +} + +static inline void bth_set_becn(struct rxe_pkt_info *pkt, int becn) +{ + __bth_set_becn(pkt->hdr, becn); +} + +static inline u8 bth_resv6a(struct rxe_pkt_info *pkt) +{ + return __bth_resv6a(pkt->hdr); +} + +static inline void bth_set_resv6a(struct rxe_pkt_info *pkt) +{ + __bth_set_resv6a(pkt->hdr); +} + +static inline int bth_ack(struct rxe_pkt_info *pkt) +{ + return __bth_ack(pkt->hdr); +} + +static inline void bth_set_ack(struct rxe_pkt_info *pkt, int ack) +{ + __bth_set_ack(pkt->hdr, ack); +} + +static inline void bth_set_resv7(struct rxe_pkt_info *pkt) +{ + __bth_set_resv7(pkt->hdr); +} + +static inline u32 bth_psn(struct rxe_pkt_info *pkt) +{ + return __bth_psn(pkt->hdr); +} + +static inline void bth_set_psn(struct rxe_pkt_info *pkt, u32 psn) +{ + __bth_set_psn(pkt->hdr, psn); +} + +static inline void bth_init(struct rxe_pkt_info *pkt, u8 opcode, int se, + int mig, int pad, u16 pkey, u32 qpn, int ack_req, + u32 psn) +{ + struct rxe_bth *bth = (struct rxe_bth *)(pkt->hdr); + + bth->opcode = opcode; + bth->flags = (pad << 4) & BTH_PAD_MASK; + if (se) + bth->flags |= BTH_SE_MASK; + if (mig) + bth->flags |= BTH_MIG_MASK; + bth->pkey = cpu_to_be16(pkey); + bth->qpn = cpu_to_be32(qpn & BTH_QPN_MASK); + psn &= BTH_PSN_MASK; + if (ack_req) + psn |= BTH_ACK_MASK; + bth->apsn = cpu_to_be32(psn); +} + +/****************************************************************************** + * Reliable Datagram Extended Transport Header + ******************************************************************************/ +struct rxe_rdeth { + __be32 een; +}; + +#define RDETH_EEN_MASK (0x00ffffff) + +static inline u8 __rdeth_een(void *arg) +{ + struct rxe_rdeth *rdeth = arg; + + return RDETH_EEN_MASK & be32_to_cpu(rdeth->een); +} + +static inline void __rdeth_set_een(void *arg, u32 een) +{ + struct rxe_rdeth *rdeth = arg; + + rdeth->een = cpu_to_be32(RDETH_EEN_MASK & een); +} + +static inline u8 rdeth_een(struct rxe_pkt_info *pkt) +{ + return __rdeth_een(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_RDETH]); +} + +static inline void rdeth_set_een(struct rxe_pkt_info *pkt, u32 een) +{ + __rdeth_set_een(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_RDETH], een); +} + +/****************************************************************************** + * Datagram Extended Transport Header + ******************************************************************************/ +struct rxe_deth { + __be32 qkey; + __be32 sqp; +}; + +#define GSI_QKEY (0x80010000) +#define DETH_SQP_MASK (0x00ffffff) + +static inline u32 __deth_qkey(void *arg) +{ + struct rxe_deth *deth = arg; + + return be32_to_cpu(deth->qkey); +} + +static inline void __deth_set_qkey(void *arg, u32 qkey) +{ + struct rxe_deth *deth = arg; + + deth->qkey = cpu_to_be32(qkey); +} + +static inline u32 __deth_sqp(void *arg) +{ + struct rxe_deth *deth = arg; + + return DETH_SQP_MASK & be32_to_cpu(deth->sqp); +} + +static inline void __deth_set_sqp(void *arg, u32 sqp) +{ + struct rxe_deth *deth = arg; + + deth->sqp = cpu_to_be32(DETH_SQP_MASK & sqp); +} + +static inline u32 deth_qkey(struct rxe_pkt_info *pkt) +{ + return __deth_qkey(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_DETH]); +} + +static inline void deth_set_qkey(struct rxe_pkt_info *pkt, u32 qkey) +{ + __deth_set_qkey(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_DETH], qkey); +} + +static inline u32 deth_sqp(struct rxe_pkt_info *pkt) +{ + return __deth_sqp(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_DETH]); +} + +static inline void deth_set_sqp(struct rxe_pkt_info *pkt, u32 sqp) +{ + __deth_set_sqp(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_DETH], sqp); +} + +/****************************************************************************** + * RDMA Extended Transport Header + ******************************************************************************/ +struct rxe_reth { + __be64 va; + __be32 rkey; + __be32 len; +}; + +static inline u64 __reth_va(void *arg) +{ + struct rxe_reth *reth = arg; + + return be64_to_cpu(reth->va); +} + +static inline void __reth_set_va(void *arg, u64 va) +{ + struct rxe_reth *reth = arg; + + reth->va = cpu_to_be64(va); +} + +static inline u32 __reth_rkey(void *arg) +{ + struct rxe_reth *reth = arg; + + return be32_to_cpu(reth->rkey); +} + +static inline void __reth_set_rkey(void *arg, u32 rkey) +{ + struct rxe_reth *reth = arg; + + reth->rkey = cpu_to_be32(rkey); +} + +static inline u32 __reth_len(void *arg) +{ + struct rxe_reth *reth = arg; + + return be32_to_cpu(reth->len); +} + +static inline void __reth_set_len(void *arg, u32 len) +{ + struct rxe_reth *reth = arg; + + reth->len = cpu_to_be32(len); +} + +static inline u64 reth_va(struct rxe_pkt_info *pkt) +{ + return __reth_va(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_RETH]); +} + +static inline void reth_set_va(struct rxe_pkt_info *pkt, u64 va) +{ + __reth_set_va(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_RETH], va); +} + +static inline u32 reth_rkey(struct rxe_pkt_info *pkt) +{ + return __reth_rkey(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_RETH]); +} + +static inline void reth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey) +{ + __reth_set_rkey(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_RETH], rkey); +} + +static inline u32 reth_len(struct rxe_pkt_info *pkt) +{ + return __reth_len(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_RETH]); +} + +static inline void reth_set_len(struct rxe_pkt_info *pkt, u32 len) +{ + __reth_set_len(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_RETH], len); +} + +/****************************************************************************** + * FLUSH Extended Transport Header + ******************************************************************************/ + +struct rxe_feth { + __be32 bits; +}; + +#define FETH_PLT_MASK (0x0000000f) /* bits 3-0 */ +#define FETH_SEL_MASK (0x00000030) /* bits 5-4 */ +#define FETH_SEL_SHIFT (4U) + +static inline u32 __feth_plt(void *arg) +{ + struct rxe_feth *feth = arg; + + return be32_to_cpu(feth->bits) & FETH_PLT_MASK; +} + +static inline u32 __feth_sel(void *arg) +{ + struct rxe_feth *feth = arg; + + return (be32_to_cpu(feth->bits) & FETH_SEL_MASK) >> FETH_SEL_SHIFT; +} + +static inline u32 feth_plt(struct rxe_pkt_info *pkt) +{ + return __feth_plt(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_FETH]); +} + +static inline u32 feth_sel(struct rxe_pkt_info *pkt) +{ + return __feth_sel(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_FETH]); +} + +static inline void feth_init(struct rxe_pkt_info *pkt, u8 type, u8 level) +{ + struct rxe_feth *feth = (struct rxe_feth *) + (pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_FETH]); + u32 bits = ((level << FETH_SEL_SHIFT) & FETH_SEL_MASK) | + (type & FETH_PLT_MASK); + + feth->bits = cpu_to_be32(bits); +} + +/****************************************************************************** + * Atomic Extended Transport Header + ******************************************************************************/ +struct rxe_atmeth { + __be64 va; + __be32 rkey; + __be64 swap_add; + __be64 comp; +} __packed; + +static inline u64 __atmeth_va(void *arg) +{ + struct rxe_atmeth *atmeth = arg; + + return be64_to_cpu(atmeth->va); +} + +static inline void __atmeth_set_va(void *arg, u64 va) +{ + struct rxe_atmeth *atmeth = arg; + + atmeth->va = cpu_to_be64(va); +} + +static inline u32 __atmeth_rkey(void *arg) +{ + struct rxe_atmeth *atmeth = arg; + + return be32_to_cpu(atmeth->rkey); +} + +static inline void __atmeth_set_rkey(void *arg, u32 rkey) +{ + struct rxe_atmeth *atmeth = arg; + + atmeth->rkey = cpu_to_be32(rkey); +} + +static inline u64 __atmeth_swap_add(void *arg) +{ + struct rxe_atmeth *atmeth = arg; + + return be64_to_cpu(atmeth->swap_add); +} + +static inline void __atmeth_set_swap_add(void *arg, u64 swap_add) +{ + struct rxe_atmeth *atmeth = arg; + + atmeth->swap_add = cpu_to_be64(swap_add); +} + +static inline u64 __atmeth_comp(void *arg) +{ + struct rxe_atmeth *atmeth = arg; + + return be64_to_cpu(atmeth->comp); +} + +static inline void __atmeth_set_comp(void *arg, u64 comp) +{ + struct rxe_atmeth *atmeth = arg; + + atmeth->comp = cpu_to_be64(comp); +} + +static inline u64 atmeth_va(struct rxe_pkt_info *pkt) +{ + return __atmeth_va(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]); +} + +static inline void atmeth_set_va(struct rxe_pkt_info *pkt, u64 va) +{ + __atmeth_set_va(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], va); +} + +static inline u32 atmeth_rkey(struct rxe_pkt_info *pkt) +{ + return __atmeth_rkey(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]); +} + +static inline void atmeth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey) +{ + __atmeth_set_rkey(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], rkey); +} + +static inline u64 atmeth_swap_add(struct rxe_pkt_info *pkt) +{ + return __atmeth_swap_add(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]); +} + +static inline void atmeth_set_swap_add(struct rxe_pkt_info *pkt, u64 swap_add) +{ + __atmeth_set_swap_add(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], swap_add); +} + +static inline u64 atmeth_comp(struct rxe_pkt_info *pkt) +{ + return __atmeth_comp(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]); +} + +static inline void atmeth_set_comp(struct rxe_pkt_info *pkt, u64 comp) +{ + __atmeth_set_comp(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], comp); +} + +/****************************************************************************** + * Ack Extended Transport Header + ******************************************************************************/ +struct rxe_aeth { + __be32 smsn; +}; + +#define AETH_SYN_MASK (0xff000000) +#define AETH_MSN_MASK (0x00ffffff) + +enum aeth_syndrome { + AETH_TYPE_MASK = 0xe0, + AETH_ACK = 0x00, + AETH_RNR_NAK = 0x20, + AETH_RSVD = 0x40, + AETH_NAK = 0x60, + AETH_ACK_UNLIMITED = 0x1f, + AETH_NAK_PSN_SEQ_ERROR = 0x60, + AETH_NAK_INVALID_REQ = 0x61, + AETH_NAK_REM_ACC_ERR = 0x62, + AETH_NAK_REM_OP_ERR = 0x63, +}; + +static inline u8 __aeth_syn(void *arg) +{ + struct rxe_aeth *aeth = arg; + + return (AETH_SYN_MASK & be32_to_cpu(aeth->smsn)) >> 24; +} + +static inline void __aeth_set_syn(void *arg, u8 syn) +{ + struct rxe_aeth *aeth = arg; + u32 smsn = be32_to_cpu(aeth->smsn); + + aeth->smsn = cpu_to_be32((AETH_SYN_MASK & (syn << 24)) | + (~AETH_SYN_MASK & smsn)); +} + +static inline u32 __aeth_msn(void *arg) +{ + struct rxe_aeth *aeth = arg; + + return AETH_MSN_MASK & be32_to_cpu(aeth->smsn); +} + +static inline void __aeth_set_msn(void *arg, u32 msn) +{ + struct rxe_aeth *aeth = arg; + u32 smsn = be32_to_cpu(aeth->smsn); + + aeth->smsn = cpu_to_be32((AETH_MSN_MASK & msn) | + (~AETH_MSN_MASK & smsn)); +} + +static inline u8 aeth_syn(struct rxe_pkt_info *pkt) +{ + return __aeth_syn(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_AETH]); +} + +static inline void aeth_set_syn(struct rxe_pkt_info *pkt, u8 syn) +{ + __aeth_set_syn(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_AETH], syn); +} + +static inline u32 aeth_msn(struct rxe_pkt_info *pkt) +{ + return __aeth_msn(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_AETH]); +} + +static inline void aeth_set_msn(struct rxe_pkt_info *pkt, u32 msn) +{ + __aeth_set_msn(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_AETH], msn); +} + +/****************************************************************************** + * Atomic Ack Extended Transport Header + ******************************************************************************/ +struct rxe_atmack { + __be64 orig; +}; + +static inline u64 __atmack_orig(void *arg) +{ + struct rxe_atmack *atmack = arg; + + return be64_to_cpu(atmack->orig); +} + +static inline void __atmack_set_orig(void *arg, u64 orig) +{ + struct rxe_atmack *atmack = arg; + + atmack->orig = cpu_to_be64(orig); +} + +static inline u64 atmack_orig(struct rxe_pkt_info *pkt) +{ + return __atmack_orig(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_ATMACK]); +} + +static inline void atmack_set_orig(struct rxe_pkt_info *pkt, u64 orig) +{ + __atmack_set_orig(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_ATMACK], orig); +} + +/****************************************************************************** + * Immediate Extended Transport Header + ******************************************************************************/ +struct rxe_immdt { + __be32 imm; +}; + +static inline __be32 __immdt_imm(void *arg) +{ + struct rxe_immdt *immdt = arg; + + return immdt->imm; +} + +static inline void __immdt_set_imm(void *arg, __be32 imm) +{ + struct rxe_immdt *immdt = arg; + + immdt->imm = imm; +} + +static inline __be32 immdt_imm(struct rxe_pkt_info *pkt) +{ + return __immdt_imm(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_IMMDT]); +} + +static inline void immdt_set_imm(struct rxe_pkt_info *pkt, __be32 imm) +{ + __immdt_set_imm(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_IMMDT], imm); +} + +/****************************************************************************** + * Invalidate Extended Transport Header + ******************************************************************************/ +struct rxe_ieth { + __be32 rkey; +}; + +static inline u32 __ieth_rkey(void *arg) +{ + struct rxe_ieth *ieth = arg; + + return be32_to_cpu(ieth->rkey); +} + +static inline void __ieth_set_rkey(void *arg, u32 rkey) +{ + struct rxe_ieth *ieth = arg; + + ieth->rkey = cpu_to_be32(rkey); +} + +static inline u32 ieth_rkey(struct rxe_pkt_info *pkt) +{ + return __ieth_rkey(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_IETH]); +} + +static inline void ieth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey) +{ + __ieth_set_rkey(pkt->hdr + + rxe_opcode[pkt->opcode].offset[RXE_IETH], rkey); +} + +enum rxe_hdr_length { + RXE_BTH_BYTES = sizeof(struct rxe_bth), + RXE_DETH_BYTES = sizeof(struct rxe_deth), + RXE_IMMDT_BYTES = sizeof(struct rxe_immdt), + RXE_RETH_BYTES = sizeof(struct rxe_reth), + RXE_AETH_BYTES = sizeof(struct rxe_aeth), + RXE_ATMACK_BYTES = sizeof(struct rxe_atmack), + RXE_ATMETH_BYTES = sizeof(struct rxe_atmeth), + RXE_IETH_BYTES = sizeof(struct rxe_ieth), + RXE_RDETH_BYTES = sizeof(struct rxe_rdeth), + RXE_FETH_BYTES = sizeof(struct rxe_feth), +}; + +static inline size_t header_size(struct rxe_pkt_info *pkt) +{ + return rxe_opcode[pkt->opcode].length; +} + +static inline void *payload_addr(struct rxe_pkt_info *pkt) +{ + return pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_PAYLOAD]; +} + +static inline size_t payload_size(struct rxe_pkt_info *pkt) +{ + return pkt->paylen - rxe_opcode[pkt->opcode].offset[RXE_PAYLOAD] + - bth_pad(pkt) - RXE_ICRC_SIZE; +} + +#endif /* RXE_HDR_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.c b/drivers/infiniband/sw/rxe/rxe_hw_counters.c new file mode 100644 index 0000000000..a012522b57 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2017 Mellanox Technologies Ltd. All rights reserved. + */ + +#include "rxe.h" +#include "rxe_hw_counters.h" + +static const struct rdma_stat_desc rxe_counter_descs[] = { + [RXE_CNT_SENT_PKTS].name = "sent_pkts", + [RXE_CNT_RCVD_PKTS].name = "rcvd_pkts", + [RXE_CNT_DUP_REQ].name = "duplicate_request", + [RXE_CNT_OUT_OF_SEQ_REQ].name = "out_of_seq_request", + [RXE_CNT_RCV_RNR].name = "rcvd_rnr_err", + [RXE_CNT_SND_RNR].name = "send_rnr_err", + [RXE_CNT_RCV_SEQ_ERR].name = "rcvd_seq_err", + [RXE_CNT_COMPLETER_SCHED].name = "ack_deferred", + [RXE_CNT_RETRY_EXCEEDED].name = "retry_exceeded_err", + [RXE_CNT_RNR_RETRY_EXCEEDED].name = "retry_rnr_exceeded_err", + [RXE_CNT_COMP_RETRY].name = "completer_retry_err", + [RXE_CNT_SEND_ERR].name = "send_err", + [RXE_CNT_LINK_DOWNED].name = "link_downed", + [RXE_CNT_RDMA_SEND].name = "rdma_sends", + [RXE_CNT_RDMA_RECV].name = "rdma_recvs", +}; + +int rxe_ib_get_hw_stats(struct ib_device *ibdev, + struct rdma_hw_stats *stats, + u32 port, int index) +{ + struct rxe_dev *dev = to_rdev(ibdev); + unsigned int cnt; + + if (!port || !stats) + return -EINVAL; + + for (cnt = 0; cnt < ARRAY_SIZE(rxe_counter_descs); cnt++) + stats->value[cnt] = atomic64_read(&dev->stats_counters[cnt]); + + return ARRAY_SIZE(rxe_counter_descs); +} + +struct rdma_hw_stats *rxe_ib_alloc_hw_port_stats(struct ib_device *ibdev, + u32 port_num) +{ + BUILD_BUG_ON(ARRAY_SIZE(rxe_counter_descs) != RXE_NUM_OF_COUNTERS); + + return rdma_alloc_hw_stats_struct(rxe_counter_descs, + ARRAY_SIZE(rxe_counter_descs), + RDMA_HW_STATS_DEFAULT_LIFESPAN); +} diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.h b/drivers/infiniband/sw/rxe/rxe_hw_counters.h new file mode 100644 index 0000000000..71f4d4fa9d --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2017 Mellanox Technologies Ltd. All rights reserved. + */ + +#ifndef RXE_HW_COUNTERS_H +#define RXE_HW_COUNTERS_H + +/* + * when adding counters to enum also add + * them to rxe_counter_name[] vector. + */ +enum rxe_counters { + RXE_CNT_SENT_PKTS, + RXE_CNT_RCVD_PKTS, + RXE_CNT_DUP_REQ, + RXE_CNT_OUT_OF_SEQ_REQ, + RXE_CNT_RCV_RNR, + RXE_CNT_SND_RNR, + RXE_CNT_RCV_SEQ_ERR, + RXE_CNT_COMPLETER_SCHED, + RXE_CNT_RETRY_EXCEEDED, + RXE_CNT_RNR_RETRY_EXCEEDED, + RXE_CNT_COMP_RETRY, + RXE_CNT_SEND_ERR, + RXE_CNT_LINK_DOWNED, + RXE_CNT_RDMA_SEND, + RXE_CNT_RDMA_RECV, + RXE_NUM_OF_COUNTERS +}; + +struct rdma_hw_stats *rxe_ib_alloc_hw_port_stats(struct ib_device *ibdev, + u32 port_num); +int rxe_ib_get_hw_stats(struct ib_device *ibdev, + struct rdma_hw_stats *stats, + u32 port, int index); +#endif /* RXE_HW_COUNTERS_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_icrc.c b/drivers/infiniband/sw/rxe/rxe_icrc.c new file mode 100644 index 0000000000..fdf5f08cd8 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_icrc.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <linux/crc32.h> + +#include "rxe.h" +#include "rxe_loc.h" + +/** + * rxe_icrc_init() - Initialize crypto function for computing crc32 + * @rxe: rdma_rxe device object + * + * Return: 0 on success else an error + */ +int rxe_icrc_init(struct rxe_dev *rxe) +{ + struct crypto_shash *tfm; + + tfm = crypto_alloc_shash("crc32", 0, 0); + if (IS_ERR(tfm)) { + rxe_dbg_dev(rxe, "failed to init crc32 algorithm err: %ld\n", + PTR_ERR(tfm)); + return PTR_ERR(tfm); + } + + rxe->tfm = tfm; + + return 0; +} + +/** + * rxe_crc32() - Compute cumulative crc32 for a contiguous segment + * @rxe: rdma_rxe device object + * @crc: starting crc32 value from previous segments + * @next: starting address of current segment + * @len: length of current segment + * + * Return: the cumulative crc32 checksum + */ +static __be32 rxe_crc32(struct rxe_dev *rxe, __be32 crc, void *next, size_t len) +{ + __be32 icrc; + int err; + + SHASH_DESC_ON_STACK(shash, rxe->tfm); + + shash->tfm = rxe->tfm; + *(__be32 *)shash_desc_ctx(shash) = crc; + err = crypto_shash_update(shash, next, len); + if (unlikely(err)) { + rxe_dbg_dev(rxe, "failed crc calculation, err: %d\n", err); + return (__force __be32)crc32_le((__force u32)crc, next, len); + } + + icrc = *(__be32 *)shash_desc_ctx(shash); + barrier_data(shash_desc_ctx(shash)); + + return icrc; +} + +/** + * rxe_icrc_hdr() - Compute the partial ICRC for the network and transport + * headers of a packet. + * @skb: packet buffer + * @pkt: packet information + * + * Return: the partial ICRC + */ +static __be32 rxe_icrc_hdr(struct sk_buff *skb, struct rxe_pkt_info *pkt) +{ + unsigned int bth_offset = 0; + struct iphdr *ip4h = NULL; + struct ipv6hdr *ip6h = NULL; + struct udphdr *udph; + struct rxe_bth *bth; + __be32 crc; + int length; + int hdr_size = sizeof(struct udphdr) + + (skb->protocol == htons(ETH_P_IP) ? + sizeof(struct iphdr) : sizeof(struct ipv6hdr)); + /* pseudo header buffer size is calculate using ipv6 header size since + * it is bigger than ipv4 + */ + u8 pshdr[sizeof(struct udphdr) + + sizeof(struct ipv6hdr) + + RXE_BTH_BYTES]; + + /* This seed is the result of computing a CRC with a seed of + * 0xfffffff and 8 bytes of 0xff representing a masked LRH. + */ + crc = (__force __be32)0xdebb20e3; + + if (skb->protocol == htons(ETH_P_IP)) { /* IPv4 */ + memcpy(pshdr, ip_hdr(skb), hdr_size); + ip4h = (struct iphdr *)pshdr; + udph = (struct udphdr *)(ip4h + 1); + + ip4h->ttl = 0xff; + ip4h->check = CSUM_MANGLED_0; + ip4h->tos = 0xff; + } else { /* IPv6 */ + memcpy(pshdr, ipv6_hdr(skb), hdr_size); + ip6h = (struct ipv6hdr *)pshdr; + udph = (struct udphdr *)(ip6h + 1); + + memset(ip6h->flow_lbl, 0xff, sizeof(ip6h->flow_lbl)); + ip6h->priority = 0xf; + ip6h->hop_limit = 0xff; + } + udph->check = CSUM_MANGLED_0; + + bth_offset += hdr_size; + + memcpy(&pshdr[bth_offset], pkt->hdr, RXE_BTH_BYTES); + bth = (struct rxe_bth *)&pshdr[bth_offset]; + + /* exclude bth.resv8a */ + bth->qpn |= cpu_to_be32(~BTH_QPN_MASK); + + length = hdr_size + RXE_BTH_BYTES; + crc = rxe_crc32(pkt->rxe, crc, pshdr, length); + + /* And finish to compute the CRC on the remainder of the headers. */ + crc = rxe_crc32(pkt->rxe, crc, pkt->hdr + RXE_BTH_BYTES, + rxe_opcode[pkt->opcode].length - RXE_BTH_BYTES); + return crc; +} + +/** + * rxe_icrc_check() - Compute ICRC for a packet and compare to the ICRC + * delivered in the packet. + * @skb: packet buffer + * @pkt: packet information + * + * Return: 0 if the values match else an error + */ +int rxe_icrc_check(struct sk_buff *skb, struct rxe_pkt_info *pkt) +{ + __be32 *icrcp; + __be32 pkt_icrc; + __be32 icrc; + + icrcp = (__be32 *)(pkt->hdr + pkt->paylen - RXE_ICRC_SIZE); + pkt_icrc = *icrcp; + + icrc = rxe_icrc_hdr(skb, pkt); + icrc = rxe_crc32(pkt->rxe, icrc, (u8 *)payload_addr(pkt), + payload_size(pkt) + bth_pad(pkt)); + icrc = ~icrc; + + if (unlikely(icrc != pkt_icrc)) + return -EINVAL; + + return 0; +} + +/** + * rxe_icrc_generate() - compute ICRC for a packet. + * @skb: packet buffer + * @pkt: packet information + */ +void rxe_icrc_generate(struct sk_buff *skb, struct rxe_pkt_info *pkt) +{ + __be32 *icrcp; + __be32 icrc; + + icrcp = (__be32 *)(pkt->hdr + pkt->paylen - RXE_ICRC_SIZE); + icrc = rxe_icrc_hdr(skb, pkt); + icrc = rxe_crc32(pkt->rxe, icrc, (u8 *)payload_addr(pkt), + payload_size(pkt) + bth_pad(pkt)); + *icrcp = ~icrc; +} diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h new file mode 100644 index 0000000000..4d2a8ef52c --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -0,0 +1,183 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#ifndef RXE_LOC_H +#define RXE_LOC_H + +/* rxe_av.c */ +void rxe_init_av(struct rdma_ah_attr *attr, struct rxe_av *av); +int rxe_av_chk_attr(struct rxe_qp *qp, struct rdma_ah_attr *attr); +int rxe_ah_chk_attr(struct rxe_ah *ah, struct rdma_ah_attr *attr); +void rxe_av_from_attr(u8 port_num, struct rxe_av *av, + struct rdma_ah_attr *attr); +void rxe_av_to_attr(struct rxe_av *av, struct rdma_ah_attr *attr); +void rxe_av_fill_ip_info(struct rxe_av *av, struct rdma_ah_attr *attr); +struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt, struct rxe_ah **ahp); + +/* rxe_cq.c */ +int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq, + int cqe, int comp_vector); + +int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, + int comp_vector, struct ib_udata *udata, + struct rxe_create_cq_resp __user *uresp); + +int rxe_cq_resize_queue(struct rxe_cq *cq, int new_cqe, + struct rxe_resize_cq_resp __user *uresp, + struct ib_udata *udata); + +int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited); + +void rxe_cq_cleanup(struct rxe_pool_elem *elem); + +/* rxe_mcast.c */ +struct rxe_mcg *rxe_lookup_mcg(struct rxe_dev *rxe, union ib_gid *mgid); +int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid); +int rxe_detach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid); +void rxe_cleanup_mcg(struct kref *kref); + +/* rxe_mmap.c */ +struct rxe_mmap_info { + struct list_head pending_mmaps; + struct ib_ucontext *context; + struct kref ref; + void *obj; + + struct mminfo info; +}; + +void rxe_mmap_release(struct kref *ref); + +struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *dev, u32 size, + struct ib_udata *udata, void *obj); + +int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); + +/* rxe_mr.c */ +u8 rxe_get_next_key(u32 last_key); +void rxe_mr_init_dma(int access, struct rxe_mr *mr); +int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, + int access, struct rxe_mr *mr); +int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr); +int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length); +int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, + unsigned int length, enum rxe_mr_copy_dir dir); +int copy_data(struct rxe_pd *pd, int access, struct rxe_dma_info *dma, + void *addr, int length, enum rxe_mr_copy_dir dir); +int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, + int sg_nents, unsigned int *sg_offset); +int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val); +int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value); +struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key, + enum rxe_mr_lookup_type type); +int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length); +int advance_dma_data(struct rxe_dma_info *dma, unsigned int length); +int rxe_invalidate_mr(struct rxe_qp *qp, u32 key); +int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe); +void rxe_mr_cleanup(struct rxe_pool_elem *elem); + +/* rxe_mw.c */ +int rxe_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata); +int rxe_dealloc_mw(struct ib_mw *ibmw); +int rxe_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe); +int rxe_invalidate_mw(struct rxe_qp *qp, u32 rkey); +struct rxe_mw *rxe_lookup_mw(struct rxe_qp *qp, int access, u32 rkey); +void rxe_mw_cleanup(struct rxe_pool_elem *elem); + +/* rxe_net.c */ +struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, + int paylen, struct rxe_pkt_info *pkt); +int rxe_prepare(struct rxe_av *av, struct rxe_pkt_info *pkt, + struct sk_buff *skb); +int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt, + struct sk_buff *skb); +const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num); + +/* rxe_qp.c */ +int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init); +int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, + struct ib_qp_init_attr *init, + struct rxe_create_qp_resp __user *uresp, + struct ib_pd *ibpd, struct ib_udata *udata); +int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init); +int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, + struct ib_qp_attr *attr, int mask); +int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, + int mask, struct ib_udata *udata); +int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask); +void rxe_qp_error(struct rxe_qp *qp); +int rxe_qp_chk_destroy(struct rxe_qp *qp); +void rxe_qp_cleanup(struct rxe_pool_elem *elem); + +static inline int qp_num(struct rxe_qp *qp) +{ + return qp->ibqp.qp_num; +} + +static inline enum ib_qp_type qp_type(struct rxe_qp *qp) +{ + return qp->ibqp.qp_type; +} + +static inline enum ib_qp_state qp_state(struct rxe_qp *qp) +{ + return qp->attr.qp_state; +} + +static inline int qp_mtu(struct rxe_qp *qp) +{ + if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) + return qp->attr.path_mtu; + else + return IB_MTU_4096; +} + +void free_rd_atomic_resource(struct resp_res *res); + +static inline void rxe_advance_resp_resource(struct rxe_qp *qp) +{ + qp->resp.res_head++; + if (unlikely(qp->resp.res_head == qp->attr.max_dest_rd_atomic)) + qp->resp.res_head = 0; +} + +void retransmit_timer(struct timer_list *t); +void rnr_nak_timer(struct timer_list *t); + +/* rxe_srq.c */ +int rxe_srq_chk_init(struct rxe_dev *rxe, struct ib_srq_init_attr *init); +int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_init_attr *init, struct ib_udata *udata, + struct rxe_create_srq_resp __user *uresp); +int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_attr *attr, enum ib_srq_attr_mask mask); +int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_attr *attr, enum ib_srq_attr_mask mask, + struct rxe_modify_srq_cmd *ucmd, struct ib_udata *udata); +void rxe_srq_cleanup(struct rxe_pool_elem *elem); + +void rxe_dealloc(struct ib_device *ib_dev); + +int rxe_completer(struct rxe_qp *qp); +int rxe_requester(struct rxe_qp *qp); +int rxe_responder(struct rxe_qp *qp); + +/* rxe_icrc.c */ +int rxe_icrc_init(struct rxe_dev *rxe); +int rxe_icrc_check(struct sk_buff *skb, struct rxe_pkt_info *pkt); +void rxe_icrc_generate(struct sk_buff *skb, struct rxe_pkt_info *pkt); + +void rxe_resp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb); + +void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb); + +static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp) +{ + return rxe_wr_opcode_info[opcode].mask[qp->ibqp.qp_type]; +} + +#endif /* RXE_LOC_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c new file mode 100644 index 0000000000..86cc2e18a7 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -0,0 +1,479 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2022 Hewlett Packard Enterprise, Inc. All rights reserved. + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +/* + * rxe_mcast.c implements driver support for multicast transport. + * It is based on two data structures struct rxe_mcg ('mcg') and + * struct rxe_mca ('mca'). An mcg is allocated each time a qp is + * attached to a new mgid for the first time. These are indexed by + * a red-black tree using the mgid. This data structure is searched + * for the mcg when a multicast packet is received and when another + * qp is attached to the same mgid. It is cleaned up when the last qp + * is detached from the mcg. Each time a qp is attached to an mcg an + * mca is created. It holds a pointer to the qp and is added to a list + * of qp's that are attached to the mcg. The qp_list is used to replicate + * mcast packets in the rxe receive path. + */ + +#include "rxe.h" + +/** + * rxe_mcast_add - add multicast address to rxe device + * @rxe: rxe device object + * @mgid: multicast address as a gid + * + * Returns 0 on success else an error + */ +static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) +{ + unsigned char ll_addr[ETH_ALEN]; + + ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); + + return dev_mc_add(rxe->ndev, ll_addr); +} + +/** + * rxe_mcast_del - delete multicast address from rxe device + * @rxe: rxe device object + * @mgid: multicast address as a gid + * + * Returns 0 on success else an error + */ +static int rxe_mcast_del(struct rxe_dev *rxe, union ib_gid *mgid) +{ + unsigned char ll_addr[ETH_ALEN]; + + ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); + + return dev_mc_del(rxe->ndev, ll_addr); +} + +/** + * __rxe_insert_mcg - insert an mcg into red-black tree (rxe->mcg_tree) + * @mcg: mcg object with an embedded red-black tree node + * + * Context: caller must hold a reference to mcg and rxe->mcg_lock and + * is responsible to avoid adding the same mcg twice to the tree. + */ +static void __rxe_insert_mcg(struct rxe_mcg *mcg) +{ + struct rb_root *tree = &mcg->rxe->mcg_tree; + struct rb_node **link = &tree->rb_node; + struct rb_node *node = NULL; + struct rxe_mcg *tmp; + int cmp; + + while (*link) { + node = *link; + tmp = rb_entry(node, struct rxe_mcg, node); + + cmp = memcmp(&tmp->mgid, &mcg->mgid, sizeof(mcg->mgid)); + if (cmp > 0) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + rb_link_node(&mcg->node, node, link); + rb_insert_color(&mcg->node, tree); +} + +/** + * __rxe_remove_mcg - remove an mcg from red-black tree holding lock + * @mcg: mcast group object with an embedded red-black tree node + * + * Context: caller must hold a reference to mcg and rxe->mcg_lock + */ +static void __rxe_remove_mcg(struct rxe_mcg *mcg) +{ + rb_erase(&mcg->node, &mcg->rxe->mcg_tree); +} + +/** + * __rxe_lookup_mcg - lookup mcg in rxe->mcg_tree while holding lock + * @rxe: rxe device object + * @mgid: multicast IP address + * + * Context: caller must hold rxe->mcg_lock + * Returns: mcg on success and takes a ref to mcg else NULL + */ +static struct rxe_mcg *__rxe_lookup_mcg(struct rxe_dev *rxe, + union ib_gid *mgid) +{ + struct rb_root *tree = &rxe->mcg_tree; + struct rxe_mcg *mcg; + struct rb_node *node; + int cmp; + + node = tree->rb_node; + + while (node) { + mcg = rb_entry(node, struct rxe_mcg, node); + + cmp = memcmp(&mcg->mgid, mgid, sizeof(*mgid)); + + if (cmp > 0) + node = node->rb_left; + else if (cmp < 0) + node = node->rb_right; + else + break; + } + + if (node) { + kref_get(&mcg->ref_cnt); + return mcg; + } + + return NULL; +} + +/** + * rxe_lookup_mcg - lookup up mcg in red-back tree + * @rxe: rxe device object + * @mgid: multicast IP address + * + * Returns: mcg if found else NULL + */ +struct rxe_mcg *rxe_lookup_mcg(struct rxe_dev *rxe, union ib_gid *mgid) +{ + struct rxe_mcg *mcg; + + spin_lock_bh(&rxe->mcg_lock); + mcg = __rxe_lookup_mcg(rxe, mgid); + spin_unlock_bh(&rxe->mcg_lock); + + return mcg; +} + +/** + * __rxe_init_mcg - initialize a new mcg + * @rxe: rxe device + * @mgid: multicast address as a gid + * @mcg: new mcg object + * + * Context: caller should hold rxe->mcg lock + */ +static void __rxe_init_mcg(struct rxe_dev *rxe, union ib_gid *mgid, + struct rxe_mcg *mcg) +{ + kref_init(&mcg->ref_cnt); + memcpy(&mcg->mgid, mgid, sizeof(mcg->mgid)); + INIT_LIST_HEAD(&mcg->qp_list); + mcg->rxe = rxe; + + /* caller holds a ref on mcg but that will be + * dropped when mcg goes out of scope. We need to take a ref + * on the pointer that will be saved in the red-black tree + * by __rxe_insert_mcg and used to lookup mcg from mgid later. + * Inserting mcg makes it visible to outside so this should + * be done last after the object is ready. + */ + kref_get(&mcg->ref_cnt); + __rxe_insert_mcg(mcg); +} + +/** + * rxe_get_mcg - lookup or allocate a mcg + * @rxe: rxe device object + * @mgid: multicast IP address as a gid + * + * Returns: mcg on success else ERR_PTR(error) + */ +static struct rxe_mcg *rxe_get_mcg(struct rxe_dev *rxe, union ib_gid *mgid) +{ + struct rxe_mcg *mcg, *tmp; + int err; + + if (rxe->attr.max_mcast_grp == 0) + return ERR_PTR(-EINVAL); + + /* check to see if mcg already exists */ + mcg = rxe_lookup_mcg(rxe, mgid); + if (mcg) + return mcg; + + /* check to see if we have reached limit */ + if (atomic_inc_return(&rxe->mcg_num) > rxe->attr.max_mcast_grp) { + err = -ENOMEM; + goto err_dec; + } + + /* speculative alloc of new mcg */ + mcg = kzalloc(sizeof(*mcg), GFP_KERNEL); + if (!mcg) { + err = -ENOMEM; + goto err_dec; + } + + spin_lock_bh(&rxe->mcg_lock); + /* re-check to see if someone else just added it */ + tmp = __rxe_lookup_mcg(rxe, mgid); + if (tmp) { + spin_unlock_bh(&rxe->mcg_lock); + atomic_dec(&rxe->mcg_num); + kfree(mcg); + return tmp; + } + + __rxe_init_mcg(rxe, mgid, mcg); + spin_unlock_bh(&rxe->mcg_lock); + + /* add mcast address outside of lock */ + err = rxe_mcast_add(rxe, mgid); + if (!err) + return mcg; + + kfree(mcg); +err_dec: + atomic_dec(&rxe->mcg_num); + return ERR_PTR(err); +} + +/** + * rxe_cleanup_mcg - cleanup mcg for kref_put + * @kref: struct kref embnedded in mcg + */ +void rxe_cleanup_mcg(struct kref *kref) +{ + struct rxe_mcg *mcg = container_of(kref, typeof(*mcg), ref_cnt); + + kfree(mcg); +} + +/** + * __rxe_destroy_mcg - destroy mcg object holding rxe->mcg_lock + * @mcg: the mcg object + * + * Context: caller is holding rxe->mcg_lock + * no qp's are attached to mcg + */ +static void __rxe_destroy_mcg(struct rxe_mcg *mcg) +{ + struct rxe_dev *rxe = mcg->rxe; + + /* remove mcg from red-black tree then drop ref */ + __rxe_remove_mcg(mcg); + kref_put(&mcg->ref_cnt, rxe_cleanup_mcg); + + atomic_dec(&rxe->mcg_num); +} + +/** + * rxe_destroy_mcg - destroy mcg object + * @mcg: the mcg object + * + * Context: no qp's are attached to mcg + */ +static void rxe_destroy_mcg(struct rxe_mcg *mcg) +{ + /* delete mcast address outside of lock */ + rxe_mcast_del(mcg->rxe, &mcg->mgid); + + spin_lock_bh(&mcg->rxe->mcg_lock); + __rxe_destroy_mcg(mcg); + spin_unlock_bh(&mcg->rxe->mcg_lock); +} + +/** + * __rxe_init_mca - initialize a new mca holding lock + * @qp: qp object + * @mcg: mcg object + * @mca: empty space for new mca + * + * Context: caller must hold references on qp and mcg, rxe->mcg_lock + * and pass memory for new mca + * + * Returns: 0 on success else an error + */ +static int __rxe_init_mca(struct rxe_qp *qp, struct rxe_mcg *mcg, + struct rxe_mca *mca) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + int n; + + n = atomic_inc_return(&rxe->mcg_attach); + if (n > rxe->attr.max_total_mcast_qp_attach) { + atomic_dec(&rxe->mcg_attach); + return -ENOMEM; + } + + n = atomic_inc_return(&mcg->qp_num); + if (n > rxe->attr.max_mcast_qp_attach) { + atomic_dec(&mcg->qp_num); + atomic_dec(&rxe->mcg_attach); + return -ENOMEM; + } + + atomic_inc(&qp->mcg_num); + + rxe_get(qp); + mca->qp = qp; + + list_add_tail(&mca->qp_list, &mcg->qp_list); + + return 0; +} + +/** + * rxe_attach_mcg - attach qp to mcg if not already attached + * @qp: qp object + * @mcg: mcg object + * + * Context: caller must hold reference on qp and mcg. + * Returns: 0 on success else an error + */ +static int rxe_attach_mcg(struct rxe_mcg *mcg, struct rxe_qp *qp) +{ + struct rxe_dev *rxe = mcg->rxe; + struct rxe_mca *mca, *tmp; + int err; + + /* check to see if the qp is already a member of the group */ + spin_lock_bh(&rxe->mcg_lock); + list_for_each_entry(mca, &mcg->qp_list, qp_list) { + if (mca->qp == qp) { + spin_unlock_bh(&rxe->mcg_lock); + return 0; + } + } + spin_unlock_bh(&rxe->mcg_lock); + + /* speculative alloc new mca without using GFP_ATOMIC */ + mca = kzalloc(sizeof(*mca), GFP_KERNEL); + if (!mca) + return -ENOMEM; + + spin_lock_bh(&rxe->mcg_lock); + /* re-check to see if someone else just attached qp */ + list_for_each_entry(tmp, &mcg->qp_list, qp_list) { + if (tmp->qp == qp) { + kfree(mca); + err = 0; + goto out; + } + } + + err = __rxe_init_mca(qp, mcg, mca); + if (err) + kfree(mca); +out: + spin_unlock_bh(&rxe->mcg_lock); + return err; +} + +/** + * __rxe_cleanup_mca - cleanup mca object holding lock + * @mca: mca object + * @mcg: mcg object + * + * Context: caller must hold a reference to mcg and rxe->mcg_lock + */ +static void __rxe_cleanup_mca(struct rxe_mca *mca, struct rxe_mcg *mcg) +{ + list_del(&mca->qp_list); + + atomic_dec(&mcg->qp_num); + atomic_dec(&mcg->rxe->mcg_attach); + atomic_dec(&mca->qp->mcg_num); + rxe_put(mca->qp); + + kfree(mca); +} + +/** + * rxe_detach_mcg - detach qp from mcg + * @mcg: mcg object + * @qp: qp object + * + * Returns: 0 on success else an error if qp is not attached. + */ +static int rxe_detach_mcg(struct rxe_mcg *mcg, struct rxe_qp *qp) +{ + struct rxe_dev *rxe = mcg->rxe; + struct rxe_mca *mca, *tmp; + + spin_lock_bh(&rxe->mcg_lock); + list_for_each_entry_safe(mca, tmp, &mcg->qp_list, qp_list) { + if (mca->qp == qp) { + __rxe_cleanup_mca(mca, mcg); + + /* if the number of qp's attached to the + * mcast group falls to zero go ahead and + * tear it down. This will not free the + * object since we are still holding a ref + * from the caller + */ + if (atomic_read(&mcg->qp_num) <= 0) + __rxe_destroy_mcg(mcg); + + spin_unlock_bh(&rxe->mcg_lock); + return 0; + } + } + + /* we didn't find the qp on the list */ + spin_unlock_bh(&rxe->mcg_lock); + return -EINVAL; +} + +/** + * rxe_attach_mcast - attach qp to multicast group (see IBA-11.3.1) + * @ibqp: (IB) qp object + * @mgid: multicast IP address + * @mlid: multicast LID, ignored for RoCEv2 (see IBA-A17.5.6) + * + * Returns: 0 on success else an errno + */ +int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) +{ + int err; + struct rxe_dev *rxe = to_rdev(ibqp->device); + struct rxe_qp *qp = to_rqp(ibqp); + struct rxe_mcg *mcg; + + /* takes a ref on mcg if successful */ + mcg = rxe_get_mcg(rxe, mgid); + if (IS_ERR(mcg)) + return PTR_ERR(mcg); + + err = rxe_attach_mcg(mcg, qp); + + /* if we failed to attach the first qp to mcg tear it down */ + if (atomic_read(&mcg->qp_num) == 0) + rxe_destroy_mcg(mcg); + + kref_put(&mcg->ref_cnt, rxe_cleanup_mcg); + + return err; +} + +/** + * rxe_detach_mcast - detach qp from multicast group (see IBA-11.3.2) + * @ibqp: address of (IB) qp object + * @mgid: multicast IP address + * @mlid: multicast LID, ignored for RoCEv2 (see IBA-A17.5.6) + * + * Returns: 0 on success else an errno + */ +int rxe_detach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) +{ + struct rxe_dev *rxe = to_rdev(ibqp->device); + struct rxe_qp *qp = to_rqp(ibqp); + struct rxe_mcg *mcg; + int err; + + mcg = rxe_lookup_mcg(rxe, mgid); + if (!mcg) + return -EINVAL; + + err = rxe_detach_mcg(mcg, qp); + kref_put(&mcg->ref_cnt, rxe_cleanup_mcg); + + return err; +} diff --git a/drivers/infiniband/sw/rxe/rxe_mmap.c b/drivers/infiniband/sw/rxe/rxe_mmap.c new file mode 100644 index 0000000000..6b7f2bd698 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_mmap.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <linux/vmalloc.h> +#include <linux/mm.h> +#include <linux/errno.h> +#include <rdma/uverbs_ioctl.h> + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" + +void rxe_mmap_release(struct kref *ref) +{ + struct rxe_mmap_info *ip = container_of(ref, + struct rxe_mmap_info, ref); + struct rxe_dev *rxe = to_rdev(ip->context->device); + + spin_lock_bh(&rxe->pending_lock); + + if (!list_empty(&ip->pending_mmaps)) + list_del(&ip->pending_mmaps); + + spin_unlock_bh(&rxe->pending_lock); + + vfree(ip->obj); /* buf */ + kfree(ip); +} + +/* + * open and close keep track of how many times the memory region is mapped, + * to avoid releasing it. + */ +static void rxe_vma_open(struct vm_area_struct *vma) +{ + struct rxe_mmap_info *ip = vma->vm_private_data; + + kref_get(&ip->ref); +} + +static void rxe_vma_close(struct vm_area_struct *vma) +{ + struct rxe_mmap_info *ip = vma->vm_private_data; + + kref_put(&ip->ref, rxe_mmap_release); +} + +static const struct vm_operations_struct rxe_vm_ops = { + .open = rxe_vma_open, + .close = rxe_vma_close, +}; + +/** + * rxe_mmap - create a new mmap region + * @context: the IB user context of the process making the mmap() call + * @vma: the VMA to be initialized + * Return zero if the mmap is OK. Otherwise, return an errno. + */ +int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + struct rxe_dev *rxe = to_rdev(context->device); + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned long size = vma->vm_end - vma->vm_start; + struct rxe_mmap_info *ip, *pp; + int ret; + + /* + * Search the device's list of objects waiting for a mmap call. + * Normally, this list is very short since a call to create a + * CQ, QP, or SRQ is soon followed by a call to mmap(). + */ + spin_lock_bh(&rxe->pending_lock); + list_for_each_entry_safe(ip, pp, &rxe->pending_mmaps, pending_mmaps) { + if (context != ip->context || (__u64)offset != ip->info.offset) + continue; + + /* Don't allow a mmap larger than the object. */ + if (size > ip->info.size) { + rxe_dbg_dev(rxe, "mmap region is larger than the object!\n"); + spin_unlock_bh(&rxe->pending_lock); + ret = -EINVAL; + goto done; + } + + goto found_it; + } + rxe_dbg_dev(rxe, "unable to find pending mmap info\n"); + spin_unlock_bh(&rxe->pending_lock); + ret = -EINVAL; + goto done; + +found_it: + list_del_init(&ip->pending_mmaps); + spin_unlock_bh(&rxe->pending_lock); + + ret = remap_vmalloc_range(vma, ip->obj, 0); + if (ret) { + rxe_dbg_dev(rxe, "err %d from remap_vmalloc_range\n", ret); + goto done; + } + + vma->vm_ops = &rxe_vm_ops; + vma->vm_private_data = ip; + rxe_vma_open(vma); +done: + return ret; +} + +/* + * Allocate information for rxe_mmap + */ +struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *rxe, u32 size, + struct ib_udata *udata, void *obj) +{ + struct rxe_mmap_info *ip; + + if (!udata) + return ERR_PTR(-EINVAL); + + ip = kmalloc(sizeof(*ip), GFP_KERNEL); + if (!ip) + return ERR_PTR(-ENOMEM); + + size = PAGE_ALIGN(size); + + spin_lock_bh(&rxe->mmap_offset_lock); + + if (rxe->mmap_offset == 0) + rxe->mmap_offset = ALIGN(PAGE_SIZE, SHMLBA); + + ip->info.offset = rxe->mmap_offset; + rxe->mmap_offset += ALIGN(size, SHMLBA); + + spin_unlock_bh(&rxe->mmap_offset_lock); + + INIT_LIST_HEAD(&ip->pending_mmaps); + ip->info.size = size; + ip->context = + container_of(udata, struct uverbs_attr_bundle, driver_udata) + ->context; + ip->obj = obj; + kref_init(&ip->ref); + + return ip; +} diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c new file mode 100644 index 0000000000..f54042e9ae --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -0,0 +1,731 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <linux/libnvdimm.h> + +#include "rxe.h" +#include "rxe_loc.h" + +/* Return a random 8 bit key value that is + * different than the last_key. Set last_key to -1 + * if this is the first key for an MR or MW + */ +u8 rxe_get_next_key(u32 last_key) +{ + u8 key; + + do { + get_random_bytes(&key, 1); + } while (key == last_key); + + return key; +} + +int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) +{ + switch (mr->ibmr.type) { + case IB_MR_TYPE_DMA: + return 0; + + case IB_MR_TYPE_USER: + case IB_MR_TYPE_MEM_REG: + if (iova < mr->ibmr.iova || + iova + length > mr->ibmr.iova + mr->ibmr.length) { + rxe_dbg_mr(mr, "iova/length out of range"); + return -EINVAL; + } + return 0; + + default: + rxe_dbg_mr(mr, "mr type not supported\n"); + return -EINVAL; + } +} + +static void rxe_mr_init(int access, struct rxe_mr *mr) +{ + u32 key = mr->elem.index << 8 | rxe_get_next_key(-1); + + /* set ibmr->l/rkey and also copy into private l/rkey + * for user MRs these will always be the same + * for cases where caller 'owns' the key portion + * they may be different until REG_MR WQE is executed. + */ + mr->lkey = mr->ibmr.lkey = key; + mr->rkey = mr->ibmr.rkey = key; + + mr->access = access; + mr->ibmr.page_size = PAGE_SIZE; + mr->page_mask = PAGE_MASK; + mr->page_shift = PAGE_SHIFT; + mr->state = RXE_MR_STATE_INVALID; +} + +void rxe_mr_init_dma(int access, struct rxe_mr *mr) +{ + rxe_mr_init(access, mr); + + mr->state = RXE_MR_STATE_VALID; + mr->ibmr.type = IB_MR_TYPE_DMA; +} + +static unsigned long rxe_mr_iova_to_index(struct rxe_mr *mr, u64 iova) +{ + return (iova >> mr->page_shift) - (mr->ibmr.iova >> mr->page_shift); +} + +static unsigned long rxe_mr_iova_to_page_offset(struct rxe_mr *mr, u64 iova) +{ + return iova & (mr_page_size(mr) - 1); +} + +static bool is_pmem_page(struct page *pg) +{ + unsigned long paddr = page_to_phys(pg); + + return REGION_INTERSECTS == + region_intersects(paddr, PAGE_SIZE, IORESOURCE_MEM, + IORES_DESC_PERSISTENT_MEMORY); +} + +static int rxe_mr_fill_pages_from_sgt(struct rxe_mr *mr, struct sg_table *sgt) +{ + XA_STATE(xas, &mr->page_list, 0); + struct sg_page_iter sg_iter; + struct page *page; + bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT); + + __sg_page_iter_start(&sg_iter, sgt->sgl, sgt->orig_nents, 0); + if (!__sg_page_iter_next(&sg_iter)) + return 0; + + do { + xas_lock(&xas); + while (true) { + page = sg_page_iter_page(&sg_iter); + + if (persistent && !is_pmem_page(page)) { + rxe_dbg_mr(mr, "Page can't be persistent\n"); + xas_set_err(&xas, -EINVAL); + break; + } + + xas_store(&xas, page); + if (xas_error(&xas)) + break; + xas_next(&xas); + if (!__sg_page_iter_next(&sg_iter)) + break; + } + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + + return xas_error(&xas); +} + +int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, + int access, struct rxe_mr *mr) +{ + struct ib_umem *umem; + int err; + + rxe_mr_init(access, mr); + + xa_init(&mr->page_list); + + umem = ib_umem_get(&rxe->ib_dev, start, length, access); + if (IS_ERR(umem)) { + rxe_dbg_mr(mr, "Unable to pin memory region err = %d\n", + (int)PTR_ERR(umem)); + return PTR_ERR(umem); + } + + err = rxe_mr_fill_pages_from_sgt(mr, &umem->sgt_append.sgt); + if (err) { + ib_umem_release(umem); + return err; + } + + mr->umem = umem; + mr->ibmr.type = IB_MR_TYPE_USER; + mr->state = RXE_MR_STATE_VALID; + + return 0; +} + +static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf) +{ + XA_STATE(xas, &mr->page_list, 0); + int i = 0; + int err; + + xa_init(&mr->page_list); + + do { + xas_lock(&xas); + while (i != num_buf) { + xas_store(&xas, XA_ZERO_ENTRY); + if (xas_error(&xas)) + break; + xas_next(&xas); + i++; + } + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + + err = xas_error(&xas); + if (err) + return err; + + mr->num_buf = num_buf; + + return 0; +} + +int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr) +{ + int err; + + /* always allow remote access for FMRs */ + rxe_mr_init(RXE_ACCESS_REMOTE, mr); + + err = rxe_mr_alloc(mr, max_pages); + if (err) + goto err1; + + mr->state = RXE_MR_STATE_FREE; + mr->ibmr.type = IB_MR_TYPE_MEM_REG; + + return 0; + +err1: + return err; +} + +static int rxe_set_page(struct ib_mr *ibmr, u64 dma_addr) +{ + struct rxe_mr *mr = to_rmr(ibmr); + struct page *page = ib_virt_dma_to_page(dma_addr); + bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT); + int err; + + if (persistent && !is_pmem_page(page)) { + rxe_dbg_mr(mr, "Page cannot be persistent\n"); + return -EINVAL; + } + + if (unlikely(mr->nbuf == mr->num_buf)) + return -ENOMEM; + + err = xa_err(xa_store(&mr->page_list, mr->nbuf, page, GFP_KERNEL)); + if (err) + return err; + + mr->nbuf++; + return 0; +} + +int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sgl, + int sg_nents, unsigned int *sg_offset) +{ + struct rxe_mr *mr = to_rmr(ibmr); + unsigned int page_size = mr_page_size(mr); + + mr->nbuf = 0; + mr->page_shift = ilog2(page_size); + mr->page_mask = ~((u64)page_size - 1); + mr->page_offset = mr->ibmr.iova & (page_size - 1); + + return ib_sg_to_pages(ibmr, sgl, sg_nents, sg_offset, rxe_set_page); +} + +static int rxe_mr_copy_xarray(struct rxe_mr *mr, u64 iova, void *addr, + unsigned int length, enum rxe_mr_copy_dir dir) +{ + unsigned int page_offset = rxe_mr_iova_to_page_offset(mr, iova); + unsigned long index = rxe_mr_iova_to_index(mr, iova); + unsigned int bytes; + struct page *page; + void *va; + + while (length) { + page = xa_load(&mr->page_list, index); + if (!page) + return -EFAULT; + + bytes = min_t(unsigned int, length, + mr_page_size(mr) - page_offset); + va = kmap_local_page(page); + if (dir == RXE_FROM_MR_OBJ) + memcpy(addr, va + page_offset, bytes); + else + memcpy(va + page_offset, addr, bytes); + kunmap_local(va); + + page_offset = 0; + addr += bytes; + length -= bytes; + index++; + } + + return 0; +} + +static void rxe_mr_copy_dma(struct rxe_mr *mr, u64 dma_addr, void *addr, + unsigned int length, enum rxe_mr_copy_dir dir) +{ + unsigned int page_offset = dma_addr & (PAGE_SIZE - 1); + unsigned int bytes; + struct page *page; + u8 *va; + + while (length) { + page = ib_virt_dma_to_page(dma_addr); + bytes = min_t(unsigned int, length, + PAGE_SIZE - page_offset); + va = kmap_local_page(page); + + if (dir == RXE_TO_MR_OBJ) + memcpy(va + page_offset, addr, bytes); + else + memcpy(addr, va + page_offset, bytes); + + kunmap_local(va); + page_offset = 0; + dma_addr += bytes; + addr += bytes; + length -= bytes; + } +} + +int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, + unsigned int length, enum rxe_mr_copy_dir dir) +{ + int err; + + if (length == 0) + return 0; + + if (WARN_ON(!mr)) + return -EINVAL; + + if (mr->ibmr.type == IB_MR_TYPE_DMA) { + rxe_mr_copy_dma(mr, iova, addr, length, dir); + return 0; + } + + err = mr_check_range(mr, iova, length); + if (unlikely(err)) { + rxe_dbg_mr(mr, "iova out of range"); + return err; + } + + return rxe_mr_copy_xarray(mr, iova, addr, length, dir); +} + +/* copy data in or out of a wqe, i.e. sg list + * under the control of a dma descriptor + */ +int copy_data( + struct rxe_pd *pd, + int access, + struct rxe_dma_info *dma, + void *addr, + int length, + enum rxe_mr_copy_dir dir) +{ + int bytes; + struct rxe_sge *sge = &dma->sge[dma->cur_sge]; + int offset = dma->sge_offset; + int resid = dma->resid; + struct rxe_mr *mr = NULL; + u64 iova; + int err; + + if (length == 0) + return 0; + + if (length > resid) { + err = -EINVAL; + goto err2; + } + + if (sge->length && (offset < sge->length)) { + mr = lookup_mr(pd, access, sge->lkey, RXE_LOOKUP_LOCAL); + if (!mr) { + err = -EINVAL; + goto err1; + } + } + + while (length > 0) { + bytes = length; + + if (offset >= sge->length) { + if (mr) { + rxe_put(mr); + mr = NULL; + } + sge++; + dma->cur_sge++; + offset = 0; + + if (dma->cur_sge >= dma->num_sge) { + err = -ENOSPC; + goto err2; + } + + if (sge->length) { + mr = lookup_mr(pd, access, sge->lkey, + RXE_LOOKUP_LOCAL); + if (!mr) { + err = -EINVAL; + goto err1; + } + } else { + continue; + } + } + + if (bytes > sge->length - offset) + bytes = sge->length - offset; + + if (bytes > 0) { + iova = sge->addr + offset; + err = rxe_mr_copy(mr, iova, addr, bytes, dir); + if (err) + goto err2; + + offset += bytes; + resid -= bytes; + length -= bytes; + addr += bytes; + } + } + + dma->sge_offset = offset; + dma->resid = resid; + + if (mr) + rxe_put(mr); + + return 0; + +err2: + if (mr) + rxe_put(mr); +err1: + return err; +} + +int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) +{ + unsigned int page_offset; + unsigned long index; + struct page *page; + unsigned int bytes; + int err; + u8 *va; + + /* mr must be valid even if length is zero */ + if (WARN_ON(!mr)) + return -EINVAL; + + if (length == 0) + return 0; + + if (mr->ibmr.type == IB_MR_TYPE_DMA) + return -EFAULT; + + err = mr_check_range(mr, iova, length); + if (err) + return err; + + while (length > 0) { + index = rxe_mr_iova_to_index(mr, iova); + page = xa_load(&mr->page_list, index); + page_offset = rxe_mr_iova_to_page_offset(mr, iova); + if (!page) + return -EFAULT; + bytes = min_t(unsigned int, length, + mr_page_size(mr) - page_offset); + + va = kmap_local_page(page); + arch_wb_cache_pmem(va + page_offset, bytes); + kunmap_local(va); + + length -= bytes; + iova += bytes; + page_offset = 0; + } + + return 0; +} + +/* Guarantee atomicity of atomic operations at the machine level. */ +static DEFINE_SPINLOCK(atomic_ops_lock); + +int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val) +{ + unsigned int page_offset; + struct page *page; + u64 value; + u64 *va; + + if (unlikely(mr->state != RXE_MR_STATE_VALID)) { + rxe_dbg_mr(mr, "mr not in valid state"); + return RESPST_ERR_RKEY_VIOLATION; + } + + if (mr->ibmr.type == IB_MR_TYPE_DMA) { + page_offset = iova & (PAGE_SIZE - 1); + page = ib_virt_dma_to_page(iova); + } else { + unsigned long index; + int err; + + err = mr_check_range(mr, iova, sizeof(value)); + if (err) { + rxe_dbg_mr(mr, "iova out of range"); + return RESPST_ERR_RKEY_VIOLATION; + } + page_offset = rxe_mr_iova_to_page_offset(mr, iova); + index = rxe_mr_iova_to_index(mr, iova); + page = xa_load(&mr->page_list, index); + if (!page) + return RESPST_ERR_RKEY_VIOLATION; + } + + if (unlikely(page_offset & 0x7)) { + rxe_dbg_mr(mr, "iova not aligned"); + return RESPST_ERR_MISALIGNED_ATOMIC; + } + + va = kmap_local_page(page); + + spin_lock_bh(&atomic_ops_lock); + value = *orig_val = va[page_offset >> 3]; + + if (opcode == IB_OPCODE_RC_COMPARE_SWAP) { + if (value == compare) + va[page_offset >> 3] = swap_add; + } else { + value += swap_add; + va[page_offset >> 3] = value; + } + spin_unlock_bh(&atomic_ops_lock); + + kunmap_local(va); + + return 0; +} + +#if defined CONFIG_64BIT +/* only implemented or called for 64 bit architectures */ +int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) +{ + unsigned int page_offset; + struct page *page; + u64 *va; + + /* See IBA oA19-28 */ + if (unlikely(mr->state != RXE_MR_STATE_VALID)) { + rxe_dbg_mr(mr, "mr not in valid state"); + return RESPST_ERR_RKEY_VIOLATION; + } + + if (mr->ibmr.type == IB_MR_TYPE_DMA) { + page_offset = iova & (PAGE_SIZE - 1); + page = ib_virt_dma_to_page(iova); + } else { + unsigned long index; + int err; + + /* See IBA oA19-28 */ + err = mr_check_range(mr, iova, sizeof(value)); + if (unlikely(err)) { + rxe_dbg_mr(mr, "iova out of range"); + return RESPST_ERR_RKEY_VIOLATION; + } + page_offset = rxe_mr_iova_to_page_offset(mr, iova); + index = rxe_mr_iova_to_index(mr, iova); + page = xa_load(&mr->page_list, index); + if (!page) + return RESPST_ERR_RKEY_VIOLATION; + } + + /* See IBA A19.4.2 */ + if (unlikely(page_offset & 0x7)) { + rxe_dbg_mr(mr, "misaligned address"); + return RESPST_ERR_MISALIGNED_ATOMIC; + } + + va = kmap_local_page(page); + + /* Do atomic write after all prior operations have completed */ + smp_store_release(&va[page_offset >> 3], value); + + kunmap_local(va); + + return 0; +} +#else +int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) +{ + return RESPST_ERR_UNSUPPORTED_OPCODE; +} +#endif + +int advance_dma_data(struct rxe_dma_info *dma, unsigned int length) +{ + struct rxe_sge *sge = &dma->sge[dma->cur_sge]; + int offset = dma->sge_offset; + int resid = dma->resid; + + while (length) { + unsigned int bytes; + + if (offset >= sge->length) { + sge++; + dma->cur_sge++; + offset = 0; + if (dma->cur_sge >= dma->num_sge) + return -ENOSPC; + } + + bytes = length; + + if (bytes > sge->length - offset) + bytes = sge->length - offset; + + offset += bytes; + resid -= bytes; + length -= bytes; + } + + dma->sge_offset = offset; + dma->resid = resid; + + return 0; +} + +struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key, + enum rxe_mr_lookup_type type) +{ + struct rxe_mr *mr; + struct rxe_dev *rxe = to_rdev(pd->ibpd.device); + int index = key >> 8; + + mr = rxe_pool_get_index(&rxe->mr_pool, index); + if (!mr) + return NULL; + + if (unlikely((type == RXE_LOOKUP_LOCAL && mr->lkey != key) || + (type == RXE_LOOKUP_REMOTE && mr->rkey != key) || + mr_pd(mr) != pd || ((access & mr->access) != access) || + mr->state != RXE_MR_STATE_VALID)) { + rxe_put(mr); + mr = NULL; + } + + return mr; +} + +int rxe_invalidate_mr(struct rxe_qp *qp, u32 key) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct rxe_mr *mr; + int remote; + int ret; + + mr = rxe_pool_get_index(&rxe->mr_pool, key >> 8); + if (!mr) { + rxe_dbg_qp(qp, "No MR for key %#x\n", key); + ret = -EINVAL; + goto err; + } + + remote = mr->access & RXE_ACCESS_REMOTE; + if (remote ? (key != mr->rkey) : (key != mr->lkey)) { + rxe_dbg_mr(mr, "wr key (%#x) doesn't match mr key (%#x)\n", + key, (remote ? mr->rkey : mr->lkey)); + ret = -EINVAL; + goto err_drop_ref; + } + + if (atomic_read(&mr->num_mw) > 0) { + rxe_dbg_mr(mr, "Attempt to invalidate an MR while bound to MWs\n"); + ret = -EINVAL; + goto err_drop_ref; + } + + if (unlikely(mr->ibmr.type != IB_MR_TYPE_MEM_REG)) { + rxe_dbg_mr(mr, "Type (%d) is wrong\n", mr->ibmr.type); + ret = -EINVAL; + goto err_drop_ref; + } + + mr->state = RXE_MR_STATE_FREE; + ret = 0; + +err_drop_ref: + rxe_put(mr); +err: + return ret; +} + +/* user can (re)register fast MR by executing a REG_MR WQE. + * user is expected to hold a reference on the ib mr until the + * WQE completes. + * Once a fast MR is created this is the only way to change the + * private keys. It is the responsibility of the user to maintain + * the ib mr keys in sync with rxe mr keys. + */ +int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe) +{ + struct rxe_mr *mr = to_rmr(wqe->wr.wr.reg.mr); + u32 key = wqe->wr.wr.reg.key; + u32 access = wqe->wr.wr.reg.access; + + /* user can only register MR in free state */ + if (unlikely(mr->state != RXE_MR_STATE_FREE)) { + rxe_dbg_mr(mr, "mr->lkey = 0x%x not free\n", mr->lkey); + return -EINVAL; + } + + /* user can only register mr with qp in same protection domain */ + if (unlikely(qp->ibqp.pd != mr->ibmr.pd)) { + rxe_dbg_mr(mr, "qp->pd and mr->pd don't match\n"); + return -EINVAL; + } + + /* user is only allowed to change key portion of l/rkey */ + if (unlikely((mr->lkey & ~0xff) != (key & ~0xff))) { + rxe_dbg_mr(mr, "key = 0x%x has wrong index mr->lkey = 0x%x\n", + key, mr->lkey); + return -EINVAL; + } + + mr->access = access; + mr->lkey = key; + mr->rkey = key; + mr->ibmr.iova = wqe->wr.wr.reg.mr->iova; + mr->state = RXE_MR_STATE_VALID; + + return 0; +} + +void rxe_mr_cleanup(struct rxe_pool_elem *elem) +{ + struct rxe_mr *mr = container_of(elem, typeof(*mr), elem); + + rxe_put(mr_pd(mr)); + ib_umem_release(mr->umem); + + if (mr->ibmr.type != IB_MR_TYPE_DMA) + xa_destroy(&mr->page_list); +} diff --git a/drivers/infiniband/sw/rxe/rxe_mw.c b/drivers/infiniband/sw/rxe/rxe_mw.c new file mode 100644 index 0000000000..d9312b5c9d --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_mw.c @@ -0,0 +1,338 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020 Hewlett Packard Enterprise, Inc. All rights reserved. + */ + +/* + * The rdma_rxe driver supports type 1 or type 2B memory windows. + * Type 1 MWs are created by ibv_alloc_mw() verbs calls and bound by + * ibv_bind_mw() calls. Type 2 MWs are also created by ibv_alloc_mw() + * but bound by bind_mw work requests. The ibv_bind_mw() call is converted + * by libibverbs to a bind_mw work request. + */ + +#include "rxe.h" + +int rxe_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) +{ + struct rxe_mw *mw = to_rmw(ibmw); + struct rxe_pd *pd = to_rpd(ibmw->pd); + struct rxe_dev *rxe = to_rdev(ibmw->device); + int ret; + + rxe_get(pd); + + ret = rxe_add_to_pool(&rxe->mw_pool, mw); + if (ret) { + rxe_put(pd); + return ret; + } + + mw->rkey = ibmw->rkey = (mw->elem.index << 8) | rxe_get_next_key(-1); + mw->state = (mw->ibmw.type == IB_MW_TYPE_2) ? + RXE_MW_STATE_FREE : RXE_MW_STATE_VALID; + spin_lock_init(&mw->lock); + + rxe_finalize(mw); + + return 0; +} + +int rxe_dealloc_mw(struct ib_mw *ibmw) +{ + struct rxe_mw *mw = to_rmw(ibmw); + + rxe_cleanup(mw); + + return 0; +} + +static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, + struct rxe_mw *mw, struct rxe_mr *mr, int access) +{ + if (mw->ibmw.type == IB_MW_TYPE_1) { + if (unlikely(mw->state != RXE_MW_STATE_VALID)) { + rxe_dbg_mw(mw, + "attempt to bind a type 1 MW not in the valid state\n"); + return -EINVAL; + } + + /* o10-36.2.2 */ + if (unlikely((access & IB_ZERO_BASED))) { + rxe_dbg_mw(mw, "attempt to bind a zero based type 1 MW\n"); + return -EINVAL; + } + } + + if (mw->ibmw.type == IB_MW_TYPE_2) { + /* o10-37.2.30 */ + if (unlikely(mw->state != RXE_MW_STATE_FREE)) { + rxe_dbg_mw(mw, + "attempt to bind a type 2 MW not in the free state\n"); + return -EINVAL; + } + + /* C10-72 */ + if (unlikely(qp->pd != to_rpd(mw->ibmw.pd))) { + rxe_dbg_mw(mw, + "attempt to bind type 2 MW with qp with different PD\n"); + return -EINVAL; + } + + /* o10-37.2.40 */ + if (unlikely(!mr || wqe->wr.wr.mw.length == 0)) { + rxe_dbg_mw(mw, + "attempt to invalidate type 2 MW by binding with NULL or zero length MR\n"); + return -EINVAL; + } + } + + /* remaining checks only apply to a nonzero MR */ + if (!mr) + return 0; + + if (unlikely(mr->access & IB_ZERO_BASED)) { + rxe_dbg_mw(mw, "attempt to bind MW to zero based MR\n"); + return -EINVAL; + } + + /* C10-73 */ + if (unlikely(!(mr->access & IB_ACCESS_MW_BIND))) { + rxe_dbg_mw(mw, + "attempt to bind an MW to an MR without bind access\n"); + return -EINVAL; + } + + /* C10-74 */ + if (unlikely((access & + (IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC)) && + !(mr->access & IB_ACCESS_LOCAL_WRITE))) { + rxe_dbg_mw(mw, + "attempt to bind an Writable MW to an MR without local write access\n"); + return -EINVAL; + } + + /* C10-75 */ + if (access & IB_ZERO_BASED) { + if (unlikely(wqe->wr.wr.mw.length > mr->ibmr.length)) { + rxe_dbg_mw(mw, + "attempt to bind a ZB MW outside of the MR\n"); + return -EINVAL; + } + } else { + if (unlikely((wqe->wr.wr.mw.addr < mr->ibmr.iova) || + ((wqe->wr.wr.mw.addr + wqe->wr.wr.mw.length) > + (mr->ibmr.iova + mr->ibmr.length)))) { + rxe_dbg_mw(mw, + "attempt to bind a VA MW outside of the MR\n"); + return -EINVAL; + } + } + + return 0; +} + +static void rxe_do_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, + struct rxe_mw *mw, struct rxe_mr *mr, int access) +{ + u32 key = wqe->wr.wr.mw.rkey & 0xff; + + mw->rkey = (mw->rkey & ~0xff) | key; + mw->access = access; + mw->state = RXE_MW_STATE_VALID; + mw->addr = wqe->wr.wr.mw.addr; + mw->length = wqe->wr.wr.mw.length; + + if (mw->mr) { + rxe_put(mw->mr); + atomic_dec(&mw->mr->num_mw); + mw->mr = NULL; + } + + if (mw->length) { + mw->mr = mr; + atomic_inc(&mr->num_mw); + rxe_get(mr); + } + + if (mw->ibmw.type == IB_MW_TYPE_2) { + rxe_get(qp); + mw->qp = qp; + } +} + +int rxe_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe) +{ + int ret; + struct rxe_mw *mw; + struct rxe_mr *mr; + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + u32 mw_rkey = wqe->wr.wr.mw.mw_rkey; + u32 mr_lkey = wqe->wr.wr.mw.mr_lkey; + int access = wqe->wr.wr.mw.access; + + mw = rxe_pool_get_index(&rxe->mw_pool, mw_rkey >> 8); + if (unlikely(!mw)) { + ret = -EINVAL; + goto err; + } + + if (unlikely(mw->rkey != mw_rkey)) { + ret = -EINVAL; + goto err_drop_mw; + } + + if (likely(wqe->wr.wr.mw.length)) { + mr = rxe_pool_get_index(&rxe->mr_pool, mr_lkey >> 8); + if (unlikely(!mr)) { + ret = -EINVAL; + goto err_drop_mw; + } + + if (unlikely(mr->lkey != mr_lkey)) { + ret = -EINVAL; + goto err_drop_mr; + } + } else { + mr = NULL; + } + + if (access & ~RXE_ACCESS_SUPPORTED_MW) { + rxe_err_mw(mw, "access %#x not supported", access); + ret = -EOPNOTSUPP; + goto err_drop_mr; + } + + spin_lock_bh(&mw->lock); + + ret = rxe_check_bind_mw(qp, wqe, mw, mr, access); + if (ret) + goto err_unlock; + + rxe_do_bind_mw(qp, wqe, mw, mr, access); +err_unlock: + spin_unlock_bh(&mw->lock); +err_drop_mr: + if (mr) + rxe_put(mr); +err_drop_mw: + rxe_put(mw); +err: + return ret; +} + +static int rxe_check_invalidate_mw(struct rxe_qp *qp, struct rxe_mw *mw) +{ + if (unlikely(mw->state == RXE_MW_STATE_INVALID)) + return -EINVAL; + + /* o10-37.2.26 */ + if (unlikely(mw->ibmw.type == IB_MW_TYPE_1)) + return -EINVAL; + + return 0; +} + +static void rxe_do_invalidate_mw(struct rxe_mw *mw) +{ + struct rxe_qp *qp; + struct rxe_mr *mr; + + /* valid type 2 MW will always have a QP pointer */ + qp = mw->qp; + mw->qp = NULL; + rxe_put(qp); + + /* valid type 2 MW will always have an MR pointer */ + mr = mw->mr; + mw->mr = NULL; + atomic_dec(&mr->num_mw); + rxe_put(mr); + + mw->access = 0; + mw->addr = 0; + mw->length = 0; + mw->state = RXE_MW_STATE_FREE; +} + +int rxe_invalidate_mw(struct rxe_qp *qp, u32 rkey) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct rxe_mw *mw; + int ret; + + mw = rxe_pool_get_index(&rxe->mw_pool, rkey >> 8); + if (!mw) { + ret = -EINVAL; + goto err; + } + + if (rkey != mw->rkey) { + ret = -EINVAL; + goto err_drop_ref; + } + + spin_lock_bh(&mw->lock); + + ret = rxe_check_invalidate_mw(qp, mw); + if (ret) + goto err_unlock; + + rxe_do_invalidate_mw(mw); +err_unlock: + spin_unlock_bh(&mw->lock); +err_drop_ref: + rxe_put(mw); +err: + return ret; +} + +struct rxe_mw *rxe_lookup_mw(struct rxe_qp *qp, int access, u32 rkey) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct rxe_pd *pd = to_rpd(qp->ibqp.pd); + struct rxe_mw *mw; + int index = rkey >> 8; + + mw = rxe_pool_get_index(&rxe->mw_pool, index); + if (!mw) + return NULL; + + if (unlikely((mw->rkey != rkey) || rxe_mw_pd(mw) != pd || + (mw->ibmw.type == IB_MW_TYPE_2 && mw->qp != qp) || + (mw->length == 0) || ((access & mw->access) != access) || + mw->state != RXE_MW_STATE_VALID)) { + rxe_put(mw); + return NULL; + } + + return mw; +} + +void rxe_mw_cleanup(struct rxe_pool_elem *elem) +{ + struct rxe_mw *mw = container_of(elem, typeof(*mw), elem); + struct rxe_pd *pd = to_rpd(mw->ibmw.pd); + + rxe_put(pd); + + if (mw->mr) { + struct rxe_mr *mr = mw->mr; + + mw->mr = NULL; + atomic_dec(&mr->num_mw); + rxe_put(mr); + } + + if (mw->qp) { + struct rxe_qp *qp = mw->qp; + + mw->qp = NULL; + rxe_put(qp); + } + + mw->access = 0; + mw->addr = 0; + mw->length = 0; + mw->state = RXE_MW_STATE_INVALID; +} diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c new file mode 100644 index 0000000000..cd59666158 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -0,0 +1,696 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <linux/skbuff.h> +#include <linux/if_arp.h> +#include <linux/netdevice.h> +#include <linux/if.h> +#include <linux/if_vlan.h> +#include <net/udp_tunnel.h> +#include <net/sch_generic.h> +#include <linux/netfilter.h> +#include <rdma/ib_addr.h> + +#include "rxe.h" +#include "rxe_net.h" +#include "rxe_loc.h" + +static struct rxe_recv_sockets recv_sockets; + +static struct dst_entry *rxe_find_route4(struct rxe_qp *qp, + struct net_device *ndev, + struct in_addr *saddr, + struct in_addr *daddr) +{ + struct rtable *rt; + struct flowi4 fl = { { 0 } }; + + memset(&fl, 0, sizeof(fl)); + fl.flowi4_oif = ndev->ifindex; + memcpy(&fl.saddr, saddr, sizeof(*saddr)); + memcpy(&fl.daddr, daddr, sizeof(*daddr)); + fl.flowi4_proto = IPPROTO_UDP; + + rt = ip_route_output_key(&init_net, &fl); + if (IS_ERR(rt)) { + rxe_dbg_qp(qp, "no route to %pI4\n", &daddr->s_addr); + return NULL; + } + + return &rt->dst; +} + +#if IS_ENABLED(CONFIG_IPV6) +static struct dst_entry *rxe_find_route6(struct rxe_qp *qp, + struct net_device *ndev, + struct in6_addr *saddr, + struct in6_addr *daddr) +{ + struct dst_entry *ndst; + struct flowi6 fl6 = { { 0 } }; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_oif = ndev->ifindex; + memcpy(&fl6.saddr, saddr, sizeof(*saddr)); + memcpy(&fl6.daddr, daddr, sizeof(*daddr)); + fl6.flowi6_proto = IPPROTO_UDP; + + ndst = ipv6_stub->ipv6_dst_lookup_flow(sock_net(recv_sockets.sk6->sk), + recv_sockets.sk6->sk, &fl6, + NULL); + if (IS_ERR(ndst)) { + rxe_dbg_qp(qp, "no route to %pI6\n", daddr); + return NULL; + } + + if (unlikely(ndst->error)) { + rxe_dbg_qp(qp, "no route to %pI6\n", daddr); + goto put; + } + + return ndst; +put: + dst_release(ndst); + return NULL; +} + +#else + +static struct dst_entry *rxe_find_route6(struct rxe_qp *qp, + struct net_device *ndev, + struct in6_addr *saddr, + struct in6_addr *daddr) +{ + return NULL; +} + +#endif + +static struct dst_entry *rxe_find_route(struct net_device *ndev, + struct rxe_qp *qp, + struct rxe_av *av) +{ + struct dst_entry *dst = NULL; + + if (qp_type(qp) == IB_QPT_RC) + dst = sk_dst_get(qp->sk->sk); + + if (!dst || !dst_check(dst, qp->dst_cookie)) { + if (dst) + dst_release(dst); + + if (av->network_type == RXE_NETWORK_TYPE_IPV4) { + struct in_addr *saddr; + struct in_addr *daddr; + + saddr = &av->sgid_addr._sockaddr_in.sin_addr; + daddr = &av->dgid_addr._sockaddr_in.sin_addr; + dst = rxe_find_route4(qp, ndev, saddr, daddr); + } else if (av->network_type == RXE_NETWORK_TYPE_IPV6) { + struct in6_addr *saddr6; + struct in6_addr *daddr6; + + saddr6 = &av->sgid_addr._sockaddr_in6.sin6_addr; + daddr6 = &av->dgid_addr._sockaddr_in6.sin6_addr; + dst = rxe_find_route6(qp, ndev, saddr6, daddr6); +#if IS_ENABLED(CONFIG_IPV6) + if (dst) + qp->dst_cookie = + rt6_get_cookie((struct rt6_info *)dst); +#endif + } + + if (dst && (qp_type(qp) == IB_QPT_RC)) { + dst_hold(dst); + sk_dst_set(qp->sk->sk, dst); + } + } + return dst; +} + +static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb) +{ + struct udphdr *udph; + struct rxe_dev *rxe; + struct net_device *ndev = skb->dev; + struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); + + /* takes a reference on rxe->ib_dev + * drop when skb is freed + */ + rxe = rxe_get_dev_from_net(ndev); + if (!rxe && is_vlan_dev(ndev)) + rxe = rxe_get_dev_from_net(vlan_dev_real_dev(ndev)); + if (!rxe) + goto drop; + + if (skb_linearize(skb)) { + ib_device_put(&rxe->ib_dev); + goto drop; + } + + udph = udp_hdr(skb); + pkt->rxe = rxe; + pkt->port_num = 1; + pkt->hdr = (u8 *)(udph + 1); + pkt->mask = RXE_GRH_MASK; + pkt->paylen = be16_to_cpu(udph->len) - sizeof(*udph); + + /* remove udp header */ + skb_pull(skb, sizeof(struct udphdr)); + + rxe_rcv(skb); + + return 0; +drop: + kfree_skb(skb); + + return 0; +} + +static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port, + bool ipv6) +{ + int err; + struct socket *sock; + struct udp_port_cfg udp_cfg = { }; + struct udp_tunnel_sock_cfg tnl_cfg = { }; + + if (ipv6) { + udp_cfg.family = AF_INET6; + udp_cfg.ipv6_v6only = 1; + } else { + udp_cfg.family = AF_INET; + } + + udp_cfg.local_udp_port = port; + + /* Create UDP socket */ + err = udp_sock_create(net, &udp_cfg, &sock); + if (err < 0) + return ERR_PTR(err); + + tnl_cfg.encap_type = 1; + tnl_cfg.encap_rcv = rxe_udp_encap_recv; + + /* Setup UDP tunnel */ + setup_udp_tunnel_sock(net, sock, &tnl_cfg); + + return sock; +} + +static void rxe_release_udp_tunnel(struct socket *sk) +{ + if (sk) + udp_tunnel_sock_release(sk); +} + +static void prepare_udp_hdr(struct sk_buff *skb, __be16 src_port, + __be16 dst_port) +{ + struct udphdr *udph; + + __skb_push(skb, sizeof(*udph)); + skb_reset_transport_header(skb); + udph = udp_hdr(skb); + + udph->dest = dst_port; + udph->source = src_port; + udph->len = htons(skb->len); + udph->check = 0; +} + +static void prepare_ipv4_hdr(struct dst_entry *dst, struct sk_buff *skb, + __be32 saddr, __be32 daddr, __u8 proto, + __u8 tos, __u8 ttl, __be16 df, bool xnet) +{ + struct iphdr *iph; + + skb_scrub_packet(skb, xnet); + + skb_clear_hash(skb); + skb_dst_set(skb, dst_clone(dst)); + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + + iph = ip_hdr(skb); + + iph->version = IPVERSION; + iph->ihl = sizeof(struct iphdr) >> 2; + iph->tot_len = htons(skb->len); + iph->frag_off = df; + iph->protocol = proto; + iph->tos = tos; + iph->daddr = daddr; + iph->saddr = saddr; + iph->ttl = ttl; + __ip_select_ident(dev_net(dst->dev), iph, + skb_shinfo(skb)->gso_segs ?: 1); +} + +static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb, + struct in6_addr *saddr, struct in6_addr *daddr, + __u8 proto, __u8 prio, __u8 ttl) +{ + struct ipv6hdr *ip6h; + + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED + | IPSKB_REROUTED); + skb_dst_set(skb, dst_clone(dst)); + + __skb_push(skb, sizeof(*ip6h)); + skb_reset_network_header(skb); + ip6h = ipv6_hdr(skb); + ip6_flow_hdr(ip6h, prio, htonl(0)); + ip6h->payload_len = htons(skb->len); + ip6h->nexthdr = proto; + ip6h->hop_limit = ttl; + ip6h->daddr = *daddr; + ip6h->saddr = *saddr; + ip6h->payload_len = htons(skb->len - sizeof(*ip6h)); +} + +static int prepare4(struct rxe_av *av, struct rxe_pkt_info *pkt, + struct sk_buff *skb) +{ + struct rxe_qp *qp = pkt->qp; + struct dst_entry *dst; + bool xnet = false; + __be16 df = htons(IP_DF); + struct in_addr *saddr = &av->sgid_addr._sockaddr_in.sin_addr; + struct in_addr *daddr = &av->dgid_addr._sockaddr_in.sin_addr; + + dst = rxe_find_route(skb->dev, qp, av); + if (!dst) { + rxe_dbg_qp(qp, "Host not reachable\n"); + return -EHOSTUNREACH; + } + + prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), + cpu_to_be16(ROCE_V2_UDP_DPORT)); + + prepare_ipv4_hdr(dst, skb, saddr->s_addr, daddr->s_addr, IPPROTO_UDP, + av->grh.traffic_class, av->grh.hop_limit, df, xnet); + + dst_release(dst); + return 0; +} + +static int prepare6(struct rxe_av *av, struct rxe_pkt_info *pkt, + struct sk_buff *skb) +{ + struct rxe_qp *qp = pkt->qp; + struct dst_entry *dst; + struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr; + struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr; + + dst = rxe_find_route(skb->dev, qp, av); + if (!dst) { + rxe_dbg_qp(qp, "Host not reachable\n"); + return -EHOSTUNREACH; + } + + prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), + cpu_to_be16(ROCE_V2_UDP_DPORT)); + + prepare_ipv6_hdr(dst, skb, saddr, daddr, IPPROTO_UDP, + av->grh.traffic_class, + av->grh.hop_limit); + + dst_release(dst); + return 0; +} + +int rxe_prepare(struct rxe_av *av, struct rxe_pkt_info *pkt, + struct sk_buff *skb) +{ + int err = 0; + + if (skb->protocol == htons(ETH_P_IP)) + err = prepare4(av, pkt, skb); + else if (skb->protocol == htons(ETH_P_IPV6)) + err = prepare6(av, pkt, skb); + + if (ether_addr_equal(skb->dev->dev_addr, av->dmac)) + pkt->mask |= RXE_LOOPBACK_MASK; + + return err; +} + +static void rxe_skb_tx_dtor(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct rxe_qp *qp = sk->sk_user_data; + int skb_out = atomic_dec_return(&qp->skb_out); + + if (unlikely(qp->need_req_skb && + skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW)) + rxe_sched_task(&qp->req.task); + + rxe_put(qp); +} + +static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt) +{ + int err; + + skb->destructor = rxe_skb_tx_dtor; + skb->sk = pkt->qp->sk->sk; + + rxe_get(pkt->qp); + atomic_inc(&pkt->qp->skb_out); + + if (skb->protocol == htons(ETH_P_IP)) { + err = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); + } else if (skb->protocol == htons(ETH_P_IPV6)) { + err = ip6_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); + } else { + rxe_dbg_qp(pkt->qp, "Unknown layer 3 protocol: %d\n", + skb->protocol); + atomic_dec(&pkt->qp->skb_out); + rxe_put(pkt->qp); + kfree_skb(skb); + return -EINVAL; + } + + if (unlikely(net_xmit_eval(err))) { + rxe_dbg_qp(pkt->qp, "error sending packet: %d\n", err); + return -EAGAIN; + } + + return 0; +} + +/* fix up a send packet to match the packets + * received from UDP before looping them back + */ +static int rxe_loopback(struct sk_buff *skb, struct rxe_pkt_info *pkt) +{ + memcpy(SKB_TO_PKT(skb), pkt, sizeof(*pkt)); + + if (skb->protocol == htons(ETH_P_IP)) + skb_pull(skb, sizeof(struct iphdr)); + else + skb_pull(skb, sizeof(struct ipv6hdr)); + + if (WARN_ON(!ib_device_try_get(&pkt->rxe->ib_dev))) { + kfree_skb(skb); + return -EIO; + } + + /* remove udp header */ + skb_pull(skb, sizeof(struct udphdr)); + + rxe_rcv(skb); + + return 0; +} + +int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt, + struct sk_buff *skb) +{ + int err; + int is_request = pkt->mask & RXE_REQ_MASK; + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + unsigned long flags; + + spin_lock_irqsave(&qp->state_lock, flags); + if ((is_request && (qp_state(qp) < IB_QPS_RTS)) || + (!is_request && (qp_state(qp) < IB_QPS_RTR))) { + spin_unlock_irqrestore(&qp->state_lock, flags); + rxe_dbg_qp(qp, "Packet dropped. QP is not in ready state\n"); + goto drop; + } + spin_unlock_irqrestore(&qp->state_lock, flags); + + rxe_icrc_generate(skb, pkt); + + if (pkt->mask & RXE_LOOPBACK_MASK) + err = rxe_loopback(skb, pkt); + else + err = rxe_send(skb, pkt); + if (err) { + rxe_counter_inc(rxe, RXE_CNT_SEND_ERR); + return err; + } + + if ((qp_type(qp) != IB_QPT_RC) && + (pkt->mask & RXE_END_MASK)) { + pkt->wqe->state = wqe_state_done; + rxe_sched_task(&qp->comp.task); + } + + rxe_counter_inc(rxe, RXE_CNT_SENT_PKTS); + goto done; + +drop: + kfree_skb(skb); + err = 0; +done: + return err; +} + +struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, + int paylen, struct rxe_pkt_info *pkt) +{ + unsigned int hdr_len; + struct sk_buff *skb = NULL; + struct net_device *ndev; + const struct ib_gid_attr *attr; + const int port_num = 1; + + attr = rdma_get_gid_attr(&rxe->ib_dev, port_num, av->grh.sgid_index); + if (IS_ERR(attr)) + return NULL; + + if (av->network_type == RXE_NETWORK_TYPE_IPV4) + hdr_len = ETH_HLEN + sizeof(struct udphdr) + + sizeof(struct iphdr); + else + hdr_len = ETH_HLEN + sizeof(struct udphdr) + + sizeof(struct ipv6hdr); + + rcu_read_lock(); + ndev = rdma_read_gid_attr_ndev_rcu(attr); + if (IS_ERR(ndev)) { + rcu_read_unlock(); + goto out; + } + skb = alloc_skb(paylen + hdr_len + LL_RESERVED_SPACE(ndev), + GFP_ATOMIC); + + if (unlikely(!skb)) { + rcu_read_unlock(); + goto out; + } + + skb_reserve(skb, hdr_len + LL_RESERVED_SPACE(ndev)); + + /* FIXME: hold reference to this netdev until life of this skb. */ + skb->dev = ndev; + rcu_read_unlock(); + + if (av->network_type == RXE_NETWORK_TYPE_IPV4) + skb->protocol = htons(ETH_P_IP); + else + skb->protocol = htons(ETH_P_IPV6); + + pkt->rxe = rxe; + pkt->port_num = port_num; + pkt->hdr = skb_put(skb, paylen); + pkt->mask |= RXE_GRH_MASK; + +out: + rdma_put_gid_attr(attr); + return skb; +} + +/* + * this is required by rxe_cfg to match rxe devices in + * /sys/class/infiniband up with their underlying ethernet devices + */ +const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num) +{ + return rxe->ndev->name; +} + +int rxe_net_add(const char *ibdev_name, struct net_device *ndev) +{ + int err; + struct rxe_dev *rxe = NULL; + + rxe = ib_alloc_device(rxe_dev, ib_dev); + if (!rxe) + return -ENOMEM; + + rxe->ndev = ndev; + + err = rxe_add(rxe, ndev->mtu, ibdev_name); + if (err) { + ib_dealloc_device(&rxe->ib_dev); + return err; + } + + return 0; +} + +static void rxe_port_event(struct rxe_dev *rxe, + enum ib_event_type event) +{ + struct ib_event ev; + + ev.device = &rxe->ib_dev; + ev.element.port_num = 1; + ev.event = event; + + ib_dispatch_event(&ev); +} + +/* Caller must hold net_info_lock */ +void rxe_port_up(struct rxe_dev *rxe) +{ + struct rxe_port *port; + + port = &rxe->port; + port->attr.state = IB_PORT_ACTIVE; + + rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE); + dev_info(&rxe->ib_dev.dev, "set active\n"); +} + +/* Caller must hold net_info_lock */ +void rxe_port_down(struct rxe_dev *rxe) +{ + struct rxe_port *port; + + port = &rxe->port; + port->attr.state = IB_PORT_DOWN; + + rxe_port_event(rxe, IB_EVENT_PORT_ERR); + rxe_counter_inc(rxe, RXE_CNT_LINK_DOWNED); + dev_info(&rxe->ib_dev.dev, "set down\n"); +} + +void rxe_set_port_state(struct rxe_dev *rxe) +{ + if (netif_running(rxe->ndev) && netif_carrier_ok(rxe->ndev)) + rxe_port_up(rxe); + else + rxe_port_down(rxe); +} + +static int rxe_notify(struct notifier_block *not_blk, + unsigned long event, + void *arg) +{ + struct net_device *ndev = netdev_notifier_info_to_dev(arg); + struct rxe_dev *rxe = rxe_get_dev_from_net(ndev); + + if (!rxe) + return NOTIFY_OK; + + switch (event) { + case NETDEV_UNREGISTER: + ib_unregister_device_queued(&rxe->ib_dev); + break; + case NETDEV_UP: + rxe_port_up(rxe); + break; + case NETDEV_DOWN: + rxe_port_down(rxe); + break; + case NETDEV_CHANGEMTU: + rxe_dbg_dev(rxe, "%s changed mtu to %d\n", ndev->name, ndev->mtu); + rxe_set_mtu(rxe, ndev->mtu); + break; + case NETDEV_CHANGE: + rxe_set_port_state(rxe); + break; + case NETDEV_REBOOT: + case NETDEV_GOING_DOWN: + case NETDEV_CHANGEADDR: + case NETDEV_CHANGENAME: + case NETDEV_FEAT_CHANGE: + default: + rxe_dbg_dev(rxe, "ignoring netdev event = %ld for %s\n", + event, ndev->name); + break; + } + + ib_device_put(&rxe->ib_dev); + return NOTIFY_OK; +} + +static struct notifier_block rxe_net_notifier = { + .notifier_call = rxe_notify, +}; + +static int rxe_net_ipv4_init(void) +{ + recv_sockets.sk4 = rxe_setup_udp_tunnel(&init_net, + htons(ROCE_V2_UDP_DPORT), false); + if (IS_ERR(recv_sockets.sk4)) { + recv_sockets.sk4 = NULL; + pr_err("Failed to create IPv4 UDP tunnel\n"); + return -1; + } + + return 0; +} + +static int rxe_net_ipv6_init(void) +{ +#if IS_ENABLED(CONFIG_IPV6) + + recv_sockets.sk6 = rxe_setup_udp_tunnel(&init_net, + htons(ROCE_V2_UDP_DPORT), true); + if (PTR_ERR(recv_sockets.sk6) == -EAFNOSUPPORT) { + recv_sockets.sk6 = NULL; + pr_warn("IPv6 is not supported, can not create a UDPv6 socket\n"); + return 0; + } + + if (IS_ERR(recv_sockets.sk6)) { + recv_sockets.sk6 = NULL; + pr_err("Failed to create IPv6 UDP tunnel\n"); + return -1; + } +#endif + return 0; +} + +void rxe_net_exit(void) +{ + rxe_release_udp_tunnel(recv_sockets.sk6); + rxe_release_udp_tunnel(recv_sockets.sk4); + unregister_netdevice_notifier(&rxe_net_notifier); +} + +int rxe_net_init(void) +{ + int err; + + recv_sockets.sk6 = NULL; + + err = rxe_net_ipv4_init(); + if (err) + return err; + err = rxe_net_ipv6_init(); + if (err) + goto err_out; + err = register_netdevice_notifier(&rxe_net_notifier); + if (err) { + pr_err("Failed to register netdev notifier\n"); + goto err_out; + } + return 0; +err_out: + rxe_net_exit(); + return err; +} diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h new file mode 100644 index 0000000000..45d80d00f8 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_net.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#ifndef RXE_NET_H +#define RXE_NET_H + +#include <net/sock.h> +#include <net/if_inet6.h> +#include <linux/module.h> + +struct rxe_recv_sockets { + struct socket *sk4; + struct socket *sk6; +}; + +int rxe_net_add(const char *ibdev_name, struct net_device *ndev); + +int rxe_net_init(void); +void rxe_net_exit(void); + +#endif /* RXE_NET_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.c b/drivers/infiniband/sw/rxe/rxe_opcode.c new file mode 100644 index 0000000000..5c0d5c6ffd --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_opcode.c @@ -0,0 +1,975 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <rdma/ib_pack.h> +#include "rxe_opcode.h" +#include "rxe_hdr.h" + +/* useful information about work request opcodes and pkt opcodes in + * table form + */ +struct rxe_wr_opcode_info rxe_wr_opcode_info[] = { + [IB_WR_RDMA_WRITE] = { + .name = "IB_WR_RDMA_WRITE", + .mask = { + [IB_QPT_RC] = WR_INLINE_MASK | WR_WRITE_MASK, + [IB_QPT_UC] = WR_INLINE_MASK | WR_WRITE_MASK, + }, + }, + [IB_WR_RDMA_WRITE_WITH_IMM] = { + .name = "IB_WR_RDMA_WRITE_WITH_IMM", + .mask = { + [IB_QPT_RC] = WR_INLINE_MASK | WR_WRITE_MASK, + [IB_QPT_UC] = WR_INLINE_MASK | WR_WRITE_MASK, + }, + }, + [IB_WR_SEND] = { + .name = "IB_WR_SEND", + .mask = { + [IB_QPT_GSI] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_RC] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_UC] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_UD] = WR_INLINE_MASK | WR_SEND_MASK, + }, + }, + [IB_WR_SEND_WITH_IMM] = { + .name = "IB_WR_SEND_WITH_IMM", + .mask = { + [IB_QPT_GSI] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_RC] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_UC] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_UD] = WR_INLINE_MASK | WR_SEND_MASK, + }, + }, + [IB_WR_RDMA_READ] = { + .name = "IB_WR_RDMA_READ", + .mask = { + [IB_QPT_RC] = WR_READ_MASK, + }, + }, + [IB_WR_ATOMIC_CMP_AND_SWP] = { + .name = "IB_WR_ATOMIC_CMP_AND_SWP", + .mask = { + [IB_QPT_RC] = WR_ATOMIC_MASK, + }, + }, + [IB_WR_ATOMIC_FETCH_AND_ADD] = { + .name = "IB_WR_ATOMIC_FETCH_AND_ADD", + .mask = { + [IB_QPT_RC] = WR_ATOMIC_MASK, + }, + }, + [IB_WR_LSO] = { + .name = "IB_WR_LSO", + .mask = { + /* not supported */ + }, + }, + [IB_WR_SEND_WITH_INV] = { + .name = "IB_WR_SEND_WITH_INV", + .mask = { + [IB_QPT_RC] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_UC] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_UD] = WR_INLINE_MASK | WR_SEND_MASK, + }, + }, + [IB_WR_RDMA_READ_WITH_INV] = { + .name = "IB_WR_RDMA_READ_WITH_INV", + .mask = { + [IB_QPT_RC] = WR_READ_MASK, + }, + }, + [IB_WR_LOCAL_INV] = { + .name = "IB_WR_LOCAL_INV", + .mask = { + [IB_QPT_RC] = WR_LOCAL_OP_MASK, + }, + }, + [IB_WR_REG_MR] = { + .name = "IB_WR_REG_MR", + .mask = { + [IB_QPT_RC] = WR_LOCAL_OP_MASK, + }, + }, + [IB_WR_BIND_MW] = { + .name = "IB_WR_BIND_MW", + .mask = { + [IB_QPT_RC] = WR_LOCAL_OP_MASK, + [IB_QPT_UC] = WR_LOCAL_OP_MASK, + }, + }, + [IB_WR_FLUSH] = { + .name = "IB_WR_FLUSH", + .mask = { + [IB_QPT_RC] = WR_FLUSH_MASK, + }, + }, + [IB_WR_ATOMIC_WRITE] = { + .name = "IB_WR_ATOMIC_WRITE", + .mask = { + [IB_QPT_RC] = WR_ATOMIC_WRITE_MASK, + }, + }, +}; + +struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE] = { + [IB_OPCODE_RC_SEND_FIRST] = { + .name = "IB_OPCODE_RC_SEND_FIRST", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_RWR_MASK | + RXE_SEND_MASK | RXE_START_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_SEND_MIDDLE] = { + .name = "IB_OPCODE_RC_SEND_MIDDLE", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_SEND_MASK | + RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_SEND_LAST] = { + .name = "IB_OPCODE_RC_SEND_LAST", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK | + RXE_SEND_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE", + .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK | + RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IMMDT] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RC_SEND_ONLY] = { + .name = "IB_OPCODE_RC_SEND_ONLY", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK | + RXE_RWR_MASK | RXE_SEND_MASK | + RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE", + .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK | + RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK | + RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IMMDT] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_WRITE_FIRST] = { + .name = "IB_OPCODE_RC_RDMA_WRITE_FIRST", + .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK | + RXE_WRITE_MASK | RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_WRITE_MIDDLE] = { + .name = "IB_OPCODE_RC_RDMA_WRITE_MIDDLE", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK | + RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_WRITE_LAST] = { + .name = "IB_OPCODE_RC_RDMA_WRITE_LAST", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK | + RXE_END_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE", + .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK | + RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK | + RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IMMDT] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_WRITE_ONLY] = { + .name = "IB_OPCODE_RC_RDMA_WRITE_ONLY", + .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK | + RXE_WRITE_MASK | RXE_START_MASK | + RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE", + .mask = RXE_RETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | + RXE_REQ_MASK | RXE_WRITE_MASK | + RXE_COMP_MASK | RXE_RWR_MASK | + RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_READ_REQUEST] = { + .name = "IB_OPCODE_RC_RDMA_READ_REQUEST", + .mask = RXE_RETH_MASK | RXE_REQ_MASK | RXE_READ_MASK | + RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST] = { + .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST", + .mask = RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK | + RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_AETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE] = { + .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE", + .mask = RXE_PAYLOAD_MASK | RXE_ACK_MASK | RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST] = { + .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST", + .mask = RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK | + RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_AETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY] = { + .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY", + .mask = RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK | + RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_AETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RC_ACKNOWLEDGE] = { + .name = "IB_OPCODE_RC_ACKNOWLEDGE", + .mask = RXE_AETH_MASK | RXE_ACK_MASK | RXE_START_MASK | + RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_AETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = { + .name = "IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE", + .mask = RXE_AETH_MASK | RXE_ATMACK_MASK | RXE_ACK_MASK | + RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_ATMACK_BYTES + RXE_AETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_AETH] = RXE_BTH_BYTES, + [RXE_ATMACK] = RXE_BTH_BYTES + + RXE_AETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_ATMACK_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RC_COMPARE_SWAP] = { + .name = "IB_OPCODE_RC_COMPARE_SWAP", + .mask = RXE_ATMETH_MASK | RXE_REQ_MASK | RXE_ATOMIC_MASK | + RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_ATMETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_ATMETH_BYTES, + } + }, + [IB_OPCODE_RC_FETCH_ADD] = { + .name = "IB_OPCODE_RC_FETCH_ADD", + .mask = RXE_ATMETH_MASK | RXE_REQ_MASK | RXE_ATOMIC_MASK | + RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_ATMETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_ATMETH_BYTES, + } + }, + [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = { + .name = "IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE", + .mask = RXE_IETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK | + RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IETH_BYTES, + } + }, + [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = { + .name = "IB_OPCODE_RC_SEND_ONLY_INV", + .mask = RXE_IETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK | + RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK | + RXE_END_MASK | RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_IETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IETH_BYTES, + } + }, + [IB_OPCODE_RC_FLUSH] = { + .name = "IB_OPCODE_RC_FLUSH", + .mask = RXE_FETH_MASK | RXE_RETH_MASK | RXE_FLUSH_MASK | + RXE_START_MASK | RXE_END_MASK | RXE_REQ_MASK, + .length = RXE_BTH_BYTES + RXE_FETH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_FETH] = RXE_BTH_BYTES, + [RXE_RETH] = RXE_BTH_BYTES + RXE_FETH_BYTES, + } + }, + [IB_OPCODE_RC_ATOMIC_WRITE] = { + .name = "IB_OPCODE_RC_ATOMIC_WRITE", + .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK | + RXE_ATOMIC_WRITE_MASK | RXE_START_MASK | + RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + RXE_RETH_BYTES, + } + }, + + /* UC */ + [IB_OPCODE_UC_SEND_FIRST] = { + .name = "IB_OPCODE_UC_SEND_FIRST", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_RWR_MASK | + RXE_SEND_MASK | RXE_START_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_UC_SEND_MIDDLE] = { + .name = "IB_OPCODE_UC_SEND_MIDDLE", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_SEND_MASK | + RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_UC_SEND_LAST] = { + .name = "IB_OPCODE_UC_SEND_LAST", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK | + RXE_SEND_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE", + .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK | + RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IMMDT] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_UC_SEND_ONLY] = { + .name = "IB_OPCODE_UC_SEND_ONLY", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK | + RXE_RWR_MASK | RXE_SEND_MASK | + RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE", + .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK | + RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK | + RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IMMDT] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_UC_RDMA_WRITE_FIRST] = { + .name = "IB_OPCODE_UC_RDMA_WRITE_FIRST", + .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK | + RXE_WRITE_MASK | RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_UC_RDMA_WRITE_MIDDLE] = { + .name = "IB_OPCODE_UC_RDMA_WRITE_MIDDLE", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK | + RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_UC_RDMA_WRITE_LAST] = { + .name = "IB_OPCODE_UC_RDMA_WRITE_LAST", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK | + RXE_END_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE", + .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK | + RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK | + RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IMMDT] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_UC_RDMA_WRITE_ONLY] = { + .name = "IB_OPCODE_UC_RDMA_WRITE_ONLY", + .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK | + RXE_WRITE_MASK | RXE_START_MASK | + RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE", + .mask = RXE_RETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | + RXE_REQ_MASK | RXE_WRITE_MASK | + RXE_COMP_MASK | RXE_RWR_MASK | + RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + + /* RD */ + [IB_OPCODE_RD_SEND_FIRST] = { + .name = "IB_OPCODE_RD_SEND_FIRST", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK | + RXE_REQ_MASK | RXE_RWR_MASK | RXE_SEND_MASK | + RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_RD_SEND_MIDDLE] = { + .name = "IB_OPCODE_RD_SEND_MIDDLE", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK | + RXE_REQ_MASK | RXE_SEND_MASK | + RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_RD_SEND_LAST] = { + .name = "IB_OPCODE_RD_SEND_LAST", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK | + RXE_REQ_MASK | RXE_COMP_MASK | RXE_SEND_MASK | + RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_RD_SEND_LAST_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RD_SEND_LAST_WITH_IMMEDIATE", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK | + RXE_PAYLOAD_MASK | RXE_REQ_MASK | + RXE_COMP_MASK | RXE_SEND_MASK | + RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RD_SEND_ONLY] = { + .name = "IB_OPCODE_RD_SEND_ONLY", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK | + RXE_REQ_MASK | RXE_COMP_MASK | RXE_RWR_MASK | + RXE_SEND_MASK | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_RD_SEND_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RD_SEND_ONLY_WITH_IMMEDIATE", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK | + RXE_PAYLOAD_MASK | RXE_REQ_MASK | + RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK | + RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_WRITE_FIRST] = { + .name = "IB_OPCODE_RD_RDMA_WRITE_FIRST", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK | + RXE_PAYLOAD_MASK | RXE_REQ_MASK | + RXE_WRITE_MASK | RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_RETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_WRITE_MIDDLE] = { + .name = "IB_OPCODE_RD_RDMA_WRITE_MIDDLE", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK | + RXE_REQ_MASK | RXE_WRITE_MASK | + RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_WRITE_LAST] = { + .name = "IB_OPCODE_RD_RDMA_WRITE_LAST", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK | + RXE_REQ_MASK | RXE_WRITE_MASK | + RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_WRITE_LAST_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RD_RDMA_WRITE_LAST_WITH_IMMEDIATE", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK | + RXE_PAYLOAD_MASK | RXE_REQ_MASK | + RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK | + RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_WRITE_ONLY] = { + .name = "IB_OPCODE_RD_RDMA_WRITE_ONLY", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK | + RXE_PAYLOAD_MASK | RXE_REQ_MASK | + RXE_WRITE_MASK | RXE_START_MASK | + RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_RETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RD_RDMA_WRITE_ONLY_WITH_IMMEDIATE", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK | + RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | + RXE_REQ_MASK | RXE_WRITE_MASK | + RXE_COMP_MASK | RXE_RWR_MASK | + RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES + + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_RETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_RETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_RETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_READ_REQUEST] = { + .name = "IB_OPCODE_RD_RDMA_READ_REQUEST", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK | + RXE_REQ_MASK | RXE_READ_MASK | + RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_RETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES + + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_READ_RESPONSE_FIRST] = { + .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_FIRST", + .mask = RXE_RDETH_MASK | RXE_AETH_MASK | + RXE_PAYLOAD_MASK | RXE_ACK_MASK | + RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_AETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_READ_RESPONSE_MIDDLE] = { + .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_MIDDLE", + .mask = RXE_RDETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK | + RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_READ_RESPONSE_LAST] = { + .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_LAST", + .mask = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_PAYLOAD_MASK | + RXE_ACK_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_AETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_READ_RESPONSE_ONLY] = { + .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_ONLY", + .mask = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_PAYLOAD_MASK | + RXE_ACK_MASK | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_AETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RD_ACKNOWLEDGE] = { + .name = "IB_OPCODE_RD_ACKNOWLEDGE", + .mask = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_ACK_MASK | + RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_AETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + } + }, + [IB_OPCODE_RD_ATOMIC_ACKNOWLEDGE] = { + .name = "IB_OPCODE_RD_ATOMIC_ACKNOWLEDGE", + .mask = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_ATMACK_MASK | + RXE_ACK_MASK | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_ATMACK_BYTES + RXE_AETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_AETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_ATMACK] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RD_COMPARE_SWAP] = { + .name = "RD_COMPARE_SWAP", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_ATMETH_MASK | + RXE_REQ_MASK | RXE_ATOMIC_MASK | + RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_ATMETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_ATMETH_BYTES + + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + } + }, + [IB_OPCODE_RD_FETCH_ADD] = { + .name = "IB_OPCODE_RD_FETCH_ADD", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_ATMETH_MASK | + RXE_REQ_MASK | RXE_ATOMIC_MASK | + RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_ATMETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_ATMETH_BYTES + + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + } + }, + + /* UD */ + [IB_OPCODE_UD_SEND_ONLY] = { + .name = "IB_OPCODE_UD_SEND_ONLY", + .mask = RXE_DETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK | + RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK | + RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_DETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE", + .mask = RXE_DETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | + RXE_REQ_MASK | RXE_COMP_MASK | RXE_RWR_MASK | + RXE_SEND_MASK | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_DETH] = RXE_BTH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_DETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + +}; diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.h b/drivers/infiniband/sw/rxe/rxe_opcode.h new file mode 100644 index 0000000000..5686b691d6 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_opcode.h @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#ifndef RXE_OPCODE_H +#define RXE_OPCODE_H + +/* + * contains header bit mask definitions and header lengths + * declaration of the rxe_opcode_info struct and + * rxe_wr_opcode_info struct + */ + +enum rxe_wr_mask { + WR_INLINE_MASK = BIT(0), + WR_ATOMIC_MASK = BIT(1), + WR_SEND_MASK = BIT(2), + WR_READ_MASK = BIT(3), + WR_WRITE_MASK = BIT(4), + WR_LOCAL_OP_MASK = BIT(5), + WR_FLUSH_MASK = BIT(6), + WR_ATOMIC_WRITE_MASK = BIT(7), + + WR_READ_OR_WRITE_MASK = WR_READ_MASK | WR_WRITE_MASK, + WR_WRITE_OR_SEND_MASK = WR_WRITE_MASK | WR_SEND_MASK, + WR_ATOMIC_OR_READ_MASK = WR_ATOMIC_MASK | WR_READ_MASK, +}; + +#define WR_MAX_QPT (8) + +struct rxe_wr_opcode_info { + char *name; + enum rxe_wr_mask mask[WR_MAX_QPT]; +}; + +extern struct rxe_wr_opcode_info rxe_wr_opcode_info[]; + +enum rxe_hdr_type { + RXE_LRH, + RXE_GRH, + RXE_BTH, + RXE_RETH, + RXE_AETH, + RXE_ATMETH, + RXE_ATMACK, + RXE_IETH, + RXE_RDETH, + RXE_DETH, + RXE_IMMDT, + RXE_FETH, + RXE_PAYLOAD, + NUM_HDR_TYPES +}; + +enum rxe_hdr_mask { + RXE_LRH_MASK = BIT(RXE_LRH), + RXE_GRH_MASK = BIT(RXE_GRH), + RXE_BTH_MASK = BIT(RXE_BTH), + RXE_IMMDT_MASK = BIT(RXE_IMMDT), + RXE_RETH_MASK = BIT(RXE_RETH), + RXE_AETH_MASK = BIT(RXE_AETH), + RXE_ATMETH_MASK = BIT(RXE_ATMETH), + RXE_ATMACK_MASK = BIT(RXE_ATMACK), + RXE_IETH_MASK = BIT(RXE_IETH), + RXE_RDETH_MASK = BIT(RXE_RDETH), + RXE_DETH_MASK = BIT(RXE_DETH), + RXE_FETH_MASK = BIT(RXE_FETH), + RXE_PAYLOAD_MASK = BIT(RXE_PAYLOAD), + + RXE_REQ_MASK = BIT(NUM_HDR_TYPES + 0), + RXE_ACK_MASK = BIT(NUM_HDR_TYPES + 1), + RXE_SEND_MASK = BIT(NUM_HDR_TYPES + 2), + RXE_WRITE_MASK = BIT(NUM_HDR_TYPES + 3), + RXE_READ_MASK = BIT(NUM_HDR_TYPES + 4), + RXE_ATOMIC_MASK = BIT(NUM_HDR_TYPES + 5), + RXE_FLUSH_MASK = BIT(NUM_HDR_TYPES + 6), + + RXE_RWR_MASK = BIT(NUM_HDR_TYPES + 7), + RXE_COMP_MASK = BIT(NUM_HDR_TYPES + 8), + + RXE_START_MASK = BIT(NUM_HDR_TYPES + 9), + RXE_MIDDLE_MASK = BIT(NUM_HDR_TYPES + 10), + RXE_END_MASK = BIT(NUM_HDR_TYPES + 11), + + RXE_LOOPBACK_MASK = BIT(NUM_HDR_TYPES + 12), + + RXE_ATOMIC_WRITE_MASK = BIT(NUM_HDR_TYPES + 14), + + RXE_READ_OR_ATOMIC_MASK = (RXE_READ_MASK | RXE_ATOMIC_MASK), + RXE_WRITE_OR_SEND_MASK = (RXE_WRITE_MASK | RXE_SEND_MASK), + RXE_READ_OR_WRITE_MASK = (RXE_READ_MASK | RXE_WRITE_MASK), + RXE_RDMA_OP_MASK = (RXE_READ_MASK | RXE_WRITE_MASK | + RXE_ATOMIC_WRITE_MASK | RXE_FLUSH_MASK | + RXE_ATOMIC_MASK), +}; + +#define OPCODE_NONE (-1) +#define RXE_NUM_OPCODE 256 + +struct rxe_opcode_info { + char *name; + enum rxe_hdr_mask mask; + int length; + int offset[NUM_HDR_TYPES]; +}; + +extern struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE]; + +#endif /* RXE_OPCODE_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h new file mode 100644 index 0000000000..d2f57ead78 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#ifndef RXE_PARAM_H +#define RXE_PARAM_H + +#include <uapi/rdma/rdma_user_rxe.h> + +#define DEFAULT_MAX_VALUE (1 << 20) + +static inline enum ib_mtu rxe_mtu_int_to_enum(int mtu) +{ + if (mtu < 256) + return 0; + else if (mtu < 512) + return IB_MTU_256; + else if (mtu < 1024) + return IB_MTU_512; + else if (mtu < 2048) + return IB_MTU_1024; + else if (mtu < 4096) + return IB_MTU_2048; + else + return IB_MTU_4096; +} + +/* Find the IB mtu for a given network MTU. */ +static inline enum ib_mtu eth_mtu_int_to_enum(int mtu) +{ + mtu -= RXE_MAX_HDR_LENGTH; + + return rxe_mtu_int_to_enum(mtu); +} + +/* default/initial rxe device parameter settings */ +enum rxe_device_param { + RXE_MAX_MR_SIZE = -1ull, + RXE_PAGE_SIZE_CAP = 0xfffff000, + RXE_MAX_QP_WR = DEFAULT_MAX_VALUE, + RXE_DEVICE_CAP_FLAGS = IB_DEVICE_BAD_PKEY_CNTR + | IB_DEVICE_BAD_QKEY_CNTR + | IB_DEVICE_AUTO_PATH_MIG + | IB_DEVICE_CHANGE_PHY_PORT + | IB_DEVICE_UD_AV_PORT_ENFORCE + | IB_DEVICE_PORT_ACTIVE_EVENT + | IB_DEVICE_SYS_IMAGE_GUID + | IB_DEVICE_RC_RNR_NAK_GEN + | IB_DEVICE_SRQ_RESIZE + | IB_DEVICE_MEM_MGT_EXTENSIONS + | IB_DEVICE_MEM_WINDOW + | IB_DEVICE_FLUSH_GLOBAL + | IB_DEVICE_FLUSH_PERSISTENT +#ifdef CONFIG_64BIT + | IB_DEVICE_MEM_WINDOW_TYPE_2B + | IB_DEVICE_ATOMIC_WRITE, +#else + | IB_DEVICE_MEM_WINDOW_TYPE_2B, +#endif /* CONFIG_64BIT */ + RXE_MAX_SGE = 32, + RXE_MAX_WQE_SIZE = sizeof(struct rxe_send_wqe) + + sizeof(struct ib_sge) * RXE_MAX_SGE, + RXE_MAX_INLINE_DATA = RXE_MAX_WQE_SIZE - + sizeof(struct rxe_send_wqe), + RXE_MAX_SGE_RD = 32, + RXE_MAX_CQ = DEFAULT_MAX_VALUE, + RXE_MAX_LOG_CQE = 15, + RXE_MAX_PD = DEFAULT_MAX_VALUE, + RXE_MAX_QP_RD_ATOM = 128, + RXE_MAX_RES_RD_ATOM = 0x3f000, + RXE_MAX_QP_INIT_RD_ATOM = 128, + RXE_MAX_MCAST_GRP = 8192, + RXE_MAX_MCAST_QP_ATTACH = 56, + RXE_MAX_TOT_MCAST_QP_ATTACH = 0x70000, + RXE_MAX_AH = (1<<15) - 1, /* 32Ki - 1 */ + RXE_MIN_AH_INDEX = 1, + RXE_MAX_AH_INDEX = RXE_MAX_AH, + RXE_MAX_SRQ_WR = DEFAULT_MAX_VALUE, + RXE_MIN_SRQ_WR = 1, + RXE_MAX_SRQ_SGE = 27, + RXE_MIN_SRQ_SGE = 1, + RXE_MAX_FMR_PAGE_LIST_LEN = 512, + RXE_MAX_PKEYS = 64, + RXE_LOCAL_CA_ACK_DELAY = 15, + + RXE_MAX_UCONTEXT = DEFAULT_MAX_VALUE, + + RXE_NUM_PORT = 1, + + RXE_MIN_QP_INDEX = 16, + RXE_MAX_QP_INDEX = DEFAULT_MAX_VALUE, + RXE_MAX_QP = DEFAULT_MAX_VALUE - RXE_MIN_QP_INDEX, + + RXE_MIN_SRQ_INDEX = 0x00020001, + RXE_MAX_SRQ_INDEX = DEFAULT_MAX_VALUE, + RXE_MAX_SRQ = DEFAULT_MAX_VALUE - RXE_MIN_SRQ_INDEX, + + RXE_MIN_MR_INDEX = 0x00000001, + RXE_MAX_MR_INDEX = DEFAULT_MAX_VALUE >> 1, + RXE_MAX_MR = RXE_MAX_MR_INDEX - RXE_MIN_MR_INDEX, + RXE_MIN_MW_INDEX = RXE_MAX_MR_INDEX + 1, + RXE_MAX_MW_INDEX = DEFAULT_MAX_VALUE, + RXE_MAX_MW = RXE_MAX_MW_INDEX - RXE_MIN_MW_INDEX, + + RXE_MAX_PKT_PER_ACK = 64, + + RXE_MAX_UNACKED_PSNS = 128, + + /* Max inflight SKBs per queue pair */ + RXE_INFLIGHT_SKBS_PER_QP_HIGH = 64, + RXE_INFLIGHT_SKBS_PER_QP_LOW = 16, + + /* Max number of interations of each work item + * before yielding the cpu to let other + * work make progress + */ + RXE_MAX_ITERATIONS = 1024, + + /* Delay before calling arbiter timer */ + RXE_NSEC_ARB_TIMER_DELAY = 200, + + /* IBTA v1.4 A3.3.1 VENDOR INFORMATION section */ + RXE_VENDOR_ID = 0XFFFFFF, +}; + +/* default/initial rxe port parameters */ +enum rxe_port_param { + RXE_PORT_GID_TBL_LEN = 1024, + RXE_PORT_PORT_CAP_FLAGS = IB_PORT_CM_SUP, + RXE_PORT_MAX_MSG_SZ = 0x800000, + RXE_PORT_BAD_PKEY_CNTR = 0, + RXE_PORT_QKEY_VIOL_CNTR = 0, + RXE_PORT_LID = 0, + RXE_PORT_SM_LID = 0, + RXE_PORT_SM_SL = 0, + RXE_PORT_LMC = 0, + RXE_PORT_MAX_VL_NUM = 1, + RXE_PORT_SUBNET_TIMEOUT = 0, + RXE_PORT_INIT_TYPE_REPLY = 0, + RXE_PORT_ACTIVE_WIDTH = IB_WIDTH_1X, + RXE_PORT_ACTIVE_SPEED = 1, + RXE_PORT_PKEY_TBL_LEN = 1, + RXE_PORT_PHYS_STATE = IB_PORT_PHYS_STATE_POLLING, + RXE_PORT_SUBNET_PREFIX = 0xfe80000000000000ULL, +}; + +/* default/initial port info parameters */ +enum rxe_port_info_param { + RXE_PORT_INFO_VL_CAP = 4, /* 1-8 */ + RXE_PORT_INFO_MTU_CAP = 5, /* 4096 */ + RXE_PORT_INFO_OPER_VL = 1, /* 1 */ +}; + +#endif /* RXE_PARAM_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c new file mode 100644 index 0000000000..6215c6de3a --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -0,0 +1,256 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include "rxe.h" + +#define RXE_POOL_TIMEOUT (200) +#define RXE_POOL_ALIGN (16) + +static const struct rxe_type_info { + const char *name; + size_t size; + size_t elem_offset; + void (*cleanup)(struct rxe_pool_elem *elem); + u32 min_index; + u32 max_index; + u32 max_elem; +} rxe_type_info[RXE_NUM_TYPES] = { + [RXE_TYPE_UC] = { + .name = "uc", + .size = sizeof(struct rxe_ucontext), + .elem_offset = offsetof(struct rxe_ucontext, elem), + .min_index = 1, + .max_index = RXE_MAX_UCONTEXT, + .max_elem = RXE_MAX_UCONTEXT, + }, + [RXE_TYPE_PD] = { + .name = "pd", + .size = sizeof(struct rxe_pd), + .elem_offset = offsetof(struct rxe_pd, elem), + .min_index = 1, + .max_index = RXE_MAX_PD, + .max_elem = RXE_MAX_PD, + }, + [RXE_TYPE_AH] = { + .name = "ah", + .size = sizeof(struct rxe_ah), + .elem_offset = offsetof(struct rxe_ah, elem), + .min_index = RXE_MIN_AH_INDEX, + .max_index = RXE_MAX_AH_INDEX, + .max_elem = RXE_MAX_AH, + }, + [RXE_TYPE_SRQ] = { + .name = "srq", + .size = sizeof(struct rxe_srq), + .elem_offset = offsetof(struct rxe_srq, elem), + .cleanup = rxe_srq_cleanup, + .min_index = RXE_MIN_SRQ_INDEX, + .max_index = RXE_MAX_SRQ_INDEX, + .max_elem = RXE_MAX_SRQ, + }, + [RXE_TYPE_QP] = { + .name = "qp", + .size = sizeof(struct rxe_qp), + .elem_offset = offsetof(struct rxe_qp, elem), + .cleanup = rxe_qp_cleanup, + .min_index = RXE_MIN_QP_INDEX, + .max_index = RXE_MAX_QP_INDEX, + .max_elem = RXE_MAX_QP, + }, + [RXE_TYPE_CQ] = { + .name = "cq", + .size = sizeof(struct rxe_cq), + .elem_offset = offsetof(struct rxe_cq, elem), + .cleanup = rxe_cq_cleanup, + .min_index = 1, + .max_index = RXE_MAX_CQ, + .max_elem = RXE_MAX_CQ, + }, + [RXE_TYPE_MR] = { + .name = "mr", + .size = sizeof(struct rxe_mr), + .elem_offset = offsetof(struct rxe_mr, elem), + .cleanup = rxe_mr_cleanup, + .min_index = RXE_MIN_MR_INDEX, + .max_index = RXE_MAX_MR_INDEX, + .max_elem = RXE_MAX_MR, + }, + [RXE_TYPE_MW] = { + .name = "mw", + .size = sizeof(struct rxe_mw), + .elem_offset = offsetof(struct rxe_mw, elem), + .cleanup = rxe_mw_cleanup, + .min_index = RXE_MIN_MW_INDEX, + .max_index = RXE_MAX_MW_INDEX, + .max_elem = RXE_MAX_MW, + }, +}; + +void rxe_pool_init(struct rxe_dev *rxe, struct rxe_pool *pool, + enum rxe_elem_type type) +{ + const struct rxe_type_info *info = &rxe_type_info[type]; + + memset(pool, 0, sizeof(*pool)); + + pool->rxe = rxe; + pool->name = info->name; + pool->type = type; + pool->max_elem = info->max_elem; + pool->elem_size = ALIGN(info->size, RXE_POOL_ALIGN); + pool->elem_offset = info->elem_offset; + pool->cleanup = info->cleanup; + + atomic_set(&pool->num_elem, 0); + + xa_init_flags(&pool->xa, XA_FLAGS_ALLOC); + pool->limit.min = info->min_index; + pool->limit.max = info->max_index; +} + +void rxe_pool_cleanup(struct rxe_pool *pool) +{ + WARN_ON(!xa_empty(&pool->xa)); +} + +int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem, + bool sleepable) +{ + int err; + gfp_t gfp_flags; + + if (atomic_inc_return(&pool->num_elem) > pool->max_elem) + goto err_cnt; + + elem->pool = pool; + elem->obj = (u8 *)elem - pool->elem_offset; + kref_init(&elem->ref_cnt); + init_completion(&elem->complete); + + /* AH objects are unique in that the create_ah verb + * can be called in atomic context. If the create_ah + * call is not sleepable use GFP_ATOMIC. + */ + gfp_flags = sleepable ? GFP_KERNEL : GFP_ATOMIC; + + if (sleepable) + might_sleep(); + err = xa_alloc_cyclic(&pool->xa, &elem->index, NULL, pool->limit, + &pool->next, gfp_flags); + if (err < 0) + goto err_cnt; + + return 0; + +err_cnt: + atomic_dec(&pool->num_elem); + return -EINVAL; +} + +void *rxe_pool_get_index(struct rxe_pool *pool, u32 index) +{ + struct rxe_pool_elem *elem; + struct xarray *xa = &pool->xa; + void *obj; + + rcu_read_lock(); + elem = xa_load(xa, index); + if (elem && kref_get_unless_zero(&elem->ref_cnt)) + obj = elem->obj; + else + obj = NULL; + rcu_read_unlock(); + + return obj; +} + +static void rxe_elem_release(struct kref *kref) +{ + struct rxe_pool_elem *elem = container_of(kref, typeof(*elem), ref_cnt); + + complete(&elem->complete); +} + +int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable) +{ + struct rxe_pool *pool = elem->pool; + struct xarray *xa = &pool->xa; + static int timeout = RXE_POOL_TIMEOUT; + int ret, err = 0; + void *xa_ret; + + if (sleepable) + might_sleep(); + + /* erase xarray entry to prevent looking up + * the pool elem from its index + */ + xa_ret = xa_erase(xa, elem->index); + WARN_ON(xa_err(xa_ret)); + + /* if this is the last call to rxe_put complete the + * object. It is safe to touch obj->elem after this since + * it is freed below + */ + __rxe_put(elem); + + /* wait until all references to the object have been + * dropped before final object specific cleanup and + * return to rdma-core + */ + if (sleepable) { + if (!completion_done(&elem->complete) && timeout) { + ret = wait_for_completion_timeout(&elem->complete, + timeout); + + /* Shouldn't happen. There are still references to + * the object but, rather than deadlock, free the + * object or pass back to rdma-core. + */ + if (WARN_ON(!ret)) + err = -EINVAL; + } + } else { + unsigned long until = jiffies + timeout; + + /* AH objects are unique in that the destroy_ah verb + * can be called in atomic context. This delay + * replaces the wait_for_completion call above + * when the destroy_ah call is not sleepable + */ + while (!completion_done(&elem->complete) && + time_before(jiffies, until)) + mdelay(1); + + if (WARN_ON(!completion_done(&elem->complete))) + err = -EINVAL; + } + + if (pool->cleanup) + pool->cleanup(elem); + + atomic_dec(&pool->num_elem); + + return err; +} + +int __rxe_get(struct rxe_pool_elem *elem) +{ + return kref_get_unless_zero(&elem->ref_cnt); +} + +int __rxe_put(struct rxe_pool_elem *elem) +{ + return kref_put(&elem->ref_cnt, rxe_elem_release); +} + +void __rxe_finalize(struct rxe_pool_elem *elem) +{ + void *xa_ret; + + xa_ret = xa_store(&elem->pool->xa, elem->index, elem, GFP_KERNEL); + WARN_ON(xa_err(xa_ret)); +} diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h new file mode 100644 index 0000000000..b42e26427a --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_pool.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#ifndef RXE_POOL_H +#define RXE_POOL_H + +enum rxe_elem_type { + RXE_TYPE_UC, + RXE_TYPE_PD, + RXE_TYPE_AH, + RXE_TYPE_SRQ, + RXE_TYPE_QP, + RXE_TYPE_CQ, + RXE_TYPE_MR, + RXE_TYPE_MW, + RXE_NUM_TYPES, /* keep me last */ +}; + +struct rxe_pool_elem { + struct rxe_pool *pool; + void *obj; + struct kref ref_cnt; + struct list_head list; + struct completion complete; + u32 index; +}; + +struct rxe_pool { + struct rxe_dev *rxe; + const char *name; + void (*cleanup)(struct rxe_pool_elem *elem); + enum rxe_elem_type type; + + unsigned int max_elem; + atomic_t num_elem; + size_t elem_size; + size_t elem_offset; + + struct xarray xa; + struct xa_limit limit; + u32 next; +}; + +/* initialize a pool of objects with given limit on + * number of elements. gets parameters from rxe_type_info + * pool elements will be allocated out of a slab cache + */ +void rxe_pool_init(struct rxe_dev *rxe, struct rxe_pool *pool, + enum rxe_elem_type type); + +/* free resources from object pool */ +void rxe_pool_cleanup(struct rxe_pool *pool); + +/* connect already allocated object to pool */ +int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem, + bool sleepable); +#define rxe_add_to_pool(pool, obj) __rxe_add_to_pool(pool, &(obj)->elem, true) +#define rxe_add_to_pool_ah(pool, obj, sleepable) __rxe_add_to_pool(pool, \ + &(obj)->elem, sleepable) + +/* lookup an indexed object from index. takes a reference on object */ +void *rxe_pool_get_index(struct rxe_pool *pool, u32 index); + +int __rxe_get(struct rxe_pool_elem *elem); +#define rxe_get(obj) __rxe_get(&(obj)->elem) + +int __rxe_put(struct rxe_pool_elem *elem); +#define rxe_put(obj) __rxe_put(&(obj)->elem) + +int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable); +#define rxe_cleanup(obj) __rxe_cleanup(&(obj)->elem, true) +#define rxe_cleanup_ah(obj, sleepable) __rxe_cleanup(&(obj)->elem, sleepable) + +#define rxe_read(obj) kref_read(&(obj)->elem.ref_cnt) + +void __rxe_finalize(struct rxe_pool_elem *elem); +#define rxe_finalize(obj) __rxe_finalize(&(obj)->elem) + +#endif /* RXE_POOL_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c new file mode 100644 index 0000000000..28e379c108 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -0,0 +1,880 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <linux/skbuff.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/vmalloc.h> +#include <rdma/uverbs_ioctl.h> + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" +#include "rxe_task.h" + +static int rxe_qp_chk_cap(struct rxe_dev *rxe, struct ib_qp_cap *cap, + int has_srq) +{ + if (cap->max_send_wr > rxe->attr.max_qp_wr) { + rxe_dbg_dev(rxe, "invalid send wr = %u > %d\n", + cap->max_send_wr, rxe->attr.max_qp_wr); + goto err1; + } + + if (cap->max_send_sge > rxe->attr.max_send_sge) { + rxe_dbg_dev(rxe, "invalid send sge = %u > %d\n", + cap->max_send_sge, rxe->attr.max_send_sge); + goto err1; + } + + if (!has_srq) { + if (cap->max_recv_wr > rxe->attr.max_qp_wr) { + rxe_dbg_dev(rxe, "invalid recv wr = %u > %d\n", + cap->max_recv_wr, rxe->attr.max_qp_wr); + goto err1; + } + + if (cap->max_recv_sge > rxe->attr.max_recv_sge) { + rxe_dbg_dev(rxe, "invalid recv sge = %u > %d\n", + cap->max_recv_sge, rxe->attr.max_recv_sge); + goto err1; + } + } + + if (cap->max_inline_data > rxe->max_inline_data) { + rxe_dbg_dev(rxe, "invalid max inline data = %u > %d\n", + cap->max_inline_data, rxe->max_inline_data); + goto err1; + } + + return 0; + +err1: + return -EINVAL; +} + +int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init) +{ + struct ib_qp_cap *cap = &init->cap; + struct rxe_port *port; + int port_num = init->port_num; + + switch (init->qp_type) { + case IB_QPT_GSI: + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_UD: + break; + default: + return -EOPNOTSUPP; + } + + if (!init->recv_cq || !init->send_cq) { + rxe_dbg_dev(rxe, "missing cq\n"); + goto err1; + } + + if (rxe_qp_chk_cap(rxe, cap, !!init->srq)) + goto err1; + + if (init->qp_type == IB_QPT_GSI) { + if (!rdma_is_port_valid(&rxe->ib_dev, port_num)) { + rxe_dbg_dev(rxe, "invalid port = %d\n", port_num); + goto err1; + } + + port = &rxe->port; + + if (init->qp_type == IB_QPT_GSI && port->qp_gsi_index) { + rxe_dbg_dev(rxe, "GSI QP exists for port %d\n", port_num); + goto err1; + } + } + + return 0; + +err1: + return -EINVAL; +} + +static int alloc_rd_atomic_resources(struct rxe_qp *qp, unsigned int n) +{ + qp->resp.res_head = 0; + qp->resp.res_tail = 0; + qp->resp.resources = kcalloc(n, sizeof(struct resp_res), GFP_KERNEL); + + if (!qp->resp.resources) + return -ENOMEM; + + return 0; +} + +static void free_rd_atomic_resources(struct rxe_qp *qp) +{ + if (qp->resp.resources) { + int i; + + for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) { + struct resp_res *res = &qp->resp.resources[i]; + + free_rd_atomic_resource(res); + } + kfree(qp->resp.resources); + qp->resp.resources = NULL; + } +} + +void free_rd_atomic_resource(struct resp_res *res) +{ + res->type = 0; +} + +static void cleanup_rd_atomic_resources(struct rxe_qp *qp) +{ + int i; + struct resp_res *res; + + if (qp->resp.resources) { + for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) { + res = &qp->resp.resources[i]; + free_rd_atomic_resource(res); + } + } +} + +static void rxe_qp_init_misc(struct rxe_dev *rxe, struct rxe_qp *qp, + struct ib_qp_init_attr *init) +{ + struct rxe_port *port; + u32 qpn; + + qp->sq_sig_type = init->sq_sig_type; + qp->attr.path_mtu = 1; + qp->mtu = ib_mtu_enum_to_int(qp->attr.path_mtu); + + qpn = qp->elem.index; + port = &rxe->port; + + switch (init->qp_type) { + case IB_QPT_GSI: + qp->ibqp.qp_num = 1; + port->qp_gsi_index = qpn; + qp->attr.port_num = init->port_num; + break; + + default: + qp->ibqp.qp_num = qpn; + break; + } + + spin_lock_init(&qp->state_lock); + + spin_lock_init(&qp->sq.sq_lock); + spin_lock_init(&qp->rq.producer_lock); + spin_lock_init(&qp->rq.consumer_lock); + + skb_queue_head_init(&qp->req_pkts); + skb_queue_head_init(&qp->resp_pkts); + + atomic_set(&qp->ssn, 0); + atomic_set(&qp->skb_out, 0); +} + +static int rxe_init_sq(struct rxe_qp *qp, struct ib_qp_init_attr *init, + struct ib_udata *udata, + struct rxe_create_qp_resp __user *uresp) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + int wqe_size; + int err; + + qp->sq.max_wr = init->cap.max_send_wr; + wqe_size = max_t(int, init->cap.max_send_sge * sizeof(struct ib_sge), + init->cap.max_inline_data); + qp->sq.max_sge = wqe_size / sizeof(struct ib_sge); + qp->sq.max_inline = wqe_size; + wqe_size += sizeof(struct rxe_send_wqe); + + qp->sq.queue = rxe_queue_init(rxe, &qp->sq.max_wr, wqe_size, + QUEUE_TYPE_FROM_CLIENT); + if (!qp->sq.queue) { + rxe_err_qp(qp, "Unable to allocate send queue"); + err = -ENOMEM; + goto err_out; + } + + /* prepare info for caller to mmap send queue if user space qp */ + err = do_mmap_info(rxe, uresp ? &uresp->sq_mi : NULL, udata, + qp->sq.queue->buf, qp->sq.queue->buf_size, + &qp->sq.queue->ip); + if (err) { + rxe_err_qp(qp, "do_mmap_info failed, err = %d", err); + goto err_free; + } + + /* return actual capabilities to caller which may be larger + * than requested + */ + init->cap.max_send_wr = qp->sq.max_wr; + init->cap.max_send_sge = qp->sq.max_sge; + init->cap.max_inline_data = qp->sq.max_inline; + + return 0; + +err_free: + vfree(qp->sq.queue->buf); + kfree(qp->sq.queue); + qp->sq.queue = NULL; +err_out: + return err; +} + +static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, + struct ib_qp_init_attr *init, struct ib_udata *udata, + struct rxe_create_qp_resp __user *uresp) +{ + int err; + + /* if we don't finish qp create make sure queue is valid */ + skb_queue_head_init(&qp->req_pkts); + + err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &qp->sk); + if (err < 0) + return err; + qp->sk->sk->sk_user_data = qp; + + /* pick a source UDP port number for this QP based on + * the source QPN. this spreads traffic for different QPs + * across different NIC RX queues (while using a single + * flow for a given QP to maintain packet order). + * the port number must be in the Dynamic Ports range + * (0xc000 - 0xffff). + */ + qp->src_port = RXE_ROCE_V2_SPORT + (hash_32(qp_num(qp), 14) & 0x3fff); + + err = rxe_init_sq(qp, init, udata, uresp); + if (err) + return err; + + qp->req.wqe_index = queue_get_producer(qp->sq.queue, + QUEUE_TYPE_FROM_CLIENT); + + qp->req.opcode = -1; + qp->comp.opcode = -1; + + rxe_init_task(&qp->req.task, qp, rxe_requester); + rxe_init_task(&qp->comp.task, qp, rxe_completer); + + qp->qp_timeout_jiffies = 0; /* Can't be set for UD/UC in modify_qp */ + if (init->qp_type == IB_QPT_RC) { + timer_setup(&qp->rnr_nak_timer, rnr_nak_timer, 0); + timer_setup(&qp->retrans_timer, retransmit_timer, 0); + } + return 0; +} + +static int rxe_init_rq(struct rxe_qp *qp, struct ib_qp_init_attr *init, + struct ib_udata *udata, + struct rxe_create_qp_resp __user *uresp) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + int wqe_size; + int err; + + qp->rq.max_wr = init->cap.max_recv_wr; + qp->rq.max_sge = init->cap.max_recv_sge; + wqe_size = sizeof(struct rxe_recv_wqe) + + qp->rq.max_sge*sizeof(struct ib_sge); + + qp->rq.queue = rxe_queue_init(rxe, &qp->rq.max_wr, wqe_size, + QUEUE_TYPE_FROM_CLIENT); + if (!qp->rq.queue) { + rxe_err_qp(qp, "Unable to allocate recv queue"); + err = -ENOMEM; + goto err_out; + } + + /* prepare info for caller to mmap recv queue if user space qp */ + err = do_mmap_info(rxe, uresp ? &uresp->rq_mi : NULL, udata, + qp->rq.queue->buf, qp->rq.queue->buf_size, + &qp->rq.queue->ip); + if (err) { + rxe_err_qp(qp, "do_mmap_info failed, err = %d", err); + goto err_free; + } + + /* return actual capabilities to caller which may be larger + * than requested + */ + init->cap.max_recv_wr = qp->rq.max_wr; + + return 0; + +err_free: + vfree(qp->rq.queue->buf); + kfree(qp->rq.queue); + qp->rq.queue = NULL; +err_out: + return err; +} + +static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp, + struct ib_qp_init_attr *init, + struct ib_udata *udata, + struct rxe_create_qp_resp __user *uresp) +{ + int err; + + /* if we don't finish qp create make sure queue is valid */ + skb_queue_head_init(&qp->resp_pkts); + + if (!qp->srq) { + err = rxe_init_rq(qp, init, udata, uresp); + if (err) + return err; + } + + rxe_init_task(&qp->resp.task, qp, rxe_responder); + + qp->resp.opcode = OPCODE_NONE; + qp->resp.msn = 0; + + return 0; +} + +/* called by the create qp verb */ +int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, + struct ib_qp_init_attr *init, + struct rxe_create_qp_resp __user *uresp, + struct ib_pd *ibpd, + struct ib_udata *udata) +{ + int err; + struct rxe_cq *rcq = to_rcq(init->recv_cq); + struct rxe_cq *scq = to_rcq(init->send_cq); + struct rxe_srq *srq = init->srq ? to_rsrq(init->srq) : NULL; + unsigned long flags; + + rxe_get(pd); + rxe_get(rcq); + rxe_get(scq); + if (srq) + rxe_get(srq); + + qp->pd = pd; + qp->rcq = rcq; + qp->scq = scq; + qp->srq = srq; + + atomic_inc(&rcq->num_wq); + atomic_inc(&scq->num_wq); + + rxe_qp_init_misc(rxe, qp, init); + + err = rxe_qp_init_req(rxe, qp, init, udata, uresp); + if (err) + goto err1; + + err = rxe_qp_init_resp(rxe, qp, init, udata, uresp); + if (err) + goto err2; + + spin_lock_irqsave(&qp->state_lock, flags); + qp->attr.qp_state = IB_QPS_RESET; + qp->valid = 1; + spin_unlock_irqrestore(&qp->state_lock, flags); + + return 0; + +err2: + rxe_queue_cleanup(qp->sq.queue); + qp->sq.queue = NULL; +err1: + atomic_dec(&rcq->num_wq); + atomic_dec(&scq->num_wq); + + qp->pd = NULL; + qp->rcq = NULL; + qp->scq = NULL; + qp->srq = NULL; + + if (srq) + rxe_put(srq); + rxe_put(scq); + rxe_put(rcq); + rxe_put(pd); + + return err; +} + +/* called by the query qp verb */ +int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init) +{ + init->event_handler = qp->ibqp.event_handler; + init->qp_context = qp->ibqp.qp_context; + init->send_cq = qp->ibqp.send_cq; + init->recv_cq = qp->ibqp.recv_cq; + init->srq = qp->ibqp.srq; + + init->cap.max_send_wr = qp->sq.max_wr; + init->cap.max_send_sge = qp->sq.max_sge; + init->cap.max_inline_data = qp->sq.max_inline; + + if (!qp->srq) { + init->cap.max_recv_wr = qp->rq.max_wr; + init->cap.max_recv_sge = qp->rq.max_sge; + } + + init->sq_sig_type = qp->sq_sig_type; + + init->qp_type = qp->ibqp.qp_type; + init->port_num = 1; + + return 0; +} + +int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, + struct ib_qp_attr *attr, int mask) +{ + if (mask & IB_QP_PORT) { + if (!rdma_is_port_valid(&rxe->ib_dev, attr->port_num)) { + rxe_dbg_qp(qp, "invalid port %d\n", attr->port_num); + goto err1; + } + } + + if (mask & IB_QP_CAP && rxe_qp_chk_cap(rxe, &attr->cap, !!qp->srq)) + goto err1; + + if (mask & IB_QP_ACCESS_FLAGS) { + if (!(qp_type(qp) == IB_QPT_RC || qp_type(qp) == IB_QPT_UC)) + goto err1; + if (attr->qp_access_flags & ~RXE_ACCESS_SUPPORTED_QP) + goto err1; + } + + if (mask & IB_QP_AV && rxe_av_chk_attr(qp, &attr->ah_attr)) + goto err1; + + if (mask & IB_QP_ALT_PATH) { + if (rxe_av_chk_attr(qp, &attr->alt_ah_attr)) + goto err1; + if (!rdma_is_port_valid(&rxe->ib_dev, attr->alt_port_num)) { + rxe_dbg_qp(qp, "invalid alt port %d\n", attr->alt_port_num); + goto err1; + } + if (attr->alt_timeout > 31) { + rxe_dbg_qp(qp, "invalid alt timeout %d > 31\n", + attr->alt_timeout); + goto err1; + } + } + + if (mask & IB_QP_PATH_MTU) { + struct rxe_port *port = &rxe->port; + + enum ib_mtu max_mtu = port->attr.max_mtu; + enum ib_mtu mtu = attr->path_mtu; + + if (mtu > max_mtu) { + rxe_dbg_qp(qp, "invalid mtu (%d) > (%d)\n", + ib_mtu_enum_to_int(mtu), + ib_mtu_enum_to_int(max_mtu)); + goto err1; + } + } + + if (mask & IB_QP_MAX_QP_RD_ATOMIC) { + if (attr->max_rd_atomic > rxe->attr.max_qp_rd_atom) { + rxe_dbg_qp(qp, "invalid max_rd_atomic %d > %d\n", + attr->max_rd_atomic, + rxe->attr.max_qp_rd_atom); + goto err1; + } + } + + if (mask & IB_QP_TIMEOUT) { + if (attr->timeout > 31) { + rxe_dbg_qp(qp, "invalid timeout %d > 31\n", + attr->timeout); + goto err1; + } + } + + return 0; + +err1: + return -EINVAL; +} + +/* move the qp to the reset state */ +static void rxe_qp_reset(struct rxe_qp *qp) +{ + /* stop tasks from running */ + rxe_disable_task(&qp->resp.task); + rxe_disable_task(&qp->comp.task); + rxe_disable_task(&qp->req.task); + + /* drain work and packet queuesc */ + rxe_requester(qp); + rxe_completer(qp); + rxe_responder(qp); + + if (qp->rq.queue) + rxe_queue_reset(qp->rq.queue); + if (qp->sq.queue) + rxe_queue_reset(qp->sq.queue); + + /* cleanup attributes */ + atomic_set(&qp->ssn, 0); + qp->req.opcode = -1; + qp->req.need_retry = 0; + qp->req.wait_for_rnr_timer = 0; + qp->req.noack_pkts = 0; + qp->resp.msn = 0; + qp->resp.opcode = -1; + qp->resp.drop_msg = 0; + qp->resp.goto_error = 0; + qp->resp.sent_psn_nak = 0; + + if (qp->resp.mr) { + rxe_put(qp->resp.mr); + qp->resp.mr = NULL; + } + + cleanup_rd_atomic_resources(qp); + + /* reenable tasks */ + rxe_enable_task(&qp->resp.task); + rxe_enable_task(&qp->comp.task); + rxe_enable_task(&qp->req.task); +} + +/* move the qp to the error state */ +void rxe_qp_error(struct rxe_qp *qp) +{ + unsigned long flags; + + spin_lock_irqsave(&qp->state_lock, flags); + qp->attr.qp_state = IB_QPS_ERR; + + /* drain work and packet queues */ + rxe_sched_task(&qp->resp.task); + rxe_sched_task(&qp->comp.task); + rxe_sched_task(&qp->req.task); + spin_unlock_irqrestore(&qp->state_lock, flags); +} + +static void rxe_qp_sqd(struct rxe_qp *qp, struct ib_qp_attr *attr, + int mask) +{ + unsigned long flags; + + spin_lock_irqsave(&qp->state_lock, flags); + qp->attr.sq_draining = 1; + rxe_sched_task(&qp->comp.task); + rxe_sched_task(&qp->req.task); + spin_unlock_irqrestore(&qp->state_lock, flags); +} + +/* caller should hold qp->state_lock */ +static int __qp_chk_state(struct rxe_qp *qp, struct ib_qp_attr *attr, + int mask) +{ + enum ib_qp_state cur_state; + enum ib_qp_state new_state; + + cur_state = (mask & IB_QP_CUR_STATE) ? + attr->cur_qp_state : qp->attr.qp_state; + new_state = (mask & IB_QP_STATE) ? + attr->qp_state : cur_state; + + if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask)) + return -EINVAL; + + if (mask & IB_QP_STATE && cur_state == IB_QPS_SQD) { + if (qp->attr.sq_draining && new_state != IB_QPS_ERR) + return -EINVAL; + } + + return 0; +} + +static const char *const qps2str[] = { + [IB_QPS_RESET] = "RESET", + [IB_QPS_INIT] = "INIT", + [IB_QPS_RTR] = "RTR", + [IB_QPS_RTS] = "RTS", + [IB_QPS_SQD] = "SQD", + [IB_QPS_SQE] = "SQE", + [IB_QPS_ERR] = "ERR", +}; + +/* called by the modify qp verb */ +int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, + struct ib_udata *udata) +{ + int err; + + if (mask & IB_QP_CUR_STATE) + qp->attr.cur_qp_state = attr->qp_state; + + if (mask & IB_QP_STATE) { + unsigned long flags; + + spin_lock_irqsave(&qp->state_lock, flags); + err = __qp_chk_state(qp, attr, mask); + if (!err) { + qp->attr.qp_state = attr->qp_state; + rxe_dbg_qp(qp, "state -> %s\n", + qps2str[attr->qp_state]); + } + spin_unlock_irqrestore(&qp->state_lock, flags); + + if (err) + return err; + + switch (attr->qp_state) { + case IB_QPS_RESET: + rxe_qp_reset(qp); + break; + case IB_QPS_SQD: + rxe_qp_sqd(qp, attr, mask); + break; + case IB_QPS_ERR: + rxe_qp_error(qp); + break; + default: + break; + } + } + + if (mask & IB_QP_MAX_QP_RD_ATOMIC) { + int max_rd_atomic = attr->max_rd_atomic ? + roundup_pow_of_two(attr->max_rd_atomic) : 0; + + qp->attr.max_rd_atomic = max_rd_atomic; + atomic_set(&qp->req.rd_atomic, max_rd_atomic); + } + + if (mask & IB_QP_MAX_DEST_RD_ATOMIC) { + int max_dest_rd_atomic = attr->max_dest_rd_atomic ? + roundup_pow_of_two(attr->max_dest_rd_atomic) : 0; + + qp->attr.max_dest_rd_atomic = max_dest_rd_atomic; + + free_rd_atomic_resources(qp); + + err = alloc_rd_atomic_resources(qp, max_dest_rd_atomic); + if (err) + return err; + } + + if (mask & IB_QP_EN_SQD_ASYNC_NOTIFY) + qp->attr.en_sqd_async_notify = attr->en_sqd_async_notify; + + if (mask & IB_QP_ACCESS_FLAGS) + qp->attr.qp_access_flags = attr->qp_access_flags; + + if (mask & IB_QP_PKEY_INDEX) + qp->attr.pkey_index = attr->pkey_index; + + if (mask & IB_QP_PORT) + qp->attr.port_num = attr->port_num; + + if (mask & IB_QP_QKEY) + qp->attr.qkey = attr->qkey; + + if (mask & IB_QP_AV) + rxe_init_av(&attr->ah_attr, &qp->pri_av); + + if (mask & IB_QP_ALT_PATH) { + rxe_init_av(&attr->alt_ah_attr, &qp->alt_av); + qp->attr.alt_port_num = attr->alt_port_num; + qp->attr.alt_pkey_index = attr->alt_pkey_index; + qp->attr.alt_timeout = attr->alt_timeout; + } + + if (mask & IB_QP_PATH_MTU) { + qp->attr.path_mtu = attr->path_mtu; + qp->mtu = ib_mtu_enum_to_int(attr->path_mtu); + } + + if (mask & IB_QP_TIMEOUT) { + qp->attr.timeout = attr->timeout; + if (attr->timeout == 0) { + qp->qp_timeout_jiffies = 0; + } else { + /* According to the spec, timeout = 4.096 * 2 ^ attr->timeout [us] */ + int j = nsecs_to_jiffies(4096ULL << attr->timeout); + + qp->qp_timeout_jiffies = j ? j : 1; + } + } + + if (mask & IB_QP_RETRY_CNT) { + qp->attr.retry_cnt = attr->retry_cnt; + qp->comp.retry_cnt = attr->retry_cnt; + rxe_dbg_qp(qp, "set retry count = %d\n", attr->retry_cnt); + } + + if (mask & IB_QP_RNR_RETRY) { + qp->attr.rnr_retry = attr->rnr_retry; + qp->comp.rnr_retry = attr->rnr_retry; + rxe_dbg_qp(qp, "set rnr retry count = %d\n", attr->rnr_retry); + } + + if (mask & IB_QP_RQ_PSN) { + qp->attr.rq_psn = (attr->rq_psn & BTH_PSN_MASK); + qp->resp.psn = qp->attr.rq_psn; + rxe_dbg_qp(qp, "set resp psn = 0x%x\n", qp->resp.psn); + } + + if (mask & IB_QP_MIN_RNR_TIMER) { + qp->attr.min_rnr_timer = attr->min_rnr_timer; + rxe_dbg_qp(qp, "set min rnr timer = 0x%x\n", + attr->min_rnr_timer); + } + + if (mask & IB_QP_SQ_PSN) { + qp->attr.sq_psn = (attr->sq_psn & BTH_PSN_MASK); + qp->req.psn = qp->attr.sq_psn; + qp->comp.psn = qp->attr.sq_psn; + rxe_dbg_qp(qp, "set req psn = 0x%x\n", qp->req.psn); + } + + if (mask & IB_QP_PATH_MIG_STATE) + qp->attr.path_mig_state = attr->path_mig_state; + + if (mask & IB_QP_DEST_QPN) + qp->attr.dest_qp_num = attr->dest_qp_num; + + return 0; +} + +/* called by the query qp verb */ +int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) +{ + unsigned long flags; + + *attr = qp->attr; + + attr->rq_psn = qp->resp.psn; + attr->sq_psn = qp->req.psn; + + attr->cap.max_send_wr = qp->sq.max_wr; + attr->cap.max_send_sge = qp->sq.max_sge; + attr->cap.max_inline_data = qp->sq.max_inline; + + if (!qp->srq) { + attr->cap.max_recv_wr = qp->rq.max_wr; + attr->cap.max_recv_sge = qp->rq.max_sge; + } + + rxe_av_to_attr(&qp->pri_av, &attr->ah_attr); + rxe_av_to_attr(&qp->alt_av, &attr->alt_ah_attr); + + /* Applications that get this state typically spin on it. + * Yield the processor + */ + spin_lock_irqsave(&qp->state_lock, flags); + if (qp->attr.sq_draining) { + spin_unlock_irqrestore(&qp->state_lock, flags); + cond_resched(); + } else { + spin_unlock_irqrestore(&qp->state_lock, flags); + } + + return 0; +} + +int rxe_qp_chk_destroy(struct rxe_qp *qp) +{ + /* See IBA o10-2.2.3 + * An attempt to destroy a QP while attached to a mcast group + * will fail immediately. + */ + if (atomic_read(&qp->mcg_num)) { + rxe_dbg_qp(qp, "Attempt to destroy while attached to multicast group\n"); + return -EBUSY; + } + + return 0; +} + +/* called when the last reference to the qp is dropped */ +static void rxe_qp_do_cleanup(struct work_struct *work) +{ + struct rxe_qp *qp = container_of(work, typeof(*qp), cleanup_work.work); + unsigned long flags; + + spin_lock_irqsave(&qp->state_lock, flags); + qp->valid = 0; + spin_unlock_irqrestore(&qp->state_lock, flags); + qp->qp_timeout_jiffies = 0; + + if (qp_type(qp) == IB_QPT_RC) { + del_timer_sync(&qp->retrans_timer); + del_timer_sync(&qp->rnr_nak_timer); + } + + if (qp->resp.task.func) + rxe_cleanup_task(&qp->resp.task); + + if (qp->req.task.func) + rxe_cleanup_task(&qp->req.task); + + if (qp->comp.task.func) + rxe_cleanup_task(&qp->comp.task); + + /* flush out any receive wr's or pending requests */ + rxe_requester(qp); + rxe_completer(qp); + rxe_responder(qp); + + if (qp->sq.queue) + rxe_queue_cleanup(qp->sq.queue); + + if (qp->srq) + rxe_put(qp->srq); + + if (qp->rq.queue) + rxe_queue_cleanup(qp->rq.queue); + + if (qp->scq) { + atomic_dec(&qp->scq->num_wq); + rxe_put(qp->scq); + } + + if (qp->rcq) { + atomic_dec(&qp->rcq->num_wq); + rxe_put(qp->rcq); + } + + if (qp->pd) + rxe_put(qp->pd); + + if (qp->resp.mr) + rxe_put(qp->resp.mr); + + free_rd_atomic_resources(qp); + + if (qp->sk) { + if (qp_type(qp) == IB_QPT_RC) + sk_dst_reset(qp->sk->sk); + + kernel_sock_shutdown(qp->sk, SHUT_RDWR); + sock_release(qp->sk); + } +} + +/* called when the last reference to the qp is dropped */ +void rxe_qp_cleanup(struct rxe_pool_elem *elem) +{ + struct rxe_qp *qp = container_of(elem, typeof(*qp), elem); + + execute_in_process_context(rxe_qp_do_cleanup, &qp->cleanup_work); +} diff --git a/drivers/infiniband/sw/rxe/rxe_queue.c b/drivers/infiniband/sw/rxe/rxe_queue.c new file mode 100644 index 0000000000..9611ee191a --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_queue.c @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <linux/vmalloc.h> +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" + +int do_mmap_info(struct rxe_dev *rxe, struct mminfo __user *outbuf, + struct ib_udata *udata, struct rxe_queue_buf *buf, + size_t buf_size, struct rxe_mmap_info **ip_p) +{ + int err; + struct rxe_mmap_info *ip = NULL; + + if (outbuf) { + ip = rxe_create_mmap_info(rxe, buf_size, udata, buf); + if (IS_ERR(ip)) { + err = PTR_ERR(ip); + goto err1; + } + + if (copy_to_user(outbuf, &ip->info, sizeof(ip->info))) { + err = -EFAULT; + goto err2; + } + + spin_lock_bh(&rxe->pending_lock); + list_add(&ip->pending_mmaps, &rxe->pending_mmaps); + spin_unlock_bh(&rxe->pending_lock); + } + + *ip_p = ip; + + return 0; + +err2: + kfree(ip); +err1: + return err; +} + +inline void rxe_queue_reset(struct rxe_queue *q) +{ + /* queue is comprised from header and the memory + * of the actual queue. See "struct rxe_queue_buf" in rxe_queue.h + * reset only the queue itself and not the management header + */ + memset(q->buf->data, 0, q->buf_size - sizeof(struct rxe_queue_buf)); +} + +struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe, int *num_elem, + unsigned int elem_size, enum queue_type type) +{ + struct rxe_queue *q; + size_t buf_size; + unsigned int num_slots; + + /* num_elem == 0 is allowed, but uninteresting */ + if (*num_elem < 0) + return NULL; + + q = kzalloc(sizeof(*q), GFP_KERNEL); + if (!q) + return NULL; + + q->rxe = rxe; + q->type = type; + + /* used in resize, only need to copy used part of queue */ + q->elem_size = elem_size; + + /* pad element up to at least a cacheline and always a power of 2 */ + if (elem_size < cache_line_size()) + elem_size = cache_line_size(); + elem_size = roundup_pow_of_two(elem_size); + + q->log2_elem_size = order_base_2(elem_size); + + num_slots = *num_elem + 1; + num_slots = roundup_pow_of_two(num_slots); + q->index_mask = num_slots - 1; + + buf_size = sizeof(struct rxe_queue_buf) + num_slots * elem_size; + + q->buf = vmalloc_user(buf_size); + if (!q->buf) + goto err2; + + q->buf->log2_elem_size = q->log2_elem_size; + q->buf->index_mask = q->index_mask; + + q->buf_size = buf_size; + + *num_elem = num_slots - 1; + return q; + +err2: + kfree(q); + return NULL; +} + +/* copies elements from original q to new q and then swaps the contents of the + * two q headers. This is so that if anyone is holding a pointer to q it will + * still work + */ +static int resize_finish(struct rxe_queue *q, struct rxe_queue *new_q, + unsigned int num_elem) +{ + enum queue_type type = q->type; + u32 new_prod; + u32 prod; + u32 cons; + + if (!queue_empty(q, q->type) && (num_elem < queue_count(q, type))) + return -EINVAL; + + new_prod = queue_get_producer(new_q, type); + prod = queue_get_producer(q, type); + cons = queue_get_consumer(q, type); + + while ((prod - cons) & q->index_mask) { + memcpy(queue_addr_from_index(new_q, new_prod), + queue_addr_from_index(q, cons), new_q->elem_size); + new_prod = queue_next_index(new_q, new_prod); + cons = queue_next_index(q, cons); + } + + new_q->buf->producer_index = new_prod; + q->buf->consumer_index = cons; + + /* update private index copies */ + if (type == QUEUE_TYPE_TO_CLIENT) + new_q->index = new_q->buf->producer_index; + else + q->index = q->buf->consumer_index; + + /* exchange rxe_queue headers */ + swap(*q, *new_q); + + return 0; +} + +int rxe_queue_resize(struct rxe_queue *q, unsigned int *num_elem_p, + unsigned int elem_size, struct ib_udata *udata, + struct mminfo __user *outbuf, spinlock_t *producer_lock, + spinlock_t *consumer_lock) +{ + struct rxe_queue *new_q; + unsigned int num_elem = *num_elem_p; + int err; + unsigned long producer_flags; + unsigned long consumer_flags; + + new_q = rxe_queue_init(q->rxe, &num_elem, elem_size, q->type); + if (!new_q) + return -ENOMEM; + + err = do_mmap_info(new_q->rxe, outbuf, udata, new_q->buf, + new_q->buf_size, &new_q->ip); + if (err) { + vfree(new_q->buf); + kfree(new_q); + goto err1; + } + + spin_lock_irqsave(consumer_lock, consumer_flags); + + if (producer_lock) { + spin_lock_irqsave(producer_lock, producer_flags); + err = resize_finish(q, new_q, num_elem); + spin_unlock_irqrestore(producer_lock, producer_flags); + } else { + err = resize_finish(q, new_q, num_elem); + } + + spin_unlock_irqrestore(consumer_lock, consumer_flags); + + rxe_queue_cleanup(new_q); /* new/old dep on err */ + if (err) + goto err1; + + *num_elem_p = num_elem; + return 0; + +err1: + return err; +} + +void rxe_queue_cleanup(struct rxe_queue *q) +{ + if (q->ip) + kref_put(&q->ip->ref, rxe_mmap_release); + else + vfree(q->buf); + + kfree(q); +} diff --git a/drivers/infiniband/sw/rxe/rxe_queue.h b/drivers/infiniband/sw/rxe/rxe_queue.h new file mode 100644 index 0000000000..c711cb98b9 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_queue.h @@ -0,0 +1,284 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#ifndef RXE_QUEUE_H +#define RXE_QUEUE_H + +/* Implements a simple circular buffer that is shared between user + * and the driver and can be resized. The requested element size is + * rounded up to a power of 2 and the number of elements in the buffer + * is also rounded up to a power of 2. Since the queue is empty when + * the producer and consumer indices match the maximum capacity of the + * queue is one less than the number of element slots. + * + * Notes: + * - The driver indices are always masked off to q->index_mask + * before storing so do not need to be checked on reads. + * - The user whether user space or kernel is generally + * not trusted so its parameters are masked to make sure + * they do not access the queue out of bounds on reads. + * - The driver indices for queues must not be written + * by user so a local copy is used and a shared copy is + * stored when the local copy is changed. + * - By passing the type in the parameter list separate from q + * the compiler can eliminate the switch statement when the + * actual queue type is known when the function is called at + * compile time. + * - These queues are lock free. The user and driver must protect + * changes to their end of the queues with locks if more than one + * CPU can be accessing it at the same time. + */ + +/** + * enum queue_type - type of queue + * @QUEUE_TYPE_TO_CLIENT: Queue is written by rxe driver and + * read by client which may be a user space + * application or a kernel ulp. + * Used by rxe internals only. + * @QUEUE_TYPE_FROM_CLIENT: Queue is written by client and + * read by rxe driver. + * Used by rxe internals only. + * @QUEUE_TYPE_FROM_ULP: Queue is written by kernel ulp and + * read by rxe driver. + * Used by kernel verbs APIs only on + * behalf of ulps. + * @QUEUE_TYPE_TO_ULP: Queue is written by rxe driver and + * read by kernel ulp. + * Used by kernel verbs APIs only on + * behalf of ulps. + */ +enum queue_type { + QUEUE_TYPE_TO_CLIENT, + QUEUE_TYPE_FROM_CLIENT, + QUEUE_TYPE_FROM_ULP, + QUEUE_TYPE_TO_ULP, +}; + +struct rxe_queue_buf; + +struct rxe_queue { + struct rxe_dev *rxe; + struct rxe_queue_buf *buf; + struct rxe_mmap_info *ip; + size_t buf_size; + size_t elem_size; + unsigned int log2_elem_size; + u32 index_mask; + enum queue_type type; + /* private copy of index for shared queues between + * driver and clients. Driver reads and writes + * this copy and then replicates to rxe_queue_buf + * for read access by clients. + */ + u32 index; +}; + +int do_mmap_info(struct rxe_dev *rxe, struct mminfo __user *outbuf, + struct ib_udata *udata, struct rxe_queue_buf *buf, + size_t buf_size, struct rxe_mmap_info **ip_p); + +void rxe_queue_reset(struct rxe_queue *q); + +struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe, int *num_elem, + unsigned int elem_size, enum queue_type type); + +int rxe_queue_resize(struct rxe_queue *q, unsigned int *num_elem_p, + unsigned int elem_size, struct ib_udata *udata, + struct mminfo __user *outbuf, + spinlock_t *producer_lock, spinlock_t *consumer_lock); + +void rxe_queue_cleanup(struct rxe_queue *queue); + +static inline u32 queue_next_index(struct rxe_queue *q, int index) +{ + return (index + 1) & q->index_mask; +} + +static inline u32 queue_get_producer(const struct rxe_queue *q, + enum queue_type type) +{ + u32 prod; + + switch (type) { + case QUEUE_TYPE_FROM_CLIENT: + /* used by rxe, client owns the index */ + prod = smp_load_acquire(&q->buf->producer_index); + break; + case QUEUE_TYPE_TO_CLIENT: + /* used by rxe which owns the index */ + prod = q->index; + break; + case QUEUE_TYPE_FROM_ULP: + /* used by ulp which owns the index */ + prod = q->buf->producer_index; + break; + case QUEUE_TYPE_TO_ULP: + /* used by ulp, rxe owns the index */ + prod = smp_load_acquire(&q->buf->producer_index); + break; + } + + return prod; +} + +static inline u32 queue_get_consumer(const struct rxe_queue *q, + enum queue_type type) +{ + u32 cons; + + switch (type) { + case QUEUE_TYPE_FROM_CLIENT: + /* used by rxe which owns the index */ + cons = q->index; + break; + case QUEUE_TYPE_TO_CLIENT: + /* used by rxe, client owns the index */ + cons = smp_load_acquire(&q->buf->consumer_index); + break; + case QUEUE_TYPE_FROM_ULP: + /* used by ulp, rxe owns the index */ + cons = smp_load_acquire(&q->buf->consumer_index); + break; + case QUEUE_TYPE_TO_ULP: + /* used by ulp which owns the index */ + cons = q->buf->consumer_index; + break; + } + + return cons; +} + +static inline int queue_empty(struct rxe_queue *q, enum queue_type type) +{ + u32 prod = queue_get_producer(q, type); + u32 cons = queue_get_consumer(q, type); + + return ((prod - cons) & q->index_mask) == 0; +} + +static inline int queue_full(struct rxe_queue *q, enum queue_type type) +{ + u32 prod = queue_get_producer(q, type); + u32 cons = queue_get_consumer(q, type); + + return ((prod + 1 - cons) & q->index_mask) == 0; +} + +static inline u32 queue_count(const struct rxe_queue *q, + enum queue_type type) +{ + u32 prod = queue_get_producer(q, type); + u32 cons = queue_get_consumer(q, type); + + return (prod - cons) & q->index_mask; +} + +static inline void queue_advance_producer(struct rxe_queue *q, + enum queue_type type) +{ + u32 prod; + + switch (type) { + case QUEUE_TYPE_FROM_CLIENT: + /* used by rxe, client owns the index */ + if (WARN_ON(1)) + pr_warn("%s: attempt to advance client index\n", + __func__); + break; + case QUEUE_TYPE_TO_CLIENT: + /* used by rxe which owns the index */ + prod = q->index; + prod = (prod + 1) & q->index_mask; + q->index = prod; + /* release so client can read it safely */ + smp_store_release(&q->buf->producer_index, prod); + break; + case QUEUE_TYPE_FROM_ULP: + /* used by ulp which owns the index */ + prod = q->buf->producer_index; + prod = (prod + 1) & q->index_mask; + /* release so rxe can read it safely */ + smp_store_release(&q->buf->producer_index, prod); + break; + case QUEUE_TYPE_TO_ULP: + /* used by ulp, rxe owns the index */ + if (WARN_ON(1)) + pr_warn("%s: attempt to advance driver index\n", + __func__); + break; + } +} + +static inline void queue_advance_consumer(struct rxe_queue *q, + enum queue_type type) +{ + u32 cons; + + switch (type) { + case QUEUE_TYPE_FROM_CLIENT: + /* used by rxe which owns the index */ + cons = (q->index + 1) & q->index_mask; + q->index = cons; + /* release so client can read it safely */ + smp_store_release(&q->buf->consumer_index, cons); + break; + case QUEUE_TYPE_TO_CLIENT: + /* used by rxe, client owns the index */ + if (WARN_ON(1)) + pr_warn("%s: attempt to advance client index\n", + __func__); + break; + case QUEUE_TYPE_FROM_ULP: + /* used by ulp, rxe owns the index */ + if (WARN_ON(1)) + pr_warn("%s: attempt to advance driver index\n", + __func__); + break; + case QUEUE_TYPE_TO_ULP: + /* used by ulp which owns the index */ + cons = q->buf->consumer_index; + cons = (cons + 1) & q->index_mask; + /* release so rxe can read it safely */ + smp_store_release(&q->buf->consumer_index, cons); + break; + } +} + +static inline void *queue_producer_addr(struct rxe_queue *q, + enum queue_type type) +{ + u32 prod = queue_get_producer(q, type); + + return q->buf->data + (prod << q->log2_elem_size); +} + +static inline void *queue_consumer_addr(struct rxe_queue *q, + enum queue_type type) +{ + u32 cons = queue_get_consumer(q, type); + + return q->buf->data + (cons << q->log2_elem_size); +} + +static inline void *queue_addr_from_index(struct rxe_queue *q, u32 index) +{ + return q->buf->data + ((index & q->index_mask) + << q->log2_elem_size); +} + +static inline u32 queue_index_from_addr(const struct rxe_queue *q, + const void *addr) +{ + return (((u8 *)addr - q->buf->data) >> q->log2_elem_size) + & q->index_mask; +} + +static inline void *queue_head(struct rxe_queue *q, enum queue_type type) +{ + return queue_empty(q, type) ? NULL : queue_consumer_addr(q, type); +} + +#endif /* RXE_QUEUE_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c new file mode 100644 index 0000000000..5861e42440 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -0,0 +1,359 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <linux/skbuff.h> + +#include "rxe.h" +#include "rxe_loc.h" + +/* check that QP matches packet opcode type and is in a valid state */ +static int check_type_state(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, + struct rxe_qp *qp) +{ + unsigned int pkt_type; + unsigned long flags; + + if (unlikely(!qp->valid)) + return -EINVAL; + + pkt_type = pkt->opcode & 0xe0; + + switch (qp_type(qp)) { + case IB_QPT_RC: + if (unlikely(pkt_type != IB_OPCODE_RC)) + return -EINVAL; + break; + case IB_QPT_UC: + if (unlikely(pkt_type != IB_OPCODE_UC)) + return -EINVAL; + break; + case IB_QPT_UD: + case IB_QPT_GSI: + if (unlikely(pkt_type != IB_OPCODE_UD)) + return -EINVAL; + break; + default: + return -EINVAL; + } + + spin_lock_irqsave(&qp->state_lock, flags); + if (pkt->mask & RXE_REQ_MASK) { + if (unlikely(qp_state(qp) < IB_QPS_RTR)) { + spin_unlock_irqrestore(&qp->state_lock, flags); + return -EINVAL; + } + } else { + if (unlikely(qp_state(qp) < IB_QPS_RTS)) { + spin_unlock_irqrestore(&qp->state_lock, flags); + return -EINVAL; + } + } + spin_unlock_irqrestore(&qp->state_lock, flags); + + return 0; +} + +static void set_bad_pkey_cntr(struct rxe_port *port) +{ + spin_lock_bh(&port->port_lock); + port->attr.bad_pkey_cntr = min((u32)0xffff, + port->attr.bad_pkey_cntr + 1); + spin_unlock_bh(&port->port_lock); +} + +static void set_qkey_viol_cntr(struct rxe_port *port) +{ + spin_lock_bh(&port->port_lock); + port->attr.qkey_viol_cntr = min((u32)0xffff, + port->attr.qkey_viol_cntr + 1); + spin_unlock_bh(&port->port_lock); +} + +static int check_keys(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, + u32 qpn, struct rxe_qp *qp) +{ + struct rxe_port *port = &rxe->port; + u16 pkey = bth_pkey(pkt); + + pkt->pkey_index = 0; + + if (!pkey_match(pkey, IB_DEFAULT_PKEY_FULL)) { + set_bad_pkey_cntr(port); + return -EINVAL; + } + + if (qp_type(qp) == IB_QPT_UD || qp_type(qp) == IB_QPT_GSI) { + u32 qkey = (qpn == 1) ? GSI_QKEY : qp->attr.qkey; + + if (unlikely(deth_qkey(pkt) != qkey)) { + set_qkey_viol_cntr(port); + return -EINVAL; + } + } + + return 0; +} + +static int check_addr(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, + struct rxe_qp *qp) +{ + struct sk_buff *skb = PKT_TO_SKB(pkt); + + if (qp_type(qp) != IB_QPT_RC && qp_type(qp) != IB_QPT_UC) + return 0; + + if (unlikely(pkt->port_num != qp->attr.port_num)) + return -EINVAL; + + if (skb->protocol == htons(ETH_P_IP)) { + struct in_addr *saddr = + &qp->pri_av.sgid_addr._sockaddr_in.sin_addr; + struct in_addr *daddr = + &qp->pri_av.dgid_addr._sockaddr_in.sin_addr; + + if ((ip_hdr(skb)->daddr != saddr->s_addr) || + (ip_hdr(skb)->saddr != daddr->s_addr)) + return -EINVAL; + + } else if (skb->protocol == htons(ETH_P_IPV6)) { + struct in6_addr *saddr = + &qp->pri_av.sgid_addr._sockaddr_in6.sin6_addr; + struct in6_addr *daddr = + &qp->pri_av.dgid_addr._sockaddr_in6.sin6_addr; + + if (memcmp(&ipv6_hdr(skb)->daddr, saddr, sizeof(*saddr)) || + memcmp(&ipv6_hdr(skb)->saddr, daddr, sizeof(*daddr))) + return -EINVAL; + } + + return 0; +} + +static int hdr_check(struct rxe_pkt_info *pkt) +{ + struct rxe_dev *rxe = pkt->rxe; + struct rxe_port *port = &rxe->port; + struct rxe_qp *qp = NULL; + u32 qpn = bth_qpn(pkt); + int index; + int err; + + if (unlikely(bth_tver(pkt) != BTH_TVER)) + goto err1; + + if (unlikely(qpn == 0)) + goto err1; + + if (qpn != IB_MULTICAST_QPN) { + index = (qpn == 1) ? port->qp_gsi_index : qpn; + + qp = rxe_pool_get_index(&rxe->qp_pool, index); + if (unlikely(!qp)) + goto err1; + + err = check_type_state(rxe, pkt, qp); + if (unlikely(err)) + goto err2; + + err = check_addr(rxe, pkt, qp); + if (unlikely(err)) + goto err2; + + err = check_keys(rxe, pkt, qpn, qp); + if (unlikely(err)) + goto err2; + } else { + if (unlikely((pkt->mask & RXE_GRH_MASK) == 0)) + goto err1; + } + + pkt->qp = qp; + return 0; + +err2: + rxe_put(qp); +err1: + return -EINVAL; +} + +static inline void rxe_rcv_pkt(struct rxe_pkt_info *pkt, struct sk_buff *skb) +{ + if (pkt->mask & RXE_REQ_MASK) + rxe_resp_queue_pkt(pkt->qp, skb); + else + rxe_comp_queue_pkt(pkt->qp, skb); +} + +static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb) +{ + struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); + struct rxe_mcg *mcg; + struct rxe_mca *mca; + struct rxe_qp *qp; + union ib_gid dgid; + int err; + + if (skb->protocol == htons(ETH_P_IP)) + ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr, + (struct in6_addr *)&dgid); + else if (skb->protocol == htons(ETH_P_IPV6)) + memcpy(&dgid, &ipv6_hdr(skb)->daddr, sizeof(dgid)); + + /* lookup mcast group corresponding to mgid, takes a ref */ + mcg = rxe_lookup_mcg(rxe, &dgid); + if (!mcg) + goto drop; /* mcast group not registered */ + + spin_lock_bh(&rxe->mcg_lock); + + /* this is unreliable datagram service so we let + * failures to deliver a multicast packet to a + * single QP happen and just move on and try + * the rest of them on the list + */ + list_for_each_entry(mca, &mcg->qp_list, qp_list) { + qp = mca->qp; + + /* validate qp for incoming packet */ + err = check_type_state(rxe, pkt, qp); + if (err) + continue; + + err = check_keys(rxe, pkt, bth_qpn(pkt), qp); + if (err) + continue; + + /* for all but the last QP create a new clone of the + * skb and pass to the QP. Pass the original skb to + * the last QP in the list. + */ + if (mca->qp_list.next != &mcg->qp_list) { + struct sk_buff *cskb; + struct rxe_pkt_info *cpkt; + + cskb = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!cskb)) + continue; + + if (WARN_ON(!ib_device_try_get(&rxe->ib_dev))) { + kfree_skb(cskb); + break; + } + + cpkt = SKB_TO_PKT(cskb); + cpkt->qp = qp; + rxe_get(qp); + rxe_rcv_pkt(cpkt, cskb); + } else { + pkt->qp = qp; + rxe_get(qp); + rxe_rcv_pkt(pkt, skb); + skb = NULL; /* mark consumed */ + } + } + + spin_unlock_bh(&rxe->mcg_lock); + + kref_put(&mcg->ref_cnt, rxe_cleanup_mcg); + + if (likely(!skb)) + return; + + /* This only occurs if one of the checks fails on the last + * QP in the list above + */ + +drop: + kfree_skb(skb); + ib_device_put(&rxe->ib_dev); +} + +/** + * rxe_chk_dgid - validate destination IP address + * @rxe: rxe device that received packet + * @skb: the received packet buffer + * + * Accept any loopback packets + * Extract IP address from packet and + * Accept if multicast packet + * Accept if matches an SGID table entry + */ +static int rxe_chk_dgid(struct rxe_dev *rxe, struct sk_buff *skb) +{ + struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); + const struct ib_gid_attr *gid_attr; + union ib_gid dgid; + union ib_gid *pdgid; + + if (pkt->mask & RXE_LOOPBACK_MASK) + return 0; + + if (skb->protocol == htons(ETH_P_IP)) { + ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr, + (struct in6_addr *)&dgid); + pdgid = &dgid; + } else { + pdgid = (union ib_gid *)&ipv6_hdr(skb)->daddr; + } + + if (rdma_is_multicast_addr((struct in6_addr *)pdgid)) + return 0; + + gid_attr = rdma_find_gid_by_port(&rxe->ib_dev, pdgid, + IB_GID_TYPE_ROCE_UDP_ENCAP, + 1, skb->dev); + if (IS_ERR(gid_attr)) + return PTR_ERR(gid_attr); + + rdma_put_gid_attr(gid_attr); + return 0; +} + +/* rxe_rcv is called from the interface driver */ +void rxe_rcv(struct sk_buff *skb) +{ + int err; + struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); + struct rxe_dev *rxe = pkt->rxe; + + if (unlikely(skb->len < RXE_BTH_BYTES)) + goto drop; + + if (rxe_chk_dgid(rxe, skb) < 0) + goto drop; + + pkt->opcode = bth_opcode(pkt); + pkt->psn = bth_psn(pkt); + pkt->qp = NULL; + pkt->mask |= rxe_opcode[pkt->opcode].mask; + + if (unlikely(skb->len < header_size(pkt))) + goto drop; + + err = hdr_check(pkt); + if (unlikely(err)) + goto drop; + + err = rxe_icrc_check(skb, pkt); + if (unlikely(err)) + goto drop; + + rxe_counter_inc(rxe, RXE_CNT_RCVD_PKTS); + + if (unlikely(bth_qpn(pkt) == IB_MULTICAST_QPN)) + rxe_rcv_mcast_pkt(rxe, skb); + else + rxe_rcv_pkt(pkt, skb); + + return; + +drop: + if (pkt->qp) + rxe_put(pkt->qp); + + kfree_skb(skb); + ib_device_put(&rxe->ib_dev); +} diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c new file mode 100644 index 0000000000..d8c41fd626 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -0,0 +1,880 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <linux/skbuff.h> +#include <crypto/hash.h> + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" + +static int next_opcode(struct rxe_qp *qp, struct rxe_send_wqe *wqe, + u32 opcode); + +static inline void retry_first_write_send(struct rxe_qp *qp, + struct rxe_send_wqe *wqe, int npsn) +{ + int i; + + for (i = 0; i < npsn; i++) { + int to_send = (wqe->dma.resid > qp->mtu) ? + qp->mtu : wqe->dma.resid; + + qp->req.opcode = next_opcode(qp, wqe, + wqe->wr.opcode); + + if (wqe->wr.send_flags & IB_SEND_INLINE) { + wqe->dma.resid -= to_send; + wqe->dma.sge_offset += to_send; + } else { + advance_dma_data(&wqe->dma, to_send); + } + } +} + +static void req_retry(struct rxe_qp *qp) +{ + struct rxe_send_wqe *wqe; + unsigned int wqe_index; + unsigned int mask; + int npsn; + int first = 1; + struct rxe_queue *q = qp->sq.queue; + unsigned int cons; + unsigned int prod; + + cons = queue_get_consumer(q, QUEUE_TYPE_FROM_CLIENT); + prod = queue_get_producer(q, QUEUE_TYPE_FROM_CLIENT); + + qp->req.wqe_index = cons; + qp->req.psn = qp->comp.psn; + qp->req.opcode = -1; + + for (wqe_index = cons; wqe_index != prod; + wqe_index = queue_next_index(q, wqe_index)) { + wqe = queue_addr_from_index(qp->sq.queue, wqe_index); + mask = wr_opcode_mask(wqe->wr.opcode, qp); + + if (wqe->state == wqe_state_posted) + break; + + if (wqe->state == wqe_state_done) + continue; + + wqe->iova = (mask & WR_ATOMIC_MASK) ? + wqe->wr.wr.atomic.remote_addr : + (mask & WR_READ_OR_WRITE_MASK) ? + wqe->wr.wr.rdma.remote_addr : + 0; + + if (!first || (mask & WR_READ_MASK) == 0) { + wqe->dma.resid = wqe->dma.length; + wqe->dma.cur_sge = 0; + wqe->dma.sge_offset = 0; + } + + if (first) { + first = 0; + + if (mask & WR_WRITE_OR_SEND_MASK) { + npsn = (qp->comp.psn - wqe->first_psn) & + BTH_PSN_MASK; + retry_first_write_send(qp, wqe, npsn); + } + + if (mask & WR_READ_MASK) { + npsn = (wqe->dma.length - wqe->dma.resid) / + qp->mtu; + wqe->iova += npsn * qp->mtu; + } + } + + wqe->state = wqe_state_posted; + } +} + +void rnr_nak_timer(struct timer_list *t) +{ + struct rxe_qp *qp = from_timer(qp, t, rnr_nak_timer); + unsigned long flags; + + rxe_dbg_qp(qp, "nak timer fired\n"); + + spin_lock_irqsave(&qp->state_lock, flags); + if (qp->valid) { + /* request a send queue retry */ + qp->req.need_retry = 1; + qp->req.wait_for_rnr_timer = 0; + rxe_sched_task(&qp->req.task); + } + spin_unlock_irqrestore(&qp->state_lock, flags); +} + +static void req_check_sq_drain_done(struct rxe_qp *qp) +{ + struct rxe_queue *q; + unsigned int index; + unsigned int cons; + struct rxe_send_wqe *wqe; + unsigned long flags; + + spin_lock_irqsave(&qp->state_lock, flags); + if (qp_state(qp) == IB_QPS_SQD) { + q = qp->sq.queue; + index = qp->req.wqe_index; + cons = queue_get_consumer(q, QUEUE_TYPE_FROM_CLIENT); + wqe = queue_addr_from_index(q, cons); + + /* check to see if we are drained; + * state_lock used by requester and completer + */ + do { + if (!qp->attr.sq_draining) + /* comp just finished */ + break; + + if (wqe && ((index != cons) || + (wqe->state != wqe_state_posted))) + /* comp not done yet */ + break; + + qp->attr.sq_draining = 0; + spin_unlock_irqrestore(&qp->state_lock, flags); + + if (qp->ibqp.event_handler) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_SQ_DRAINED; + qp->ibqp.event_handler(&ev, + qp->ibqp.qp_context); + } + return; + } while (0); + } + spin_unlock_irqrestore(&qp->state_lock, flags); +} + +static struct rxe_send_wqe *__req_next_wqe(struct rxe_qp *qp) +{ + struct rxe_queue *q = qp->sq.queue; + unsigned int index = qp->req.wqe_index; + unsigned int prod; + + prod = queue_get_producer(q, QUEUE_TYPE_FROM_CLIENT); + if (index == prod) + return NULL; + else + return queue_addr_from_index(q, index); +} + +static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp) +{ + struct rxe_send_wqe *wqe; + unsigned long flags; + + req_check_sq_drain_done(qp); + + wqe = __req_next_wqe(qp); + if (wqe == NULL) + return NULL; + + spin_lock_irqsave(&qp->state_lock, flags); + if (unlikely((qp_state(qp) == IB_QPS_SQD) && + (wqe->state != wqe_state_processing))) { + spin_unlock_irqrestore(&qp->state_lock, flags); + return NULL; + } + spin_unlock_irqrestore(&qp->state_lock, flags); + + wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp); + return wqe; +} + +/** + * rxe_wqe_is_fenced - check if next wqe is fenced + * @qp: the queue pair + * @wqe: the next wqe + * + * Returns: 1 if wqe needs to wait + * 0 if wqe is ready to go + */ +static int rxe_wqe_is_fenced(struct rxe_qp *qp, struct rxe_send_wqe *wqe) +{ + /* Local invalidate fence (LIF) see IBA 10.6.5.1 + * Requires ALL previous operations on the send queue + * are complete. Make mandatory for the rxe driver. + */ + if (wqe->wr.opcode == IB_WR_LOCAL_INV) + return qp->req.wqe_index != queue_get_consumer(qp->sq.queue, + QUEUE_TYPE_FROM_CLIENT); + + /* Fence see IBA 10.8.3.3 + * Requires that all previous read and atomic operations + * are complete. + */ + return (wqe->wr.send_flags & IB_SEND_FENCE) && + atomic_read(&qp->req.rd_atomic) != qp->attr.max_rd_atomic; +} + +static int next_opcode_rc(struct rxe_qp *qp, u32 opcode, int fits) +{ + switch (opcode) { + case IB_WR_RDMA_WRITE: + if (qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_FIRST || + qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_MIDDLE) + return fits ? + IB_OPCODE_RC_RDMA_WRITE_LAST : + IB_OPCODE_RC_RDMA_WRITE_MIDDLE; + else + return fits ? + IB_OPCODE_RC_RDMA_WRITE_ONLY : + IB_OPCODE_RC_RDMA_WRITE_FIRST; + + case IB_WR_RDMA_WRITE_WITH_IMM: + if (qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_FIRST || + qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_MIDDLE) + return fits ? + IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE : + IB_OPCODE_RC_RDMA_WRITE_MIDDLE; + else + return fits ? + IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE : + IB_OPCODE_RC_RDMA_WRITE_FIRST; + + case IB_WR_SEND: + if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST || + qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE) + return fits ? + IB_OPCODE_RC_SEND_LAST : + IB_OPCODE_RC_SEND_MIDDLE; + else + return fits ? + IB_OPCODE_RC_SEND_ONLY : + IB_OPCODE_RC_SEND_FIRST; + + case IB_WR_SEND_WITH_IMM: + if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST || + qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE) + return fits ? + IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE : + IB_OPCODE_RC_SEND_MIDDLE; + else + return fits ? + IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE : + IB_OPCODE_RC_SEND_FIRST; + + case IB_WR_FLUSH: + return IB_OPCODE_RC_FLUSH; + + case IB_WR_RDMA_READ: + return IB_OPCODE_RC_RDMA_READ_REQUEST; + + case IB_WR_ATOMIC_CMP_AND_SWP: + return IB_OPCODE_RC_COMPARE_SWAP; + + case IB_WR_ATOMIC_FETCH_AND_ADD: + return IB_OPCODE_RC_FETCH_ADD; + + case IB_WR_SEND_WITH_INV: + if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST || + qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE) + return fits ? IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE : + IB_OPCODE_RC_SEND_MIDDLE; + else + return fits ? IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE : + IB_OPCODE_RC_SEND_FIRST; + + case IB_WR_ATOMIC_WRITE: + return IB_OPCODE_RC_ATOMIC_WRITE; + + case IB_WR_REG_MR: + case IB_WR_LOCAL_INV: + return opcode; + } + + return -EINVAL; +} + +static int next_opcode_uc(struct rxe_qp *qp, u32 opcode, int fits) +{ + switch (opcode) { + case IB_WR_RDMA_WRITE: + if (qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_FIRST || + qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_MIDDLE) + return fits ? + IB_OPCODE_UC_RDMA_WRITE_LAST : + IB_OPCODE_UC_RDMA_WRITE_MIDDLE; + else + return fits ? + IB_OPCODE_UC_RDMA_WRITE_ONLY : + IB_OPCODE_UC_RDMA_WRITE_FIRST; + + case IB_WR_RDMA_WRITE_WITH_IMM: + if (qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_FIRST || + qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_MIDDLE) + return fits ? + IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE : + IB_OPCODE_UC_RDMA_WRITE_MIDDLE; + else + return fits ? + IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE : + IB_OPCODE_UC_RDMA_WRITE_FIRST; + + case IB_WR_SEND: + if (qp->req.opcode == IB_OPCODE_UC_SEND_FIRST || + qp->req.opcode == IB_OPCODE_UC_SEND_MIDDLE) + return fits ? + IB_OPCODE_UC_SEND_LAST : + IB_OPCODE_UC_SEND_MIDDLE; + else + return fits ? + IB_OPCODE_UC_SEND_ONLY : + IB_OPCODE_UC_SEND_FIRST; + + case IB_WR_SEND_WITH_IMM: + if (qp->req.opcode == IB_OPCODE_UC_SEND_FIRST || + qp->req.opcode == IB_OPCODE_UC_SEND_MIDDLE) + return fits ? + IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE : + IB_OPCODE_UC_SEND_MIDDLE; + else + return fits ? + IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE : + IB_OPCODE_UC_SEND_FIRST; + } + + return -EINVAL; +} + +static int next_opcode(struct rxe_qp *qp, struct rxe_send_wqe *wqe, + u32 opcode) +{ + int fits = (wqe->dma.resid <= qp->mtu); + + switch (qp_type(qp)) { + case IB_QPT_RC: + return next_opcode_rc(qp, opcode, fits); + + case IB_QPT_UC: + return next_opcode_uc(qp, opcode, fits); + + case IB_QPT_UD: + case IB_QPT_GSI: + switch (opcode) { + case IB_WR_SEND: + return IB_OPCODE_UD_SEND_ONLY; + + case IB_WR_SEND_WITH_IMM: + return IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; + } + break; + + default: + break; + } + + return -EINVAL; +} + +static inline int check_init_depth(struct rxe_qp *qp, struct rxe_send_wqe *wqe) +{ + int depth; + + if (wqe->has_rd_atomic) + return 0; + + qp->req.need_rd_atomic = 1; + depth = atomic_dec_return(&qp->req.rd_atomic); + + if (depth >= 0) { + qp->req.need_rd_atomic = 0; + wqe->has_rd_atomic = 1; + return 0; + } + + atomic_inc(&qp->req.rd_atomic); + return -EAGAIN; +} + +static inline int get_mtu(struct rxe_qp *qp) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + + if ((qp_type(qp) == IB_QPT_RC) || (qp_type(qp) == IB_QPT_UC)) + return qp->mtu; + + return rxe->port.mtu_cap; +} + +static struct sk_buff *init_req_packet(struct rxe_qp *qp, + struct rxe_av *av, + struct rxe_send_wqe *wqe, + int opcode, u32 payload, + struct rxe_pkt_info *pkt) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct sk_buff *skb; + struct rxe_send_wr *ibwr = &wqe->wr; + int pad = (-payload) & 0x3; + int paylen; + int solicited; + u32 qp_num; + int ack_req; + + /* length from start of bth to end of icrc */ + paylen = rxe_opcode[opcode].length + payload + pad + RXE_ICRC_SIZE; + pkt->paylen = paylen; + + /* init skb */ + skb = rxe_init_packet(rxe, av, paylen, pkt); + if (unlikely(!skb)) + return NULL; + + /* init bth */ + solicited = (ibwr->send_flags & IB_SEND_SOLICITED) && + (pkt->mask & RXE_END_MASK) && + ((pkt->mask & (RXE_SEND_MASK)) || + (pkt->mask & (RXE_WRITE_MASK | RXE_IMMDT_MASK)) == + (RXE_WRITE_MASK | RXE_IMMDT_MASK)); + + qp_num = (pkt->mask & RXE_DETH_MASK) ? ibwr->wr.ud.remote_qpn : + qp->attr.dest_qp_num; + + ack_req = ((pkt->mask & RXE_END_MASK) || + (qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK)); + if (ack_req) + qp->req.noack_pkts = 0; + + bth_init(pkt, pkt->opcode, solicited, 0, pad, IB_DEFAULT_PKEY_FULL, qp_num, + ack_req, pkt->psn); + + /* init optional headers */ + if (pkt->mask & RXE_RETH_MASK) { + if (pkt->mask & RXE_FETH_MASK) + reth_set_rkey(pkt, ibwr->wr.flush.rkey); + else + reth_set_rkey(pkt, ibwr->wr.rdma.rkey); + reth_set_va(pkt, wqe->iova); + reth_set_len(pkt, wqe->dma.resid); + } + + /* Fill Flush Extension Transport Header */ + if (pkt->mask & RXE_FETH_MASK) + feth_init(pkt, ibwr->wr.flush.type, ibwr->wr.flush.level); + + if (pkt->mask & RXE_IMMDT_MASK) + immdt_set_imm(pkt, ibwr->ex.imm_data); + + if (pkt->mask & RXE_IETH_MASK) + ieth_set_rkey(pkt, ibwr->ex.invalidate_rkey); + + if (pkt->mask & RXE_ATMETH_MASK) { + atmeth_set_va(pkt, wqe->iova); + if (opcode == IB_OPCODE_RC_COMPARE_SWAP) { + atmeth_set_swap_add(pkt, ibwr->wr.atomic.swap); + atmeth_set_comp(pkt, ibwr->wr.atomic.compare_add); + } else { + atmeth_set_swap_add(pkt, ibwr->wr.atomic.compare_add); + } + atmeth_set_rkey(pkt, ibwr->wr.atomic.rkey); + } + + if (pkt->mask & RXE_DETH_MASK) { + if (qp->ibqp.qp_num == 1) + deth_set_qkey(pkt, GSI_QKEY); + else + deth_set_qkey(pkt, ibwr->wr.ud.remote_qkey); + deth_set_sqp(pkt, qp->ibqp.qp_num); + } + + return skb; +} + +static int finish_packet(struct rxe_qp *qp, struct rxe_av *av, + struct rxe_send_wqe *wqe, struct rxe_pkt_info *pkt, + struct sk_buff *skb, u32 payload) +{ + int err; + + err = rxe_prepare(av, pkt, skb); + if (err) + return err; + + if (pkt->mask & RXE_WRITE_OR_SEND_MASK) { + if (wqe->wr.send_flags & IB_SEND_INLINE) { + u8 *tmp = &wqe->dma.inline_data[wqe->dma.sge_offset]; + + memcpy(payload_addr(pkt), tmp, payload); + + wqe->dma.resid -= payload; + wqe->dma.sge_offset += payload; + } else { + err = copy_data(qp->pd, 0, &wqe->dma, + payload_addr(pkt), payload, + RXE_FROM_MR_OBJ); + if (err) + return err; + } + if (bth_pad(pkt)) { + u8 *pad = payload_addr(pkt) + payload; + + memset(pad, 0, bth_pad(pkt)); + } + } else if (pkt->mask & RXE_FLUSH_MASK) { + /* oA19-2: shall have no payload. */ + wqe->dma.resid = 0; + } + + if (pkt->mask & RXE_ATOMIC_WRITE_MASK) { + memcpy(payload_addr(pkt), wqe->dma.atomic_wr, payload); + wqe->dma.resid -= payload; + } + + return 0; +} + +static void update_wqe_state(struct rxe_qp *qp, + struct rxe_send_wqe *wqe, + struct rxe_pkt_info *pkt) +{ + if (pkt->mask & RXE_END_MASK) { + if (qp_type(qp) == IB_QPT_RC) + wqe->state = wqe_state_pending; + } else { + wqe->state = wqe_state_processing; + } +} + +static void update_wqe_psn(struct rxe_qp *qp, + struct rxe_send_wqe *wqe, + struct rxe_pkt_info *pkt, + u32 payload) +{ + /* number of packets left to send including current one */ + int num_pkt = (wqe->dma.resid + payload + qp->mtu - 1) / qp->mtu; + + /* handle zero length packet case */ + if (num_pkt == 0) + num_pkt = 1; + + if (pkt->mask & RXE_START_MASK) { + wqe->first_psn = qp->req.psn; + wqe->last_psn = (qp->req.psn + num_pkt - 1) & BTH_PSN_MASK; + } + + if (pkt->mask & RXE_READ_MASK) + qp->req.psn = (wqe->first_psn + num_pkt) & BTH_PSN_MASK; + else + qp->req.psn = (qp->req.psn + 1) & BTH_PSN_MASK; +} + +static void save_state(struct rxe_send_wqe *wqe, + struct rxe_qp *qp, + struct rxe_send_wqe *rollback_wqe, + u32 *rollback_psn) +{ + rollback_wqe->state = wqe->state; + rollback_wqe->first_psn = wqe->first_psn; + rollback_wqe->last_psn = wqe->last_psn; + rollback_wqe->dma = wqe->dma; + *rollback_psn = qp->req.psn; +} + +static void rollback_state(struct rxe_send_wqe *wqe, + struct rxe_qp *qp, + struct rxe_send_wqe *rollback_wqe, + u32 rollback_psn) +{ + wqe->state = rollback_wqe->state; + wqe->first_psn = rollback_wqe->first_psn; + wqe->last_psn = rollback_wqe->last_psn; + wqe->dma = rollback_wqe->dma; + qp->req.psn = rollback_psn; +} + +static void update_state(struct rxe_qp *qp, struct rxe_pkt_info *pkt) +{ + qp->req.opcode = pkt->opcode; + + if (pkt->mask & RXE_END_MASK) + qp->req.wqe_index = queue_next_index(qp->sq.queue, + qp->req.wqe_index); + + qp->need_req_skb = 0; + + if (qp->qp_timeout_jiffies && !timer_pending(&qp->retrans_timer)) + mod_timer(&qp->retrans_timer, + jiffies + qp->qp_timeout_jiffies); +} + +static int rxe_do_local_ops(struct rxe_qp *qp, struct rxe_send_wqe *wqe) +{ + u8 opcode = wqe->wr.opcode; + u32 rkey; + int ret; + + switch (opcode) { + case IB_WR_LOCAL_INV: + rkey = wqe->wr.ex.invalidate_rkey; + if (rkey_is_mw(rkey)) + ret = rxe_invalidate_mw(qp, rkey); + else + ret = rxe_invalidate_mr(qp, rkey); + + if (unlikely(ret)) { + wqe->status = IB_WC_LOC_QP_OP_ERR; + return ret; + } + break; + case IB_WR_REG_MR: + ret = rxe_reg_fast_mr(qp, wqe); + if (unlikely(ret)) { + wqe->status = IB_WC_LOC_QP_OP_ERR; + return ret; + } + break; + case IB_WR_BIND_MW: + ret = rxe_bind_mw(qp, wqe); + if (unlikely(ret)) { + wqe->status = IB_WC_MW_BIND_ERR; + return ret; + } + break; + default: + rxe_dbg_qp(qp, "Unexpected send wqe opcode %d\n", opcode); + wqe->status = IB_WC_LOC_QP_OP_ERR; + return -EINVAL; + } + + wqe->state = wqe_state_done; + wqe->status = IB_WC_SUCCESS; + qp->req.wqe_index = queue_next_index(qp->sq.queue, qp->req.wqe_index); + + /* There is no ack coming for local work requests + * which can lead to a deadlock. So go ahead and complete + * it now. + */ + rxe_sched_task(&qp->comp.task); + + return 0; +} + +int rxe_requester(struct rxe_qp *qp) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct rxe_pkt_info pkt; + struct sk_buff *skb; + struct rxe_send_wqe *wqe; + enum rxe_hdr_mask mask; + u32 payload; + int mtu; + int opcode; + int err; + int ret; + struct rxe_send_wqe rollback_wqe; + u32 rollback_psn; + struct rxe_queue *q = qp->sq.queue; + struct rxe_ah *ah; + struct rxe_av *av; + unsigned long flags; + + spin_lock_irqsave(&qp->state_lock, flags); + if (unlikely(!qp->valid)) { + spin_unlock_irqrestore(&qp->state_lock, flags); + goto exit; + } + + if (unlikely(qp_state(qp) == IB_QPS_ERR)) { + wqe = __req_next_wqe(qp); + spin_unlock_irqrestore(&qp->state_lock, flags); + if (wqe) + goto err; + else + goto exit; + } + + if (unlikely(qp_state(qp) == IB_QPS_RESET)) { + qp->req.wqe_index = queue_get_consumer(q, + QUEUE_TYPE_FROM_CLIENT); + qp->req.opcode = -1; + qp->req.need_rd_atomic = 0; + qp->req.wait_psn = 0; + qp->req.need_retry = 0; + qp->req.wait_for_rnr_timer = 0; + spin_unlock_irqrestore(&qp->state_lock, flags); + goto exit; + } + spin_unlock_irqrestore(&qp->state_lock, flags); + + /* we come here if the retransmit timer has fired + * or if the rnr timer has fired. If the retransmit + * timer fires while we are processing an RNR NAK wait + * until the rnr timer has fired before starting the + * retry flow + */ + if (unlikely(qp->req.need_retry && !qp->req.wait_for_rnr_timer)) { + req_retry(qp); + qp->req.need_retry = 0; + } + + wqe = req_next_wqe(qp); + if (unlikely(!wqe)) + goto exit; + + if (rxe_wqe_is_fenced(qp, wqe)) { + qp->req.wait_fence = 1; + goto exit; + } + + if (wqe->mask & WR_LOCAL_OP_MASK) { + err = rxe_do_local_ops(qp, wqe); + if (unlikely(err)) + goto err; + else + goto done; + } + + if (unlikely(qp_type(qp) == IB_QPT_RC && + psn_compare(qp->req.psn, (qp->comp.psn + + RXE_MAX_UNACKED_PSNS)) > 0)) { + qp->req.wait_psn = 1; + goto exit; + } + + /* Limit the number of inflight SKBs per QP */ + if (unlikely(atomic_read(&qp->skb_out) > + RXE_INFLIGHT_SKBS_PER_QP_HIGH)) { + qp->need_req_skb = 1; + goto exit; + } + + opcode = next_opcode(qp, wqe, wqe->wr.opcode); + if (unlikely(opcode < 0)) { + wqe->status = IB_WC_LOC_QP_OP_ERR; + goto err; + } + + mask = rxe_opcode[opcode].mask; + if (unlikely(mask & (RXE_READ_OR_ATOMIC_MASK | + RXE_ATOMIC_WRITE_MASK))) { + if (check_init_depth(qp, wqe)) + goto exit; + } + + mtu = get_mtu(qp); + payload = (mask & (RXE_WRITE_OR_SEND_MASK | RXE_ATOMIC_WRITE_MASK)) ? + wqe->dma.resid : 0; + if (payload > mtu) { + if (qp_type(qp) == IB_QPT_UD) { + /* C10-93.1.1: If the total sum of all the buffer lengths specified for a + * UD message exceeds the MTU of the port as returned by QueryHCA, the CI + * shall not emit any packets for this message. Further, the CI shall not + * generate an error due to this condition. + */ + + /* fake a successful UD send */ + wqe->first_psn = qp->req.psn; + wqe->last_psn = qp->req.psn; + qp->req.psn = (qp->req.psn + 1) & BTH_PSN_MASK; + qp->req.opcode = IB_OPCODE_UD_SEND_ONLY; + qp->req.wqe_index = queue_next_index(qp->sq.queue, + qp->req.wqe_index); + wqe->state = wqe_state_done; + wqe->status = IB_WC_SUCCESS; + rxe_sched_task(&qp->comp.task); + goto done; + } + payload = mtu; + } + + pkt.rxe = rxe; + pkt.opcode = opcode; + pkt.qp = qp; + pkt.psn = qp->req.psn; + pkt.mask = rxe_opcode[opcode].mask; + pkt.wqe = wqe; + + /* save wqe state before we build and send packet */ + save_state(wqe, qp, &rollback_wqe, &rollback_psn); + + av = rxe_get_av(&pkt, &ah); + if (unlikely(!av)) { + rxe_dbg_qp(qp, "Failed no address vector\n"); + wqe->status = IB_WC_LOC_QP_OP_ERR; + goto err; + } + + skb = init_req_packet(qp, av, wqe, opcode, payload, &pkt); + if (unlikely(!skb)) { + rxe_dbg_qp(qp, "Failed allocating skb\n"); + wqe->status = IB_WC_LOC_QP_OP_ERR; + if (ah) + rxe_put(ah); + goto err; + } + + err = finish_packet(qp, av, wqe, &pkt, skb, payload); + if (unlikely(err)) { + rxe_dbg_qp(qp, "Error during finish packet\n"); + if (err == -EFAULT) + wqe->status = IB_WC_LOC_PROT_ERR; + else + wqe->status = IB_WC_LOC_QP_OP_ERR; + kfree_skb(skb); + if (ah) + rxe_put(ah); + goto err; + } + + if (ah) + rxe_put(ah); + + /* update wqe state as though we had sent it */ + update_wqe_state(qp, wqe, &pkt); + update_wqe_psn(qp, wqe, &pkt, payload); + + err = rxe_xmit_packet(qp, &pkt, skb); + if (err) { + if (err != -EAGAIN) { + wqe->status = IB_WC_LOC_QP_OP_ERR; + goto err; + } + + /* the packet was dropped so reset wqe to the state + * before we sent it so we can try to resend + */ + rollback_state(wqe, qp, &rollback_wqe, rollback_psn); + + /* force a delay until the dropped packet is freed and + * the send queue is drained below the low water mark + */ + qp->need_req_skb = 1; + + rxe_sched_task(&qp->req.task); + goto exit; + } + + update_state(qp, &pkt); + + /* A non-zero return value will cause rxe_do_task to + * exit its loop and end the work item. A zero return + * will continue looping and return to rxe_requester + */ +done: + ret = 0; + goto out; +err: + /* update wqe_index for each wqe completion */ + qp->req.wqe_index = queue_next_index(qp->sq.queue, qp->req.wqe_index); + wqe->state = wqe_state_error; + rxe_qp_error(qp); +exit: + ret = -EAGAIN; +out: + return ret; +} diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c new file mode 100644 index 0000000000..da470a925e --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -0,0 +1,1691 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <linux/skbuff.h> + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" + +static char *resp_state_name[] = { + [RESPST_NONE] = "NONE", + [RESPST_GET_REQ] = "GET_REQ", + [RESPST_CHK_PSN] = "CHK_PSN", + [RESPST_CHK_OP_SEQ] = "CHK_OP_SEQ", + [RESPST_CHK_OP_VALID] = "CHK_OP_VALID", + [RESPST_CHK_RESOURCE] = "CHK_RESOURCE", + [RESPST_CHK_LENGTH] = "CHK_LENGTH", + [RESPST_CHK_RKEY] = "CHK_RKEY", + [RESPST_EXECUTE] = "EXECUTE", + [RESPST_READ_REPLY] = "READ_REPLY", + [RESPST_ATOMIC_REPLY] = "ATOMIC_REPLY", + [RESPST_ATOMIC_WRITE_REPLY] = "ATOMIC_WRITE_REPLY", + [RESPST_PROCESS_FLUSH] = "PROCESS_FLUSH", + [RESPST_COMPLETE] = "COMPLETE", + [RESPST_ACKNOWLEDGE] = "ACKNOWLEDGE", + [RESPST_CLEANUP] = "CLEANUP", + [RESPST_DUPLICATE_REQUEST] = "DUPLICATE_REQUEST", + [RESPST_ERR_MALFORMED_WQE] = "ERR_MALFORMED_WQE", + [RESPST_ERR_UNSUPPORTED_OPCODE] = "ERR_UNSUPPORTED_OPCODE", + [RESPST_ERR_MISALIGNED_ATOMIC] = "ERR_MISALIGNED_ATOMIC", + [RESPST_ERR_PSN_OUT_OF_SEQ] = "ERR_PSN_OUT_OF_SEQ", + [RESPST_ERR_MISSING_OPCODE_FIRST] = "ERR_MISSING_OPCODE_FIRST", + [RESPST_ERR_MISSING_OPCODE_LAST_C] = "ERR_MISSING_OPCODE_LAST_C", + [RESPST_ERR_MISSING_OPCODE_LAST_D1E] = "ERR_MISSING_OPCODE_LAST_D1E", + [RESPST_ERR_TOO_MANY_RDMA_ATM_REQ] = "ERR_TOO_MANY_RDMA_ATM_REQ", + [RESPST_ERR_RNR] = "ERR_RNR", + [RESPST_ERR_RKEY_VIOLATION] = "ERR_RKEY_VIOLATION", + [RESPST_ERR_INVALIDATE_RKEY] = "ERR_INVALIDATE_RKEY_VIOLATION", + [RESPST_ERR_LENGTH] = "ERR_LENGTH", + [RESPST_ERR_CQ_OVERFLOW] = "ERR_CQ_OVERFLOW", + [RESPST_ERROR] = "ERROR", + [RESPST_DONE] = "DONE", + [RESPST_EXIT] = "EXIT", +}; + +/* rxe_recv calls here to add a request packet to the input queue */ +void rxe_resp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb) +{ + int must_sched; + struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); + + skb_queue_tail(&qp->req_pkts, skb); + + must_sched = (pkt->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST) || + (skb_queue_len(&qp->req_pkts) > 1); + + if (must_sched) + rxe_sched_task(&qp->resp.task); + else + rxe_run_task(&qp->resp.task); +} + +static inline enum resp_states get_req(struct rxe_qp *qp, + struct rxe_pkt_info **pkt_p) +{ + struct sk_buff *skb; + + skb = skb_peek(&qp->req_pkts); + if (!skb) + return RESPST_EXIT; + + *pkt_p = SKB_TO_PKT(skb); + + return (qp->resp.res) ? RESPST_READ_REPLY : RESPST_CHK_PSN; +} + +static enum resp_states check_psn(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + int diff = psn_compare(pkt->psn, qp->resp.psn); + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + + switch (qp_type(qp)) { + case IB_QPT_RC: + if (diff > 0) { + if (qp->resp.sent_psn_nak) + return RESPST_CLEANUP; + + qp->resp.sent_psn_nak = 1; + rxe_counter_inc(rxe, RXE_CNT_OUT_OF_SEQ_REQ); + return RESPST_ERR_PSN_OUT_OF_SEQ; + + } else if (diff < 0) { + rxe_counter_inc(rxe, RXE_CNT_DUP_REQ); + return RESPST_DUPLICATE_REQUEST; + } + + if (qp->resp.sent_psn_nak) + qp->resp.sent_psn_nak = 0; + + break; + + case IB_QPT_UC: + if (qp->resp.drop_msg || diff != 0) { + if (pkt->mask & RXE_START_MASK) { + qp->resp.drop_msg = 0; + return RESPST_CHK_OP_SEQ; + } + + qp->resp.drop_msg = 1; + return RESPST_CLEANUP; + } + break; + default: + break; + } + + return RESPST_CHK_OP_SEQ; +} + +static enum resp_states check_op_seq(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + switch (qp_type(qp)) { + case IB_QPT_RC: + switch (qp->resp.opcode) { + case IB_OPCODE_RC_SEND_FIRST: + case IB_OPCODE_RC_SEND_MIDDLE: + switch (pkt->opcode) { + case IB_OPCODE_RC_SEND_MIDDLE: + case IB_OPCODE_RC_SEND_LAST: + case IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE: + case IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE: + return RESPST_CHK_OP_VALID; + default: + return RESPST_ERR_MISSING_OPCODE_LAST_C; + } + + case IB_OPCODE_RC_RDMA_WRITE_FIRST: + case IB_OPCODE_RC_RDMA_WRITE_MIDDLE: + switch (pkt->opcode) { + case IB_OPCODE_RC_RDMA_WRITE_MIDDLE: + case IB_OPCODE_RC_RDMA_WRITE_LAST: + case IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE: + return RESPST_CHK_OP_VALID; + default: + return RESPST_ERR_MISSING_OPCODE_LAST_C; + } + + default: + switch (pkt->opcode) { + case IB_OPCODE_RC_SEND_MIDDLE: + case IB_OPCODE_RC_SEND_LAST: + case IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE: + case IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE: + case IB_OPCODE_RC_RDMA_WRITE_MIDDLE: + case IB_OPCODE_RC_RDMA_WRITE_LAST: + case IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE: + return RESPST_ERR_MISSING_OPCODE_FIRST; + default: + return RESPST_CHK_OP_VALID; + } + } + break; + + case IB_QPT_UC: + switch (qp->resp.opcode) { + case IB_OPCODE_UC_SEND_FIRST: + case IB_OPCODE_UC_SEND_MIDDLE: + switch (pkt->opcode) { + case IB_OPCODE_UC_SEND_MIDDLE: + case IB_OPCODE_UC_SEND_LAST: + case IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE: + return RESPST_CHK_OP_VALID; + default: + return RESPST_ERR_MISSING_OPCODE_LAST_D1E; + } + + case IB_OPCODE_UC_RDMA_WRITE_FIRST: + case IB_OPCODE_UC_RDMA_WRITE_MIDDLE: + switch (pkt->opcode) { + case IB_OPCODE_UC_RDMA_WRITE_MIDDLE: + case IB_OPCODE_UC_RDMA_WRITE_LAST: + case IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE: + return RESPST_CHK_OP_VALID; + default: + return RESPST_ERR_MISSING_OPCODE_LAST_D1E; + } + + default: + switch (pkt->opcode) { + case IB_OPCODE_UC_SEND_MIDDLE: + case IB_OPCODE_UC_SEND_LAST: + case IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE: + case IB_OPCODE_UC_RDMA_WRITE_MIDDLE: + case IB_OPCODE_UC_RDMA_WRITE_LAST: + case IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE: + qp->resp.drop_msg = 1; + return RESPST_CLEANUP; + default: + return RESPST_CHK_OP_VALID; + } + } + break; + + default: + return RESPST_CHK_OP_VALID; + } +} + +static bool check_qp_attr_access(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + if (((pkt->mask & RXE_READ_MASK) && + !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_READ)) || + ((pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) && + !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) || + ((pkt->mask & RXE_ATOMIC_MASK) && + !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) + return false; + + if (pkt->mask & RXE_FLUSH_MASK) { + u32 flush_type = feth_plt(pkt); + + if ((flush_type & IB_FLUSH_GLOBAL && + !(qp->attr.qp_access_flags & IB_ACCESS_FLUSH_GLOBAL)) || + (flush_type & IB_FLUSH_PERSISTENT && + !(qp->attr.qp_access_flags & IB_ACCESS_FLUSH_PERSISTENT))) + return false; + } + + return true; +} + +static enum resp_states check_op_valid(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + switch (qp_type(qp)) { + case IB_QPT_RC: + if (!check_qp_attr_access(qp, pkt)) + return RESPST_ERR_UNSUPPORTED_OPCODE; + + break; + + case IB_QPT_UC: + if ((pkt->mask & RXE_WRITE_MASK) && + !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) { + qp->resp.drop_msg = 1; + return RESPST_CLEANUP; + } + + break; + + case IB_QPT_UD: + case IB_QPT_GSI: + break; + + default: + WARN_ON_ONCE(1); + break; + } + + return RESPST_CHK_RESOURCE; +} + +static enum resp_states get_srq_wqe(struct rxe_qp *qp) +{ + struct rxe_srq *srq = qp->srq; + struct rxe_queue *q = srq->rq.queue; + struct rxe_recv_wqe *wqe; + struct ib_event ev; + unsigned int count; + size_t size; + unsigned long flags; + + if (srq->error) + return RESPST_ERR_RNR; + + spin_lock_irqsave(&srq->rq.consumer_lock, flags); + + wqe = queue_head(q, QUEUE_TYPE_FROM_CLIENT); + if (!wqe) { + spin_unlock_irqrestore(&srq->rq.consumer_lock, flags); + return RESPST_ERR_RNR; + } + + /* don't trust user space data */ + if (unlikely(wqe->dma.num_sge > srq->rq.max_sge)) { + spin_unlock_irqrestore(&srq->rq.consumer_lock, flags); + rxe_dbg_qp(qp, "invalid num_sge in SRQ entry\n"); + return RESPST_ERR_MALFORMED_WQE; + } + size = sizeof(*wqe) + wqe->dma.num_sge*sizeof(struct rxe_sge); + memcpy(&qp->resp.srq_wqe, wqe, size); + + qp->resp.wqe = &qp->resp.srq_wqe.wqe; + queue_advance_consumer(q, QUEUE_TYPE_FROM_CLIENT); + count = queue_count(q, QUEUE_TYPE_FROM_CLIENT); + + if (srq->limit && srq->ibsrq.event_handler && (count < srq->limit)) { + srq->limit = 0; + goto event; + } + + spin_unlock_irqrestore(&srq->rq.consumer_lock, flags); + return RESPST_CHK_LENGTH; + +event: + spin_unlock_irqrestore(&srq->rq.consumer_lock, flags); + ev.device = qp->ibqp.device; + ev.element.srq = qp->ibqp.srq; + ev.event = IB_EVENT_SRQ_LIMIT_REACHED; + srq->ibsrq.event_handler(&ev, srq->ibsrq.srq_context); + return RESPST_CHK_LENGTH; +} + +static enum resp_states check_resource(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + struct rxe_srq *srq = qp->srq; + + if (pkt->mask & (RXE_READ_OR_ATOMIC_MASK | RXE_ATOMIC_WRITE_MASK)) { + /* it is the requesters job to not send + * too many read/atomic ops, we just + * recycle the responder resource queue + */ + if (likely(qp->attr.max_dest_rd_atomic > 0)) + return RESPST_CHK_LENGTH; + else + return RESPST_ERR_TOO_MANY_RDMA_ATM_REQ; + } + + if (pkt->mask & RXE_RWR_MASK) { + if (srq) + return get_srq_wqe(qp); + + qp->resp.wqe = queue_head(qp->rq.queue, + QUEUE_TYPE_FROM_CLIENT); + return (qp->resp.wqe) ? RESPST_CHK_LENGTH : RESPST_ERR_RNR; + } + + return RESPST_CHK_LENGTH; +} + +static enum resp_states rxe_resp_check_length(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + /* + * See IBA C9-92 + * For UD QPs we only check if the packet will fit in the + * receive buffer later. For rmda operations additional + * length checks are performed in check_rkey. + */ + if (pkt->mask & RXE_PAYLOAD_MASK && ((qp_type(qp) == IB_QPT_RC) || + (qp_type(qp) == IB_QPT_UC))) { + unsigned int mtu = qp->mtu; + unsigned int payload = payload_size(pkt); + + if ((pkt->mask & RXE_START_MASK) && + (pkt->mask & RXE_END_MASK)) { + if (unlikely(payload > mtu)) { + rxe_dbg_qp(qp, "only packet too long"); + return RESPST_ERR_LENGTH; + } + } else if ((pkt->mask & RXE_START_MASK) || + (pkt->mask & RXE_MIDDLE_MASK)) { + if (unlikely(payload != mtu)) { + rxe_dbg_qp(qp, "first or middle packet not mtu"); + return RESPST_ERR_LENGTH; + } + } else if (pkt->mask & RXE_END_MASK) { + if (unlikely((payload == 0) || (payload > mtu))) { + rxe_dbg_qp(qp, "last packet zero or too long"); + return RESPST_ERR_LENGTH; + } + } + } + + /* See IBA C9-94 */ + if (pkt->mask & RXE_RETH_MASK) { + if (reth_len(pkt) > (1U << 31)) { + rxe_dbg_qp(qp, "dma length too long"); + return RESPST_ERR_LENGTH; + } + } + + if (pkt->mask & RXE_RDMA_OP_MASK) + return RESPST_CHK_RKEY; + else + return RESPST_EXECUTE; +} + +/* if the reth length field is zero we can assume nothing + * about the rkey value and should not validate or use it. + * Instead set qp->resp.rkey to 0 which is an invalid rkey + * value since the minimum index part is 1. + */ +static void qp_resp_from_reth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) +{ + unsigned int length = reth_len(pkt); + + qp->resp.va = reth_va(pkt); + qp->resp.offset = 0; + qp->resp.resid = length; + qp->resp.length = length; + if (pkt->mask & RXE_READ_OR_WRITE_MASK && length == 0) + qp->resp.rkey = 0; + else + qp->resp.rkey = reth_rkey(pkt); +} + +static void qp_resp_from_atmeth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) +{ + qp->resp.va = atmeth_va(pkt); + qp->resp.offset = 0; + qp->resp.rkey = atmeth_rkey(pkt); + qp->resp.resid = sizeof(u64); +} + +/* resolve the packet rkey to qp->resp.mr or set qp->resp.mr to NULL + * if an invalid rkey is received or the rdma length is zero. For middle + * or last packets use the stored value of mr. + */ +static enum resp_states check_rkey(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + struct rxe_mr *mr = NULL; + struct rxe_mw *mw = NULL; + u64 va; + u32 rkey; + u32 resid; + u32 pktlen; + int mtu = qp->mtu; + enum resp_states state; + int access = 0; + + /* parse RETH or ATMETH header for first/only packets + * for va, length, rkey, etc. or use current value for + * middle/last packets. + */ + if (pkt->mask & (RXE_READ_OR_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) { + if (pkt->mask & RXE_RETH_MASK) + qp_resp_from_reth(qp, pkt); + + access = (pkt->mask & RXE_READ_MASK) ? IB_ACCESS_REMOTE_READ + : IB_ACCESS_REMOTE_WRITE; + } else if (pkt->mask & RXE_FLUSH_MASK) { + u32 flush_type = feth_plt(pkt); + + if (pkt->mask & RXE_RETH_MASK) + qp_resp_from_reth(qp, pkt); + + if (flush_type & IB_FLUSH_GLOBAL) + access |= IB_ACCESS_FLUSH_GLOBAL; + if (flush_type & IB_FLUSH_PERSISTENT) + access |= IB_ACCESS_FLUSH_PERSISTENT; + } else if (pkt->mask & RXE_ATOMIC_MASK) { + qp_resp_from_atmeth(qp, pkt); + access = IB_ACCESS_REMOTE_ATOMIC; + } else { + /* shouldn't happen */ + WARN_ON(1); + } + + /* A zero-byte read or write op is not required to + * set an addr or rkey. See C9-88 + */ + if ((pkt->mask & RXE_READ_OR_WRITE_MASK) && + (pkt->mask & RXE_RETH_MASK) && reth_len(pkt) == 0) { + qp->resp.mr = NULL; + return RESPST_EXECUTE; + } + + va = qp->resp.va; + rkey = qp->resp.rkey; + resid = qp->resp.resid; + pktlen = payload_size(pkt); + + if (rkey_is_mw(rkey)) { + mw = rxe_lookup_mw(qp, access, rkey); + if (!mw) { + rxe_dbg_qp(qp, "no MW matches rkey %#x\n", rkey); + state = RESPST_ERR_RKEY_VIOLATION; + goto err; + } + + mr = mw->mr; + if (!mr) { + rxe_dbg_qp(qp, "MW doesn't have an MR\n"); + state = RESPST_ERR_RKEY_VIOLATION; + goto err; + } + + if (mw->access & IB_ZERO_BASED) + qp->resp.offset = mw->addr; + + rxe_get(mr); + rxe_put(mw); + mw = NULL; + } else { + mr = lookup_mr(qp->pd, access, rkey, RXE_LOOKUP_REMOTE); + if (!mr) { + rxe_dbg_qp(qp, "no MR matches rkey %#x\n", rkey); + state = RESPST_ERR_RKEY_VIOLATION; + goto err; + } + } + + if (pkt->mask & RXE_FLUSH_MASK) { + /* FLUSH MR may not set va or resid + * no need to check range since we will flush whole mr + */ + if (feth_sel(pkt) == IB_FLUSH_MR) + goto skip_check_range; + } + + if (mr_check_range(mr, va + qp->resp.offset, resid)) { + state = RESPST_ERR_RKEY_VIOLATION; + goto err; + } + +skip_check_range: + if (pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) { + if (resid > mtu) { + if (pktlen != mtu || bth_pad(pkt)) { + state = RESPST_ERR_LENGTH; + goto err; + } + } else { + if (pktlen != resid) { + state = RESPST_ERR_LENGTH; + goto err; + } + if ((bth_pad(pkt) != (0x3 & (-resid)))) { + /* This case may not be exactly that + * but nothing else fits. + */ + state = RESPST_ERR_LENGTH; + goto err; + } + } + } + + WARN_ON_ONCE(qp->resp.mr); + + qp->resp.mr = mr; + return RESPST_EXECUTE; + +err: + qp->resp.mr = NULL; + if (mr) + rxe_put(mr); + if (mw) + rxe_put(mw); + + return state; +} + +static enum resp_states send_data_in(struct rxe_qp *qp, void *data_addr, + int data_len) +{ + int err; + + err = copy_data(qp->pd, IB_ACCESS_LOCAL_WRITE, &qp->resp.wqe->dma, + data_addr, data_len, RXE_TO_MR_OBJ); + if (unlikely(err)) + return (err == -ENOSPC) ? RESPST_ERR_LENGTH + : RESPST_ERR_MALFORMED_WQE; + + return RESPST_NONE; +} + +static enum resp_states write_data_in(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + enum resp_states rc = RESPST_NONE; + int err; + int data_len = payload_size(pkt); + + err = rxe_mr_copy(qp->resp.mr, qp->resp.va + qp->resp.offset, + payload_addr(pkt), data_len, RXE_TO_MR_OBJ); + if (err) { + rc = RESPST_ERR_RKEY_VIOLATION; + goto out; + } + + qp->resp.va += data_len; + qp->resp.resid -= data_len; + +out: + return rc; +} + +static struct resp_res *rxe_prepare_res(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + int type) +{ + struct resp_res *res; + u32 pkts; + + res = &qp->resp.resources[qp->resp.res_head]; + rxe_advance_resp_resource(qp); + free_rd_atomic_resource(res); + + res->type = type; + res->replay = 0; + + switch (type) { + case RXE_READ_MASK: + res->read.va = qp->resp.va + qp->resp.offset; + res->read.va_org = qp->resp.va + qp->resp.offset; + res->read.resid = qp->resp.resid; + res->read.length = qp->resp.resid; + res->read.rkey = qp->resp.rkey; + + pkts = max_t(u32, (reth_len(pkt) + qp->mtu - 1)/qp->mtu, 1); + res->first_psn = pkt->psn; + res->cur_psn = pkt->psn; + res->last_psn = (pkt->psn + pkts - 1) & BTH_PSN_MASK; + + res->state = rdatm_res_state_new; + break; + case RXE_ATOMIC_MASK: + case RXE_ATOMIC_WRITE_MASK: + res->first_psn = pkt->psn; + res->last_psn = pkt->psn; + res->cur_psn = pkt->psn; + break; + case RXE_FLUSH_MASK: + res->flush.va = qp->resp.va + qp->resp.offset; + res->flush.length = qp->resp.length; + res->flush.type = feth_plt(pkt); + res->flush.level = feth_sel(pkt); + } + + return res; +} + +static enum resp_states process_flush(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + u64 length, start; + struct rxe_mr *mr = qp->resp.mr; + struct resp_res *res = qp->resp.res; + + /* oA19-14, oA19-15 */ + if (res && res->replay) + return RESPST_ACKNOWLEDGE; + else if (!res) { + res = rxe_prepare_res(qp, pkt, RXE_FLUSH_MASK); + qp->resp.res = res; + } + + if (res->flush.level == IB_FLUSH_RANGE) { + start = res->flush.va; + length = res->flush.length; + } else { /* level == IB_FLUSH_MR */ + start = mr->ibmr.iova; + length = mr->ibmr.length; + } + + if (res->flush.type & IB_FLUSH_PERSISTENT) { + if (rxe_flush_pmem_iova(mr, start, length)) + return RESPST_ERR_RKEY_VIOLATION; + /* Make data persistent. */ + wmb(); + } else if (res->flush.type & IB_FLUSH_GLOBAL) { + /* Make data global visibility. */ + wmb(); + } + + qp->resp.msn++; + + /* next expected psn, read handles this separately */ + qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK; + qp->resp.ack_psn = qp->resp.psn; + + qp->resp.opcode = pkt->opcode; + qp->resp.status = IB_WC_SUCCESS; + + return RESPST_ACKNOWLEDGE; +} + +static enum resp_states atomic_reply(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + struct rxe_mr *mr = qp->resp.mr; + struct resp_res *res = qp->resp.res; + int err; + + if (!res) { + res = rxe_prepare_res(qp, pkt, RXE_ATOMIC_MASK); + qp->resp.res = res; + } + + if (!res->replay) { + u64 iova = qp->resp.va + qp->resp.offset; + + err = rxe_mr_do_atomic_op(mr, iova, pkt->opcode, + atmeth_comp(pkt), + atmeth_swap_add(pkt), + &res->atomic.orig_val); + if (err) + return err; + + qp->resp.msn++; + + /* next expected psn, read handles this separately */ + qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK; + qp->resp.ack_psn = qp->resp.psn; + + qp->resp.opcode = pkt->opcode; + qp->resp.status = IB_WC_SUCCESS; + } + + return RESPST_ACKNOWLEDGE; +} + +static enum resp_states atomic_write_reply(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + struct resp_res *res = qp->resp.res; + struct rxe_mr *mr; + u64 value; + u64 iova; + int err; + + if (!res) { + res = rxe_prepare_res(qp, pkt, RXE_ATOMIC_WRITE_MASK); + qp->resp.res = res; + } + + if (res->replay) + return RESPST_ACKNOWLEDGE; + + mr = qp->resp.mr; + value = *(u64 *)payload_addr(pkt); + iova = qp->resp.va + qp->resp.offset; + + err = rxe_mr_do_atomic_write(mr, iova, value); + if (err) + return err; + + qp->resp.resid = 0; + qp->resp.msn++; + + /* next expected psn, read handles this separately */ + qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK; + qp->resp.ack_psn = qp->resp.psn; + + qp->resp.opcode = pkt->opcode; + qp->resp.status = IB_WC_SUCCESS; + + return RESPST_ACKNOWLEDGE; +} + +static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp, + struct rxe_pkt_info *ack, + int opcode, + int payload, + u32 psn, + u8 syndrome) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct sk_buff *skb; + int paylen; + int pad; + int err; + + /* + * allocate packet + */ + pad = (-payload) & 0x3; + paylen = rxe_opcode[opcode].length + payload + pad + RXE_ICRC_SIZE; + + skb = rxe_init_packet(rxe, &qp->pri_av, paylen, ack); + if (!skb) + return NULL; + + ack->qp = qp; + ack->opcode = opcode; + ack->mask = rxe_opcode[opcode].mask; + ack->paylen = paylen; + ack->psn = psn; + + bth_init(ack, opcode, 0, 0, pad, IB_DEFAULT_PKEY_FULL, + qp->attr.dest_qp_num, 0, psn); + + if (ack->mask & RXE_AETH_MASK) { + aeth_set_syn(ack, syndrome); + aeth_set_msn(ack, qp->resp.msn); + } + + if (ack->mask & RXE_ATMACK_MASK) + atmack_set_orig(ack, qp->resp.res->atomic.orig_val); + + err = rxe_prepare(&qp->pri_av, ack, skb); + if (err) { + kfree_skb(skb); + return NULL; + } + + return skb; +} + +/** + * rxe_recheck_mr - revalidate MR from rkey and get a reference + * @qp: the qp + * @rkey: the rkey + * + * This code allows the MR to be invalidated or deregistered or + * the MW if one was used to be invalidated or deallocated. + * It is assumed that the access permissions if originally good + * are OK and the mappings to be unchanged. + * + * TODO: If someone reregisters an MR to change its size or + * access permissions during the processing of an RDMA read + * we should kill the responder resource and complete the + * operation with an error. + * + * Return: mr on success else NULL + */ +static struct rxe_mr *rxe_recheck_mr(struct rxe_qp *qp, u32 rkey) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct rxe_mr *mr; + struct rxe_mw *mw; + + if (rkey_is_mw(rkey)) { + mw = rxe_pool_get_index(&rxe->mw_pool, rkey >> 8); + if (!mw) + return NULL; + + mr = mw->mr; + if (mw->rkey != rkey || mw->state != RXE_MW_STATE_VALID || + !mr || mr->state != RXE_MR_STATE_VALID) { + rxe_put(mw); + return NULL; + } + + rxe_get(mr); + rxe_put(mw); + + return mr; + } + + mr = rxe_pool_get_index(&rxe->mr_pool, rkey >> 8); + if (!mr) + return NULL; + + if (mr->rkey != rkey || mr->state != RXE_MR_STATE_VALID) { + rxe_put(mr); + return NULL; + } + + return mr; +} + +/* RDMA read response. If res is not NULL, then we have a current RDMA request + * being processed or replayed. + */ +static enum resp_states read_reply(struct rxe_qp *qp, + struct rxe_pkt_info *req_pkt) +{ + struct rxe_pkt_info ack_pkt; + struct sk_buff *skb; + int mtu = qp->mtu; + enum resp_states state; + int payload; + int opcode; + int err; + struct resp_res *res = qp->resp.res; + struct rxe_mr *mr; + + if (!res) { + res = rxe_prepare_res(qp, req_pkt, RXE_READ_MASK); + qp->resp.res = res; + } + + if (res->state == rdatm_res_state_new) { + if (!res->replay || qp->resp.length == 0) { + /* if length == 0 mr will be NULL (is ok) + * otherwise qp->resp.mr holds a ref on mr + * which we transfer to mr and drop below. + */ + mr = qp->resp.mr; + qp->resp.mr = NULL; + } else { + mr = rxe_recheck_mr(qp, res->read.rkey); + if (!mr) + return RESPST_ERR_RKEY_VIOLATION; + } + + if (res->read.resid <= mtu) + opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY; + else + opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST; + } else { + /* re-lookup mr from rkey on all later packets. + * length will be non-zero. This can fail if someone + * modifies or destroys the mr since the first packet. + */ + mr = rxe_recheck_mr(qp, res->read.rkey); + if (!mr) + return RESPST_ERR_RKEY_VIOLATION; + + if (res->read.resid > mtu) + opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE; + else + opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST; + } + + res->state = rdatm_res_state_next; + + payload = min_t(int, res->read.resid, mtu); + + skb = prepare_ack_packet(qp, &ack_pkt, opcode, payload, + res->cur_psn, AETH_ACK_UNLIMITED); + if (!skb) { + state = RESPST_ERR_RNR; + goto err_out; + } + + err = rxe_mr_copy(mr, res->read.va, payload_addr(&ack_pkt), + payload, RXE_FROM_MR_OBJ); + if (err) { + kfree_skb(skb); + state = RESPST_ERR_RKEY_VIOLATION; + goto err_out; + } + + if (bth_pad(&ack_pkt)) { + u8 *pad = payload_addr(&ack_pkt) + payload; + + memset(pad, 0, bth_pad(&ack_pkt)); + } + + /* rxe_xmit_packet always consumes the skb */ + err = rxe_xmit_packet(qp, &ack_pkt, skb); + if (err) { + state = RESPST_ERR_RNR; + goto err_out; + } + + res->read.va += payload; + res->read.resid -= payload; + res->cur_psn = (res->cur_psn + 1) & BTH_PSN_MASK; + + if (res->read.resid > 0) { + state = RESPST_DONE; + } else { + qp->resp.res = NULL; + if (!res->replay) + qp->resp.opcode = -1; + if (psn_compare(res->cur_psn, qp->resp.psn) >= 0) + qp->resp.psn = res->cur_psn; + state = RESPST_CLEANUP; + } + +err_out: + if (mr) + rxe_put(mr); + return state; +} + +static int invalidate_rkey(struct rxe_qp *qp, u32 rkey) +{ + if (rkey_is_mw(rkey)) + return rxe_invalidate_mw(qp, rkey); + else + return rxe_invalidate_mr(qp, rkey); +} + +/* Executes a new request. A retried request never reach that function (send + * and writes are discarded, and reads and atomics are retried elsewhere. + */ +static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt) +{ + enum resp_states err; + struct sk_buff *skb = PKT_TO_SKB(pkt); + union rdma_network_hdr hdr; + + if (pkt->mask & RXE_SEND_MASK) { + if (qp_type(qp) == IB_QPT_UD || + qp_type(qp) == IB_QPT_GSI) { + if (skb->protocol == htons(ETH_P_IP)) { + memset(&hdr.reserved, 0, + sizeof(hdr.reserved)); + memcpy(&hdr.roce4grh, ip_hdr(skb), + sizeof(hdr.roce4grh)); + err = send_data_in(qp, &hdr, sizeof(hdr)); + } else { + err = send_data_in(qp, ipv6_hdr(skb), + sizeof(hdr)); + } + if (err) + return err; + } + err = send_data_in(qp, payload_addr(pkt), payload_size(pkt)); + if (err) + return err; + } else if (pkt->mask & RXE_WRITE_MASK) { + err = write_data_in(qp, pkt); + if (err) + return err; + } else if (pkt->mask & RXE_READ_MASK) { + /* For RDMA Read we can increment the msn now. See C9-148. */ + qp->resp.msn++; + return RESPST_READ_REPLY; + } else if (pkt->mask & RXE_ATOMIC_MASK) { + return RESPST_ATOMIC_REPLY; + } else if (pkt->mask & RXE_ATOMIC_WRITE_MASK) { + return RESPST_ATOMIC_WRITE_REPLY; + } else if (pkt->mask & RXE_FLUSH_MASK) { + return RESPST_PROCESS_FLUSH; + } else { + /* Unreachable */ + WARN_ON_ONCE(1); + } + + if (pkt->mask & RXE_IETH_MASK) { + u32 rkey = ieth_rkey(pkt); + + err = invalidate_rkey(qp, rkey); + if (err) + return RESPST_ERR_INVALIDATE_RKEY; + } + + if (pkt->mask & RXE_END_MASK) + /* We successfully processed this new request. */ + qp->resp.msn++; + + /* next expected psn, read handles this separately */ + qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK; + qp->resp.ack_psn = qp->resp.psn; + + qp->resp.opcode = pkt->opcode; + qp->resp.status = IB_WC_SUCCESS; + + if (pkt->mask & RXE_COMP_MASK) + return RESPST_COMPLETE; + else if (qp_type(qp) == IB_QPT_RC) + return RESPST_ACKNOWLEDGE; + else + return RESPST_CLEANUP; +} + +static enum resp_states do_complete(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + struct rxe_cqe cqe; + struct ib_wc *wc = &cqe.ibwc; + struct ib_uverbs_wc *uwc = &cqe.uibwc; + struct rxe_recv_wqe *wqe = qp->resp.wqe; + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + unsigned long flags; + + if (!wqe) + goto finish; + + memset(&cqe, 0, sizeof(cqe)); + + if (qp->rcq->is_user) { + uwc->status = qp->resp.status; + uwc->qp_num = qp->ibqp.qp_num; + uwc->wr_id = wqe->wr_id; + } else { + wc->status = qp->resp.status; + wc->qp = &qp->ibqp; + wc->wr_id = wqe->wr_id; + } + + if (wc->status == IB_WC_SUCCESS) { + rxe_counter_inc(rxe, RXE_CNT_RDMA_RECV); + wc->opcode = (pkt->mask & RXE_IMMDT_MASK && + pkt->mask & RXE_WRITE_MASK) ? + IB_WC_RECV_RDMA_WITH_IMM : IB_WC_RECV; + wc->byte_len = (pkt->mask & RXE_IMMDT_MASK && + pkt->mask & RXE_WRITE_MASK) ? + qp->resp.length : wqe->dma.length - wqe->dma.resid; + + /* fields after byte_len are different between kernel and user + * space + */ + if (qp->rcq->is_user) { + uwc->wc_flags = IB_WC_GRH; + + if (pkt->mask & RXE_IMMDT_MASK) { + uwc->wc_flags |= IB_WC_WITH_IMM; + uwc->ex.imm_data = immdt_imm(pkt); + } + + if (pkt->mask & RXE_IETH_MASK) { + uwc->wc_flags |= IB_WC_WITH_INVALIDATE; + uwc->ex.invalidate_rkey = ieth_rkey(pkt); + } + + if (pkt->mask & RXE_DETH_MASK) + uwc->src_qp = deth_sqp(pkt); + + uwc->port_num = qp->attr.port_num; + } else { + struct sk_buff *skb = PKT_TO_SKB(pkt); + + wc->wc_flags = IB_WC_GRH | IB_WC_WITH_NETWORK_HDR_TYPE; + if (skb->protocol == htons(ETH_P_IP)) + wc->network_hdr_type = RDMA_NETWORK_IPV4; + else + wc->network_hdr_type = RDMA_NETWORK_IPV6; + + if (is_vlan_dev(skb->dev)) { + wc->wc_flags |= IB_WC_WITH_VLAN; + wc->vlan_id = vlan_dev_vlan_id(skb->dev); + } + + if (pkt->mask & RXE_IMMDT_MASK) { + wc->wc_flags |= IB_WC_WITH_IMM; + wc->ex.imm_data = immdt_imm(pkt); + } + + if (pkt->mask & RXE_IETH_MASK) { + wc->wc_flags |= IB_WC_WITH_INVALIDATE; + wc->ex.invalidate_rkey = ieth_rkey(pkt); + } + + if (pkt->mask & RXE_DETH_MASK) + wc->src_qp = deth_sqp(pkt); + + wc->port_num = qp->attr.port_num; + } + } else { + if (wc->status != IB_WC_WR_FLUSH_ERR) + rxe_err_qp(qp, "non-flush error status = %d", + wc->status); + } + + /* have copy for srq and reference for !srq */ + if (!qp->srq) + queue_advance_consumer(qp->rq.queue, QUEUE_TYPE_FROM_CLIENT); + + qp->resp.wqe = NULL; + + if (rxe_cq_post(qp->rcq, &cqe, pkt ? bth_se(pkt) : 1)) + return RESPST_ERR_CQ_OVERFLOW; + +finish: + spin_lock_irqsave(&qp->state_lock, flags); + if (unlikely(qp_state(qp) == IB_QPS_ERR)) { + spin_unlock_irqrestore(&qp->state_lock, flags); + return RESPST_CHK_RESOURCE; + } + spin_unlock_irqrestore(&qp->state_lock, flags); + + if (unlikely(!pkt)) + return RESPST_DONE; + if (qp_type(qp) == IB_QPT_RC) + return RESPST_ACKNOWLEDGE; + else + return RESPST_CLEANUP; +} + + +static int send_common_ack(struct rxe_qp *qp, u8 syndrome, u32 psn, + int opcode, const char *msg) +{ + int err; + struct rxe_pkt_info ack_pkt; + struct sk_buff *skb; + + skb = prepare_ack_packet(qp, &ack_pkt, opcode, 0, psn, syndrome); + if (!skb) + return -ENOMEM; + + err = rxe_xmit_packet(qp, &ack_pkt, skb); + if (err) + rxe_dbg_qp(qp, "Failed sending %s\n", msg); + + return err; +} + +static int send_ack(struct rxe_qp *qp, u8 syndrome, u32 psn) +{ + return send_common_ack(qp, syndrome, psn, + IB_OPCODE_RC_ACKNOWLEDGE, "ACK"); +} + +static int send_atomic_ack(struct rxe_qp *qp, u8 syndrome, u32 psn) +{ + int ret = send_common_ack(qp, syndrome, psn, + IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE, "ATOMIC ACK"); + + /* have to clear this since it is used to trigger + * long read replies + */ + qp->resp.res = NULL; + return ret; +} + +static int send_read_response_ack(struct rxe_qp *qp, u8 syndrome, u32 psn) +{ + int ret = send_common_ack(qp, syndrome, psn, + IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY, + "RDMA READ response of length zero ACK"); + + /* have to clear this since it is used to trigger + * long read replies + */ + qp->resp.res = NULL; + return ret; +} + +static enum resp_states acknowledge(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + if (qp_type(qp) != IB_QPT_RC) + return RESPST_CLEANUP; + + if (qp->resp.aeth_syndrome != AETH_ACK_UNLIMITED) + send_ack(qp, qp->resp.aeth_syndrome, pkt->psn); + else if (pkt->mask & RXE_ATOMIC_MASK) + send_atomic_ack(qp, AETH_ACK_UNLIMITED, pkt->psn); + else if (pkt->mask & (RXE_FLUSH_MASK | RXE_ATOMIC_WRITE_MASK)) + send_read_response_ack(qp, AETH_ACK_UNLIMITED, pkt->psn); + else if (bth_ack(pkt)) + send_ack(qp, AETH_ACK_UNLIMITED, pkt->psn); + + return RESPST_CLEANUP; +} + +static enum resp_states cleanup(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + struct sk_buff *skb; + + if (pkt) { + skb = skb_dequeue(&qp->req_pkts); + rxe_put(qp); + kfree_skb(skb); + ib_device_put(qp->ibqp.device); + } + + if (qp->resp.mr) { + rxe_put(qp->resp.mr); + qp->resp.mr = NULL; + } + + return RESPST_DONE; +} + +static struct resp_res *find_resource(struct rxe_qp *qp, u32 psn) +{ + int i; + + for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) { + struct resp_res *res = &qp->resp.resources[i]; + + if (res->type == 0) + continue; + + if (psn_compare(psn, res->first_psn) >= 0 && + psn_compare(psn, res->last_psn) <= 0) { + return res; + } + } + + return NULL; +} + +static enum resp_states duplicate_request(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + enum resp_states rc; + u32 prev_psn = (qp->resp.ack_psn - 1) & BTH_PSN_MASK; + + if (pkt->mask & RXE_SEND_MASK || + pkt->mask & RXE_WRITE_MASK) { + /* SEND. Ack again and cleanup. C9-105. */ + send_ack(qp, AETH_ACK_UNLIMITED, prev_psn); + return RESPST_CLEANUP; + } else if (pkt->mask & RXE_FLUSH_MASK) { + struct resp_res *res; + + /* Find the operation in our list of responder resources. */ + res = find_resource(qp, pkt->psn); + if (res) { + res->replay = 1; + res->cur_psn = pkt->psn; + qp->resp.res = res; + rc = RESPST_PROCESS_FLUSH; + goto out; + } + + /* Resource not found. Class D error. Drop the request. */ + rc = RESPST_CLEANUP; + goto out; + } else if (pkt->mask & RXE_READ_MASK) { + struct resp_res *res; + + res = find_resource(qp, pkt->psn); + if (!res) { + /* Resource not found. Class D error. Drop the + * request. + */ + rc = RESPST_CLEANUP; + goto out; + } else { + /* Ensure this new request is the same as the previous + * one or a subset of it. + */ + u64 iova = reth_va(pkt); + u32 resid = reth_len(pkt); + + if (iova < res->read.va_org || + resid > res->read.length || + (iova + resid) > (res->read.va_org + + res->read.length)) { + rc = RESPST_CLEANUP; + goto out; + } + + if (reth_rkey(pkt) != res->read.rkey) { + rc = RESPST_CLEANUP; + goto out; + } + + res->cur_psn = pkt->psn; + res->state = (pkt->psn == res->first_psn) ? + rdatm_res_state_new : + rdatm_res_state_replay; + res->replay = 1; + + /* Reset the resource, except length. */ + res->read.va_org = iova; + res->read.va = iova; + res->read.resid = resid; + + /* Replay the RDMA read reply. */ + qp->resp.res = res; + rc = RESPST_READ_REPLY; + goto out; + } + } else { + struct resp_res *res; + + /* Find the operation in our list of responder resources. */ + res = find_resource(qp, pkt->psn); + if (res) { + res->replay = 1; + res->cur_psn = pkt->psn; + qp->resp.res = res; + rc = pkt->mask & RXE_ATOMIC_MASK ? + RESPST_ATOMIC_REPLY : + RESPST_ATOMIC_WRITE_REPLY; + goto out; + } + + /* Resource not found. Class D error. Drop the request. */ + rc = RESPST_CLEANUP; + goto out; + } +out: + return rc; +} + +/* Process a class A or C. Both are treated the same in this implementation. */ +static void do_class_ac_error(struct rxe_qp *qp, u8 syndrome, + enum ib_wc_status status) +{ + qp->resp.aeth_syndrome = syndrome; + qp->resp.status = status; + + /* indicate that we should go through the ERROR state */ + qp->resp.goto_error = 1; +} + +static enum resp_states do_class_d1e_error(struct rxe_qp *qp) +{ + /* UC */ + if (qp->srq) { + /* Class E */ + qp->resp.drop_msg = 1; + if (qp->resp.wqe) { + qp->resp.status = IB_WC_REM_INV_REQ_ERR; + return RESPST_COMPLETE; + } else { + return RESPST_CLEANUP; + } + } else { + /* Class D1. This packet may be the start of a + * new message and could be valid. The previous + * message is invalid and ignored. reset the + * recv wr to its original state + */ + if (qp->resp.wqe) { + qp->resp.wqe->dma.resid = qp->resp.wqe->dma.length; + qp->resp.wqe->dma.cur_sge = 0; + qp->resp.wqe->dma.sge_offset = 0; + qp->resp.opcode = -1; + } + + if (qp->resp.mr) { + rxe_put(qp->resp.mr); + qp->resp.mr = NULL; + } + + return RESPST_CLEANUP; + } +} + +/* drain incoming request packet queue */ +static void drain_req_pkts(struct rxe_qp *qp) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(&qp->req_pkts))) { + rxe_put(qp); + kfree_skb(skb); + ib_device_put(qp->ibqp.device); + } +} + +/* complete receive wqe with flush error */ +static int flush_recv_wqe(struct rxe_qp *qp, struct rxe_recv_wqe *wqe) +{ + struct rxe_cqe cqe = {}; + struct ib_wc *wc = &cqe.ibwc; + struct ib_uverbs_wc *uwc = &cqe.uibwc; + int err; + + if (qp->rcq->is_user) { + uwc->wr_id = wqe->wr_id; + uwc->status = IB_WC_WR_FLUSH_ERR; + uwc->qp_num = qp_num(qp); + } else { + wc->wr_id = wqe->wr_id; + wc->status = IB_WC_WR_FLUSH_ERR; + wc->qp = &qp->ibqp; + } + + err = rxe_cq_post(qp->rcq, &cqe, 0); + if (err) + rxe_dbg_cq(qp->rcq, "post cq failed err = %d", err); + + return err; +} + +/* drain and optionally complete the recive queue + * if unable to complete a wqe stop completing and + * just flush the remaining wqes + */ +static void flush_recv_queue(struct rxe_qp *qp, bool notify) +{ + struct rxe_queue *q = qp->rq.queue; + struct rxe_recv_wqe *wqe; + int err; + + if (qp->srq) { + if (notify && qp->ibqp.event_handler) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } + return; + } + + /* recv queue not created. nothing to do. */ + if (!qp->rq.queue) + return; + + while ((wqe = queue_head(q, q->type))) { + if (notify) { + err = flush_recv_wqe(qp, wqe); + if (err) + notify = 0; + } + queue_advance_consumer(q, q->type); + } + + qp->resp.wqe = NULL; +} + +int rxe_responder(struct rxe_qp *qp) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + enum resp_states state; + struct rxe_pkt_info *pkt = NULL; + int ret; + unsigned long flags; + + spin_lock_irqsave(&qp->state_lock, flags); + if (!qp->valid || qp_state(qp) == IB_QPS_ERR || + qp_state(qp) == IB_QPS_RESET) { + bool notify = qp->valid && (qp_state(qp) == IB_QPS_ERR); + + drain_req_pkts(qp); + flush_recv_queue(qp, notify); + spin_unlock_irqrestore(&qp->state_lock, flags); + goto exit; + } + spin_unlock_irqrestore(&qp->state_lock, flags); + + qp->resp.aeth_syndrome = AETH_ACK_UNLIMITED; + + state = RESPST_GET_REQ; + + while (1) { + rxe_dbg_qp(qp, "state = %s\n", resp_state_name[state]); + switch (state) { + case RESPST_GET_REQ: + state = get_req(qp, &pkt); + break; + case RESPST_CHK_PSN: + state = check_psn(qp, pkt); + break; + case RESPST_CHK_OP_SEQ: + state = check_op_seq(qp, pkt); + break; + case RESPST_CHK_OP_VALID: + state = check_op_valid(qp, pkt); + break; + case RESPST_CHK_RESOURCE: + state = check_resource(qp, pkt); + break; + case RESPST_CHK_LENGTH: + state = rxe_resp_check_length(qp, pkt); + break; + case RESPST_CHK_RKEY: + state = check_rkey(qp, pkt); + break; + case RESPST_EXECUTE: + state = execute(qp, pkt); + break; + case RESPST_COMPLETE: + state = do_complete(qp, pkt); + break; + case RESPST_READ_REPLY: + state = read_reply(qp, pkt); + break; + case RESPST_ATOMIC_REPLY: + state = atomic_reply(qp, pkt); + break; + case RESPST_ATOMIC_WRITE_REPLY: + state = atomic_write_reply(qp, pkt); + break; + case RESPST_PROCESS_FLUSH: + state = process_flush(qp, pkt); + break; + case RESPST_ACKNOWLEDGE: + state = acknowledge(qp, pkt); + break; + case RESPST_CLEANUP: + state = cleanup(qp, pkt); + break; + case RESPST_DUPLICATE_REQUEST: + state = duplicate_request(qp, pkt); + break; + case RESPST_ERR_PSN_OUT_OF_SEQ: + /* RC only - Class B. Drop packet. */ + send_ack(qp, AETH_NAK_PSN_SEQ_ERROR, qp->resp.psn); + state = RESPST_CLEANUP; + break; + + case RESPST_ERR_TOO_MANY_RDMA_ATM_REQ: + case RESPST_ERR_MISSING_OPCODE_FIRST: + case RESPST_ERR_MISSING_OPCODE_LAST_C: + case RESPST_ERR_UNSUPPORTED_OPCODE: + case RESPST_ERR_MISALIGNED_ATOMIC: + /* RC Only - Class C. */ + do_class_ac_error(qp, AETH_NAK_INVALID_REQ, + IB_WC_REM_INV_REQ_ERR); + state = RESPST_COMPLETE; + break; + + case RESPST_ERR_MISSING_OPCODE_LAST_D1E: + state = do_class_d1e_error(qp); + break; + case RESPST_ERR_RNR: + if (qp_type(qp) == IB_QPT_RC) { + rxe_counter_inc(rxe, RXE_CNT_SND_RNR); + /* RC - class B */ + send_ack(qp, AETH_RNR_NAK | + (~AETH_TYPE_MASK & + qp->attr.min_rnr_timer), + pkt->psn); + } else { + /* UD/UC - class D */ + qp->resp.drop_msg = 1; + } + state = RESPST_CLEANUP; + break; + + case RESPST_ERR_RKEY_VIOLATION: + if (qp_type(qp) == IB_QPT_RC) { + /* Class C */ + do_class_ac_error(qp, AETH_NAK_REM_ACC_ERR, + IB_WC_REM_ACCESS_ERR); + state = RESPST_COMPLETE; + } else { + qp->resp.drop_msg = 1; + if (qp->srq) { + /* UC/SRQ Class D */ + qp->resp.status = IB_WC_REM_ACCESS_ERR; + state = RESPST_COMPLETE; + } else { + /* UC/non-SRQ Class E. */ + state = RESPST_CLEANUP; + } + } + break; + + case RESPST_ERR_INVALIDATE_RKEY: + /* RC - Class J. */ + qp->resp.goto_error = 1; + qp->resp.status = IB_WC_REM_INV_REQ_ERR; + state = RESPST_COMPLETE; + break; + + case RESPST_ERR_LENGTH: + if (qp_type(qp) == IB_QPT_RC) { + /* Class C */ + do_class_ac_error(qp, AETH_NAK_INVALID_REQ, + IB_WC_REM_INV_REQ_ERR); + state = RESPST_COMPLETE; + } else if (qp->srq) { + /* UC/UD - class E */ + qp->resp.status = IB_WC_REM_INV_REQ_ERR; + state = RESPST_COMPLETE; + } else { + /* UC/UD - class D */ + qp->resp.drop_msg = 1; + state = RESPST_CLEANUP; + } + break; + + case RESPST_ERR_MALFORMED_WQE: + /* All, Class A. */ + do_class_ac_error(qp, AETH_NAK_REM_OP_ERR, + IB_WC_LOC_QP_OP_ERR); + state = RESPST_COMPLETE; + break; + + case RESPST_ERR_CQ_OVERFLOW: + /* All - Class G */ + state = RESPST_ERROR; + break; + + case RESPST_DONE: + if (qp->resp.goto_error) { + state = RESPST_ERROR; + break; + } + + goto done; + + case RESPST_EXIT: + if (qp->resp.goto_error) { + state = RESPST_ERROR; + break; + } + + goto exit; + + case RESPST_ERROR: + qp->resp.goto_error = 0; + rxe_dbg_qp(qp, "moved to error state\n"); + rxe_qp_error(qp); + goto exit; + + default: + WARN_ON_ONCE(1); + } + } + + /* A non-zero return value will cause rxe_do_task to + * exit its loop and end the work item. A zero return + * will continue looping and return to rxe_responder + */ +done: + ret = 0; + goto out; +exit: + ret = -EAGAIN; +out: + return ret; +} diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c new file mode 100644 index 0000000000..3661cb627d --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_srq.c @@ -0,0 +1,199 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <linux/vmalloc.h> +#include "rxe.h" +#include "rxe_queue.h" + +int rxe_srq_chk_init(struct rxe_dev *rxe, struct ib_srq_init_attr *init) +{ + struct ib_srq_attr *attr = &init->attr; + + if (attr->max_wr > rxe->attr.max_srq_wr) { + rxe_dbg_dev(rxe, "max_wr(%d) > max_srq_wr(%d)\n", + attr->max_wr, rxe->attr.max_srq_wr); + goto err1; + } + + if (attr->max_wr <= 0) { + rxe_dbg_dev(rxe, "max_wr(%d) <= 0\n", attr->max_wr); + goto err1; + } + + if (attr->max_wr < RXE_MIN_SRQ_WR) + attr->max_wr = RXE_MIN_SRQ_WR; + + if (attr->max_sge > rxe->attr.max_srq_sge) { + rxe_dbg_dev(rxe, "max_sge(%d) > max_srq_sge(%d)\n", + attr->max_sge, rxe->attr.max_srq_sge); + goto err1; + } + + if (attr->max_sge < RXE_MIN_SRQ_SGE) + attr->max_sge = RXE_MIN_SRQ_SGE; + + return 0; + +err1: + return -EINVAL; +} + +int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_init_attr *init, struct ib_udata *udata, + struct rxe_create_srq_resp __user *uresp) +{ + struct rxe_queue *q; + int wqe_size; + int err; + + srq->ibsrq.event_handler = init->event_handler; + srq->ibsrq.srq_context = init->srq_context; + srq->limit = init->attr.srq_limit; + srq->srq_num = srq->elem.index; + srq->rq.max_wr = init->attr.max_wr; + srq->rq.max_sge = init->attr.max_sge; + + wqe_size = sizeof(struct rxe_recv_wqe) + + srq->rq.max_sge*sizeof(struct ib_sge); + + spin_lock_init(&srq->rq.producer_lock); + spin_lock_init(&srq->rq.consumer_lock); + + q = rxe_queue_init(rxe, &srq->rq.max_wr, wqe_size, + QUEUE_TYPE_FROM_CLIENT); + if (!q) { + rxe_dbg_srq(srq, "Unable to allocate queue\n"); + err = -ENOMEM; + goto err_out; + } + + err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, udata, q->buf, + q->buf_size, &q->ip); + if (err) { + rxe_dbg_srq(srq, "Unable to init mmap info for caller\n"); + goto err_free; + } + + srq->rq.queue = q; + init->attr.max_wr = srq->rq.max_wr; + + if (uresp) { + if (copy_to_user(&uresp->srq_num, &srq->srq_num, + sizeof(uresp->srq_num))) { + rxe_queue_cleanup(q); + return -EFAULT; + } + } + + return 0; + +err_free: + vfree(q->buf); + kfree(q); +err_out: + return err; +} + +int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_attr *attr, enum ib_srq_attr_mask mask) +{ + if (srq->error) { + rxe_dbg_srq(srq, "in error state\n"); + goto err1; + } + + if (mask & IB_SRQ_MAX_WR) { + if (attr->max_wr > rxe->attr.max_srq_wr) { + rxe_dbg_srq(srq, "max_wr(%d) > max_srq_wr(%d)\n", + attr->max_wr, rxe->attr.max_srq_wr); + goto err1; + } + + if (attr->max_wr <= 0) { + rxe_dbg_srq(srq, "max_wr(%d) <= 0\n", attr->max_wr); + goto err1; + } + + if (srq->limit && (attr->max_wr < srq->limit)) { + rxe_dbg_srq(srq, "max_wr (%d) < srq->limit (%d)\n", + attr->max_wr, srq->limit); + goto err1; + } + + if (attr->max_wr < RXE_MIN_SRQ_WR) + attr->max_wr = RXE_MIN_SRQ_WR; + } + + if (mask & IB_SRQ_LIMIT) { + if (attr->srq_limit > rxe->attr.max_srq_wr) { + rxe_dbg_srq(srq, "srq_limit(%d) > max_srq_wr(%d)\n", + attr->srq_limit, rxe->attr.max_srq_wr); + goto err1; + } + + if (attr->srq_limit > srq->rq.queue->buf->index_mask) { + rxe_dbg_srq(srq, "srq_limit (%d) > cur limit(%d)\n", + attr->srq_limit, + srq->rq.queue->buf->index_mask); + goto err1; + } + } + + return 0; + +err1: + return -EINVAL; +} + +int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_attr *attr, enum ib_srq_attr_mask mask, + struct rxe_modify_srq_cmd *ucmd, struct ib_udata *udata) +{ + struct rxe_queue *q = srq->rq.queue; + struct mminfo __user *mi = NULL; + int wqe_size; + int err; + + if (mask & IB_SRQ_MAX_WR) { + /* + * This is completely screwed up, the response is supposed to + * be in the outbuf not like this. + */ + mi = u64_to_user_ptr(ucmd->mmap_info_addr); + + wqe_size = sizeof(struct rxe_recv_wqe) + + srq->rq.max_sge*sizeof(struct ib_sge); + + err = rxe_queue_resize(q, &attr->max_wr, wqe_size, + udata, mi, &srq->rq.producer_lock, + &srq->rq.consumer_lock); + if (err) + goto err_free; + + srq->rq.max_wr = attr->max_wr; + } + + if (mask & IB_SRQ_LIMIT) + srq->limit = attr->srq_limit; + + return 0; + +err_free: + rxe_queue_cleanup(q); + srq->rq.queue = NULL; + return err; +} + +void rxe_srq_cleanup(struct rxe_pool_elem *elem) +{ + struct rxe_srq *srq = container_of(elem, typeof(*srq), elem); + + if (srq->pd) + rxe_put(srq->pd); + + if (srq->rq.queue) + rxe_queue_cleanup(srq->rq.queue); +} diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c new file mode 100644 index 0000000000..1501120d4f --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -0,0 +1,313 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include "rxe.h" + +static struct workqueue_struct *rxe_wq; + +int rxe_alloc_wq(void) +{ + rxe_wq = alloc_workqueue("rxe_wq", WQ_UNBOUND, WQ_MAX_ACTIVE); + if (!rxe_wq) + return -ENOMEM; + + return 0; +} + +void rxe_destroy_wq(void) +{ + destroy_workqueue(rxe_wq); +} + +/* Check if task is idle i.e. not running, not scheduled in + * work queue and not draining. If so move to busy to + * reserve a slot in do_task() by setting to busy and taking + * a qp reference to cover the gap from now until the task finishes. + * state will move out of busy if task returns a non zero value + * in do_task(). If state is already busy it is raised to armed + * to indicate to do_task that additional pass should be made + * over the task. + * Context: caller should hold task->lock. + * Returns: true if state transitioned from idle to busy else false. + */ +static bool __reserve_if_idle(struct rxe_task *task) +{ + WARN_ON(rxe_read(task->qp) <= 0); + + if (task->state == TASK_STATE_IDLE) { + rxe_get(task->qp); + task->state = TASK_STATE_BUSY; + task->num_sched++; + return true; + } + + if (task->state == TASK_STATE_BUSY) + task->state = TASK_STATE_ARMED; + + return false; +} + +/* check if task is idle or drained and not currently + * scheduled in the work queue. This routine is + * called by rxe_cleanup_task or rxe_disable_task to + * see if the queue is empty. + * Context: caller should hold task->lock. + * Returns true if done else false. + */ +static bool __is_done(struct rxe_task *task) +{ + if (work_pending(&task->work)) + return false; + + if (task->state == TASK_STATE_IDLE || + task->state == TASK_STATE_DRAINED) { + return true; + } + + return false; +} + +/* a locked version of __is_done */ +static bool is_done(struct rxe_task *task) +{ + unsigned long flags; + int done; + + spin_lock_irqsave(&task->lock, flags); + done = __is_done(task); + spin_unlock_irqrestore(&task->lock, flags); + + return done; +} + +/* do_task is a wrapper for the three tasks (requester, + * completer, responder) and calls them in a loop until + * they return a non-zero value. It is called either + * directly by rxe_run_task or indirectly if rxe_sched_task + * schedules the task. They must call __reserve_if_idle to + * move the task to busy before calling or scheduling. + * The task can also be moved to drained or invalid + * by calls to rxe_cleanup_task or rxe_disable_task. + * In that case tasks which get here are not executed but + * just flushed. The tasks are designed to look to see if + * there is work to do and then do part of it before returning + * here with a return value of zero until all the work + * has been consumed then it returns a non-zero value. + * The number of times the task can be run is limited by + * max iterations so one task cannot hold the cpu forever. + * If the limit is hit and work remains the task is rescheduled. + */ +static void do_task(struct rxe_task *task) +{ + unsigned int iterations; + unsigned long flags; + int resched = 0; + int cont; + int ret; + + WARN_ON(rxe_read(task->qp) <= 0); + + spin_lock_irqsave(&task->lock, flags); + if (task->state >= TASK_STATE_DRAINED) { + rxe_put(task->qp); + task->num_done++; + spin_unlock_irqrestore(&task->lock, flags); + return; + } + spin_unlock_irqrestore(&task->lock, flags); + + do { + iterations = RXE_MAX_ITERATIONS; + cont = 0; + + do { + ret = task->func(task->qp); + } while (ret == 0 && iterations-- > 0); + + spin_lock_irqsave(&task->lock, flags); + /* we're not done yet but we ran out of iterations. + * yield the cpu and reschedule the task + */ + if (!ret) { + task->state = TASK_STATE_IDLE; + resched = 1; + goto exit; + } + + switch (task->state) { + case TASK_STATE_BUSY: + task->state = TASK_STATE_IDLE; + break; + + /* someone tried to schedule the task while we + * were running, keep going + */ + case TASK_STATE_ARMED: + task->state = TASK_STATE_BUSY; + cont = 1; + break; + + case TASK_STATE_DRAINING: + task->state = TASK_STATE_DRAINED; + break; + + default: + WARN_ON(1); + rxe_dbg_qp(task->qp, "unexpected task state = %d", + task->state); + task->state = TASK_STATE_IDLE; + } + +exit: + if (!cont) { + task->num_done++; + if (WARN_ON(task->num_done != task->num_sched)) + rxe_dbg_qp( + task->qp, + "%ld tasks scheduled, %ld tasks done", + task->num_sched, task->num_done); + } + spin_unlock_irqrestore(&task->lock, flags); + } while (cont); + + task->ret = ret; + + if (resched) + rxe_sched_task(task); + + rxe_put(task->qp); +} + +/* wrapper around do_task to fix argument for work queue */ +static void do_work(struct work_struct *work) +{ + do_task(container_of(work, struct rxe_task, work)); +} + +int rxe_init_task(struct rxe_task *task, struct rxe_qp *qp, + int (*func)(struct rxe_qp *)) +{ + WARN_ON(rxe_read(qp) <= 0); + + task->qp = qp; + task->func = func; + task->state = TASK_STATE_IDLE; + spin_lock_init(&task->lock); + INIT_WORK(&task->work, do_work); + + return 0; +} + +/* rxe_cleanup_task is only called from rxe_do_qp_cleanup in + * process context. The qp is already completed with no + * remaining references. Once the queue is drained the + * task is moved to invalid and returns. The qp cleanup + * code then calls the task functions directly without + * using the task struct to drain any late arriving packets + * or work requests. + */ +void rxe_cleanup_task(struct rxe_task *task) +{ + unsigned long flags; + + spin_lock_irqsave(&task->lock, flags); + if (!__is_done(task) && task->state < TASK_STATE_DRAINED) { + task->state = TASK_STATE_DRAINING; + } else { + task->state = TASK_STATE_INVALID; + spin_unlock_irqrestore(&task->lock, flags); + return; + } + spin_unlock_irqrestore(&task->lock, flags); + + /* now the task cannot be scheduled or run just wait + * for the previously scheduled tasks to finish. + */ + while (!is_done(task)) + cond_resched(); + + spin_lock_irqsave(&task->lock, flags); + task->state = TASK_STATE_INVALID; + spin_unlock_irqrestore(&task->lock, flags); +} + +/* run the task inline if it is currently idle + * cannot call do_task holding the lock + */ +void rxe_run_task(struct rxe_task *task) +{ + unsigned long flags; + bool run; + + WARN_ON(rxe_read(task->qp) <= 0); + + spin_lock_irqsave(&task->lock, flags); + run = __reserve_if_idle(task); + spin_unlock_irqrestore(&task->lock, flags); + + if (run) + do_task(task); +} + +/* schedule the task to run later as a work queue entry. + * the queue_work call can be called holding + * the lock. + */ +void rxe_sched_task(struct rxe_task *task) +{ + unsigned long flags; + + WARN_ON(rxe_read(task->qp) <= 0); + + spin_lock_irqsave(&task->lock, flags); + if (__reserve_if_idle(task)) + queue_work(rxe_wq, &task->work); + spin_unlock_irqrestore(&task->lock, flags); +} + +/* rxe_disable/enable_task are only called from + * rxe_modify_qp in process context. Task is moved + * to the drained state by do_task. + */ +void rxe_disable_task(struct rxe_task *task) +{ + unsigned long flags; + + WARN_ON(rxe_read(task->qp) <= 0); + + spin_lock_irqsave(&task->lock, flags); + if (!__is_done(task) && task->state < TASK_STATE_DRAINED) { + task->state = TASK_STATE_DRAINING; + } else { + task->state = TASK_STATE_DRAINED; + spin_unlock_irqrestore(&task->lock, flags); + return; + } + spin_unlock_irqrestore(&task->lock, flags); + + while (!is_done(task)) + cond_resched(); + + spin_lock_irqsave(&task->lock, flags); + task->state = TASK_STATE_DRAINED; + spin_unlock_irqrestore(&task->lock, flags); +} + +void rxe_enable_task(struct rxe_task *task) +{ + unsigned long flags; + + WARN_ON(rxe_read(task->qp) <= 0); + + spin_lock_irqsave(&task->lock, flags); + if (task->state == TASK_STATE_INVALID) { + spin_unlock_irqrestore(&task->lock, flags); + return; + } + + task->state = TASK_STATE_IDLE; + spin_unlock_irqrestore(&task->lock, flags); +} diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h new file mode 100644 index 0000000000..a63e258b3d --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_task.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#ifndef RXE_TASK_H +#define RXE_TASK_H + +enum { + TASK_STATE_IDLE = 0, + TASK_STATE_BUSY = 1, + TASK_STATE_ARMED = 2, + TASK_STATE_DRAINING = 3, + TASK_STATE_DRAINED = 4, + TASK_STATE_INVALID = 5, +}; + +/* + * data structure to describe a 'task' which is a short + * function that returns 0 as long as it needs to be + * called again. + */ +struct rxe_task { + struct work_struct work; + int state; + spinlock_t lock; + struct rxe_qp *qp; + int (*func)(struct rxe_qp *qp); + int ret; + long num_sched; + long num_done; +}; + +int rxe_alloc_wq(void); + +void rxe_destroy_wq(void); + +/* + * init rxe_task structure + * qp => parameter to pass to func + * func => function to call until it returns != 0 + */ +int rxe_init_task(struct rxe_task *task, struct rxe_qp *qp, + int (*func)(struct rxe_qp *)); + +/* cleanup task */ +void rxe_cleanup_task(struct rxe_task *task); + +void rxe_run_task(struct rxe_task *task); + +void rxe_sched_task(struct rxe_task *task); + +/* keep a task from scheduling */ +void rxe_disable_task(struct rxe_task *task); + +/* allow task to run */ +void rxe_enable_task(struct rxe_task *task); + +#endif /* RXE_TASK_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c new file mode 100644 index 0000000000..48f86839d3 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -0,0 +1,1533 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <linux/dma-mapping.h> +#include <net/addrconf.h> +#include <rdma/uverbs_ioctl.h> + +#include "rxe.h" +#include "rxe_queue.h" +#include "rxe_hw_counters.h" + +static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr); + +/* dev */ +static int rxe_query_device(struct ib_device *ibdev, + struct ib_device_attr *attr, + struct ib_udata *udata) +{ + struct rxe_dev *rxe = to_rdev(ibdev); + int err; + + if (udata->inlen || udata->outlen) { + rxe_dbg_dev(rxe, "malformed udata"); + err = -EINVAL; + goto err_out; + } + + memcpy(attr, &rxe->attr, sizeof(*attr)); + + return 0; + +err_out: + rxe_err_dev(rxe, "returned err = %d", err); + return err; +} + +static int rxe_query_port(struct ib_device *ibdev, + u32 port_num, struct ib_port_attr *attr) +{ + struct rxe_dev *rxe = to_rdev(ibdev); + int err, ret; + + if (port_num != 1) { + err = -EINVAL; + rxe_dbg_dev(rxe, "bad port_num = %d", port_num); + goto err_out; + } + + memcpy(attr, &rxe->port.attr, sizeof(*attr)); + + mutex_lock(&rxe->usdev_lock); + ret = ib_get_eth_speed(ibdev, port_num, &attr->active_speed, + &attr->active_width); + + if (attr->state == IB_PORT_ACTIVE) + attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; + else if (dev_get_flags(rxe->ndev) & IFF_UP) + attr->phys_state = IB_PORT_PHYS_STATE_POLLING; + else + attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; + + mutex_unlock(&rxe->usdev_lock); + + return ret; + +err_out: + rxe_err_dev(rxe, "returned err = %d", err); + return err; +} + +static int rxe_query_pkey(struct ib_device *ibdev, + u32 port_num, u16 index, u16 *pkey) +{ + struct rxe_dev *rxe = to_rdev(ibdev); + int err; + + if (index != 0) { + err = -EINVAL; + rxe_dbg_dev(rxe, "bad pkey index = %d", index); + goto err_out; + } + + *pkey = IB_DEFAULT_PKEY_FULL; + return 0; + +err_out: + rxe_err_dev(rxe, "returned err = %d", err); + return err; +} + +static int rxe_modify_device(struct ib_device *ibdev, + int mask, struct ib_device_modify *attr) +{ + struct rxe_dev *rxe = to_rdev(ibdev); + int err; + + if (mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID | + IB_DEVICE_MODIFY_NODE_DESC)) { + err = -EOPNOTSUPP; + rxe_dbg_dev(rxe, "unsupported mask = 0x%x", mask); + goto err_out; + } + + if (mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) + rxe->attr.sys_image_guid = cpu_to_be64(attr->sys_image_guid); + + if (mask & IB_DEVICE_MODIFY_NODE_DESC) { + memcpy(rxe->ib_dev.node_desc, + attr->node_desc, sizeof(rxe->ib_dev.node_desc)); + } + + return 0; + +err_out: + rxe_err_dev(rxe, "returned err = %d", err); + return err; +} + +static int rxe_modify_port(struct ib_device *ibdev, u32 port_num, + int mask, struct ib_port_modify *attr) +{ + struct rxe_dev *rxe = to_rdev(ibdev); + struct rxe_port *port; + int err; + + if (port_num != 1) { + err = -EINVAL; + rxe_dbg_dev(rxe, "bad port_num = %d", port_num); + goto err_out; + } + + //TODO is shutdown useful + if (mask & ~(IB_PORT_RESET_QKEY_CNTR)) { + err = -EOPNOTSUPP; + rxe_dbg_dev(rxe, "unsupported mask = 0x%x", mask); + goto err_out; + } + + port = &rxe->port; + port->attr.port_cap_flags |= attr->set_port_cap_mask; + port->attr.port_cap_flags &= ~attr->clr_port_cap_mask; + + if (mask & IB_PORT_RESET_QKEY_CNTR) + port->attr.qkey_viol_cntr = 0; + + return 0; + +err_out: + rxe_err_dev(rxe, "returned err = %d", err); + return err; +} + +static enum rdma_link_layer rxe_get_link_layer(struct ib_device *ibdev, + u32 port_num) +{ + struct rxe_dev *rxe = to_rdev(ibdev); + int err; + + if (port_num != 1) { + err = -EINVAL; + rxe_dbg_dev(rxe, "bad port_num = %d", port_num); + goto err_out; + } + + return IB_LINK_LAYER_ETHERNET; + +err_out: + rxe_err_dev(rxe, "returned err = %d", err); + return err; +} + +static int rxe_port_immutable(struct ib_device *ibdev, u32 port_num, + struct ib_port_immutable *immutable) +{ + struct rxe_dev *rxe = to_rdev(ibdev); + struct ib_port_attr attr = {}; + int err; + + if (port_num != 1) { + err = -EINVAL; + rxe_dbg_dev(rxe, "bad port_num = %d", port_num); + goto err_out; + } + + err = ib_query_port(ibdev, port_num, &attr); + if (err) + goto err_out; + + immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; + immutable->pkey_tbl_len = attr.pkey_tbl_len; + immutable->gid_tbl_len = attr.gid_tbl_len; + immutable->max_mad_size = IB_MGMT_MAD_SIZE; + + return 0; + +err_out: + rxe_err_dev(rxe, "returned err = %d", err); + return err; +} + +/* uc */ +static int rxe_alloc_ucontext(struct ib_ucontext *ibuc, struct ib_udata *udata) +{ + struct rxe_dev *rxe = to_rdev(ibuc->device); + struct rxe_ucontext *uc = to_ruc(ibuc); + int err; + + err = rxe_add_to_pool(&rxe->uc_pool, uc); + if (err) + rxe_err_dev(rxe, "unable to create uc"); + + return err; +} + +static void rxe_dealloc_ucontext(struct ib_ucontext *ibuc) +{ + struct rxe_ucontext *uc = to_ruc(ibuc); + int err; + + err = rxe_cleanup(uc); + if (err) + rxe_err_uc(uc, "cleanup failed, err = %d", err); +} + +/* pd */ +static int rxe_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +{ + struct rxe_dev *rxe = to_rdev(ibpd->device); + struct rxe_pd *pd = to_rpd(ibpd); + int err; + + err = rxe_add_to_pool(&rxe->pd_pool, pd); + if (err) { + rxe_dbg_dev(rxe, "unable to alloc pd"); + goto err_out; + } + + return 0; + +err_out: + rxe_err_dev(rxe, "returned err = %d", err); + return err; +} + +static int rxe_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +{ + struct rxe_pd *pd = to_rpd(ibpd); + int err; + + err = rxe_cleanup(pd); + if (err) + rxe_err_pd(pd, "cleanup failed, err = %d", err); + + return 0; +} + +/* ah */ +static int rxe_create_ah(struct ib_ah *ibah, + struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata) +{ + struct rxe_dev *rxe = to_rdev(ibah->device); + struct rxe_ah *ah = to_rah(ibah); + struct rxe_create_ah_resp __user *uresp = NULL; + int err, cleanup_err; + + if (udata) { + /* test if new user provider */ + if (udata->outlen >= sizeof(*uresp)) + uresp = udata->outbuf; + ah->is_user = true; + } else { + ah->is_user = false; + } + + err = rxe_add_to_pool_ah(&rxe->ah_pool, ah, + init_attr->flags & RDMA_CREATE_AH_SLEEPABLE); + if (err) { + rxe_dbg_dev(rxe, "unable to create ah"); + goto err_out; + } + + /* create index > 0 */ + ah->ah_num = ah->elem.index; + + err = rxe_ah_chk_attr(ah, init_attr->ah_attr); + if (err) { + rxe_dbg_ah(ah, "bad attr"); + goto err_cleanup; + } + + if (uresp) { + /* only if new user provider */ + err = copy_to_user(&uresp->ah_num, &ah->ah_num, + sizeof(uresp->ah_num)); + if (err) { + err = -EFAULT; + rxe_dbg_ah(ah, "unable to copy to user"); + goto err_cleanup; + } + } else if (ah->is_user) { + /* only if old user provider */ + ah->ah_num = 0; + } + + rxe_init_av(init_attr->ah_attr, &ah->av); + rxe_finalize(ah); + + return 0; + +err_cleanup: + cleanup_err = rxe_cleanup(ah); + if (cleanup_err) + rxe_err_ah(ah, "cleanup failed, err = %d", cleanup_err); +err_out: + rxe_err_ah(ah, "returned err = %d", err); + return err; +} + +static int rxe_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr) +{ + struct rxe_ah *ah = to_rah(ibah); + int err; + + err = rxe_ah_chk_attr(ah, attr); + if (err) { + rxe_dbg_ah(ah, "bad attr"); + goto err_out; + } + + rxe_init_av(attr, &ah->av); + + return 0; + +err_out: + rxe_err_ah(ah, "returned err = %d", err); + return err; +} + +static int rxe_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr) +{ + struct rxe_ah *ah = to_rah(ibah); + + memset(attr, 0, sizeof(*attr)); + attr->type = ibah->type; + rxe_av_to_attr(&ah->av, attr); + + return 0; +} + +static int rxe_destroy_ah(struct ib_ah *ibah, u32 flags) +{ + struct rxe_ah *ah = to_rah(ibah); + int err; + + err = rxe_cleanup_ah(ah, flags & RDMA_DESTROY_AH_SLEEPABLE); + if (err) + rxe_err_ah(ah, "cleanup failed, err = %d", err); + + return 0; +} + +/* srq */ +static int rxe_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init, + struct ib_udata *udata) +{ + struct rxe_dev *rxe = to_rdev(ibsrq->device); + struct rxe_pd *pd = to_rpd(ibsrq->pd); + struct rxe_srq *srq = to_rsrq(ibsrq); + struct rxe_create_srq_resp __user *uresp = NULL; + int err, cleanup_err; + + if (udata) { + if (udata->outlen < sizeof(*uresp)) { + err = -EINVAL; + rxe_err_dev(rxe, "malformed udata"); + goto err_out; + } + uresp = udata->outbuf; + } + + if (init->srq_type != IB_SRQT_BASIC) { + err = -EOPNOTSUPP; + rxe_dbg_dev(rxe, "srq type = %d, not supported", + init->srq_type); + goto err_out; + } + + err = rxe_srq_chk_init(rxe, init); + if (err) { + rxe_dbg_dev(rxe, "invalid init attributes"); + goto err_out; + } + + err = rxe_add_to_pool(&rxe->srq_pool, srq); + if (err) { + rxe_dbg_dev(rxe, "unable to create srq, err = %d", err); + goto err_out; + } + + rxe_get(pd); + srq->pd = pd; + + err = rxe_srq_from_init(rxe, srq, init, udata, uresp); + if (err) { + rxe_dbg_srq(srq, "create srq failed, err = %d", err); + goto err_cleanup; + } + + return 0; + +err_cleanup: + cleanup_err = rxe_cleanup(srq); + if (cleanup_err) + rxe_err_srq(srq, "cleanup failed, err = %d", cleanup_err); +err_out: + rxe_err_dev(rxe, "returned err = %d", err); + return err; +} + +static int rxe_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask mask, + struct ib_udata *udata) +{ + struct rxe_srq *srq = to_rsrq(ibsrq); + struct rxe_dev *rxe = to_rdev(ibsrq->device); + struct rxe_modify_srq_cmd cmd = {}; + int err; + + if (udata) { + if (udata->inlen < sizeof(cmd)) { + err = -EINVAL; + rxe_dbg_srq(srq, "malformed udata"); + goto err_out; + } + + err = ib_copy_from_udata(&cmd, udata, sizeof(cmd)); + if (err) { + err = -EFAULT; + rxe_dbg_srq(srq, "unable to read udata"); + goto err_out; + } + } + + err = rxe_srq_chk_attr(rxe, srq, attr, mask); + if (err) { + rxe_dbg_srq(srq, "bad init attributes"); + goto err_out; + } + + err = rxe_srq_from_attr(rxe, srq, attr, mask, &cmd, udata); + if (err) { + rxe_dbg_srq(srq, "bad attr"); + goto err_out; + } + + return 0; + +err_out: + rxe_err_srq(srq, "returned err = %d", err); + return err; +} + +static int rxe_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) +{ + struct rxe_srq *srq = to_rsrq(ibsrq); + int err; + + if (srq->error) { + err = -EINVAL; + rxe_dbg_srq(srq, "srq in error state"); + goto err_out; + } + + attr->max_wr = srq->rq.queue->buf->index_mask; + attr->max_sge = srq->rq.max_sge; + attr->srq_limit = srq->limit; + return 0; + +err_out: + rxe_err_srq(srq, "returned err = %d", err); + return err; +} + +static int rxe_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + int err = 0; + struct rxe_srq *srq = to_rsrq(ibsrq); + unsigned long flags; + + spin_lock_irqsave(&srq->rq.producer_lock, flags); + + while (wr) { + err = post_one_recv(&srq->rq, wr); + if (unlikely(err)) + break; + wr = wr->next; + } + + spin_unlock_irqrestore(&srq->rq.producer_lock, flags); + + if (err) { + *bad_wr = wr; + rxe_err_srq(srq, "returned err = %d", err); + } + + return err; +} + +static int rxe_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) +{ + struct rxe_srq *srq = to_rsrq(ibsrq); + int err; + + err = rxe_cleanup(srq); + if (err) + rxe_err_srq(srq, "cleanup failed, err = %d", err); + + return 0; +} + +/* qp */ +static int rxe_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init, + struct ib_udata *udata) +{ + struct rxe_dev *rxe = to_rdev(ibqp->device); + struct rxe_pd *pd = to_rpd(ibqp->pd); + struct rxe_qp *qp = to_rqp(ibqp); + struct rxe_create_qp_resp __user *uresp = NULL; + int err, cleanup_err; + + if (udata) { + if (udata->inlen) { + err = -EINVAL; + rxe_dbg_dev(rxe, "malformed udata, err = %d", err); + goto err_out; + } + + if (udata->outlen < sizeof(*uresp)) { + err = -EINVAL; + rxe_dbg_dev(rxe, "malformed udata, err = %d", err); + goto err_out; + } + + qp->is_user = true; + uresp = udata->outbuf; + } else { + qp->is_user = false; + } + + if (init->create_flags) { + err = -EOPNOTSUPP; + rxe_dbg_dev(rxe, "unsupported create_flags, err = %d", err); + goto err_out; + } + + err = rxe_qp_chk_init(rxe, init); + if (err) { + rxe_dbg_dev(rxe, "bad init attr, err = %d", err); + goto err_out; + } + + err = rxe_add_to_pool(&rxe->qp_pool, qp); + if (err) { + rxe_dbg_dev(rxe, "unable to create qp, err = %d", err); + goto err_out; + } + + err = rxe_qp_from_init(rxe, qp, pd, init, uresp, ibqp->pd, udata); + if (err) { + rxe_dbg_qp(qp, "create qp failed, err = %d", err); + goto err_cleanup; + } + + rxe_finalize(qp); + return 0; + +err_cleanup: + cleanup_err = rxe_cleanup(qp); + if (cleanup_err) + rxe_err_qp(qp, "cleanup failed, err = %d", cleanup_err); +err_out: + rxe_err_dev(rxe, "returned err = %d", err); + return err; +} + +static int rxe_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int mask, struct ib_udata *udata) +{ + struct rxe_dev *rxe = to_rdev(ibqp->device); + struct rxe_qp *qp = to_rqp(ibqp); + int err; + + if (mask & ~IB_QP_ATTR_STANDARD_BITS) { + err = -EOPNOTSUPP; + rxe_dbg_qp(qp, "unsupported mask = 0x%x, err = %d", + mask, err); + goto err_out; + } + + err = rxe_qp_chk_attr(rxe, qp, attr, mask); + if (err) { + rxe_dbg_qp(qp, "bad mask/attr, err = %d", err); + goto err_out; + } + + err = rxe_qp_from_attr(qp, attr, mask, udata); + if (err) { + rxe_dbg_qp(qp, "modify qp failed, err = %d", err); + goto err_out; + } + + if ((mask & IB_QP_AV) && (attr->ah_attr.ah_flags & IB_AH_GRH)) + qp->src_port = rdma_get_udp_sport(attr->ah_attr.grh.flow_label, + qp->ibqp.qp_num, + qp->attr.dest_qp_num); + + return 0; + +err_out: + rxe_err_qp(qp, "returned err = %d", err); + return err; +} + +static int rxe_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int mask, struct ib_qp_init_attr *init) +{ + struct rxe_qp *qp = to_rqp(ibqp); + + rxe_qp_to_init(qp, init); + rxe_qp_to_attr(qp, attr, mask); + + return 0; +} + +static int rxe_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) +{ + struct rxe_qp *qp = to_rqp(ibqp); + int err; + + err = rxe_qp_chk_destroy(qp); + if (err) { + rxe_dbg_qp(qp, "unable to destroy qp, err = %d", err); + goto err_out; + } + + err = rxe_cleanup(qp); + if (err) + rxe_err_qp(qp, "cleanup failed, err = %d", err); + + return 0; + +err_out: + rxe_err_qp(qp, "returned err = %d", err); + return err; +} + +/* send wr */ + +/* sanity check incoming send work request */ +static int validate_send_wr(struct rxe_qp *qp, const struct ib_send_wr *ibwr, + unsigned int *maskp, unsigned int *lengthp) +{ + int num_sge = ibwr->num_sge; + struct rxe_sq *sq = &qp->sq; + unsigned int mask = 0; + unsigned long length = 0; + int err = -EINVAL; + int i; + + do { + mask = wr_opcode_mask(ibwr->opcode, qp); + if (!mask) { + rxe_err_qp(qp, "bad wr opcode for qp type"); + break; + } + + if (num_sge > sq->max_sge) { + rxe_err_qp(qp, "num_sge > max_sge"); + break; + } + + length = 0; + for (i = 0; i < ibwr->num_sge; i++) + length += ibwr->sg_list[i].length; + + if (length > (1UL << 31)) { + rxe_err_qp(qp, "message length too long"); + break; + } + + if (mask & WR_ATOMIC_MASK) { + if (length != 8) { + rxe_err_qp(qp, "atomic length != 8"); + break; + } + if (atomic_wr(ibwr)->remote_addr & 0x7) { + rxe_err_qp(qp, "misaligned atomic address"); + break; + } + } + if (ibwr->send_flags & IB_SEND_INLINE) { + if (!(mask & WR_INLINE_MASK)) { + rxe_err_qp(qp, "opcode doesn't support inline data"); + break; + } + if (length > sq->max_inline) { + rxe_err_qp(qp, "inline length too big"); + break; + } + } + + err = 0; + } while (0); + + *maskp = mask; + *lengthp = (int)length; + + return err; +} + +static int init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr, + const struct ib_send_wr *ibwr) +{ + wr->wr_id = ibwr->wr_id; + wr->opcode = ibwr->opcode; + wr->send_flags = ibwr->send_flags; + + if (qp_type(qp) == IB_QPT_UD || + qp_type(qp) == IB_QPT_GSI) { + struct ib_ah *ibah = ud_wr(ibwr)->ah; + + wr->wr.ud.remote_qpn = ud_wr(ibwr)->remote_qpn; + wr->wr.ud.remote_qkey = ud_wr(ibwr)->remote_qkey; + wr->wr.ud.ah_num = to_rah(ibah)->ah_num; + if (qp_type(qp) == IB_QPT_GSI) + wr->wr.ud.pkey_index = ud_wr(ibwr)->pkey_index; + + switch (wr->opcode) { + case IB_WR_SEND_WITH_IMM: + wr->ex.imm_data = ibwr->ex.imm_data; + break; + case IB_WR_SEND: + break; + default: + rxe_err_qp(qp, "bad wr opcode %d for UD/GSI QP", + wr->opcode); + return -EINVAL; + } + } else { + switch (wr->opcode) { + case IB_WR_RDMA_WRITE_WITH_IMM: + wr->ex.imm_data = ibwr->ex.imm_data; + fallthrough; + case IB_WR_RDMA_READ: + case IB_WR_RDMA_WRITE: + wr->wr.rdma.remote_addr = rdma_wr(ibwr)->remote_addr; + wr->wr.rdma.rkey = rdma_wr(ibwr)->rkey; + break; + case IB_WR_SEND_WITH_IMM: + wr->ex.imm_data = ibwr->ex.imm_data; + break; + case IB_WR_SEND_WITH_INV: + wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey; + break; + case IB_WR_RDMA_READ_WITH_INV: + wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey; + wr->wr.rdma.remote_addr = rdma_wr(ibwr)->remote_addr; + wr->wr.rdma.rkey = rdma_wr(ibwr)->rkey; + break; + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + wr->wr.atomic.remote_addr = + atomic_wr(ibwr)->remote_addr; + wr->wr.atomic.compare_add = + atomic_wr(ibwr)->compare_add; + wr->wr.atomic.swap = atomic_wr(ibwr)->swap; + wr->wr.atomic.rkey = atomic_wr(ibwr)->rkey; + break; + case IB_WR_LOCAL_INV: + wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey; + break; + case IB_WR_REG_MR: + wr->wr.reg.mr = reg_wr(ibwr)->mr; + wr->wr.reg.key = reg_wr(ibwr)->key; + wr->wr.reg.access = reg_wr(ibwr)->access; + break; + case IB_WR_SEND: + case IB_WR_BIND_MW: + case IB_WR_FLUSH: + case IB_WR_ATOMIC_WRITE: + break; + default: + rxe_err_qp(qp, "unsupported wr opcode %d", + wr->opcode); + return -EINVAL; + } + } + + return 0; +} + +static void copy_inline_data_to_wqe(struct rxe_send_wqe *wqe, + const struct ib_send_wr *ibwr) +{ + struct ib_sge *sge = ibwr->sg_list; + u8 *p = wqe->dma.inline_data; + int i; + + for (i = 0; i < ibwr->num_sge; i++, sge++) { + memcpy(p, ib_virt_dma_to_page(sge->addr), sge->length); + p += sge->length; + } +} + +static int init_send_wqe(struct rxe_qp *qp, const struct ib_send_wr *ibwr, + unsigned int mask, unsigned int length, + struct rxe_send_wqe *wqe) +{ + int num_sge = ibwr->num_sge; + int err; + + err = init_send_wr(qp, &wqe->wr, ibwr); + if (err) + return err; + + /* local operation */ + if (unlikely(mask & WR_LOCAL_OP_MASK)) { + wqe->mask = mask; + wqe->state = wqe_state_posted; + return 0; + } + + if (unlikely(ibwr->send_flags & IB_SEND_INLINE)) + copy_inline_data_to_wqe(wqe, ibwr); + else + memcpy(wqe->dma.sge, ibwr->sg_list, + num_sge * sizeof(struct ib_sge)); + + wqe->iova = mask & WR_ATOMIC_MASK ? atomic_wr(ibwr)->remote_addr : + mask & WR_READ_OR_WRITE_MASK ? rdma_wr(ibwr)->remote_addr : 0; + wqe->mask = mask; + wqe->dma.length = length; + wqe->dma.resid = length; + wqe->dma.num_sge = num_sge; + wqe->dma.cur_sge = 0; + wqe->dma.sge_offset = 0; + wqe->state = wqe_state_posted; + wqe->ssn = atomic_add_return(1, &qp->ssn); + + return 0; +} + +static int post_one_send(struct rxe_qp *qp, const struct ib_send_wr *ibwr) +{ + int err; + struct rxe_sq *sq = &qp->sq; + struct rxe_send_wqe *send_wqe; + unsigned int mask; + unsigned int length; + int full; + + err = validate_send_wr(qp, ibwr, &mask, &length); + if (err) + return err; + + full = queue_full(sq->queue, QUEUE_TYPE_FROM_ULP); + if (unlikely(full)) { + rxe_err_qp(qp, "send queue full"); + return -ENOMEM; + } + + send_wqe = queue_producer_addr(sq->queue, QUEUE_TYPE_FROM_ULP); + err = init_send_wqe(qp, ibwr, mask, length, send_wqe); + if (!err) + queue_advance_producer(sq->queue, QUEUE_TYPE_FROM_ULP); + + return err; +} + +static int rxe_post_send_kernel(struct rxe_qp *qp, + const struct ib_send_wr *ibwr, + const struct ib_send_wr **bad_wr) +{ + int err = 0; + unsigned long flags; + + spin_lock_irqsave(&qp->sq.sq_lock, flags); + while (ibwr) { + err = post_one_send(qp, ibwr); + if (err) { + *bad_wr = ibwr; + break; + } + ibwr = ibwr->next; + } + spin_unlock_irqrestore(&qp->sq.sq_lock, flags); + + if (!err) + rxe_sched_task(&qp->req.task); + + spin_lock_irqsave(&qp->state_lock, flags); + if (qp_state(qp) == IB_QPS_ERR) + rxe_sched_task(&qp->comp.task); + spin_unlock_irqrestore(&qp->state_lock, flags); + + return err; +} + +static int rxe_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) +{ + struct rxe_qp *qp = to_rqp(ibqp); + int err; + unsigned long flags; + + spin_lock_irqsave(&qp->state_lock, flags); + /* caller has already called destroy_qp */ + if (WARN_ON_ONCE(!qp->valid)) { + spin_unlock_irqrestore(&qp->state_lock, flags); + rxe_err_qp(qp, "qp has been destroyed"); + return -EINVAL; + } + + if (unlikely(qp_state(qp) < IB_QPS_RTS)) { + spin_unlock_irqrestore(&qp->state_lock, flags); + *bad_wr = wr; + rxe_err_qp(qp, "qp not ready to send"); + return -EINVAL; + } + spin_unlock_irqrestore(&qp->state_lock, flags); + + if (qp->is_user) { + /* Utilize process context to do protocol processing */ + rxe_run_task(&qp->req.task); + } else { + err = rxe_post_send_kernel(qp, wr, bad_wr); + if (err) + return err; + } + + return 0; +} + +/* recv wr */ +static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr) +{ + int i; + unsigned long length; + struct rxe_recv_wqe *recv_wqe; + int num_sge = ibwr->num_sge; + int full; + int err; + + full = queue_full(rq->queue, QUEUE_TYPE_FROM_ULP); + if (unlikely(full)) { + err = -ENOMEM; + rxe_dbg("queue full"); + goto err_out; + } + + if (unlikely(num_sge > rq->max_sge)) { + err = -EINVAL; + rxe_dbg("bad num_sge > max_sge"); + goto err_out; + } + + length = 0; + for (i = 0; i < num_sge; i++) + length += ibwr->sg_list[i].length; + + /* IBA max message size is 2^31 */ + if (length >= (1UL<<31)) { + err = -EINVAL; + rxe_dbg("message length too long"); + goto err_out; + } + + recv_wqe = queue_producer_addr(rq->queue, QUEUE_TYPE_FROM_ULP); + + recv_wqe->wr_id = ibwr->wr_id; + recv_wqe->dma.length = length; + recv_wqe->dma.resid = length; + recv_wqe->dma.num_sge = num_sge; + recv_wqe->dma.cur_sge = 0; + recv_wqe->dma.sge_offset = 0; + memcpy(recv_wqe->dma.sge, ibwr->sg_list, + num_sge * sizeof(struct ib_sge)); + + queue_advance_producer(rq->queue, QUEUE_TYPE_FROM_ULP); + + return 0; + +err_out: + rxe_dbg("returned err = %d", err); + return err; +} + +static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + int err = 0; + struct rxe_qp *qp = to_rqp(ibqp); + struct rxe_rq *rq = &qp->rq; + unsigned long flags; + + spin_lock_irqsave(&qp->state_lock, flags); + /* caller has already called destroy_qp */ + if (WARN_ON_ONCE(!qp->valid)) { + spin_unlock_irqrestore(&qp->state_lock, flags); + rxe_err_qp(qp, "qp has been destroyed"); + return -EINVAL; + } + + /* see C10-97.2.1 */ + if (unlikely((qp_state(qp) < IB_QPS_INIT))) { + spin_unlock_irqrestore(&qp->state_lock, flags); + *bad_wr = wr; + rxe_dbg_qp(qp, "qp not ready to post recv"); + return -EINVAL; + } + spin_unlock_irqrestore(&qp->state_lock, flags); + + if (unlikely(qp->srq)) { + *bad_wr = wr; + rxe_dbg_qp(qp, "qp has srq, use post_srq_recv instead"); + return -EINVAL; + } + + spin_lock_irqsave(&rq->producer_lock, flags); + + while (wr) { + err = post_one_recv(rq, wr); + if (unlikely(err)) { + *bad_wr = wr; + break; + } + wr = wr->next; + } + + spin_unlock_irqrestore(&rq->producer_lock, flags); + + spin_lock_irqsave(&qp->state_lock, flags); + if (qp_state(qp) == IB_QPS_ERR) + rxe_sched_task(&qp->resp.task); + spin_unlock_irqrestore(&qp->state_lock, flags); + + return err; +} + +/* cq */ +static int rxe_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) +{ + struct ib_device *dev = ibcq->device; + struct rxe_dev *rxe = to_rdev(dev); + struct rxe_cq *cq = to_rcq(ibcq); + struct rxe_create_cq_resp __user *uresp = NULL; + int err, cleanup_err; + + if (udata) { + if (udata->outlen < sizeof(*uresp)) { + err = -EINVAL; + rxe_dbg_dev(rxe, "malformed udata, err = %d", err); + goto err_out; + } + uresp = udata->outbuf; + } + + if (attr->flags) { + err = -EOPNOTSUPP; + rxe_dbg_dev(rxe, "bad attr->flags, err = %d", err); + goto err_out; + } + + err = rxe_cq_chk_attr(rxe, NULL, attr->cqe, attr->comp_vector); + if (err) { + rxe_dbg_dev(rxe, "bad init attributes, err = %d", err); + goto err_out; + } + + err = rxe_add_to_pool(&rxe->cq_pool, cq); + if (err) { + rxe_dbg_dev(rxe, "unable to create cq, err = %d", err); + goto err_out; + } + + err = rxe_cq_from_init(rxe, cq, attr->cqe, attr->comp_vector, udata, + uresp); + if (err) { + rxe_dbg_cq(cq, "create cq failed, err = %d", err); + goto err_cleanup; + } + + return 0; + +err_cleanup: + cleanup_err = rxe_cleanup(cq); + if (cleanup_err) + rxe_err_cq(cq, "cleanup failed, err = %d", cleanup_err); +err_out: + rxe_err_dev(rxe, "returned err = %d", err); + return err; +} + +static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) +{ + struct rxe_cq *cq = to_rcq(ibcq); + struct rxe_dev *rxe = to_rdev(ibcq->device); + struct rxe_resize_cq_resp __user *uresp = NULL; + int err; + + if (udata) { + if (udata->outlen < sizeof(*uresp)) { + err = -EINVAL; + rxe_dbg_cq(cq, "malformed udata"); + goto err_out; + } + uresp = udata->outbuf; + } + + err = rxe_cq_chk_attr(rxe, cq, cqe, 0); + if (err) { + rxe_dbg_cq(cq, "bad attr, err = %d", err); + goto err_out; + } + + err = rxe_cq_resize_queue(cq, cqe, uresp, udata); + if (err) { + rxe_dbg_cq(cq, "resize cq failed, err = %d", err); + goto err_out; + } + + return 0; + +err_out: + rxe_err_cq(cq, "returned err = %d", err); + return err; +} + +static int rxe_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + int i; + struct rxe_cq *cq = to_rcq(ibcq); + struct rxe_cqe *cqe; + unsigned long flags; + + spin_lock_irqsave(&cq->cq_lock, flags); + for (i = 0; i < num_entries; i++) { + cqe = queue_head(cq->queue, QUEUE_TYPE_TO_ULP); + if (!cqe) + break; /* queue empty */ + + memcpy(wc++, &cqe->ibwc, sizeof(*wc)); + queue_advance_consumer(cq->queue, QUEUE_TYPE_TO_ULP); + } + spin_unlock_irqrestore(&cq->cq_lock, flags); + + return i; +} + +static int rxe_peek_cq(struct ib_cq *ibcq, int wc_cnt) +{ + struct rxe_cq *cq = to_rcq(ibcq); + int count; + + count = queue_count(cq->queue, QUEUE_TYPE_TO_ULP); + + return (count > wc_cnt) ? wc_cnt : count; +} + +static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct rxe_cq *cq = to_rcq(ibcq); + int ret = 0; + int empty; + unsigned long irq_flags; + + spin_lock_irqsave(&cq->cq_lock, irq_flags); + cq->notify |= flags & IB_CQ_SOLICITED_MASK; + empty = queue_empty(cq->queue, QUEUE_TYPE_TO_ULP); + + if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && !empty) + ret = 1; + + spin_unlock_irqrestore(&cq->cq_lock, irq_flags); + + return ret; +} + +static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +{ + struct rxe_cq *cq = to_rcq(ibcq); + int err; + + /* See IBA C11-17: The CI shall return an error if this Verb is + * invoked while a Work Queue is still associated with the CQ. + */ + if (atomic_read(&cq->num_wq)) { + err = -EINVAL; + rxe_dbg_cq(cq, "still in use"); + goto err_out; + } + + err = rxe_cleanup(cq); + if (err) + rxe_err_cq(cq, "cleanup failed, err = %d", err); + + return 0; + +err_out: + rxe_err_cq(cq, "returned err = %d", err); + return err; +} + +/* mr */ +static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access) +{ + struct rxe_dev *rxe = to_rdev(ibpd->device); + struct rxe_pd *pd = to_rpd(ibpd); + struct rxe_mr *mr; + int err; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + err = rxe_add_to_pool(&rxe->mr_pool, mr); + if (err) { + rxe_dbg_dev(rxe, "unable to create mr"); + goto err_free; + } + + rxe_get(pd); + mr->ibmr.pd = ibpd; + mr->ibmr.device = ibpd->device; + + rxe_mr_init_dma(access, mr); + rxe_finalize(mr); + return &mr->ibmr; + +err_free: + kfree(mr); + rxe_err_pd(pd, "returned err = %d", err); + return ERR_PTR(err); +} + +static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start, + u64 length, u64 iova, int access, + struct ib_udata *udata) +{ + struct rxe_dev *rxe = to_rdev(ibpd->device); + struct rxe_pd *pd = to_rpd(ibpd); + struct rxe_mr *mr; + int err, cleanup_err; + + if (access & ~RXE_ACCESS_SUPPORTED_MR) { + rxe_err_pd(pd, "access = %#x not supported (%#x)", access, + RXE_ACCESS_SUPPORTED_MR); + return ERR_PTR(-EOPNOTSUPP); + } + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + err = rxe_add_to_pool(&rxe->mr_pool, mr); + if (err) { + rxe_dbg_pd(pd, "unable to create mr"); + goto err_free; + } + + rxe_get(pd); + mr->ibmr.pd = ibpd; + mr->ibmr.device = ibpd->device; + + err = rxe_mr_init_user(rxe, start, length, iova, access, mr); + if (err) { + rxe_dbg_mr(mr, "reg_user_mr failed, err = %d", err); + goto err_cleanup; + } + + rxe_finalize(mr); + return &mr->ibmr; + +err_cleanup: + cleanup_err = rxe_cleanup(mr); + if (cleanup_err) + rxe_err_mr(mr, "cleanup failed, err = %d", cleanup_err); +err_free: + kfree(mr); + rxe_err_pd(pd, "returned err = %d", err); + return ERR_PTR(err); +} + +static struct ib_mr *rxe_rereg_user_mr(struct ib_mr *ibmr, int flags, + u64 start, u64 length, u64 iova, + int access, struct ib_pd *ibpd, + struct ib_udata *udata) +{ + struct rxe_mr *mr = to_rmr(ibmr); + struct rxe_pd *old_pd = to_rpd(ibmr->pd); + struct rxe_pd *pd = to_rpd(ibpd); + + /* for now only support the two easy cases: + * rereg_pd and rereg_access + */ + if (flags & ~RXE_MR_REREG_SUPPORTED) { + rxe_err_mr(mr, "flags = %#x not supported", flags); + return ERR_PTR(-EOPNOTSUPP); + } + + if (flags & IB_MR_REREG_PD) { + rxe_put(old_pd); + rxe_get(pd); + mr->ibmr.pd = ibpd; + } + + if (flags & IB_MR_REREG_ACCESS) { + if (access & ~RXE_ACCESS_SUPPORTED_MR) { + rxe_err_mr(mr, "access = %#x not supported", access); + return ERR_PTR(-EOPNOTSUPP); + } + mr->access = access; + } + + return NULL; +} + +static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, + u32 max_num_sg) +{ + struct rxe_dev *rxe = to_rdev(ibpd->device); + struct rxe_pd *pd = to_rpd(ibpd); + struct rxe_mr *mr; + int err, cleanup_err; + + if (mr_type != IB_MR_TYPE_MEM_REG) { + err = -EINVAL; + rxe_dbg_pd(pd, "mr type %d not supported, err = %d", + mr_type, err); + goto err_out; + } + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + err = rxe_add_to_pool(&rxe->mr_pool, mr); + if (err) + goto err_free; + + rxe_get(pd); + mr->ibmr.pd = ibpd; + mr->ibmr.device = ibpd->device; + + err = rxe_mr_init_fast(max_num_sg, mr); + if (err) { + rxe_dbg_mr(mr, "alloc_mr failed, err = %d", err); + goto err_cleanup; + } + + rxe_finalize(mr); + return &mr->ibmr; + +err_cleanup: + cleanup_err = rxe_cleanup(mr); + if (cleanup_err) + rxe_err_mr(mr, "cleanup failed, err = %d", err); +err_free: + kfree(mr); +err_out: + rxe_err_pd(pd, "returned err = %d", err); + return ERR_PTR(err); +} + +static int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +{ + struct rxe_mr *mr = to_rmr(ibmr); + int err, cleanup_err; + + /* See IBA 10.6.7.2.6 */ + if (atomic_read(&mr->num_mw) > 0) { + err = -EINVAL; + rxe_dbg_mr(mr, "mr has mw's bound"); + goto err_out; + } + + cleanup_err = rxe_cleanup(mr); + if (cleanup_err) + rxe_err_mr(mr, "cleanup failed, err = %d", cleanup_err); + + kfree_rcu_mightsleep(mr); + return 0; + +err_out: + rxe_err_mr(mr, "returned err = %d", err); + return err; +} + +static ssize_t parent_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct rxe_dev *rxe = + rdma_device_to_drv_device(device, struct rxe_dev, ib_dev); + + return sysfs_emit(buf, "%s\n", rxe_parent_name(rxe, 1)); +} + +static DEVICE_ATTR_RO(parent); + +static struct attribute *rxe_dev_attributes[] = { + &dev_attr_parent.attr, + NULL +}; + +static const struct attribute_group rxe_attr_group = { + .attrs = rxe_dev_attributes, +}; + +static int rxe_enable_driver(struct ib_device *ib_dev) +{ + struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev); + + rxe_set_port_state(rxe); + dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(rxe->ndev)); + return 0; +} + +static const struct ib_device_ops rxe_dev_ops = { + .owner = THIS_MODULE, + .driver_id = RDMA_DRIVER_RXE, + .uverbs_abi_ver = RXE_UVERBS_ABI_VERSION, + + .alloc_hw_port_stats = rxe_ib_alloc_hw_port_stats, + .alloc_mr = rxe_alloc_mr, + .alloc_mw = rxe_alloc_mw, + .alloc_pd = rxe_alloc_pd, + .alloc_ucontext = rxe_alloc_ucontext, + .attach_mcast = rxe_attach_mcast, + .create_ah = rxe_create_ah, + .create_cq = rxe_create_cq, + .create_qp = rxe_create_qp, + .create_srq = rxe_create_srq, + .create_user_ah = rxe_create_ah, + .dealloc_driver = rxe_dealloc, + .dealloc_mw = rxe_dealloc_mw, + .dealloc_pd = rxe_dealloc_pd, + .dealloc_ucontext = rxe_dealloc_ucontext, + .dereg_mr = rxe_dereg_mr, + .destroy_ah = rxe_destroy_ah, + .destroy_cq = rxe_destroy_cq, + .destroy_qp = rxe_destroy_qp, + .destroy_srq = rxe_destroy_srq, + .detach_mcast = rxe_detach_mcast, + .device_group = &rxe_attr_group, + .enable_driver = rxe_enable_driver, + .get_dma_mr = rxe_get_dma_mr, + .get_hw_stats = rxe_ib_get_hw_stats, + .get_link_layer = rxe_get_link_layer, + .get_port_immutable = rxe_port_immutable, + .map_mr_sg = rxe_map_mr_sg, + .mmap = rxe_mmap, + .modify_ah = rxe_modify_ah, + .modify_device = rxe_modify_device, + .modify_port = rxe_modify_port, + .modify_qp = rxe_modify_qp, + .modify_srq = rxe_modify_srq, + .peek_cq = rxe_peek_cq, + .poll_cq = rxe_poll_cq, + .post_recv = rxe_post_recv, + .post_send = rxe_post_send, + .post_srq_recv = rxe_post_srq_recv, + .query_ah = rxe_query_ah, + .query_device = rxe_query_device, + .query_pkey = rxe_query_pkey, + .query_port = rxe_query_port, + .query_qp = rxe_query_qp, + .query_srq = rxe_query_srq, + .reg_user_mr = rxe_reg_user_mr, + .req_notify_cq = rxe_req_notify_cq, + .rereg_user_mr = rxe_rereg_user_mr, + .resize_cq = rxe_resize_cq, + + INIT_RDMA_OBJ_SIZE(ib_ah, rxe_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, rxe_cq, ibcq), + INIT_RDMA_OBJ_SIZE(ib_pd, rxe_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_qp, rxe_qp, ibqp), + INIT_RDMA_OBJ_SIZE(ib_srq, rxe_srq, ibsrq), + INIT_RDMA_OBJ_SIZE(ib_ucontext, rxe_ucontext, ibuc), + INIT_RDMA_OBJ_SIZE(ib_mw, rxe_mw, ibmw), +}; + +int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) +{ + int err; + struct ib_device *dev = &rxe->ib_dev; + + strscpy(dev->node_desc, "rxe", sizeof(dev->node_desc)); + + dev->node_type = RDMA_NODE_IB_CA; + dev->phys_port_cnt = 1; + dev->num_comp_vectors = num_possible_cpus(); + dev->local_dma_lkey = 0; + addrconf_addr_eui48((unsigned char *)&dev->node_guid, + rxe->ndev->dev_addr); + + dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND) | + BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ); + + ib_set_device_ops(dev, &rxe_dev_ops); + err = ib_device_set_netdev(&rxe->ib_dev, rxe->ndev, 1); + if (err) + return err; + + err = rxe_icrc_init(rxe); + if (err) + return err; + + err = ib_register_device(dev, ibdev_name, NULL); + if (err) + rxe_dbg_dev(rxe, "failed with error %d\n", err); + + /* + * Note that rxe may be invalid at this point if another thread + * unregistered it. + */ + return err; +} diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h new file mode 100644 index 0000000000..ccb9d19ffe --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -0,0 +1,475 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#ifndef RXE_VERBS_H +#define RXE_VERBS_H + +#include <linux/interrupt.h> +#include <linux/workqueue.h> +#include "rxe_pool.h" +#include "rxe_task.h" +#include "rxe_hw_counters.h" + +static inline int pkey_match(u16 key1, u16 key2) +{ + return (((key1 & 0x7fff) != 0) && + ((key1 & 0x7fff) == (key2 & 0x7fff)) && + ((key1 & 0x8000) || (key2 & 0x8000))) ? 1 : 0; +} + +/* Return >0 if psn_a > psn_b + * 0 if psn_a == psn_b + * <0 if psn_a < psn_b + */ +static inline int psn_compare(u32 psn_a, u32 psn_b) +{ + s32 diff; + + diff = (psn_a - psn_b) << 8; + return diff; +} + +struct rxe_ucontext { + struct ib_ucontext ibuc; + struct rxe_pool_elem elem; +}; + +struct rxe_pd { + struct ib_pd ibpd; + struct rxe_pool_elem elem; +}; + +struct rxe_ah { + struct ib_ah ibah; + struct rxe_pool_elem elem; + struct rxe_av av; + bool is_user; + int ah_num; +}; + +struct rxe_cqe { + union { + struct ib_wc ibwc; + struct ib_uverbs_wc uibwc; + }; +}; + +struct rxe_cq { + struct ib_cq ibcq; + struct rxe_pool_elem elem; + struct rxe_queue *queue; + spinlock_t cq_lock; + u8 notify; + bool is_user; + atomic_t num_wq; +}; + +enum wqe_state { + wqe_state_posted, + wqe_state_processing, + wqe_state_pending, + wqe_state_done, + wqe_state_error, +}; + +struct rxe_sq { + int max_wr; + int max_sge; + int max_inline; + spinlock_t sq_lock; /* guard queue */ + struct rxe_queue *queue; +}; + +struct rxe_rq { + int max_wr; + int max_sge; + spinlock_t producer_lock; /* guard queue producer */ + spinlock_t consumer_lock; /* guard queue consumer */ + struct rxe_queue *queue; +}; + +struct rxe_srq { + struct ib_srq ibsrq; + struct rxe_pool_elem elem; + struct rxe_pd *pd; + struct rxe_rq rq; + u32 srq_num; + + int limit; + int error; +}; + +struct rxe_req_info { + int wqe_index; + u32 psn; + int opcode; + atomic_t rd_atomic; + int wait_fence; + int need_rd_atomic; + int wait_psn; + int need_retry; + int wait_for_rnr_timer; + int noack_pkts; + struct rxe_task task; +}; + +struct rxe_comp_info { + u32 psn; + int opcode; + int timeout; + int timeout_retry; + int started_retry; + u32 retry_cnt; + u32 rnr_retry; + struct rxe_task task; +}; + +enum rdatm_res_state { + rdatm_res_state_next, + rdatm_res_state_new, + rdatm_res_state_replay, +}; + +struct resp_res { + int type; + int replay; + u32 first_psn; + u32 last_psn; + u32 cur_psn; + enum rdatm_res_state state; + + union { + struct { + u64 orig_val; + } atomic; + struct { + u64 va_org; + u32 rkey; + u32 length; + u64 va; + u32 resid; + } read; + struct { + u32 length; + u64 va; + u8 type; + u8 level; + } flush; + }; +}; + +struct rxe_resp_info { + u32 msn; + u32 psn; + u32 ack_psn; + int opcode; + int drop_msg; + int goto_error; + int sent_psn_nak; + enum ib_wc_status status; + u8 aeth_syndrome; + + /* Receive only */ + struct rxe_recv_wqe *wqe; + + /* RDMA read / atomic only */ + u64 va; + u64 offset; + struct rxe_mr *mr; + u32 resid; + u32 rkey; + u32 length; + + /* SRQ only */ + struct { + struct rxe_recv_wqe wqe; + struct ib_sge sge[RXE_MAX_SGE]; + } srq_wqe; + + /* Responder resources. It's a circular list where the oldest + * resource is dropped first. + */ + struct resp_res *resources; + unsigned int res_head; + unsigned int res_tail; + struct resp_res *res; + struct rxe_task task; +}; + +struct rxe_qp { + struct ib_qp ibqp; + struct rxe_pool_elem elem; + struct ib_qp_attr attr; + unsigned int valid; + unsigned int mtu; + bool is_user; + + struct rxe_pd *pd; + struct rxe_srq *srq; + struct rxe_cq *scq; + struct rxe_cq *rcq; + + enum ib_sig_type sq_sig_type; + + struct rxe_sq sq; + struct rxe_rq rq; + + struct socket *sk; + u32 dst_cookie; + u16 src_port; + + struct rxe_av pri_av; + struct rxe_av alt_av; + + atomic_t mcg_num; + + struct sk_buff_head req_pkts; + struct sk_buff_head resp_pkts; + + struct rxe_req_info req; + struct rxe_comp_info comp; + struct rxe_resp_info resp; + + atomic_t ssn; + atomic_t skb_out; + int need_req_skb; + + /* Timer for retranmitting packet when ACKs have been lost. RC + * only. The requester sets it when it is not already + * started. The responder resets it whenever an ack is + * received. + */ + struct timer_list retrans_timer; + u64 qp_timeout_jiffies; + + /* Timer for handling RNR NAKS. */ + struct timer_list rnr_nak_timer; + + spinlock_t state_lock; /* guard requester and completer */ + + struct execute_work cleanup_work; +}; + +enum { + RXE_ACCESS_REMOTE = IB_ACCESS_REMOTE_READ + | IB_ACCESS_REMOTE_WRITE + | IB_ACCESS_REMOTE_ATOMIC, + RXE_ACCESS_SUPPORTED_MR = RXE_ACCESS_REMOTE + | IB_ACCESS_LOCAL_WRITE + | IB_ACCESS_MW_BIND + | IB_ACCESS_ON_DEMAND + | IB_ACCESS_FLUSH_GLOBAL + | IB_ACCESS_FLUSH_PERSISTENT + | IB_ACCESS_OPTIONAL, + RXE_ACCESS_SUPPORTED_QP = RXE_ACCESS_SUPPORTED_MR, + RXE_ACCESS_SUPPORTED_MW = RXE_ACCESS_SUPPORTED_MR + | IB_ZERO_BASED, +}; + +enum rxe_mr_state { + RXE_MR_STATE_INVALID, + RXE_MR_STATE_FREE, + RXE_MR_STATE_VALID, +}; + +enum rxe_mr_copy_dir { + RXE_TO_MR_OBJ, + RXE_FROM_MR_OBJ, +}; + +enum rxe_mr_lookup_type { + RXE_LOOKUP_LOCAL, + RXE_LOOKUP_REMOTE, +}; + +enum rxe_rereg { + RXE_MR_REREG_SUPPORTED = IB_MR_REREG_PD + | IB_MR_REREG_ACCESS, +}; + +static inline int rkey_is_mw(u32 rkey) +{ + u32 index = rkey >> 8; + + return (index >= RXE_MIN_MW_INDEX) && (index <= RXE_MAX_MW_INDEX); +} + +struct rxe_mr { + struct rxe_pool_elem elem; + struct ib_mr ibmr; + + struct ib_umem *umem; + + u32 lkey; + u32 rkey; + enum rxe_mr_state state; + int access; + atomic_t num_mw; + + unsigned int page_offset; + unsigned int page_shift; + u64 page_mask; + + u32 num_buf; + u32 nbuf; + + struct xarray page_list; +}; + +static inline unsigned int mr_page_size(struct rxe_mr *mr) +{ + return mr ? mr->ibmr.page_size : PAGE_SIZE; +} + +enum rxe_mw_state { + RXE_MW_STATE_INVALID = RXE_MR_STATE_INVALID, + RXE_MW_STATE_FREE = RXE_MR_STATE_FREE, + RXE_MW_STATE_VALID = RXE_MR_STATE_VALID, +}; + +struct rxe_mw { + struct ib_mw ibmw; + struct rxe_pool_elem elem; + spinlock_t lock; + enum rxe_mw_state state; + struct rxe_qp *qp; /* Type 2 only */ + struct rxe_mr *mr; + u32 rkey; + int access; + u64 addr; + u64 length; +}; + +struct rxe_mcg { + struct rb_node node; + struct kref ref_cnt; + struct rxe_dev *rxe; + struct list_head qp_list; + union ib_gid mgid; + atomic_t qp_num; + u32 qkey; + u16 pkey; +}; + +struct rxe_mca { + struct list_head qp_list; + struct rxe_qp *qp; +}; + +struct rxe_port { + struct ib_port_attr attr; + __be64 port_guid; + __be64 subnet_prefix; + spinlock_t port_lock; /* guard port */ + unsigned int mtu_cap; + /* special QPs */ + u32 qp_gsi_index; +}; + +struct rxe_dev { + struct ib_device ib_dev; + struct ib_device_attr attr; + int max_ucontext; + int max_inline_data; + struct mutex usdev_lock; + + struct net_device *ndev; + + struct rxe_pool uc_pool; + struct rxe_pool pd_pool; + struct rxe_pool ah_pool; + struct rxe_pool srq_pool; + struct rxe_pool qp_pool; + struct rxe_pool cq_pool; + struct rxe_pool mr_pool; + struct rxe_pool mw_pool; + + /* multicast support */ + spinlock_t mcg_lock; + struct rb_root mcg_tree; + atomic_t mcg_num; + atomic_t mcg_attach; + + spinlock_t pending_lock; /* guard pending_mmaps */ + struct list_head pending_mmaps; + + spinlock_t mmap_offset_lock; /* guard mmap_offset */ + u64 mmap_offset; + + atomic64_t stats_counters[RXE_NUM_OF_COUNTERS]; + + struct rxe_port port; + struct crypto_shash *tfm; +}; + +static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters index) +{ + atomic64_inc(&rxe->stats_counters[index]); +} + +static inline struct rxe_dev *to_rdev(struct ib_device *dev) +{ + return dev ? container_of(dev, struct rxe_dev, ib_dev) : NULL; +} + +static inline struct rxe_ucontext *to_ruc(struct ib_ucontext *uc) +{ + return uc ? container_of(uc, struct rxe_ucontext, ibuc) : NULL; +} + +static inline struct rxe_pd *to_rpd(struct ib_pd *pd) +{ + return pd ? container_of(pd, struct rxe_pd, ibpd) : NULL; +} + +static inline struct rxe_ah *to_rah(struct ib_ah *ah) +{ + return ah ? container_of(ah, struct rxe_ah, ibah) : NULL; +} + +static inline struct rxe_srq *to_rsrq(struct ib_srq *srq) +{ + return srq ? container_of(srq, struct rxe_srq, ibsrq) : NULL; +} + +static inline struct rxe_qp *to_rqp(struct ib_qp *qp) +{ + return qp ? container_of(qp, struct rxe_qp, ibqp) : NULL; +} + +static inline struct rxe_cq *to_rcq(struct ib_cq *cq) +{ + return cq ? container_of(cq, struct rxe_cq, ibcq) : NULL; +} + +static inline struct rxe_mr *to_rmr(struct ib_mr *mr) +{ + return mr ? container_of(mr, struct rxe_mr, ibmr) : NULL; +} + +static inline struct rxe_mw *to_rmw(struct ib_mw *mw) +{ + return mw ? container_of(mw, struct rxe_mw, ibmw) : NULL; +} + +static inline struct rxe_pd *rxe_ah_pd(struct rxe_ah *ah) +{ + return to_rpd(ah->ibah.pd); +} + +static inline struct rxe_pd *mr_pd(struct rxe_mr *mr) +{ + return to_rpd(mr->ibmr.pd); +} + +static inline struct rxe_pd *rxe_mw_pd(struct rxe_mw *mw) +{ + return to_rpd(mw->ibmw.pd); +} + +int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name); + +#endif /* RXE_VERBS_H */ |