diff options
Diffstat (limited to 'drivers/infiniband/sw/rxe')
34 files changed, 11983 insertions, 0 deletions
diff --git a/drivers/infiniband/sw/rxe/Kconfig b/drivers/infiniband/sw/rxe/Kconfig new file mode 100644 index 000000000..06b8dc509 --- /dev/null +++ b/drivers/infiniband/sw/rxe/Kconfig @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: GPL-2.0-only +config RDMA_RXE + tristate "Software RDMA over Ethernet (RoCE) driver" + depends on INET && PCI && INFINIBAND + depends on INFINIBAND_VIRT_DMA + select NET_UDP_TUNNEL + select CRYPTO + select CRYPTO_CRC32 + help + This driver implements the InfiniBand RDMA transport over + the Linux network stack. It enables a system with a + standard Ethernet adapter to interoperate with a RoCE + adapter or with another system running the RXE driver. + Documentation on InfiniBand and RoCE can be downloaded at + www.infinibandta.org and www.openfabrics.org. (See also + siw which is a similar software driver for iWARP.) + + The driver is split into two layers, one interfaces with the + Linux RDMA stack and implements a kernel or user space + verbs API. The user space verbs API requires a support + library named librxe which is loaded by the generic user + space verbs API, libibverbs. The other layer interfaces + with the Linux network stack at layer 3. + + To configure and work with soft-RoCE driver please use the + following wiki page under "configure Soft-RoCE (RXE)" section: + + https://github.com/linux-rdma/rdma-core/blob/master/Documentation/rxe.md diff --git a/drivers/infiniband/sw/rxe/Makefile b/drivers/infiniband/sw/rxe/Makefile new file mode 100644 index 000000000..66af72dca --- /dev/null +++ b/drivers/infiniband/sw/rxe/Makefile @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_RDMA_RXE) += rdma_rxe.o + +rdma_rxe-y := \ + rxe.o \ + rxe_comp.o \ + rxe_req.o \ + rxe_resp.o \ + rxe_recv.o \ + rxe_pool.o \ + rxe_queue.o \ + rxe_verbs.o \ + rxe_av.o \ + rxe_srq.o \ + rxe_qp.o \ + rxe_cq.o \ + rxe_mr.o \ + rxe_opcode.o \ + rxe_mmap.o \ + rxe_icrc.o \ + rxe_mcast.o \ + rxe_task.o \ + rxe_net.o \ + rxe_sysfs.o \ + rxe_hw_counters.o diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c new file mode 100644 index 000000000..95f0de0c8 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -0,0 +1,310 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <rdma/rdma_netlink.h> +#include <net/addrconf.h> +#include "rxe.h" +#include "rxe_loc.h" + +MODULE_AUTHOR("Bob Pearson, Frank Zago, John Groves, Kamal Heib"); +MODULE_DESCRIPTION("Soft RDMA transport"); +MODULE_LICENSE("Dual BSD/GPL"); + +bool rxe_initialized; + +/* free resources for a rxe device all objects created for this device must + * have been destroyed + */ +void rxe_dealloc(struct ib_device *ib_dev) +{ + struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev); + + rxe_pool_cleanup(&rxe->uc_pool); + rxe_pool_cleanup(&rxe->pd_pool); + rxe_pool_cleanup(&rxe->ah_pool); + rxe_pool_cleanup(&rxe->srq_pool); + rxe_pool_cleanup(&rxe->qp_pool); + rxe_pool_cleanup(&rxe->cq_pool); + rxe_pool_cleanup(&rxe->mr_pool); + rxe_pool_cleanup(&rxe->mw_pool); + rxe_pool_cleanup(&rxe->mc_grp_pool); + rxe_pool_cleanup(&rxe->mc_elem_pool); + + if (rxe->tfm) + crypto_free_shash(rxe->tfm); +} + +/* initialize rxe device parameters */ +static void rxe_init_device_param(struct rxe_dev *rxe) +{ + rxe->max_inline_data = RXE_MAX_INLINE_DATA; + + rxe->attr.vendor_id = RXE_VENDOR_ID; + rxe->attr.max_mr_size = RXE_MAX_MR_SIZE; + rxe->attr.page_size_cap = RXE_PAGE_SIZE_CAP; + rxe->attr.max_qp = RXE_MAX_QP; + rxe->attr.max_qp_wr = RXE_MAX_QP_WR; + rxe->attr.device_cap_flags = RXE_DEVICE_CAP_FLAGS; + rxe->attr.max_send_sge = RXE_MAX_SGE; + rxe->attr.max_recv_sge = RXE_MAX_SGE; + rxe->attr.max_sge_rd = RXE_MAX_SGE_RD; + rxe->attr.max_cq = RXE_MAX_CQ; + rxe->attr.max_cqe = (1 << RXE_MAX_LOG_CQE) - 1; + rxe->attr.max_mr = RXE_MAX_MR; + rxe->attr.max_pd = RXE_MAX_PD; + rxe->attr.max_qp_rd_atom = RXE_MAX_QP_RD_ATOM; + rxe->attr.max_res_rd_atom = RXE_MAX_RES_RD_ATOM; + rxe->attr.max_qp_init_rd_atom = RXE_MAX_QP_INIT_RD_ATOM; + rxe->attr.atomic_cap = IB_ATOMIC_HCA; + rxe->attr.max_mcast_grp = RXE_MAX_MCAST_GRP; + rxe->attr.max_mcast_qp_attach = RXE_MAX_MCAST_QP_ATTACH; + rxe->attr.max_total_mcast_qp_attach = RXE_MAX_TOT_MCAST_QP_ATTACH; + rxe->attr.max_ah = RXE_MAX_AH; + rxe->attr.max_srq = RXE_MAX_SRQ; + rxe->attr.max_srq_wr = RXE_MAX_SRQ_WR; + rxe->attr.max_srq_sge = RXE_MAX_SRQ_SGE; + rxe->attr.max_fast_reg_page_list_len = RXE_MAX_FMR_PAGE_LIST_LEN; + rxe->attr.max_pkeys = RXE_MAX_PKEYS; + rxe->attr.local_ca_ack_delay = RXE_LOCAL_CA_ACK_DELAY; + addrconf_addr_eui48((unsigned char *)&rxe->attr.sys_image_guid, + rxe->ndev->dev_addr); + + rxe->max_ucontext = RXE_MAX_UCONTEXT; +} + +/* initialize port attributes */ +static void rxe_init_port_param(struct rxe_port *port) +{ + port->attr.state = IB_PORT_DOWN; + port->attr.max_mtu = IB_MTU_4096; + port->attr.active_mtu = IB_MTU_256; + port->attr.gid_tbl_len = RXE_PORT_GID_TBL_LEN; + port->attr.port_cap_flags = RXE_PORT_PORT_CAP_FLAGS; + port->attr.max_msg_sz = RXE_PORT_MAX_MSG_SZ; + port->attr.bad_pkey_cntr = RXE_PORT_BAD_PKEY_CNTR; + port->attr.qkey_viol_cntr = RXE_PORT_QKEY_VIOL_CNTR; + port->attr.pkey_tbl_len = RXE_PORT_PKEY_TBL_LEN; + port->attr.lid = RXE_PORT_LID; + port->attr.sm_lid = RXE_PORT_SM_LID; + port->attr.lmc = RXE_PORT_LMC; + port->attr.max_vl_num = RXE_PORT_MAX_VL_NUM; + port->attr.sm_sl = RXE_PORT_SM_SL; + port->attr.subnet_timeout = RXE_PORT_SUBNET_TIMEOUT; + port->attr.init_type_reply = RXE_PORT_INIT_TYPE_REPLY; + port->attr.active_width = RXE_PORT_ACTIVE_WIDTH; + port->attr.active_speed = RXE_PORT_ACTIVE_SPEED; + port->attr.phys_state = RXE_PORT_PHYS_STATE; + port->mtu_cap = ib_mtu_enum_to_int(IB_MTU_256); + port->subnet_prefix = cpu_to_be64(RXE_PORT_SUBNET_PREFIX); +} + +/* initialize port state, note IB convention that HCA ports are always + * numbered from 1 + */ +static void rxe_init_ports(struct rxe_dev *rxe) +{ + struct rxe_port *port = &rxe->port; + + rxe_init_port_param(port); + addrconf_addr_eui48((unsigned char *)&port->port_guid, + rxe->ndev->dev_addr); + spin_lock_init(&port->port_lock); +} + +/* init pools of managed objects */ +static int rxe_init_pools(struct rxe_dev *rxe) +{ + int err; + + err = rxe_pool_init(rxe, &rxe->uc_pool, RXE_TYPE_UC, + rxe->max_ucontext); + if (err) + goto err1; + + err = rxe_pool_init(rxe, &rxe->pd_pool, RXE_TYPE_PD, + rxe->attr.max_pd); + if (err) + goto err2; + + err = rxe_pool_init(rxe, &rxe->ah_pool, RXE_TYPE_AH, + rxe->attr.max_ah); + if (err) + goto err3; + + err = rxe_pool_init(rxe, &rxe->srq_pool, RXE_TYPE_SRQ, + rxe->attr.max_srq); + if (err) + goto err4; + + err = rxe_pool_init(rxe, &rxe->qp_pool, RXE_TYPE_QP, + rxe->attr.max_qp); + if (err) + goto err5; + + err = rxe_pool_init(rxe, &rxe->cq_pool, RXE_TYPE_CQ, + rxe->attr.max_cq); + if (err) + goto err6; + + err = rxe_pool_init(rxe, &rxe->mr_pool, RXE_TYPE_MR, + rxe->attr.max_mr); + if (err) + goto err7; + + err = rxe_pool_init(rxe, &rxe->mw_pool, RXE_TYPE_MW, + rxe->attr.max_mw); + if (err) + goto err8; + + err = rxe_pool_init(rxe, &rxe->mc_grp_pool, RXE_TYPE_MC_GRP, + rxe->attr.max_mcast_grp); + if (err) + goto err9; + + err = rxe_pool_init(rxe, &rxe->mc_elem_pool, RXE_TYPE_MC_ELEM, + rxe->attr.max_total_mcast_qp_attach); + if (err) + goto err10; + + return 0; + +err10: + rxe_pool_cleanup(&rxe->mc_grp_pool); +err9: + rxe_pool_cleanup(&rxe->mw_pool); +err8: + rxe_pool_cleanup(&rxe->mr_pool); +err7: + rxe_pool_cleanup(&rxe->cq_pool); +err6: + rxe_pool_cleanup(&rxe->qp_pool); +err5: + rxe_pool_cleanup(&rxe->srq_pool); +err4: + rxe_pool_cleanup(&rxe->ah_pool); +err3: + rxe_pool_cleanup(&rxe->pd_pool); +err2: + rxe_pool_cleanup(&rxe->uc_pool); +err1: + return err; +} + +/* initialize rxe device state */ +static int rxe_init(struct rxe_dev *rxe) +{ + int err; + + /* init default device parameters */ + rxe_init_device_param(rxe); + + rxe_init_ports(rxe); + + err = rxe_init_pools(rxe); + if (err) + return err; + + /* init pending mmap list */ + spin_lock_init(&rxe->mmap_offset_lock); + spin_lock_init(&rxe->pending_lock); + INIT_LIST_HEAD(&rxe->pending_mmaps); + + mutex_init(&rxe->usdev_lock); + + return 0; +} + +void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) +{ + struct rxe_port *port = &rxe->port; + enum ib_mtu mtu; + + mtu = eth_mtu_int_to_enum(ndev_mtu); + + /* Make sure that new MTU in range */ + mtu = mtu ? min_t(enum ib_mtu, mtu, IB_MTU_4096) : IB_MTU_256; + + port->attr.active_mtu = mtu; + port->mtu_cap = ib_mtu_enum_to_int(mtu); +} + +/* called by ifc layer to create new rxe device. + * The caller should allocate memory for rxe by calling ib_alloc_device. + */ +int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name) +{ + int err; + + err = rxe_init(rxe); + if (err) + return err; + + rxe_set_mtu(rxe, mtu); + + return rxe_register_device(rxe, ibdev_name); +} + +static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) +{ + struct rxe_dev *exists; + int err = 0; + + if (is_vlan_dev(ndev)) { + pr_err("rxe creation allowed on top of a real device only\n"); + err = -EPERM; + goto err; + } + + exists = rxe_get_dev_from_net(ndev); + if (exists) { + ib_device_put(&exists->ib_dev); + pr_err("already configured on %s\n", ndev->name); + err = -EEXIST; + goto err; + } + + err = rxe_net_add(ibdev_name, ndev); + if (err) { + pr_err("failed to add %s\n", ndev->name); + goto err; + } +err: + return err; +} + +static struct rdma_link_ops rxe_link_ops = { + .type = "rxe", + .newlink = rxe_newlink, +}; + +static int __init rxe_module_init(void) +{ + int err; + + err = rxe_net_init(); + if (err) + return err; + + rdma_link_register(&rxe_link_ops); + rxe_initialized = true; + pr_info("loaded\n"); + return 0; +} + +static void __exit rxe_module_exit(void) +{ + rdma_link_unregister(&rxe_link_ops); + ib_unregister_driver(RDMA_DRIVER_RXE); + rxe_net_exit(); + + rxe_initialized = false; + pr_info("unloaded\n"); +} + +late_initcall(rxe_module_init); +module_exit(rxe_module_exit); + +MODULE_ALIAS_RDMA_LINK("rxe"); diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h new file mode 100644 index 000000000..623fd17df --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#ifndef RXE_H +#define RXE_H + +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/crc32.h> + +#include <rdma/ib_verbs.h> +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_pack.h> +#include <rdma/ib_smi.h> +#include <rdma/ib_umem.h> +#include <rdma/ib_cache.h> +#include <rdma/ib_addr.h> +#include <crypto/hash.h> + +#include "rxe_net.h" +#include "rxe_opcode.h" +#include "rxe_hdr.h" +#include "rxe_param.h" +#include "rxe_verbs.h" +#include "rxe_loc.h" + +/* + * Version 1 and Version 2 are identical on 64 bit machines, but on 32 bit + * machines Version 2 has a different struct layout. + */ +#define RXE_UVERBS_ABI_VERSION 2 + +#define RXE_ROCE_V2_SPORT (0xc000) + +extern bool rxe_initialized; + +static inline u32 rxe_crc32(struct rxe_dev *rxe, + u32 crc, void *next, size_t len) +{ + u32 retval; + int err; + + SHASH_DESC_ON_STACK(shash, rxe->tfm); + + shash->tfm = rxe->tfm; + *(u32 *)shash_desc_ctx(shash) = crc; + err = crypto_shash_update(shash, next, len); + if (unlikely(err)) { + pr_warn_ratelimited("failed crc calculation, err: %d\n", err); + return crc32_le(crc, next, len); + } + + retval = *(u32 *)shash_desc_ctx(shash); + barrier_data(shash_desc_ctx(shash)); + return retval; +} + +void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu); + +int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name); + +void rxe_rcv(struct sk_buff *skb); + +/* The caller must do a matching ib_device_put(&dev->ib_dev) */ +static inline struct rxe_dev *rxe_get_dev_from_net(struct net_device *ndev) +{ + struct ib_device *ibdev = + ib_device_get_by_netdev(ndev, RDMA_DRIVER_RXE); + + if (!ibdev) + return NULL; + return container_of(ibdev, struct rxe_dev, ib_dev); +} + +void rxe_port_up(struct rxe_dev *rxe); +void rxe_port_down(struct rxe_dev *rxe); +void rxe_set_port_state(struct rxe_dev *rxe); + +#endif /* RXE_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c new file mode 100644 index 000000000..da2e867a1 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_av.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include "rxe.h" +#include "rxe_loc.h" + +void rxe_init_av(struct rdma_ah_attr *attr, struct rxe_av *av) +{ + rxe_av_from_attr(rdma_ah_get_port_num(attr), av, attr); + rxe_av_fill_ip_info(av, attr); + memcpy(av->dmac, attr->roce.dmac, ETH_ALEN); +} + +int rxe_av_chk_attr(struct rxe_dev *rxe, struct rdma_ah_attr *attr) +{ + const struct ib_global_route *grh = rdma_ah_read_grh(attr); + struct rxe_port *port; + int type; + + port = &rxe->port; + + if (rdma_ah_get_ah_flags(attr) & IB_AH_GRH) { + if (grh->sgid_index > port->attr.gid_tbl_len) { + pr_warn("invalid sgid index = %d\n", + grh->sgid_index); + return -EINVAL; + } + + type = rdma_gid_attr_network_type(grh->sgid_attr); + if (type < RDMA_NETWORK_IPV4 || + type > RDMA_NETWORK_IPV6) { + pr_warn("invalid network type for rdma_rxe = %d\n", + type); + return -EINVAL; + } + } + + return 0; +} + +void rxe_av_from_attr(u8 port_num, struct rxe_av *av, + struct rdma_ah_attr *attr) +{ + const struct ib_global_route *grh = rdma_ah_read_grh(attr); + + memset(av, 0, sizeof(*av)); + memcpy(av->grh.dgid.raw, grh->dgid.raw, sizeof(grh->dgid.raw)); + av->grh.flow_label = grh->flow_label; + av->grh.sgid_index = grh->sgid_index; + av->grh.hop_limit = grh->hop_limit; + av->grh.traffic_class = grh->traffic_class; + av->port_num = port_num; +} + +void rxe_av_to_attr(struct rxe_av *av, struct rdma_ah_attr *attr) +{ + struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); + + attr->type = RDMA_AH_ATTR_TYPE_ROCE; + + memcpy(grh->dgid.raw, av->grh.dgid.raw, sizeof(av->grh.dgid.raw)); + grh->flow_label = av->grh.flow_label; + grh->sgid_index = av->grh.sgid_index; + grh->hop_limit = av->grh.hop_limit; + grh->traffic_class = av->grh.traffic_class; + + rdma_ah_set_ah_flags(attr, IB_AH_GRH); + rdma_ah_set_port_num(attr, av->port_num); +} + +void rxe_av_fill_ip_info(struct rxe_av *av, struct rdma_ah_attr *attr) +{ + const struct ib_gid_attr *sgid_attr = attr->grh.sgid_attr; + int ibtype; + int type; + + rdma_gid2ip((struct sockaddr *)&av->sgid_addr, &sgid_attr->gid); + rdma_gid2ip((struct sockaddr *)&av->dgid_addr, + &rdma_ah_read_grh(attr)->dgid); + + ibtype = rdma_gid_attr_network_type(sgid_attr); + + switch (ibtype) { + case RDMA_NETWORK_IPV4: + type = RXE_NETWORK_TYPE_IPV4; + break; + case RDMA_NETWORK_IPV6: + type = RXE_NETWORK_TYPE_IPV6; + break; + default: + /* not reached - checked in rxe_av_chk_attr */ + type = 0; + break; + } + + av->network_type = type; +} + +struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt) +{ + if (!pkt || !pkt->qp) + return NULL; + + if (qp_type(pkt->qp) == IB_QPT_RC || qp_type(pkt->qp) == IB_QPT_UC) + return &pkt->qp->pri_av; + + return (pkt->wqe) ? &pkt->wqe->av : NULL; +} diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c new file mode 100644 index 000000000..0a1e63932 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -0,0 +1,771 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <linux/skbuff.h> + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" +#include "rxe_task.h" + +enum comp_state { + COMPST_GET_ACK, + COMPST_GET_WQE, + COMPST_COMP_WQE, + COMPST_COMP_ACK, + COMPST_CHECK_PSN, + COMPST_CHECK_ACK, + COMPST_READ, + COMPST_ATOMIC, + COMPST_WRITE_SEND, + COMPST_UPDATE_COMP, + COMPST_ERROR_RETRY, + COMPST_RNR_RETRY, + COMPST_ERROR, + COMPST_EXIT, /* We have an issue, and we want to rerun the completer */ + COMPST_DONE, /* The completer finished successflly */ +}; + +static char *comp_state_name[] = { + [COMPST_GET_ACK] = "GET ACK", + [COMPST_GET_WQE] = "GET WQE", + [COMPST_COMP_WQE] = "COMP WQE", + [COMPST_COMP_ACK] = "COMP ACK", + [COMPST_CHECK_PSN] = "CHECK PSN", + [COMPST_CHECK_ACK] = "CHECK ACK", + [COMPST_READ] = "READ", + [COMPST_ATOMIC] = "ATOMIC", + [COMPST_WRITE_SEND] = "WRITE/SEND", + [COMPST_UPDATE_COMP] = "UPDATE COMP", + [COMPST_ERROR_RETRY] = "ERROR RETRY", + [COMPST_RNR_RETRY] = "RNR RETRY", + [COMPST_ERROR] = "ERROR", + [COMPST_EXIT] = "EXIT", + [COMPST_DONE] = "DONE", +}; + +static unsigned long rnrnak_usec[32] = { + [IB_RNR_TIMER_655_36] = 655360, + [IB_RNR_TIMER_000_01] = 10, + [IB_RNR_TIMER_000_02] = 20, + [IB_RNR_TIMER_000_03] = 30, + [IB_RNR_TIMER_000_04] = 40, + [IB_RNR_TIMER_000_06] = 60, + [IB_RNR_TIMER_000_08] = 80, + [IB_RNR_TIMER_000_12] = 120, + [IB_RNR_TIMER_000_16] = 160, + [IB_RNR_TIMER_000_24] = 240, + [IB_RNR_TIMER_000_32] = 320, + [IB_RNR_TIMER_000_48] = 480, + [IB_RNR_TIMER_000_64] = 640, + [IB_RNR_TIMER_000_96] = 960, + [IB_RNR_TIMER_001_28] = 1280, + [IB_RNR_TIMER_001_92] = 1920, + [IB_RNR_TIMER_002_56] = 2560, + [IB_RNR_TIMER_003_84] = 3840, + [IB_RNR_TIMER_005_12] = 5120, + [IB_RNR_TIMER_007_68] = 7680, + [IB_RNR_TIMER_010_24] = 10240, + [IB_RNR_TIMER_015_36] = 15360, + [IB_RNR_TIMER_020_48] = 20480, + [IB_RNR_TIMER_030_72] = 30720, + [IB_RNR_TIMER_040_96] = 40960, + [IB_RNR_TIMER_061_44] = 61410, + [IB_RNR_TIMER_081_92] = 81920, + [IB_RNR_TIMER_122_88] = 122880, + [IB_RNR_TIMER_163_84] = 163840, + [IB_RNR_TIMER_245_76] = 245760, + [IB_RNR_TIMER_327_68] = 327680, + [IB_RNR_TIMER_491_52] = 491520, +}; + +static inline unsigned long rnrnak_jiffies(u8 timeout) +{ + return max_t(unsigned long, + usecs_to_jiffies(rnrnak_usec[timeout]), 1); +} + +static enum ib_wc_opcode wr_to_wc_opcode(enum ib_wr_opcode opcode) +{ + switch (opcode) { + case IB_WR_RDMA_WRITE: return IB_WC_RDMA_WRITE; + case IB_WR_RDMA_WRITE_WITH_IMM: return IB_WC_RDMA_WRITE; + case IB_WR_SEND: return IB_WC_SEND; + case IB_WR_SEND_WITH_IMM: return IB_WC_SEND; + case IB_WR_RDMA_READ: return IB_WC_RDMA_READ; + case IB_WR_ATOMIC_CMP_AND_SWP: return IB_WC_COMP_SWAP; + case IB_WR_ATOMIC_FETCH_AND_ADD: return IB_WC_FETCH_ADD; + case IB_WR_LSO: return IB_WC_LSO; + case IB_WR_SEND_WITH_INV: return IB_WC_SEND; + case IB_WR_RDMA_READ_WITH_INV: return IB_WC_RDMA_READ; + case IB_WR_LOCAL_INV: return IB_WC_LOCAL_INV; + case IB_WR_REG_MR: return IB_WC_REG_MR; + + default: + return 0xff; + } +} + +void retransmit_timer(struct timer_list *t) +{ + struct rxe_qp *qp = from_timer(qp, t, retrans_timer); + + if (qp->valid) { + qp->comp.timeout = 1; + rxe_run_task(&qp->comp.task, 1); + } +} + +void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb) +{ + int must_sched; + + skb_queue_tail(&qp->resp_pkts, skb); + + must_sched = skb_queue_len(&qp->resp_pkts) > 1; + if (must_sched != 0) + rxe_counter_inc(SKB_TO_PKT(skb)->rxe, RXE_CNT_COMPLETER_SCHED); + + rxe_run_task(&qp->comp.task, must_sched); +} + +static inline enum comp_state get_wqe(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe **wqe_p) +{ + struct rxe_send_wqe *wqe; + + /* we come here whether or not we found a response packet to see if + * there are any posted WQEs + */ + wqe = queue_head(qp->sq.queue); + *wqe_p = wqe; + + /* no WQE or requester has not started it yet */ + if (!wqe || wqe->state == wqe_state_posted) + return pkt ? COMPST_DONE : COMPST_EXIT; + + /* WQE does not require an ack */ + if (wqe->state == wqe_state_done) + return COMPST_COMP_WQE; + + /* WQE caused an error */ + if (wqe->state == wqe_state_error) + return COMPST_ERROR; + + /* we have a WQE, if we also have an ack check its PSN */ + return pkt ? COMPST_CHECK_PSN : COMPST_EXIT; +} + +static inline void reset_retry_counters(struct rxe_qp *qp) +{ + qp->comp.retry_cnt = qp->attr.retry_cnt; + qp->comp.rnr_retry = qp->attr.rnr_retry; + qp->comp.started_retry = 0; +} + +static inline enum comp_state check_psn(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + s32 diff; + + /* check to see if response is past the oldest WQE. if it is, complete + * send/write or error read/atomic + */ + diff = psn_compare(pkt->psn, wqe->last_psn); + if (diff > 0) { + if (wqe->state == wqe_state_pending) { + if (wqe->mask & WR_ATOMIC_OR_READ_MASK) + return COMPST_ERROR_RETRY; + + reset_retry_counters(qp); + return COMPST_COMP_WQE; + } else { + return COMPST_DONE; + } + } + + /* compare response packet to expected response */ + diff = psn_compare(pkt->psn, qp->comp.psn); + if (diff < 0) { + /* response is most likely a retried packet if it matches an + * uncompleted WQE go complete it else ignore it + */ + if (pkt->psn == wqe->last_psn) + return COMPST_COMP_ACK; + else + return COMPST_DONE; + } else if ((diff > 0) && (wqe->mask & WR_ATOMIC_OR_READ_MASK)) { + return COMPST_DONE; + } else { + return COMPST_CHECK_ACK; + } +} + +static inline enum comp_state check_ack(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + unsigned int mask = pkt->mask; + u8 syn; + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + + /* Check the sequence only */ + switch (qp->comp.opcode) { + case -1: + /* Will catch all *_ONLY cases. */ + if (!(mask & RXE_START_MASK)) + return COMPST_ERROR; + + break; + + case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST: + case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE: + if (pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE && + pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST) { + /* read retries of partial data may restart from + * read response first or response only. + */ + if ((pkt->psn == wqe->first_psn && + pkt->opcode == + IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) || + (wqe->first_psn == wqe->last_psn && + pkt->opcode == + IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY)) + break; + + return COMPST_ERROR; + } + break; + default: + WARN_ON_ONCE(1); + } + + /* Check operation validity. */ + switch (pkt->opcode) { + case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST: + case IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST: + case IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY: + syn = aeth_syn(pkt); + + if ((syn & AETH_TYPE_MASK) != AETH_ACK) + return COMPST_ERROR; + + fallthrough; + /* (IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE doesn't have an AETH) + */ + case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE: + if (wqe->wr.opcode != IB_WR_RDMA_READ && + wqe->wr.opcode != IB_WR_RDMA_READ_WITH_INV) { + wqe->status = IB_WC_FATAL_ERR; + return COMPST_ERROR; + } + reset_retry_counters(qp); + return COMPST_READ; + + case IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE: + syn = aeth_syn(pkt); + + if ((syn & AETH_TYPE_MASK) != AETH_ACK) + return COMPST_ERROR; + + if (wqe->wr.opcode != IB_WR_ATOMIC_CMP_AND_SWP && + wqe->wr.opcode != IB_WR_ATOMIC_FETCH_AND_ADD) + return COMPST_ERROR; + reset_retry_counters(qp); + return COMPST_ATOMIC; + + case IB_OPCODE_RC_ACKNOWLEDGE: + syn = aeth_syn(pkt); + switch (syn & AETH_TYPE_MASK) { + case AETH_ACK: + reset_retry_counters(qp); + return COMPST_WRITE_SEND; + + case AETH_RNR_NAK: + rxe_counter_inc(rxe, RXE_CNT_RCV_RNR); + return COMPST_RNR_RETRY; + + case AETH_NAK: + switch (syn) { + case AETH_NAK_PSN_SEQ_ERROR: + /* a nak implicitly acks all packets with psns + * before + */ + if (psn_compare(pkt->psn, qp->comp.psn) > 0) { + rxe_counter_inc(rxe, + RXE_CNT_RCV_SEQ_ERR); + qp->comp.psn = pkt->psn; + if (qp->req.wait_psn) { + qp->req.wait_psn = 0; + rxe_run_task(&qp->req.task, 0); + } + } + return COMPST_ERROR_RETRY; + + case AETH_NAK_INVALID_REQ: + wqe->status = IB_WC_REM_INV_REQ_ERR; + return COMPST_ERROR; + + case AETH_NAK_REM_ACC_ERR: + wqe->status = IB_WC_REM_ACCESS_ERR; + return COMPST_ERROR; + + case AETH_NAK_REM_OP_ERR: + wqe->status = IB_WC_REM_OP_ERR; + return COMPST_ERROR; + + default: + pr_warn("unexpected nak %x\n", syn); + wqe->status = IB_WC_REM_OP_ERR; + return COMPST_ERROR; + } + + default: + return COMPST_ERROR; + } + break; + + default: + pr_warn("unexpected opcode\n"); + } + + return COMPST_ERROR; +} + +static inline enum comp_state do_read(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + int ret; + + ret = copy_data(qp->pd, IB_ACCESS_LOCAL_WRITE, + &wqe->dma, payload_addr(pkt), + payload_size(pkt), to_mem_obj, NULL); + if (ret) + return COMPST_ERROR; + + if (wqe->dma.resid == 0 && (pkt->mask & RXE_END_MASK)) + return COMPST_COMP_ACK; + else + return COMPST_UPDATE_COMP; +} + +static inline enum comp_state do_atomic(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + int ret; + + u64 atomic_orig = atmack_orig(pkt); + + ret = copy_data(qp->pd, IB_ACCESS_LOCAL_WRITE, + &wqe->dma, &atomic_orig, + sizeof(u64), to_mem_obj, NULL); + if (ret) + return COMPST_ERROR; + else + return COMPST_COMP_ACK; +} + +static void make_send_cqe(struct rxe_qp *qp, struct rxe_send_wqe *wqe, + struct rxe_cqe *cqe) +{ + memset(cqe, 0, sizeof(*cqe)); + + if (!qp->is_user) { + struct ib_wc *wc = &cqe->ibwc; + + wc->wr_id = wqe->wr.wr_id; + wc->status = wqe->status; + wc->opcode = wr_to_wc_opcode(wqe->wr.opcode); + if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM || + wqe->wr.opcode == IB_WR_SEND_WITH_IMM) + wc->wc_flags = IB_WC_WITH_IMM; + wc->byte_len = wqe->dma.length; + wc->qp = &qp->ibqp; + } else { + struct ib_uverbs_wc *uwc = &cqe->uibwc; + + uwc->wr_id = wqe->wr.wr_id; + uwc->status = wqe->status; + uwc->opcode = wr_to_wc_opcode(wqe->wr.opcode); + if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM || + wqe->wr.opcode == IB_WR_SEND_WITH_IMM) + uwc->wc_flags = IB_WC_WITH_IMM; + uwc->byte_len = wqe->dma.length; + uwc->qp_num = qp->ibqp.qp_num; + } +} + +/* + * IBA Spec. Section 10.7.3.1 SIGNALED COMPLETIONS + * ---------8<---------8<------------- + * ...Note that if a completion error occurs, a Work Completion + * will always be generated, even if the signaling + * indicator requests an Unsignaled Completion. + * ---------8<---------8<------------- + */ +static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct rxe_cqe cqe; + + if ((qp->sq_sig_type == IB_SIGNAL_ALL_WR) || + (wqe->wr.send_flags & IB_SEND_SIGNALED) || + wqe->status != IB_WC_SUCCESS) { + make_send_cqe(qp, wqe, &cqe); + advance_consumer(qp->sq.queue); + rxe_cq_post(qp->scq, &cqe, 0); + } else { + advance_consumer(qp->sq.queue); + } + + if (wqe->wr.opcode == IB_WR_SEND || + wqe->wr.opcode == IB_WR_SEND_WITH_IMM || + wqe->wr.opcode == IB_WR_SEND_WITH_INV) + rxe_counter_inc(rxe, RXE_CNT_RDMA_SEND); + + /* + * we completed something so let req run again + * if it is trying to fence + */ + if (qp->req.wait_fence) { + qp->req.wait_fence = 0; + rxe_run_task(&qp->req.task, 0); + } +} + +static inline enum comp_state complete_ack(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + unsigned long flags; + + if (wqe->has_rd_atomic) { + wqe->has_rd_atomic = 0; + atomic_inc(&qp->req.rd_atomic); + if (qp->req.need_rd_atomic) { + qp->comp.timeout_retry = 0; + qp->req.need_rd_atomic = 0; + rxe_run_task(&qp->req.task, 0); + } + } + + if (unlikely(qp->req.state == QP_STATE_DRAIN)) { + /* state_lock used by requester & completer */ + spin_lock_irqsave(&qp->state_lock, flags); + if ((qp->req.state == QP_STATE_DRAIN) && + (qp->comp.psn == qp->req.psn)) { + qp->req.state = QP_STATE_DRAINED; + spin_unlock_irqrestore(&qp->state_lock, flags); + + if (qp->ibqp.event_handler) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_SQ_DRAINED; + qp->ibqp.event_handler(&ev, + qp->ibqp.qp_context); + } + } else { + spin_unlock_irqrestore(&qp->state_lock, flags); + } + } + + do_complete(qp, wqe); + + if (psn_compare(pkt->psn, qp->comp.psn) >= 0) + return COMPST_UPDATE_COMP; + else + return COMPST_DONE; +} + +static inline enum comp_state complete_wqe(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_send_wqe *wqe) +{ + if (pkt && wqe->state == wqe_state_pending) { + if (psn_compare(wqe->last_psn, qp->comp.psn) >= 0) { + qp->comp.psn = (wqe->last_psn + 1) & BTH_PSN_MASK; + qp->comp.opcode = -1; + } + + if (qp->req.wait_psn) { + qp->req.wait_psn = 0; + rxe_run_task(&qp->req.task, 1); + } + } + + do_complete(qp, wqe); + + return COMPST_GET_WQE; +} + +static void rxe_drain_resp_pkts(struct rxe_qp *qp, bool notify) +{ + struct sk_buff *skb; + struct rxe_send_wqe *wqe; + + while ((skb = skb_dequeue(&qp->resp_pkts))) { + rxe_drop_ref(qp); + kfree_skb(skb); + } + + while ((wqe = queue_head(qp->sq.queue))) { + if (notify) { + wqe->status = IB_WC_WR_FLUSH_ERR; + do_complete(qp, wqe); + } else { + advance_consumer(qp->sq.queue); + } + } +} + +int rxe_completer(void *arg) +{ + struct rxe_qp *qp = (struct rxe_qp *)arg; + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct rxe_send_wqe *wqe = NULL; + struct sk_buff *skb = NULL; + struct rxe_pkt_info *pkt = NULL; + enum comp_state state; + + rxe_add_ref(qp); + + if (!qp->valid || qp->req.state == QP_STATE_ERROR || + qp->req.state == QP_STATE_RESET) { + rxe_drain_resp_pkts(qp, qp->valid && + qp->req.state == QP_STATE_ERROR); + goto exit; + } + + if (qp->comp.timeout) { + qp->comp.timeout_retry = 1; + qp->comp.timeout = 0; + } else { + qp->comp.timeout_retry = 0; + } + + if (qp->req.need_retry) + goto exit; + + state = COMPST_GET_ACK; + + while (1) { + pr_debug("qp#%d state = %s\n", qp_num(qp), + comp_state_name[state]); + switch (state) { + case COMPST_GET_ACK: + skb = skb_dequeue(&qp->resp_pkts); + if (skb) { + pkt = SKB_TO_PKT(skb); + qp->comp.timeout_retry = 0; + } + state = COMPST_GET_WQE; + break; + + case COMPST_GET_WQE: + state = get_wqe(qp, pkt, &wqe); + break; + + case COMPST_CHECK_PSN: + state = check_psn(qp, pkt, wqe); + break; + + case COMPST_CHECK_ACK: + state = check_ack(qp, pkt, wqe); + break; + + case COMPST_READ: + state = do_read(qp, pkt, wqe); + break; + + case COMPST_ATOMIC: + state = do_atomic(qp, pkt, wqe); + break; + + case COMPST_WRITE_SEND: + if (wqe->state == wqe_state_pending && + wqe->last_psn == pkt->psn) + state = COMPST_COMP_ACK; + else + state = COMPST_UPDATE_COMP; + break; + + case COMPST_COMP_ACK: + state = complete_ack(qp, pkt, wqe); + break; + + case COMPST_COMP_WQE: + state = complete_wqe(qp, pkt, wqe); + break; + + case COMPST_UPDATE_COMP: + if (pkt->mask & RXE_END_MASK) + qp->comp.opcode = -1; + else + qp->comp.opcode = pkt->opcode; + + if (psn_compare(pkt->psn, qp->comp.psn) >= 0) + qp->comp.psn = (pkt->psn + 1) & BTH_PSN_MASK; + + if (qp->req.wait_psn) { + qp->req.wait_psn = 0; + rxe_run_task(&qp->req.task, 1); + } + + state = COMPST_DONE; + break; + + case COMPST_DONE: + if (pkt) { + rxe_drop_ref(pkt->qp); + kfree_skb(skb); + skb = NULL; + } + goto done; + + case COMPST_EXIT: + if (qp->comp.timeout_retry && wqe) { + state = COMPST_ERROR_RETRY; + break; + } + + /* re reset the timeout counter if + * (1) QP is type RC + * (2) the QP is alive + * (3) there is a packet sent by the requester that + * might be acked (we still might get spurious + * timeouts but try to keep them as few as possible) + * (4) the timeout parameter is set + */ + if ((qp_type(qp) == IB_QPT_RC) && + (qp->req.state == QP_STATE_READY) && + (psn_compare(qp->req.psn, qp->comp.psn) > 0) && + qp->qp_timeout_jiffies) + mod_timer(&qp->retrans_timer, + jiffies + qp->qp_timeout_jiffies); + goto exit; + + case COMPST_ERROR_RETRY: + /* we come here if the retry timer fired and we did + * not receive a response packet. try to retry the send + * queue if that makes sense and the limits have not + * been exceeded. remember that some timeouts are + * spurious since we do not reset the timer but kick + * it down the road or let it expire + */ + + /* there is nothing to retry in this case */ + if (!wqe || (wqe->state == wqe_state_posted)) + goto exit; + + /* if we've started a retry, don't start another + * retry sequence, unless this is a timeout. + */ + if (qp->comp.started_retry && + !qp->comp.timeout_retry) { + if (pkt) { + rxe_drop_ref(pkt->qp); + kfree_skb(skb); + skb = NULL; + } + + goto done; + } + + if (qp->comp.retry_cnt > 0) { + if (qp->comp.retry_cnt != 7) + qp->comp.retry_cnt--; + + /* no point in retrying if we have already + * seen the last ack that the requester could + * have caused + */ + if (psn_compare(qp->req.psn, + qp->comp.psn) > 0) { + /* tell the requester to retry the + * send queue next time around + */ + rxe_counter_inc(rxe, + RXE_CNT_COMP_RETRY); + qp->req.need_retry = 1; + qp->comp.started_retry = 1; + rxe_run_task(&qp->req.task, 0); + } + + if (pkt) { + rxe_drop_ref(pkt->qp); + kfree_skb(skb); + skb = NULL; + } + + goto done; + + } else { + rxe_counter_inc(rxe, RXE_CNT_RETRY_EXCEEDED); + wqe->status = IB_WC_RETRY_EXC_ERR; + state = COMPST_ERROR; + } + break; + + case COMPST_RNR_RETRY: + if (qp->comp.rnr_retry > 0) { + if (qp->comp.rnr_retry != 7) + qp->comp.rnr_retry--; + + qp->req.need_retry = 1; + pr_debug("qp#%d set rnr nak timer\n", + qp_num(qp)); + mod_timer(&qp->rnr_nak_timer, + jiffies + rnrnak_jiffies(aeth_syn(pkt) + & ~AETH_TYPE_MASK)); + rxe_drop_ref(pkt->qp); + kfree_skb(skb); + skb = NULL; + goto exit; + } else { + rxe_counter_inc(rxe, + RXE_CNT_RNR_RETRY_EXCEEDED); + wqe->status = IB_WC_RNR_RETRY_EXC_ERR; + state = COMPST_ERROR; + } + break; + + case COMPST_ERROR: + WARN_ON_ONCE(wqe->status == IB_WC_SUCCESS); + do_complete(qp, wqe); + rxe_qp_error(qp); + + if (pkt) { + rxe_drop_ref(pkt->qp); + kfree_skb(skb); + skb = NULL; + } + + goto exit; + } + } + +exit: + /* we come here if we are done with processing and want the task to + * exit from the loop calling us + */ + WARN_ON_ONCE(skb); + rxe_drop_ref(qp); + return -EAGAIN; + +done: + /* we come here if we have processed a packet we want the task to call + * us again to see if there is anything else to do + */ + WARN_ON_ONCE(skb); + rxe_drop_ref(qp); + return 0; +} diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c new file mode 100644 index 000000000..43394c3f2 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ +#include <linux/vmalloc.h> +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" + +int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq, + int cqe, int comp_vector) +{ + int count; + + if (cqe <= 0) { + pr_warn("cqe(%d) <= 0\n", cqe); + goto err1; + } + + if (cqe > rxe->attr.max_cqe) { + pr_warn("cqe(%d) > max_cqe(%d)\n", + cqe, rxe->attr.max_cqe); + goto err1; + } + + if (cq) { + count = queue_count(cq->queue); + if (cqe < count) { + pr_warn("cqe(%d) < current # elements in queue (%d)", + cqe, count); + goto err1; + } + } + + return 0; + +err1: + return -EINVAL; +} + +static void rxe_send_complete(struct tasklet_struct *t) +{ + struct rxe_cq *cq = from_tasklet(cq, t, comp_task); + unsigned long flags; + + spin_lock_irqsave(&cq->cq_lock, flags); + if (cq->is_dying) { + spin_unlock_irqrestore(&cq->cq_lock, flags); + return; + } + spin_unlock_irqrestore(&cq->cq_lock, flags); + + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); +} + +int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, + int comp_vector, struct ib_udata *udata, + struct rxe_create_cq_resp __user *uresp) +{ + int err; + + cq->queue = rxe_queue_init(rxe, &cqe, + sizeof(struct rxe_cqe)); + if (!cq->queue) { + pr_warn("unable to create cq\n"); + return -ENOMEM; + } + + err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, udata, + cq->queue->buf, cq->queue->buf_size, &cq->queue->ip); + if (err) { + vfree(cq->queue->buf); + kfree(cq->queue); + return err; + } + + if (uresp) + cq->is_user = 1; + + cq->is_dying = false; + + tasklet_setup(&cq->comp_task, rxe_send_complete); + + spin_lock_init(&cq->cq_lock); + cq->ibcq.cqe = cqe; + return 0; +} + +int rxe_cq_resize_queue(struct rxe_cq *cq, int cqe, + struct rxe_resize_cq_resp __user *uresp, + struct ib_udata *udata) +{ + int err; + + err = rxe_queue_resize(cq->queue, (unsigned int *)&cqe, + sizeof(struct rxe_cqe), udata, + uresp ? &uresp->mi : NULL, NULL, &cq->cq_lock); + if (!err) + cq->ibcq.cqe = cqe; + + return err; +} + +int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited) +{ + struct ib_event ev; + unsigned long flags; + + spin_lock_irqsave(&cq->cq_lock, flags); + + if (unlikely(queue_full(cq->queue))) { + spin_unlock_irqrestore(&cq->cq_lock, flags); + if (cq->ibcq.event_handler) { + ev.device = cq->ibcq.device; + ev.element.cq = &cq->ibcq; + ev.event = IB_EVENT_CQ_ERR; + cq->ibcq.event_handler(&ev, cq->ibcq.cq_context); + } + + return -EBUSY; + } + + memcpy(producer_addr(cq->queue), cqe, sizeof(*cqe)); + + /* make sure all changes to the CQ are written before we update the + * producer pointer + */ + smp_wmb(); + + advance_producer(cq->queue); + spin_unlock_irqrestore(&cq->cq_lock, flags); + + if ((cq->notify == IB_CQ_NEXT_COMP) || + (cq->notify == IB_CQ_SOLICITED && solicited)) { + cq->notify = 0; + tasklet_schedule(&cq->comp_task); + } + + return 0; +} + +void rxe_cq_disable(struct rxe_cq *cq) +{ + unsigned long flags; + + spin_lock_irqsave(&cq->cq_lock, flags); + cq->is_dying = true; + spin_unlock_irqrestore(&cq->cq_lock, flags); +} + +void rxe_cq_cleanup(struct rxe_pool_entry *arg) +{ + struct rxe_cq *cq = container_of(arg, typeof(*cq), pelem); + + if (cq->queue) + rxe_queue_cleanup(cq->queue); +} diff --git a/drivers/infiniband/sw/rxe/rxe_hdr.h b/drivers/infiniband/sw/rxe/rxe_hdr.h new file mode 100644 index 000000000..3b483b75d --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_hdr.h @@ -0,0 +1,933 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#ifndef RXE_HDR_H +#define RXE_HDR_H + +/* extracted information about a packet carried in an sk_buff struct fits in + * the skbuff cb array. Must be at most 48 bytes. stored in control block of + * sk_buff for received packets. + */ +struct rxe_pkt_info { + struct rxe_dev *rxe; /* device that owns packet */ + struct rxe_qp *qp; /* qp that owns packet */ + struct rxe_send_wqe *wqe; /* send wqe */ + u8 *hdr; /* points to bth */ + u32 mask; /* useful info about pkt */ + u32 psn; /* bth psn of packet */ + u16 pkey_index; /* partition of pkt */ + u16 paylen; /* length of bth - icrc */ + u8 port_num; /* port pkt received on */ + u8 opcode; /* bth opcode of packet */ + u8 offset; /* bth offset from pkt->hdr */ +}; + +/* Macros should be used only for received skb */ +static inline struct rxe_pkt_info *SKB_TO_PKT(struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(struct rxe_pkt_info) > sizeof(skb->cb)); + return (void *)skb->cb; +} + +static inline struct sk_buff *PKT_TO_SKB(struct rxe_pkt_info *pkt) +{ + return container_of((void *)pkt, struct sk_buff, cb); +} + +/* + * IBA header types and methods + * + * Some of these are for reference and completeness only since + * rxe does not currently support RD transport + * most of this could be moved into IB core. ib_pack.h has + * part of this but is incomplete + * + * Header specific routines to insert/extract values to/from headers + * the routines that are named __hhh_(set_)fff() take a pointer to a + * hhh header and get(set) the fff field. The routines named + * hhh_(set_)fff take a packet info struct and find the + * header and field based on the opcode in the packet. + * Conversion to/from network byte order from cpu order is also done. + */ + +#define RXE_ICRC_SIZE (4) +#define RXE_MAX_HDR_LENGTH (80) + +/****************************************************************************** + * Base Transport Header + ******************************************************************************/ +struct rxe_bth { + u8 opcode; + u8 flags; + __be16 pkey; + __be32 qpn; + __be32 apsn; +}; + +#define BTH_TVER (0) +#define BTH_DEF_PKEY (0xffff) + +#define BTH_SE_MASK (0x80) +#define BTH_MIG_MASK (0x40) +#define BTH_PAD_MASK (0x30) +#define BTH_TVER_MASK (0x0f) +#define BTH_FECN_MASK (0x80000000) +#define BTH_BECN_MASK (0x40000000) +#define BTH_RESV6A_MASK (0x3f000000) +#define BTH_QPN_MASK (0x00ffffff) +#define BTH_ACK_MASK (0x80000000) +#define BTH_RESV7_MASK (0x7f000000) +#define BTH_PSN_MASK (0x00ffffff) + +static inline u8 __bth_opcode(void *arg) +{ + struct rxe_bth *bth = arg; + + return bth->opcode; +} + +static inline void __bth_set_opcode(void *arg, u8 opcode) +{ + struct rxe_bth *bth = arg; + + bth->opcode = opcode; +} + +static inline u8 __bth_se(void *arg) +{ + struct rxe_bth *bth = arg; + + return 0 != (BTH_SE_MASK & bth->flags); +} + +static inline void __bth_set_se(void *arg, int se) +{ + struct rxe_bth *bth = arg; + + if (se) + bth->flags |= BTH_SE_MASK; + else + bth->flags &= ~BTH_SE_MASK; +} + +static inline u8 __bth_mig(void *arg) +{ + struct rxe_bth *bth = arg; + + return 0 != (BTH_MIG_MASK & bth->flags); +} + +static inline void __bth_set_mig(void *arg, u8 mig) +{ + struct rxe_bth *bth = arg; + + if (mig) + bth->flags |= BTH_MIG_MASK; + else + bth->flags &= ~BTH_MIG_MASK; +} + +static inline u8 __bth_pad(void *arg) +{ + struct rxe_bth *bth = arg; + + return (BTH_PAD_MASK & bth->flags) >> 4; +} + +static inline void __bth_set_pad(void *arg, u8 pad) +{ + struct rxe_bth *bth = arg; + + bth->flags = (BTH_PAD_MASK & (pad << 4)) | + (~BTH_PAD_MASK & bth->flags); +} + +static inline u8 __bth_tver(void *arg) +{ + struct rxe_bth *bth = arg; + + return BTH_TVER_MASK & bth->flags; +} + +static inline void __bth_set_tver(void *arg, u8 tver) +{ + struct rxe_bth *bth = arg; + + bth->flags = (BTH_TVER_MASK & tver) | + (~BTH_TVER_MASK & bth->flags); +} + +static inline u16 __bth_pkey(void *arg) +{ + struct rxe_bth *bth = arg; + + return be16_to_cpu(bth->pkey); +} + +static inline void __bth_set_pkey(void *arg, u16 pkey) +{ + struct rxe_bth *bth = arg; + + bth->pkey = cpu_to_be16(pkey); +} + +static inline u32 __bth_qpn(void *arg) +{ + struct rxe_bth *bth = arg; + + return BTH_QPN_MASK & be32_to_cpu(bth->qpn); +} + +static inline void __bth_set_qpn(void *arg, u32 qpn) +{ + struct rxe_bth *bth = arg; + u32 resvqpn = be32_to_cpu(bth->qpn); + + bth->qpn = cpu_to_be32((BTH_QPN_MASK & qpn) | + (~BTH_QPN_MASK & resvqpn)); +} + +static inline int __bth_fecn(void *arg) +{ + struct rxe_bth *bth = arg; + + return 0 != (cpu_to_be32(BTH_FECN_MASK) & bth->qpn); +} + +static inline void __bth_set_fecn(void *arg, int fecn) +{ + struct rxe_bth *bth = arg; + + if (fecn) + bth->qpn |= cpu_to_be32(BTH_FECN_MASK); + else + bth->qpn &= ~cpu_to_be32(BTH_FECN_MASK); +} + +static inline int __bth_becn(void *arg) +{ + struct rxe_bth *bth = arg; + + return 0 != (cpu_to_be32(BTH_BECN_MASK) & bth->qpn); +} + +static inline void __bth_set_becn(void *arg, int becn) +{ + struct rxe_bth *bth = arg; + + if (becn) + bth->qpn |= cpu_to_be32(BTH_BECN_MASK); + else + bth->qpn &= ~cpu_to_be32(BTH_BECN_MASK); +} + +static inline u8 __bth_resv6a(void *arg) +{ + struct rxe_bth *bth = arg; + + return (BTH_RESV6A_MASK & be32_to_cpu(bth->qpn)) >> 24; +} + +static inline void __bth_set_resv6a(void *arg) +{ + struct rxe_bth *bth = arg; + + bth->qpn = cpu_to_be32(~BTH_RESV6A_MASK); +} + +static inline int __bth_ack(void *arg) +{ + struct rxe_bth *bth = arg; + + return 0 != (cpu_to_be32(BTH_ACK_MASK) & bth->apsn); +} + +static inline void __bth_set_ack(void *arg, int ack) +{ + struct rxe_bth *bth = arg; + + if (ack) + bth->apsn |= cpu_to_be32(BTH_ACK_MASK); + else + bth->apsn &= ~cpu_to_be32(BTH_ACK_MASK); +} + +static inline void __bth_set_resv7(void *arg) +{ + struct rxe_bth *bth = arg; + + bth->apsn &= ~cpu_to_be32(BTH_RESV7_MASK); +} + +static inline u32 __bth_psn(void *arg) +{ + struct rxe_bth *bth = arg; + + return BTH_PSN_MASK & be32_to_cpu(bth->apsn); +} + +static inline void __bth_set_psn(void *arg, u32 psn) +{ + struct rxe_bth *bth = arg; + u32 apsn = be32_to_cpu(bth->apsn); + + bth->apsn = cpu_to_be32((BTH_PSN_MASK & psn) | + (~BTH_PSN_MASK & apsn)); +} + +static inline u8 bth_opcode(struct rxe_pkt_info *pkt) +{ + return __bth_opcode(pkt->hdr + pkt->offset); +} + +static inline void bth_set_opcode(struct rxe_pkt_info *pkt, u8 opcode) +{ + __bth_set_opcode(pkt->hdr + pkt->offset, opcode); +} + +static inline u8 bth_se(struct rxe_pkt_info *pkt) +{ + return __bth_se(pkt->hdr + pkt->offset); +} + +static inline void bth_set_se(struct rxe_pkt_info *pkt, int se) +{ + __bth_set_se(pkt->hdr + pkt->offset, se); +} + +static inline u8 bth_mig(struct rxe_pkt_info *pkt) +{ + return __bth_mig(pkt->hdr + pkt->offset); +} + +static inline void bth_set_mig(struct rxe_pkt_info *pkt, u8 mig) +{ + __bth_set_mig(pkt->hdr + pkt->offset, mig); +} + +static inline u8 bth_pad(struct rxe_pkt_info *pkt) +{ + return __bth_pad(pkt->hdr + pkt->offset); +} + +static inline void bth_set_pad(struct rxe_pkt_info *pkt, u8 pad) +{ + __bth_set_pad(pkt->hdr + pkt->offset, pad); +} + +static inline u8 bth_tver(struct rxe_pkt_info *pkt) +{ + return __bth_tver(pkt->hdr + pkt->offset); +} + +static inline void bth_set_tver(struct rxe_pkt_info *pkt, u8 tver) +{ + __bth_set_tver(pkt->hdr + pkt->offset, tver); +} + +static inline u16 bth_pkey(struct rxe_pkt_info *pkt) +{ + return __bth_pkey(pkt->hdr + pkt->offset); +} + +static inline void bth_set_pkey(struct rxe_pkt_info *pkt, u16 pkey) +{ + __bth_set_pkey(pkt->hdr + pkt->offset, pkey); +} + +static inline u32 bth_qpn(struct rxe_pkt_info *pkt) +{ + return __bth_qpn(pkt->hdr + pkt->offset); +} + +static inline void bth_set_qpn(struct rxe_pkt_info *pkt, u32 qpn) +{ + __bth_set_qpn(pkt->hdr + pkt->offset, qpn); +} + +static inline int bth_fecn(struct rxe_pkt_info *pkt) +{ + return __bth_fecn(pkt->hdr + pkt->offset); +} + +static inline void bth_set_fecn(struct rxe_pkt_info *pkt, int fecn) +{ + __bth_set_fecn(pkt->hdr + pkt->offset, fecn); +} + +static inline int bth_becn(struct rxe_pkt_info *pkt) +{ + return __bth_becn(pkt->hdr + pkt->offset); +} + +static inline void bth_set_becn(struct rxe_pkt_info *pkt, int becn) +{ + __bth_set_becn(pkt->hdr + pkt->offset, becn); +} + +static inline u8 bth_resv6a(struct rxe_pkt_info *pkt) +{ + return __bth_resv6a(pkt->hdr + pkt->offset); +} + +static inline void bth_set_resv6a(struct rxe_pkt_info *pkt) +{ + __bth_set_resv6a(pkt->hdr + pkt->offset); +} + +static inline int bth_ack(struct rxe_pkt_info *pkt) +{ + return __bth_ack(pkt->hdr + pkt->offset); +} + +static inline void bth_set_ack(struct rxe_pkt_info *pkt, int ack) +{ + __bth_set_ack(pkt->hdr + pkt->offset, ack); +} + +static inline void bth_set_resv7(struct rxe_pkt_info *pkt) +{ + __bth_set_resv7(pkt->hdr + pkt->offset); +} + +static inline u32 bth_psn(struct rxe_pkt_info *pkt) +{ + return __bth_psn(pkt->hdr + pkt->offset); +} + +static inline void bth_set_psn(struct rxe_pkt_info *pkt, u32 psn) +{ + __bth_set_psn(pkt->hdr + pkt->offset, psn); +} + +static inline void bth_init(struct rxe_pkt_info *pkt, u8 opcode, int se, + int mig, int pad, u16 pkey, u32 qpn, int ack_req, + u32 psn) +{ + struct rxe_bth *bth = (struct rxe_bth *)(pkt->hdr + pkt->offset); + + bth->opcode = opcode; + bth->flags = (pad << 4) & BTH_PAD_MASK; + if (se) + bth->flags |= BTH_SE_MASK; + if (mig) + bth->flags |= BTH_MIG_MASK; + bth->pkey = cpu_to_be16(pkey); + bth->qpn = cpu_to_be32(qpn & BTH_QPN_MASK); + psn &= BTH_PSN_MASK; + if (ack_req) + psn |= BTH_ACK_MASK; + bth->apsn = cpu_to_be32(psn); +} + +/****************************************************************************** + * Reliable Datagram Extended Transport Header + ******************************************************************************/ +struct rxe_rdeth { + __be32 een; +}; + +#define RDETH_EEN_MASK (0x00ffffff) + +static inline u8 __rdeth_een(void *arg) +{ + struct rxe_rdeth *rdeth = arg; + + return RDETH_EEN_MASK & be32_to_cpu(rdeth->een); +} + +static inline void __rdeth_set_een(void *arg, u32 een) +{ + struct rxe_rdeth *rdeth = arg; + + rdeth->een = cpu_to_be32(RDETH_EEN_MASK & een); +} + +static inline u8 rdeth_een(struct rxe_pkt_info *pkt) +{ + return __rdeth_een(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_RDETH]); +} + +static inline void rdeth_set_een(struct rxe_pkt_info *pkt, u32 een) +{ + __rdeth_set_een(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_RDETH], een); +} + +/****************************************************************************** + * Datagram Extended Transport Header + ******************************************************************************/ +struct rxe_deth { + __be32 qkey; + __be32 sqp; +}; + +#define GSI_QKEY (0x80010000) +#define DETH_SQP_MASK (0x00ffffff) + +static inline u32 __deth_qkey(void *arg) +{ + struct rxe_deth *deth = arg; + + return be32_to_cpu(deth->qkey); +} + +static inline void __deth_set_qkey(void *arg, u32 qkey) +{ + struct rxe_deth *deth = arg; + + deth->qkey = cpu_to_be32(qkey); +} + +static inline u32 __deth_sqp(void *arg) +{ + struct rxe_deth *deth = arg; + + return DETH_SQP_MASK & be32_to_cpu(deth->sqp); +} + +static inline void __deth_set_sqp(void *arg, u32 sqp) +{ + struct rxe_deth *deth = arg; + + deth->sqp = cpu_to_be32(DETH_SQP_MASK & sqp); +} + +static inline u32 deth_qkey(struct rxe_pkt_info *pkt) +{ + return __deth_qkey(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_DETH]); +} + +static inline void deth_set_qkey(struct rxe_pkt_info *pkt, u32 qkey) +{ + __deth_set_qkey(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_DETH], qkey); +} + +static inline u32 deth_sqp(struct rxe_pkt_info *pkt) +{ + return __deth_sqp(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_DETH]); +} + +static inline void deth_set_sqp(struct rxe_pkt_info *pkt, u32 sqp) +{ + __deth_set_sqp(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_DETH], sqp); +} + +/****************************************************************************** + * RDMA Extended Transport Header + ******************************************************************************/ +struct rxe_reth { + __be64 va; + __be32 rkey; + __be32 len; +}; + +static inline u64 __reth_va(void *arg) +{ + struct rxe_reth *reth = arg; + + return be64_to_cpu(reth->va); +} + +static inline void __reth_set_va(void *arg, u64 va) +{ + struct rxe_reth *reth = arg; + + reth->va = cpu_to_be64(va); +} + +static inline u32 __reth_rkey(void *arg) +{ + struct rxe_reth *reth = arg; + + return be32_to_cpu(reth->rkey); +} + +static inline void __reth_set_rkey(void *arg, u32 rkey) +{ + struct rxe_reth *reth = arg; + + reth->rkey = cpu_to_be32(rkey); +} + +static inline u32 __reth_len(void *arg) +{ + struct rxe_reth *reth = arg; + + return be32_to_cpu(reth->len); +} + +static inline void __reth_set_len(void *arg, u32 len) +{ + struct rxe_reth *reth = arg; + + reth->len = cpu_to_be32(len); +} + +static inline u64 reth_va(struct rxe_pkt_info *pkt) +{ + return __reth_va(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_RETH]); +} + +static inline void reth_set_va(struct rxe_pkt_info *pkt, u64 va) +{ + __reth_set_va(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_RETH], va); +} + +static inline u32 reth_rkey(struct rxe_pkt_info *pkt) +{ + return __reth_rkey(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_RETH]); +} + +static inline void reth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey) +{ + __reth_set_rkey(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_RETH], rkey); +} + +static inline u32 reth_len(struct rxe_pkt_info *pkt) +{ + return __reth_len(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_RETH]); +} + +static inline void reth_set_len(struct rxe_pkt_info *pkt, u32 len) +{ + __reth_set_len(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_RETH], len); +} + +/****************************************************************************** + * Atomic Extended Transport Header + ******************************************************************************/ +struct rxe_atmeth { + __be64 va; + __be32 rkey; + __be64 swap_add; + __be64 comp; +} __packed; + +static inline u64 __atmeth_va(void *arg) +{ + struct rxe_atmeth *atmeth = arg; + + return be64_to_cpu(atmeth->va); +} + +static inline void __atmeth_set_va(void *arg, u64 va) +{ + struct rxe_atmeth *atmeth = arg; + + atmeth->va = cpu_to_be64(va); +} + +static inline u32 __atmeth_rkey(void *arg) +{ + struct rxe_atmeth *atmeth = arg; + + return be32_to_cpu(atmeth->rkey); +} + +static inline void __atmeth_set_rkey(void *arg, u32 rkey) +{ + struct rxe_atmeth *atmeth = arg; + + atmeth->rkey = cpu_to_be32(rkey); +} + +static inline u64 __atmeth_swap_add(void *arg) +{ + struct rxe_atmeth *atmeth = arg; + + return be64_to_cpu(atmeth->swap_add); +} + +static inline void __atmeth_set_swap_add(void *arg, u64 swap_add) +{ + struct rxe_atmeth *atmeth = arg; + + atmeth->swap_add = cpu_to_be64(swap_add); +} + +static inline u64 __atmeth_comp(void *arg) +{ + struct rxe_atmeth *atmeth = arg; + + return be64_to_cpu(atmeth->comp); +} + +static inline void __atmeth_set_comp(void *arg, u64 comp) +{ + struct rxe_atmeth *atmeth = arg; + + atmeth->comp = cpu_to_be64(comp); +} + +static inline u64 atmeth_va(struct rxe_pkt_info *pkt) +{ + return __atmeth_va(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]); +} + +static inline void atmeth_set_va(struct rxe_pkt_info *pkt, u64 va) +{ + __atmeth_set_va(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], va); +} + +static inline u32 atmeth_rkey(struct rxe_pkt_info *pkt) +{ + return __atmeth_rkey(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]); +} + +static inline void atmeth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey) +{ + __atmeth_set_rkey(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], rkey); +} + +static inline u64 atmeth_swap_add(struct rxe_pkt_info *pkt) +{ + return __atmeth_swap_add(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]); +} + +static inline void atmeth_set_swap_add(struct rxe_pkt_info *pkt, u64 swap_add) +{ + __atmeth_set_swap_add(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], swap_add); +} + +static inline u64 atmeth_comp(struct rxe_pkt_info *pkt) +{ + return __atmeth_comp(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH]); +} + +static inline void atmeth_set_comp(struct rxe_pkt_info *pkt, u64 comp) +{ + __atmeth_set_comp(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMETH], comp); +} + +/****************************************************************************** + * Ack Extended Transport Header + ******************************************************************************/ +struct rxe_aeth { + __be32 smsn; +}; + +#define AETH_SYN_MASK (0xff000000) +#define AETH_MSN_MASK (0x00ffffff) + +enum aeth_syndrome { + AETH_TYPE_MASK = 0xe0, + AETH_ACK = 0x00, + AETH_RNR_NAK = 0x20, + AETH_RSVD = 0x40, + AETH_NAK = 0x60, + AETH_ACK_UNLIMITED = 0x1f, + AETH_NAK_PSN_SEQ_ERROR = 0x60, + AETH_NAK_INVALID_REQ = 0x61, + AETH_NAK_REM_ACC_ERR = 0x62, + AETH_NAK_REM_OP_ERR = 0x63, + AETH_NAK_INV_RD_REQ = 0x64, +}; + +static inline u8 __aeth_syn(void *arg) +{ + struct rxe_aeth *aeth = arg; + + return (AETH_SYN_MASK & be32_to_cpu(aeth->smsn)) >> 24; +} + +static inline void __aeth_set_syn(void *arg, u8 syn) +{ + struct rxe_aeth *aeth = arg; + u32 smsn = be32_to_cpu(aeth->smsn); + + aeth->smsn = cpu_to_be32((AETH_SYN_MASK & (syn << 24)) | + (~AETH_SYN_MASK & smsn)); +} + +static inline u32 __aeth_msn(void *arg) +{ + struct rxe_aeth *aeth = arg; + + return AETH_MSN_MASK & be32_to_cpu(aeth->smsn); +} + +static inline void __aeth_set_msn(void *arg, u32 msn) +{ + struct rxe_aeth *aeth = arg; + u32 smsn = be32_to_cpu(aeth->smsn); + + aeth->smsn = cpu_to_be32((AETH_MSN_MASK & msn) | + (~AETH_MSN_MASK & smsn)); +} + +static inline u8 aeth_syn(struct rxe_pkt_info *pkt) +{ + return __aeth_syn(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_AETH]); +} + +static inline void aeth_set_syn(struct rxe_pkt_info *pkt, u8 syn) +{ + __aeth_set_syn(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_AETH], syn); +} + +static inline u32 aeth_msn(struct rxe_pkt_info *pkt) +{ + return __aeth_msn(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_AETH]); +} + +static inline void aeth_set_msn(struct rxe_pkt_info *pkt, u32 msn) +{ + __aeth_set_msn(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_AETH], msn); +} + +/****************************************************************************** + * Atomic Ack Extended Transport Header + ******************************************************************************/ +struct rxe_atmack { + __be64 orig; +}; + +static inline u64 __atmack_orig(void *arg) +{ + struct rxe_atmack *atmack = arg; + + return be64_to_cpu(atmack->orig); +} + +static inline void __atmack_set_orig(void *arg, u64 orig) +{ + struct rxe_atmack *atmack = arg; + + atmack->orig = cpu_to_be64(orig); +} + +static inline u64 atmack_orig(struct rxe_pkt_info *pkt) +{ + return __atmack_orig(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMACK]); +} + +static inline void atmack_set_orig(struct rxe_pkt_info *pkt, u64 orig) +{ + __atmack_set_orig(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_ATMACK], orig); +} + +/****************************************************************************** + * Immediate Extended Transport Header + ******************************************************************************/ +struct rxe_immdt { + __be32 imm; +}; + +static inline __be32 __immdt_imm(void *arg) +{ + struct rxe_immdt *immdt = arg; + + return immdt->imm; +} + +static inline void __immdt_set_imm(void *arg, __be32 imm) +{ + struct rxe_immdt *immdt = arg; + + immdt->imm = imm; +} + +static inline __be32 immdt_imm(struct rxe_pkt_info *pkt) +{ + return __immdt_imm(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_IMMDT]); +} + +static inline void immdt_set_imm(struct rxe_pkt_info *pkt, __be32 imm) +{ + __immdt_set_imm(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_IMMDT], imm); +} + +/****************************************************************************** + * Invalidate Extended Transport Header + ******************************************************************************/ +struct rxe_ieth { + __be32 rkey; +}; + +static inline u32 __ieth_rkey(void *arg) +{ + struct rxe_ieth *ieth = arg; + + return be32_to_cpu(ieth->rkey); +} + +static inline void __ieth_set_rkey(void *arg, u32 rkey) +{ + struct rxe_ieth *ieth = arg; + + ieth->rkey = cpu_to_be32(rkey); +} + +static inline u32 ieth_rkey(struct rxe_pkt_info *pkt) +{ + return __ieth_rkey(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_IETH]); +} + +static inline void ieth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey) +{ + __ieth_set_rkey(pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_IETH], rkey); +} + +enum rxe_hdr_length { + RXE_BTH_BYTES = sizeof(struct rxe_bth), + RXE_DETH_BYTES = sizeof(struct rxe_deth), + RXE_IMMDT_BYTES = sizeof(struct rxe_immdt), + RXE_RETH_BYTES = sizeof(struct rxe_reth), + RXE_AETH_BYTES = sizeof(struct rxe_aeth), + RXE_ATMACK_BYTES = sizeof(struct rxe_atmack), + RXE_ATMETH_BYTES = sizeof(struct rxe_atmeth), + RXE_IETH_BYTES = sizeof(struct rxe_ieth), + RXE_RDETH_BYTES = sizeof(struct rxe_rdeth), +}; + +static inline size_t header_size(struct rxe_pkt_info *pkt) +{ + return pkt->offset + rxe_opcode[pkt->opcode].length; +} + +static inline void *payload_addr(struct rxe_pkt_info *pkt) +{ + return pkt->hdr + pkt->offset + + rxe_opcode[pkt->opcode].offset[RXE_PAYLOAD]; +} + +static inline size_t payload_size(struct rxe_pkt_info *pkt) +{ + return pkt->paylen - rxe_opcode[pkt->opcode].offset[RXE_PAYLOAD] + - bth_pad(pkt) - RXE_ICRC_SIZE; +} + +#endif /* RXE_HDR_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.c b/drivers/infiniband/sw/rxe/rxe_hw_counters.c new file mode 100644 index 000000000..ac9154f05 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2017 Mellanox Technologies Ltd. All rights reserved. + */ + +#include "rxe.h" +#include "rxe_hw_counters.h" + +static const char * const rxe_counter_name[] = { + [RXE_CNT_SENT_PKTS] = "sent_pkts", + [RXE_CNT_RCVD_PKTS] = "rcvd_pkts", + [RXE_CNT_DUP_REQ] = "duplicate_request", + [RXE_CNT_OUT_OF_SEQ_REQ] = "out_of_seq_request", + [RXE_CNT_RCV_RNR] = "rcvd_rnr_err", + [RXE_CNT_SND_RNR] = "send_rnr_err", + [RXE_CNT_RCV_SEQ_ERR] = "rcvd_seq_err", + [RXE_CNT_COMPLETER_SCHED] = "ack_deferred", + [RXE_CNT_RETRY_EXCEEDED] = "retry_exceeded_err", + [RXE_CNT_RNR_RETRY_EXCEEDED] = "retry_rnr_exceeded_err", + [RXE_CNT_COMP_RETRY] = "completer_retry_err", + [RXE_CNT_SEND_ERR] = "send_err", + [RXE_CNT_LINK_DOWNED] = "link_downed", + [RXE_CNT_RDMA_SEND] = "rdma_sends", + [RXE_CNT_RDMA_RECV] = "rdma_recvs", +}; + +int rxe_ib_get_hw_stats(struct ib_device *ibdev, + struct rdma_hw_stats *stats, + u8 port, int index) +{ + struct rxe_dev *dev = to_rdev(ibdev); + unsigned int cnt; + + if (!port || !stats) + return -EINVAL; + + for (cnt = 0; cnt < ARRAY_SIZE(rxe_counter_name); cnt++) + stats->value[cnt] = atomic64_read(&dev->stats_counters[cnt]); + + return ARRAY_SIZE(rxe_counter_name); +} + +struct rdma_hw_stats *rxe_ib_alloc_hw_stats(struct ib_device *ibdev, + u8 port_num) +{ + BUILD_BUG_ON(ARRAY_SIZE(rxe_counter_name) != RXE_NUM_OF_COUNTERS); + /* We support only per port stats */ + if (!port_num) + return NULL; + + return rdma_alloc_hw_stats_struct(rxe_counter_name, + ARRAY_SIZE(rxe_counter_name), + RDMA_HW_STATS_DEFAULT_LIFESPAN); +} diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.h b/drivers/infiniband/sw/rxe/rxe_hw_counters.h new file mode 100644 index 000000000..49ee6f966 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2017 Mellanox Technologies Ltd. All rights reserved. + */ + +#ifndef RXE_HW_COUNTERS_H +#define RXE_HW_COUNTERS_H + +/* + * when adding counters to enum also add + * them to rxe_counter_name[] vector. + */ +enum rxe_counters { + RXE_CNT_SENT_PKTS, + RXE_CNT_RCVD_PKTS, + RXE_CNT_DUP_REQ, + RXE_CNT_OUT_OF_SEQ_REQ, + RXE_CNT_RCV_RNR, + RXE_CNT_SND_RNR, + RXE_CNT_RCV_SEQ_ERR, + RXE_CNT_COMPLETER_SCHED, + RXE_CNT_RETRY_EXCEEDED, + RXE_CNT_RNR_RETRY_EXCEEDED, + RXE_CNT_COMP_RETRY, + RXE_CNT_SEND_ERR, + RXE_CNT_LINK_DOWNED, + RXE_CNT_RDMA_SEND, + RXE_CNT_RDMA_RECV, + RXE_NUM_OF_COUNTERS +}; + +struct rdma_hw_stats *rxe_ib_alloc_hw_stats(struct ib_device *ibdev, + u8 port_num); +int rxe_ib_get_hw_stats(struct ib_device *ibdev, + struct rdma_hw_stats *stats, + u8 port, int index); +#endif /* RXE_HW_COUNTERS_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_icrc.c b/drivers/infiniband/sw/rxe/rxe_icrc.c new file mode 100644 index 000000000..66b2aad54 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_icrc.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include "rxe.h" +#include "rxe_loc.h" + +/* Compute a partial ICRC for all the IB transport headers. */ +u32 rxe_icrc_hdr(struct rxe_pkt_info *pkt, struct sk_buff *skb) +{ + unsigned int bth_offset = 0; + struct iphdr *ip4h = NULL; + struct ipv6hdr *ip6h = NULL; + struct udphdr *udph; + struct rxe_bth *bth; + int crc; + int length; + int hdr_size = sizeof(struct udphdr) + + (skb->protocol == htons(ETH_P_IP) ? + sizeof(struct iphdr) : sizeof(struct ipv6hdr)); + /* pseudo header buffer size is calculate using ipv6 header size since + * it is bigger than ipv4 + */ + u8 pshdr[sizeof(struct udphdr) + + sizeof(struct ipv6hdr) + + RXE_BTH_BYTES]; + + /* This seed is the result of computing a CRC with a seed of + * 0xfffffff and 8 bytes of 0xff representing a masked LRH. + */ + crc = 0xdebb20e3; + + if (skb->protocol == htons(ETH_P_IP)) { /* IPv4 */ + memcpy(pshdr, ip_hdr(skb), hdr_size); + ip4h = (struct iphdr *)pshdr; + udph = (struct udphdr *)(ip4h + 1); + + ip4h->ttl = 0xff; + ip4h->check = CSUM_MANGLED_0; + ip4h->tos = 0xff; + } else { /* IPv6 */ + memcpy(pshdr, ipv6_hdr(skb), hdr_size); + ip6h = (struct ipv6hdr *)pshdr; + udph = (struct udphdr *)(ip6h + 1); + + memset(ip6h->flow_lbl, 0xff, sizeof(ip6h->flow_lbl)); + ip6h->priority = 0xf; + ip6h->hop_limit = 0xff; + } + udph->check = CSUM_MANGLED_0; + + bth_offset += hdr_size; + + memcpy(&pshdr[bth_offset], pkt->hdr, RXE_BTH_BYTES); + bth = (struct rxe_bth *)&pshdr[bth_offset]; + + /* exclude bth.resv8a */ + bth->qpn |= cpu_to_be32(~BTH_QPN_MASK); + + length = hdr_size + RXE_BTH_BYTES; + crc = rxe_crc32(pkt->rxe, crc, pshdr, length); + + /* And finish to compute the CRC on the remainder of the headers. */ + crc = rxe_crc32(pkt->rxe, crc, pkt->hdr + RXE_BTH_BYTES, + rxe_opcode[pkt->opcode].length - RXE_BTH_BYTES); + return crc; +} diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h new file mode 100644 index 000000000..0d758760b --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -0,0 +1,262 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#ifndef RXE_LOC_H +#define RXE_LOC_H + +/* rxe_av.c */ +void rxe_init_av(struct rdma_ah_attr *attr, struct rxe_av *av); + +int rxe_av_chk_attr(struct rxe_dev *rxe, struct rdma_ah_attr *attr); + +void rxe_av_from_attr(u8 port_num, struct rxe_av *av, + struct rdma_ah_attr *attr); + +void rxe_av_to_attr(struct rxe_av *av, struct rdma_ah_attr *attr); + +void rxe_av_fill_ip_info(struct rxe_av *av, struct rdma_ah_attr *attr); + +struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt); + +/* rxe_cq.c */ +int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq, + int cqe, int comp_vector); + +int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, + int comp_vector, struct ib_udata *udata, + struct rxe_create_cq_resp __user *uresp); + +int rxe_cq_resize_queue(struct rxe_cq *cq, int new_cqe, + struct rxe_resize_cq_resp __user *uresp, + struct ib_udata *udata); + +int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited); + +void rxe_cq_disable(struct rxe_cq *cq); + +void rxe_cq_cleanup(struct rxe_pool_entry *arg); + +/* rxe_mcast.c */ +int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid, + struct rxe_mc_grp **grp_p); + +int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, + struct rxe_mc_grp *grp); + +int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, + union ib_gid *mgid); + +void rxe_drop_all_mcast_groups(struct rxe_qp *qp); + +void rxe_mc_cleanup(struct rxe_pool_entry *arg); + +/* rxe_mmap.c */ +struct rxe_mmap_info { + struct list_head pending_mmaps; + struct ib_ucontext *context; + struct kref ref; + void *obj; + + struct mminfo info; +}; + +void rxe_mmap_release(struct kref *ref); + +struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *dev, u32 size, + struct ib_udata *udata, void *obj); + +int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); + +/* rxe_mr.c */ +enum copy_direction { + to_mem_obj, + from_mem_obj, +}; + +void rxe_mem_init_dma(struct rxe_pd *pd, + int access, struct rxe_mem *mem); + +int rxe_mem_init_user(struct rxe_pd *pd, u64 start, + u64 length, u64 iova, int access, struct ib_udata *udata, + struct rxe_mem *mr); + +int rxe_mem_init_fast(struct rxe_pd *pd, + int max_pages, struct rxe_mem *mem); + +int rxe_mem_copy(struct rxe_mem *mem, u64 iova, void *addr, + int length, enum copy_direction dir, u32 *crcp); + +int copy_data(struct rxe_pd *pd, int access, + struct rxe_dma_info *dma, void *addr, int length, + enum copy_direction dir, u32 *crcp); + +void *iova_to_vaddr(struct rxe_mem *mem, u64 iova, int length); + +enum lookup_type { + lookup_local, + lookup_remote, +}; + +struct rxe_mem *lookup_mem(struct rxe_pd *pd, int access, u32 key, + enum lookup_type type); + +int mem_check_range(struct rxe_mem *mem, u64 iova, size_t length); + +void rxe_mem_cleanup(struct rxe_pool_entry *arg); + +int advance_dma_data(struct rxe_dma_info *dma, unsigned int length); + +/* rxe_net.c */ +void rxe_loopback(struct sk_buff *skb); +int rxe_send(struct rxe_pkt_info *pkt, struct sk_buff *skb); +struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, + int paylen, struct rxe_pkt_info *pkt); +int rxe_prepare(struct rxe_pkt_info *pkt, struct sk_buff *skb, u32 *crc); +const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num); +struct device *rxe_dma_device(struct rxe_dev *rxe); +int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid); +int rxe_mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid); + +/* rxe_qp.c */ +int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init); + +int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, + struct ib_qp_init_attr *init, + struct rxe_create_qp_resp __user *uresp, + struct ib_pd *ibpd, struct ib_udata *udata); + +int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init); + +int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, + struct ib_qp_attr *attr, int mask); + +int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, + int mask, struct ib_udata *udata); + +int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask); + +void rxe_qp_error(struct rxe_qp *qp); + +void rxe_qp_destroy(struct rxe_qp *qp); + +void rxe_qp_cleanup(struct rxe_pool_entry *arg); + +static inline int qp_num(struct rxe_qp *qp) +{ + return qp->ibqp.qp_num; +} + +static inline enum ib_qp_type qp_type(struct rxe_qp *qp) +{ + return qp->ibqp.qp_type; +} + +static inline enum ib_qp_state qp_state(struct rxe_qp *qp) +{ + return qp->attr.qp_state; +} + +static inline int qp_mtu(struct rxe_qp *qp) +{ + if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) + return qp->attr.path_mtu; + else + return IB_MTU_4096; +} + +static inline int rcv_wqe_size(int max_sge) +{ + return sizeof(struct rxe_recv_wqe) + + max_sge * sizeof(struct ib_sge); +} + +void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res); + +static inline void rxe_advance_resp_resource(struct rxe_qp *qp) +{ + qp->resp.res_head++; + if (unlikely(qp->resp.res_head == qp->attr.max_dest_rd_atomic)) + qp->resp.res_head = 0; +} + +void retransmit_timer(struct timer_list *t); +void rnr_nak_timer(struct timer_list *t); + +/* rxe_srq.c */ +#define IB_SRQ_INIT_MASK (~IB_SRQ_LIMIT) + +int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_attr *attr, enum ib_srq_attr_mask mask); + +int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_init_attr *init, struct ib_udata *udata, + struct rxe_create_srq_resp __user *uresp); + +int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_attr *attr, enum ib_srq_attr_mask mask, + struct rxe_modify_srq_cmd *ucmd, struct ib_udata *udata); + +void rxe_dealloc(struct ib_device *ib_dev); + +int rxe_completer(void *arg); +int rxe_requester(void *arg); +int rxe_responder(void *arg); + +u32 rxe_icrc_hdr(struct rxe_pkt_info *pkt, struct sk_buff *skb); + +void rxe_resp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb); + +void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb); + +static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp) +{ + return rxe_wr_opcode_info[opcode].mask[qp->ibqp.qp_type]; +} + +static inline int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt, + struct sk_buff *skb) +{ + int err; + int is_request = pkt->mask & RXE_REQ_MASK; + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + + if ((is_request && (qp->req.state != QP_STATE_READY)) || + (!is_request && (qp->resp.state != QP_STATE_READY))) { + pr_info("Packet dropped. QP is not in ready state\n"); + goto drop; + } + + if (pkt->mask & RXE_LOOPBACK_MASK) { + memcpy(SKB_TO_PKT(skb), pkt, sizeof(*pkt)); + rxe_loopback(skb); + err = 0; + } else { + err = rxe_send(pkt, skb); + } + + if (err) { + rxe->xmit_errors++; + rxe_counter_inc(rxe, RXE_CNT_SEND_ERR); + return err; + } + + if ((qp_type(qp) != IB_QPT_RC) && + (pkt->mask & RXE_END_MASK)) { + pkt->wqe->state = wqe_state_done; + rxe_run_task(&qp->comp.task, 1); + } + + rxe_counter_inc(rxe, RXE_CNT_SENT_PKTS); + goto done; + +drop: + kfree_skb(skb); + err = 0; +done: + return err; +} + +#endif /* RXE_LOC_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c new file mode 100644 index 000000000..c02315aed --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include "rxe.h" +#include "rxe_loc.h" + +int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid, + struct rxe_mc_grp **grp_p) +{ + int err; + struct rxe_mc_grp *grp; + + if (rxe->attr.max_mcast_qp_attach == 0) { + err = -EINVAL; + goto err1; + } + + grp = rxe_pool_get_key(&rxe->mc_grp_pool, mgid); + if (grp) + goto done; + + grp = rxe_alloc(&rxe->mc_grp_pool); + if (!grp) { + err = -ENOMEM; + goto err1; + } + + INIT_LIST_HEAD(&grp->qp_list); + spin_lock_init(&grp->mcg_lock); + grp->rxe = rxe; + + rxe_add_key(grp, mgid); + + err = rxe_mcast_add(rxe, mgid); + if (err) + goto err2; + +done: + *grp_p = grp; + return 0; + +err2: + rxe_drop_ref(grp); +err1: + return err; +} + +int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, + struct rxe_mc_grp *grp) +{ + int err; + struct rxe_mc_elem *elem; + + /* check to see of the qp is already a member of the group */ + spin_lock_bh(&qp->grp_lock); + spin_lock_bh(&grp->mcg_lock); + list_for_each_entry(elem, &grp->qp_list, qp_list) { + if (elem->qp == qp) { + err = 0; + goto out; + } + } + + if (grp->num_qp >= rxe->attr.max_mcast_qp_attach) { + err = -ENOMEM; + goto out; + } + + elem = rxe_alloc(&rxe->mc_elem_pool); + if (!elem) { + err = -ENOMEM; + goto out; + } + + /* each qp holds a ref on the grp */ + rxe_add_ref(grp); + + grp->num_qp++; + elem->qp = qp; + elem->grp = grp; + + list_add(&elem->qp_list, &grp->qp_list); + list_add(&elem->grp_list, &qp->grp_list); + + err = 0; +out: + spin_unlock_bh(&grp->mcg_lock); + spin_unlock_bh(&qp->grp_lock); + return err; +} + +int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, + union ib_gid *mgid) +{ + struct rxe_mc_grp *grp; + struct rxe_mc_elem *elem, *tmp; + + grp = rxe_pool_get_key(&rxe->mc_grp_pool, mgid); + if (!grp) + goto err1; + + spin_lock_bh(&qp->grp_lock); + spin_lock_bh(&grp->mcg_lock); + + list_for_each_entry_safe(elem, tmp, &grp->qp_list, qp_list) { + if (elem->qp == qp) { + list_del(&elem->qp_list); + list_del(&elem->grp_list); + grp->num_qp--; + + spin_unlock_bh(&grp->mcg_lock); + spin_unlock_bh(&qp->grp_lock); + rxe_drop_ref(elem); + rxe_drop_ref(grp); /* ref held by QP */ + rxe_drop_ref(grp); /* ref from get_key */ + return 0; + } + } + + spin_unlock_bh(&grp->mcg_lock); + spin_unlock_bh(&qp->grp_lock); + rxe_drop_ref(grp); /* ref from get_key */ +err1: + return -EINVAL; +} + +void rxe_drop_all_mcast_groups(struct rxe_qp *qp) +{ + struct rxe_mc_grp *grp; + struct rxe_mc_elem *elem; + + while (1) { + spin_lock_bh(&qp->grp_lock); + if (list_empty(&qp->grp_list)) { + spin_unlock_bh(&qp->grp_lock); + break; + } + elem = list_first_entry(&qp->grp_list, struct rxe_mc_elem, + grp_list); + list_del(&elem->grp_list); + spin_unlock_bh(&qp->grp_lock); + + grp = elem->grp; + spin_lock_bh(&grp->mcg_lock); + list_del(&elem->qp_list); + grp->num_qp--; + spin_unlock_bh(&grp->mcg_lock); + rxe_drop_ref(grp); + rxe_drop_ref(elem); + } +} + +void rxe_mc_cleanup(struct rxe_pool_entry *arg) +{ + struct rxe_mc_grp *grp = container_of(arg, typeof(*grp), pelem); + struct rxe_dev *rxe = grp->rxe; + + rxe_drop_key(grp); + rxe_mcast_delete(rxe, &grp->mgid); +} diff --git a/drivers/infiniband/sw/rxe/rxe_mmap.c b/drivers/infiniband/sw/rxe/rxe_mmap.c new file mode 100644 index 000000000..035f226af --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_mmap.c @@ -0,0 +1,149 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/mm.h> +#include <linux/errno.h> +#include <rdma/uverbs_ioctl.h> + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" + +void rxe_mmap_release(struct kref *ref) +{ + struct rxe_mmap_info *ip = container_of(ref, + struct rxe_mmap_info, ref); + struct rxe_dev *rxe = to_rdev(ip->context->device); + + spin_lock_bh(&rxe->pending_lock); + + if (!list_empty(&ip->pending_mmaps)) + list_del(&ip->pending_mmaps); + + spin_unlock_bh(&rxe->pending_lock); + + vfree(ip->obj); /* buf */ + kfree(ip); +} + +/* + * open and close keep track of how many times the memory region is mapped, + * to avoid releasing it. + */ +static void rxe_vma_open(struct vm_area_struct *vma) +{ + struct rxe_mmap_info *ip = vma->vm_private_data; + + kref_get(&ip->ref); +} + +static void rxe_vma_close(struct vm_area_struct *vma) +{ + struct rxe_mmap_info *ip = vma->vm_private_data; + + kref_put(&ip->ref, rxe_mmap_release); +} + +static const struct vm_operations_struct rxe_vm_ops = { + .open = rxe_vma_open, + .close = rxe_vma_close, +}; + +/** + * rxe_mmap - create a new mmap region + * @context: the IB user context of the process making the mmap() call + * @vma: the VMA to be initialized + * Return zero if the mmap is OK. Otherwise, return an errno. + */ +int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + struct rxe_dev *rxe = to_rdev(context->device); + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned long size = vma->vm_end - vma->vm_start; + struct rxe_mmap_info *ip, *pp; + int ret; + + /* + * Search the device's list of objects waiting for a mmap call. + * Normally, this list is very short since a call to create a + * CQ, QP, or SRQ is soon followed by a call to mmap(). + */ + spin_lock_bh(&rxe->pending_lock); + list_for_each_entry_safe(ip, pp, &rxe->pending_mmaps, pending_mmaps) { + if (context != ip->context || (__u64)offset != ip->info.offset) + continue; + + /* Don't allow a mmap larger than the object. */ + if (size > ip->info.size) { + pr_err("mmap region is larger than the object!\n"); + spin_unlock_bh(&rxe->pending_lock); + ret = -EINVAL; + goto done; + } + + goto found_it; + } + pr_warn("unable to find pending mmap info\n"); + spin_unlock_bh(&rxe->pending_lock); + ret = -EINVAL; + goto done; + +found_it: + list_del_init(&ip->pending_mmaps); + spin_unlock_bh(&rxe->pending_lock); + + ret = remap_vmalloc_range(vma, ip->obj, 0); + if (ret) { + pr_err("err %d from remap_vmalloc_range\n", ret); + goto done; + } + + vma->vm_ops = &rxe_vm_ops; + vma->vm_private_data = ip; + rxe_vma_open(vma); +done: + return ret; +} + +/* + * Allocate information for rxe_mmap + */ +struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *rxe, u32 size, + struct ib_udata *udata, void *obj) +{ + struct rxe_mmap_info *ip; + + if (!udata) + return ERR_PTR(-EINVAL); + + ip = kmalloc(sizeof(*ip), GFP_KERNEL); + if (!ip) + return ERR_PTR(-ENOMEM); + + size = PAGE_ALIGN(size); + + spin_lock_bh(&rxe->mmap_offset_lock); + + if (rxe->mmap_offset == 0) + rxe->mmap_offset = ALIGN(PAGE_SIZE, SHMLBA); + + ip->info.offset = rxe->mmap_offset; + rxe->mmap_offset += ALIGN(size, SHMLBA); + + spin_unlock_bh(&rxe->mmap_offset_lock); + + INIT_LIST_HEAD(&ip->pending_mmaps); + ip->info.size = size; + ip->context = + container_of(udata, struct uverbs_attr_bundle, driver_udata) + ->context; + ip->obj = obj; + kref_init(&ip->ref); + + return ip; +} diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c new file mode 100644 index 000000000..026285f7f --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -0,0 +1,556 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include "rxe.h" +#include "rxe_loc.h" + +/* + * lfsr (linear feedback shift register) with period 255 + */ +static u8 rxe_get_key(void) +{ + static u32 key = 1; + + key = key << 1; + + key |= (0 != (key & 0x100)) ^ (0 != (key & 0x10)) + ^ (0 != (key & 0x80)) ^ (0 != (key & 0x40)); + + key &= 0xff; + + return key; +} + +int mem_check_range(struct rxe_mem *mem, u64 iova, size_t length) +{ + switch (mem->type) { + case RXE_MEM_TYPE_DMA: + return 0; + + case RXE_MEM_TYPE_MR: + case RXE_MEM_TYPE_FMR: + if (iova < mem->iova || + length > mem->length || + iova > mem->iova + mem->length - length) + return -EFAULT; + return 0; + + default: + return -EFAULT; + } +} + +#define IB_ACCESS_REMOTE (IB_ACCESS_REMOTE_READ \ + | IB_ACCESS_REMOTE_WRITE \ + | IB_ACCESS_REMOTE_ATOMIC) + +static void rxe_mem_init(int access, struct rxe_mem *mem) +{ + u32 lkey = mem->pelem.index << 8 | rxe_get_key(); + u32 rkey = (access & IB_ACCESS_REMOTE) ? lkey : 0; + + mem->ibmr.lkey = lkey; + mem->ibmr.rkey = rkey; + mem->state = RXE_MEM_STATE_INVALID; + mem->type = RXE_MEM_TYPE_NONE; + mem->map_shift = ilog2(RXE_BUF_PER_MAP); +} + +void rxe_mem_cleanup(struct rxe_pool_entry *arg) +{ + struct rxe_mem *mem = container_of(arg, typeof(*mem), pelem); + int i; + + ib_umem_release(mem->umem); + + if (mem->map) { + for (i = 0; i < mem->num_map; i++) + kfree(mem->map[i]); + + kfree(mem->map); + } +} + +static int rxe_mem_alloc(struct rxe_mem *mem, int num_buf) +{ + int i; + int num_map; + struct rxe_map **map = mem->map; + + num_map = (num_buf + RXE_BUF_PER_MAP - 1) / RXE_BUF_PER_MAP; + + mem->map = kmalloc_array(num_map, sizeof(*map), GFP_KERNEL); + if (!mem->map) + goto err1; + + for (i = 0; i < num_map; i++) { + mem->map[i] = kmalloc(sizeof(**map), GFP_KERNEL); + if (!mem->map[i]) + goto err2; + } + + BUILD_BUG_ON(!is_power_of_2(RXE_BUF_PER_MAP)); + + mem->map_shift = ilog2(RXE_BUF_PER_MAP); + mem->map_mask = RXE_BUF_PER_MAP - 1; + + mem->num_buf = num_buf; + mem->num_map = num_map; + mem->max_buf = num_map * RXE_BUF_PER_MAP; + + return 0; + +err2: + for (i--; i >= 0; i--) + kfree(mem->map[i]); + + kfree(mem->map); +err1: + return -ENOMEM; +} + +void rxe_mem_init_dma(struct rxe_pd *pd, + int access, struct rxe_mem *mem) +{ + rxe_mem_init(access, mem); + + mem->ibmr.pd = &pd->ibpd; + mem->access = access; + mem->state = RXE_MEM_STATE_VALID; + mem->type = RXE_MEM_TYPE_DMA; +} + +int rxe_mem_init_user(struct rxe_pd *pd, u64 start, + u64 length, u64 iova, int access, struct ib_udata *udata, + struct rxe_mem *mem) +{ + struct rxe_map **map; + struct rxe_phys_buf *buf = NULL; + struct ib_umem *umem; + struct sg_page_iter sg_iter; + int num_buf; + void *vaddr; + int err; + + umem = ib_umem_get(pd->ibpd.device, start, length, access); + if (IS_ERR(umem)) { + pr_warn("err %d from rxe_umem_get\n", + (int)PTR_ERR(umem)); + err = PTR_ERR(umem); + goto err1; + } + + mem->umem = umem; + num_buf = ib_umem_num_pages(umem); + + rxe_mem_init(access, mem); + + err = rxe_mem_alloc(mem, num_buf); + if (err) { + pr_warn("err %d from rxe_mem_alloc\n", err); + ib_umem_release(umem); + goto err1; + } + + mem->page_shift = PAGE_SHIFT; + mem->page_mask = PAGE_SIZE - 1; + + num_buf = 0; + map = mem->map; + if (length > 0) { + buf = map[0]->buf; + + for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) { + if (num_buf >= RXE_BUF_PER_MAP) { + map++; + buf = map[0]->buf; + num_buf = 0; + } + + vaddr = page_address(sg_page_iter_page(&sg_iter)); + if (!vaddr) { + pr_warn("null vaddr\n"); + ib_umem_release(umem); + err = -ENOMEM; + goto err1; + } + + buf->addr = (uintptr_t)vaddr; + buf->size = PAGE_SIZE; + num_buf++; + buf++; + + } + } + + mem->ibmr.pd = &pd->ibpd; + mem->umem = umem; + mem->access = access; + mem->length = length; + mem->iova = iova; + mem->va = start; + mem->offset = ib_umem_offset(umem); + mem->state = RXE_MEM_STATE_VALID; + mem->type = RXE_MEM_TYPE_MR; + + return 0; + +err1: + return err; +} + +int rxe_mem_init_fast(struct rxe_pd *pd, + int max_pages, struct rxe_mem *mem) +{ + int err; + + rxe_mem_init(0, mem); + + /* In fastreg, we also set the rkey */ + mem->ibmr.rkey = mem->ibmr.lkey; + + err = rxe_mem_alloc(mem, max_pages); + if (err) + goto err1; + + mem->ibmr.pd = &pd->ibpd; + mem->max_buf = max_pages; + mem->state = RXE_MEM_STATE_FREE; + mem->type = RXE_MEM_TYPE_MR; + + return 0; + +err1: + return err; +} + +static void lookup_iova( + struct rxe_mem *mem, + u64 iova, + int *m_out, + int *n_out, + size_t *offset_out) +{ + size_t offset = iova - mem->iova + mem->offset; + int map_index; + int buf_index; + u64 length; + + if (likely(mem->page_shift)) { + *offset_out = offset & mem->page_mask; + offset >>= mem->page_shift; + *n_out = offset & mem->map_mask; + *m_out = offset >> mem->map_shift; + } else { + map_index = 0; + buf_index = 0; + + length = mem->map[map_index]->buf[buf_index].size; + + while (offset >= length) { + offset -= length; + buf_index++; + + if (buf_index == RXE_BUF_PER_MAP) { + map_index++; + buf_index = 0; + } + length = mem->map[map_index]->buf[buf_index].size; + } + + *m_out = map_index; + *n_out = buf_index; + *offset_out = offset; + } +} + +void *iova_to_vaddr(struct rxe_mem *mem, u64 iova, int length) +{ + size_t offset; + int m, n; + void *addr; + + if (mem->state != RXE_MEM_STATE_VALID) { + pr_warn("mem not in valid state\n"); + addr = NULL; + goto out; + } + + if (!mem->map) { + addr = (void *)(uintptr_t)iova; + goto out; + } + + if (mem_check_range(mem, iova, length)) { + pr_warn("range violation\n"); + addr = NULL; + goto out; + } + + lookup_iova(mem, iova, &m, &n, &offset); + + if (offset + length > mem->map[m]->buf[n].size) { + pr_warn("crosses page boundary\n"); + addr = NULL; + goto out; + } + + addr = (void *)(uintptr_t)mem->map[m]->buf[n].addr + offset; + +out: + return addr; +} + +/* copy data from a range (vaddr, vaddr+length-1) to or from + * a mem object starting at iova. Compute incremental value of + * crc32 if crcp is not zero. caller must hold a reference to mem + */ +int rxe_mem_copy(struct rxe_mem *mem, u64 iova, void *addr, int length, + enum copy_direction dir, u32 *crcp) +{ + int err; + int bytes; + u8 *va; + struct rxe_map **map; + struct rxe_phys_buf *buf; + int m; + int i; + size_t offset; + u32 crc = crcp ? (*crcp) : 0; + + if (length == 0) + return 0; + + if (mem->type == RXE_MEM_TYPE_DMA) { + u8 *src, *dest; + + src = (dir == to_mem_obj) ? + addr : ((void *)(uintptr_t)iova); + + dest = (dir == to_mem_obj) ? + ((void *)(uintptr_t)iova) : addr; + + memcpy(dest, src, length); + + if (crcp) + *crcp = rxe_crc32(to_rdev(mem->ibmr.device), + *crcp, dest, length); + + return 0; + } + + WARN_ON_ONCE(!mem->map); + + err = mem_check_range(mem, iova, length); + if (err) { + err = -EFAULT; + goto err1; + } + + lookup_iova(mem, iova, &m, &i, &offset); + + map = mem->map + m; + buf = map[0]->buf + i; + + while (length > 0) { + u8 *src, *dest; + + va = (u8 *)(uintptr_t)buf->addr + offset; + src = (dir == to_mem_obj) ? addr : va; + dest = (dir == to_mem_obj) ? va : addr; + + bytes = buf->size - offset; + + if (bytes > length) + bytes = length; + + memcpy(dest, src, bytes); + + if (crcp) + crc = rxe_crc32(to_rdev(mem->ibmr.device), + crc, dest, bytes); + + length -= bytes; + addr += bytes; + + offset = 0; + buf++; + i++; + + if (i == RXE_BUF_PER_MAP) { + i = 0; + map++; + buf = map[0]->buf; + } + } + + if (crcp) + *crcp = crc; + + return 0; + +err1: + return err; +} + +/* copy data in or out of a wqe, i.e. sg list + * under the control of a dma descriptor + */ +int copy_data( + struct rxe_pd *pd, + int access, + struct rxe_dma_info *dma, + void *addr, + int length, + enum copy_direction dir, + u32 *crcp) +{ + int bytes; + struct rxe_sge *sge = &dma->sge[dma->cur_sge]; + int offset = dma->sge_offset; + int resid = dma->resid; + struct rxe_mem *mem = NULL; + u64 iova; + int err; + + if (length == 0) + return 0; + + if (length > resid) { + err = -EINVAL; + goto err2; + } + + if (sge->length && (offset < sge->length)) { + mem = lookup_mem(pd, access, sge->lkey, lookup_local); + if (!mem) { + err = -EINVAL; + goto err1; + } + } + + while (length > 0) { + bytes = length; + + if (offset >= sge->length) { + if (mem) { + rxe_drop_ref(mem); + mem = NULL; + } + sge++; + dma->cur_sge++; + offset = 0; + + if (dma->cur_sge >= dma->num_sge) { + err = -ENOSPC; + goto err2; + } + + if (sge->length) { + mem = lookup_mem(pd, access, sge->lkey, + lookup_local); + if (!mem) { + err = -EINVAL; + goto err1; + } + } else { + continue; + } + } + + if (bytes > sge->length - offset) + bytes = sge->length - offset; + + if (bytes > 0) { + iova = sge->addr + offset; + + err = rxe_mem_copy(mem, iova, addr, bytes, dir, crcp); + if (err) + goto err2; + + offset += bytes; + resid -= bytes; + length -= bytes; + addr += bytes; + } + } + + dma->sge_offset = offset; + dma->resid = resid; + + if (mem) + rxe_drop_ref(mem); + + return 0; + +err2: + if (mem) + rxe_drop_ref(mem); +err1: + return err; +} + +int advance_dma_data(struct rxe_dma_info *dma, unsigned int length) +{ + struct rxe_sge *sge = &dma->sge[dma->cur_sge]; + int offset = dma->sge_offset; + int resid = dma->resid; + + while (length) { + unsigned int bytes; + + if (offset >= sge->length) { + sge++; + dma->cur_sge++; + offset = 0; + if (dma->cur_sge >= dma->num_sge) + return -ENOSPC; + } + + bytes = length; + + if (bytes > sge->length - offset) + bytes = sge->length - offset; + + offset += bytes; + resid -= bytes; + length -= bytes; + } + + dma->sge_offset = offset; + dma->resid = resid; + + return 0; +} + +/* (1) find the mem (mr or mw) corresponding to lkey/rkey + * depending on lookup_type + * (2) verify that the (qp) pd matches the mem pd + * (3) verify that the mem can support the requested access + * (4) verify that mem state is valid + */ +struct rxe_mem *lookup_mem(struct rxe_pd *pd, int access, u32 key, + enum lookup_type type) +{ + struct rxe_mem *mem; + struct rxe_dev *rxe = to_rdev(pd->ibpd.device); + int index = key >> 8; + + mem = rxe_pool_get_index(&rxe->mr_pool, index); + if (!mem) + return NULL; + + if (unlikely((type == lookup_local && mr_lkey(mem) != key) || + (type == lookup_remote && mr_rkey(mem) != key) || + mr_pd(mem) != pd || + (access && !(access & mem->access)) || + mem->state != RXE_MEM_STATE_VALID)) { + rxe_drop_ref(mem); + mem = NULL; + } + + return mem; +} diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c new file mode 100644 index 000000000..c071d5b1b --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -0,0 +1,660 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <linux/skbuff.h> +#include <linux/if_arp.h> +#include <linux/netdevice.h> +#include <linux/if.h> +#include <linux/if_vlan.h> +#include <net/udp_tunnel.h> +#include <net/sch_generic.h> +#include <linux/netfilter.h> +#include <rdma/ib_addr.h> + +#include "rxe.h" +#include "rxe_net.h" +#include "rxe_loc.h" + +static struct rxe_recv_sockets recv_sockets; + +int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) +{ + int err; + unsigned char ll_addr[ETH_ALEN]; + + ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); + err = dev_mc_add(rxe->ndev, ll_addr); + + return err; +} + +int rxe_mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid) +{ + int err; + unsigned char ll_addr[ETH_ALEN]; + + ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); + err = dev_mc_del(rxe->ndev, ll_addr); + + return err; +} + +static struct dst_entry *rxe_find_route4(struct net_device *ndev, + struct in_addr *saddr, + struct in_addr *daddr) +{ + struct rtable *rt; + struct flowi4 fl = { { 0 } }; + + memset(&fl, 0, sizeof(fl)); + fl.flowi4_oif = ndev->ifindex; + memcpy(&fl.saddr, saddr, sizeof(*saddr)); + memcpy(&fl.daddr, daddr, sizeof(*daddr)); + fl.flowi4_proto = IPPROTO_UDP; + + rt = ip_route_output_key(&init_net, &fl); + if (IS_ERR(rt)) { + pr_err_ratelimited("no route to %pI4\n", &daddr->s_addr); + return NULL; + } + + return &rt->dst; +} + +#if IS_ENABLED(CONFIG_IPV6) +static struct dst_entry *rxe_find_route6(struct net_device *ndev, + struct in6_addr *saddr, + struct in6_addr *daddr) +{ + struct dst_entry *ndst; + struct flowi6 fl6 = { { 0 } }; + + memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_oif = ndev->ifindex; + memcpy(&fl6.saddr, saddr, sizeof(*saddr)); + memcpy(&fl6.daddr, daddr, sizeof(*daddr)); + fl6.flowi6_proto = IPPROTO_UDP; + + ndst = ipv6_stub->ipv6_dst_lookup_flow(sock_net(recv_sockets.sk6->sk), + recv_sockets.sk6->sk, &fl6, + NULL); + if (IS_ERR(ndst)) { + pr_err_ratelimited("no route to %pI6\n", daddr); + return NULL; + } + + if (unlikely(ndst->error)) { + pr_err("no route to %pI6\n", daddr); + goto put; + } + + return ndst; +put: + dst_release(ndst); + return NULL; +} + +#else + +static struct dst_entry *rxe_find_route6(struct net_device *ndev, + struct in6_addr *saddr, + struct in6_addr *daddr) +{ + return NULL; +} + +#endif + +static struct dst_entry *rxe_find_route(struct net_device *ndev, + struct rxe_qp *qp, + struct rxe_av *av) +{ + struct dst_entry *dst = NULL; + + if (qp_type(qp) == IB_QPT_RC) + dst = sk_dst_get(qp->sk->sk); + + if (!dst || !dst_check(dst, qp->dst_cookie)) { + if (dst) + dst_release(dst); + + if (av->network_type == RXE_NETWORK_TYPE_IPV4) { + struct in_addr *saddr; + struct in_addr *daddr; + + saddr = &av->sgid_addr._sockaddr_in.sin_addr; + daddr = &av->dgid_addr._sockaddr_in.sin_addr; + dst = rxe_find_route4(ndev, saddr, daddr); + } else if (av->network_type == RXE_NETWORK_TYPE_IPV6) { + struct in6_addr *saddr6; + struct in6_addr *daddr6; + + saddr6 = &av->sgid_addr._sockaddr_in6.sin6_addr; + daddr6 = &av->dgid_addr._sockaddr_in6.sin6_addr; + dst = rxe_find_route6(ndev, saddr6, daddr6); +#if IS_ENABLED(CONFIG_IPV6) + if (dst) + qp->dst_cookie = + rt6_get_cookie((struct rt6_info *)dst); +#endif + } + + if (dst && (qp_type(qp) == IB_QPT_RC)) { + dst_hold(dst); + sk_dst_set(qp->sk->sk, dst); + } + } + return dst; +} + +static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb) +{ + struct udphdr *udph; + struct net_device *ndev = skb->dev; + struct net_device *rdev = ndev; + struct rxe_dev *rxe = rxe_get_dev_from_net(ndev); + struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); + + if (!rxe && is_vlan_dev(rdev)) { + rdev = vlan_dev_real_dev(ndev); + rxe = rxe_get_dev_from_net(rdev); + } + if (!rxe) + goto drop; + + if (skb_linearize(skb)) { + pr_err("skb_linearize failed\n"); + ib_device_put(&rxe->ib_dev); + goto drop; + } + + udph = udp_hdr(skb); + pkt->rxe = rxe; + pkt->port_num = 1; + pkt->hdr = (u8 *)(udph + 1); + pkt->mask = RXE_GRH_MASK; + pkt->paylen = be16_to_cpu(udph->len) - sizeof(*udph); + + rxe_rcv(skb); + + /* + * FIXME: this is in the wrong place, it needs to be done when pkt is + * destroyed + */ + ib_device_put(&rxe->ib_dev); + + return 0; +drop: + kfree_skb(skb); + + return 0; +} + +static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port, + bool ipv6) +{ + int err; + struct socket *sock; + struct udp_port_cfg udp_cfg = { }; + struct udp_tunnel_sock_cfg tnl_cfg = { }; + + if (ipv6) { + udp_cfg.family = AF_INET6; + udp_cfg.ipv6_v6only = 1; + } else { + udp_cfg.family = AF_INET; + } + + udp_cfg.local_udp_port = port; + + /* Create UDP socket */ + err = udp_sock_create(net, &udp_cfg, &sock); + if (err < 0) + return ERR_PTR(err); + + tnl_cfg.encap_type = 1; + tnl_cfg.encap_rcv = rxe_udp_encap_recv; + + /* Setup UDP tunnel */ + setup_udp_tunnel_sock(net, sock, &tnl_cfg); + + return sock; +} + +static void rxe_release_udp_tunnel(struct socket *sk) +{ + if (sk) + udp_tunnel_sock_release(sk); +} + +static void prepare_udp_hdr(struct sk_buff *skb, __be16 src_port, + __be16 dst_port) +{ + struct udphdr *udph; + + __skb_push(skb, sizeof(*udph)); + skb_reset_transport_header(skb); + udph = udp_hdr(skb); + + udph->dest = dst_port; + udph->source = src_port; + udph->len = htons(skb->len); + udph->check = 0; +} + +static void prepare_ipv4_hdr(struct dst_entry *dst, struct sk_buff *skb, + __be32 saddr, __be32 daddr, __u8 proto, + __u8 tos, __u8 ttl, __be16 df, bool xnet) +{ + struct iphdr *iph; + + skb_scrub_packet(skb, xnet); + + skb_clear_hash(skb); + skb_dst_set(skb, dst_clone(dst)); + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + + iph = ip_hdr(skb); + + iph->version = IPVERSION; + iph->ihl = sizeof(struct iphdr) >> 2; + iph->frag_off = df; + iph->protocol = proto; + iph->tos = tos; + iph->daddr = daddr; + iph->saddr = saddr; + iph->ttl = ttl; + __ip_select_ident(dev_net(dst->dev), iph, + skb_shinfo(skb)->gso_segs ?: 1); + iph->tot_len = htons(skb->len); + ip_send_check(iph); +} + +static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb, + struct in6_addr *saddr, struct in6_addr *daddr, + __u8 proto, __u8 prio, __u8 ttl) +{ + struct ipv6hdr *ip6h; + + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED + | IPSKB_REROUTED); + skb_dst_set(skb, dst_clone(dst)); + + __skb_push(skb, sizeof(*ip6h)); + skb_reset_network_header(skb); + ip6h = ipv6_hdr(skb); + ip6_flow_hdr(ip6h, prio, htonl(0)); + ip6h->payload_len = htons(skb->len); + ip6h->nexthdr = proto; + ip6h->hop_limit = ttl; + ip6h->daddr = *daddr; + ip6h->saddr = *saddr; + ip6h->payload_len = htons(skb->len - sizeof(*ip6h)); +} + +static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb) +{ + struct rxe_qp *qp = pkt->qp; + struct dst_entry *dst; + bool xnet = false; + __be16 df = htons(IP_DF); + struct rxe_av *av = rxe_get_av(pkt); + struct in_addr *saddr = &av->sgid_addr._sockaddr_in.sin_addr; + struct in_addr *daddr = &av->dgid_addr._sockaddr_in.sin_addr; + + dst = rxe_find_route(skb->dev, qp, av); + if (!dst) { + pr_err("Host not reachable\n"); + return -EHOSTUNREACH; + } + + prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), + cpu_to_be16(ROCE_V2_UDP_DPORT)); + + prepare_ipv4_hdr(dst, skb, saddr->s_addr, daddr->s_addr, IPPROTO_UDP, + av->grh.traffic_class, av->grh.hop_limit, df, xnet); + + dst_release(dst); + return 0; +} + +static int prepare6(struct rxe_pkt_info *pkt, struct sk_buff *skb) +{ + struct rxe_qp *qp = pkt->qp; + struct dst_entry *dst; + struct rxe_av *av = rxe_get_av(pkt); + struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr; + struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr; + + dst = rxe_find_route(skb->dev, qp, av); + if (!dst) { + pr_err("Host not reachable\n"); + return -EHOSTUNREACH; + } + + prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), + cpu_to_be16(ROCE_V2_UDP_DPORT)); + + prepare_ipv6_hdr(dst, skb, saddr, daddr, IPPROTO_UDP, + av->grh.traffic_class, + av->grh.hop_limit); + + dst_release(dst); + return 0; +} + +int rxe_prepare(struct rxe_pkt_info *pkt, struct sk_buff *skb, u32 *crc) +{ + int err = 0; + + if (skb->protocol == htons(ETH_P_IP)) + err = prepare4(pkt, skb); + else if (skb->protocol == htons(ETH_P_IPV6)) + err = prepare6(pkt, skb); + + *crc = rxe_icrc_hdr(pkt, skb); + + if (ether_addr_equal(skb->dev->dev_addr, rxe_get_av(pkt)->dmac)) + pkt->mask |= RXE_LOOPBACK_MASK; + + return err; +} + +static void rxe_skb_tx_dtor(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct rxe_qp *qp = sk->sk_user_data; + int skb_out = atomic_dec_return(&qp->skb_out); + + if (unlikely(qp->need_req_skb && + skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW)) + rxe_run_task(&qp->req.task, 1); + + rxe_drop_ref(qp); +} + +int rxe_send(struct rxe_pkt_info *pkt, struct sk_buff *skb) +{ + int err; + + skb->destructor = rxe_skb_tx_dtor; + skb->sk = pkt->qp->sk->sk; + + rxe_add_ref(pkt->qp); + atomic_inc(&pkt->qp->skb_out); + + if (skb->protocol == htons(ETH_P_IP)) { + err = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); + } else if (skb->protocol == htons(ETH_P_IPV6)) { + err = ip6_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); + } else { + pr_err("Unknown layer 3 protocol: %d\n", skb->protocol); + atomic_dec(&pkt->qp->skb_out); + rxe_drop_ref(pkt->qp); + kfree_skb(skb); + return -EINVAL; + } + + if (unlikely(net_xmit_eval(err))) { + pr_debug("error sending packet: %d\n", err); + return -EAGAIN; + } + + return 0; +} + +void rxe_loopback(struct sk_buff *skb) +{ + if (skb->protocol == htons(ETH_P_IP)) + skb_pull(skb, sizeof(struct iphdr)); + else + skb_pull(skb, sizeof(struct ipv6hdr)); + + rxe_rcv(skb); +} + +struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, + int paylen, struct rxe_pkt_info *pkt) +{ + unsigned int hdr_len; + struct sk_buff *skb = NULL; + struct net_device *ndev; + const struct ib_gid_attr *attr; + const int port_num = 1; + + attr = rdma_get_gid_attr(&rxe->ib_dev, port_num, av->grh.sgid_index); + if (IS_ERR(attr)) + return NULL; + + if (av->network_type == RXE_NETWORK_TYPE_IPV4) + hdr_len = ETH_HLEN + sizeof(struct udphdr) + + sizeof(struct iphdr); + else + hdr_len = ETH_HLEN + sizeof(struct udphdr) + + sizeof(struct ipv6hdr); + + rcu_read_lock(); + ndev = rdma_read_gid_attr_ndev_rcu(attr); + if (IS_ERR(ndev)) { + rcu_read_unlock(); + goto out; + } + skb = alloc_skb(paylen + hdr_len + LL_RESERVED_SPACE(ndev), + GFP_ATOMIC); + + if (unlikely(!skb)) { + rcu_read_unlock(); + goto out; + } + + skb_reserve(skb, hdr_len + LL_RESERVED_SPACE(ndev)); + + /* FIXME: hold reference to this netdev until life of this skb. */ + skb->dev = ndev; + rcu_read_unlock(); + + if (av->network_type == RXE_NETWORK_TYPE_IPV4) + skb->protocol = htons(ETH_P_IP); + else + skb->protocol = htons(ETH_P_IPV6); + + pkt->rxe = rxe; + pkt->port_num = port_num; + pkt->hdr = skb_put_zero(skb, paylen); + pkt->mask |= RXE_GRH_MASK; + +out: + rdma_put_gid_attr(attr); + return skb; +} + +/* + * this is required by rxe_cfg to match rxe devices in + * /sys/class/infiniband up with their underlying ethernet devices + */ +const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num) +{ + return rxe->ndev->name; +} + +int rxe_net_add(const char *ibdev_name, struct net_device *ndev) +{ + int err; + struct rxe_dev *rxe = NULL; + + rxe = ib_alloc_device(rxe_dev, ib_dev); + if (!rxe) + return -ENOMEM; + + rxe->ndev = ndev; + + err = rxe_add(rxe, ndev->mtu, ibdev_name); + if (err) { + ib_dealloc_device(&rxe->ib_dev); + return err; + } + + return 0; +} + +static void rxe_port_event(struct rxe_dev *rxe, + enum ib_event_type event) +{ + struct ib_event ev; + + ev.device = &rxe->ib_dev; + ev.element.port_num = 1; + ev.event = event; + + ib_dispatch_event(&ev); +} + +/* Caller must hold net_info_lock */ +void rxe_port_up(struct rxe_dev *rxe) +{ + struct rxe_port *port; + + port = &rxe->port; + port->attr.state = IB_PORT_ACTIVE; + + rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE); + dev_info(&rxe->ib_dev.dev, "set active\n"); +} + +/* Caller must hold net_info_lock */ +void rxe_port_down(struct rxe_dev *rxe) +{ + struct rxe_port *port; + + port = &rxe->port; + port->attr.state = IB_PORT_DOWN; + + rxe_port_event(rxe, IB_EVENT_PORT_ERR); + rxe_counter_inc(rxe, RXE_CNT_LINK_DOWNED); + dev_info(&rxe->ib_dev.dev, "set down\n"); +} + +void rxe_set_port_state(struct rxe_dev *rxe) +{ + if (netif_running(rxe->ndev) && netif_carrier_ok(rxe->ndev)) + rxe_port_up(rxe); + else + rxe_port_down(rxe); +} + +static int rxe_notify(struct notifier_block *not_blk, + unsigned long event, + void *arg) +{ + struct net_device *ndev = netdev_notifier_info_to_dev(arg); + struct rxe_dev *rxe = rxe_get_dev_from_net(ndev); + + if (!rxe) + return NOTIFY_OK; + + switch (event) { + case NETDEV_UNREGISTER: + ib_unregister_device_queued(&rxe->ib_dev); + break; + case NETDEV_UP: + rxe_port_up(rxe); + break; + case NETDEV_DOWN: + rxe_port_down(rxe); + break; + case NETDEV_CHANGEMTU: + pr_info("%s changed mtu to %d\n", ndev->name, ndev->mtu); + rxe_set_mtu(rxe, ndev->mtu); + break; + case NETDEV_CHANGE: + rxe_set_port_state(rxe); + break; + case NETDEV_REBOOT: + case NETDEV_GOING_DOWN: + case NETDEV_CHANGEADDR: + case NETDEV_CHANGENAME: + case NETDEV_FEAT_CHANGE: + default: + pr_info("ignoring netdev event = %ld for %s\n", + event, ndev->name); + break; + } + + ib_device_put(&rxe->ib_dev); + return NOTIFY_OK; +} + +static struct notifier_block rxe_net_notifier = { + .notifier_call = rxe_notify, +}; + +static int rxe_net_ipv4_init(void) +{ + recv_sockets.sk4 = rxe_setup_udp_tunnel(&init_net, + htons(ROCE_V2_UDP_DPORT), false); + if (IS_ERR(recv_sockets.sk4)) { + recv_sockets.sk4 = NULL; + pr_err("Failed to create IPv4 UDP tunnel\n"); + return -1; + } + + return 0; +} + +static int rxe_net_ipv6_init(void) +{ +#if IS_ENABLED(CONFIG_IPV6) + + recv_sockets.sk6 = rxe_setup_udp_tunnel(&init_net, + htons(ROCE_V2_UDP_DPORT), true); + if (PTR_ERR(recv_sockets.sk6) == -EAFNOSUPPORT) { + recv_sockets.sk6 = NULL; + pr_warn("IPv6 is not supported, can not create a UDPv6 socket\n"); + return 0; + } + + if (IS_ERR(recv_sockets.sk6)) { + recv_sockets.sk6 = NULL; + pr_err("Failed to create IPv6 UDP tunnel\n"); + return -1; + } +#endif + return 0; +} + +void rxe_net_exit(void) +{ + rxe_release_udp_tunnel(recv_sockets.sk6); + rxe_release_udp_tunnel(recv_sockets.sk4); + unregister_netdevice_notifier(&rxe_net_notifier); +} + +int rxe_net_init(void) +{ + int err; + + recv_sockets.sk6 = NULL; + + err = rxe_net_ipv4_init(); + if (err) + return err; + err = rxe_net_ipv6_init(); + if (err) + goto err_out; + err = register_netdevice_notifier(&rxe_net_notifier); + if (err) { + pr_err("Failed to register netdev notifier\n"); + goto err_out; + } + return 0; +err_out: + rxe_net_exit(); + return err; +} diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h new file mode 100644 index 000000000..45d80d00f --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_net.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#ifndef RXE_NET_H +#define RXE_NET_H + +#include <net/sock.h> +#include <net/if_inet6.h> +#include <linux/module.h> + +struct rxe_recv_sockets { + struct socket *sk4; + struct socket *sk6; +}; + +int rxe_net_add(const char *ibdev_name, struct net_device *ndev); + +int rxe_net_init(void); +void rxe_net_exit(void); + +#endif /* RXE_NET_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.c b/drivers/infiniband/sw/rxe/rxe_opcode.c new file mode 100644 index 000000000..66ffb516b --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_opcode.c @@ -0,0 +1,934 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <rdma/ib_pack.h> +#include "rxe_opcode.h" +#include "rxe_hdr.h" + +/* useful information about work request opcodes and pkt opcodes in + * table form + */ +struct rxe_wr_opcode_info rxe_wr_opcode_info[] = { + [IB_WR_RDMA_WRITE] = { + .name = "IB_WR_RDMA_WRITE", + .mask = { + [IB_QPT_RC] = WR_INLINE_MASK | WR_WRITE_MASK, + [IB_QPT_UC] = WR_INLINE_MASK | WR_WRITE_MASK, + }, + }, + [IB_WR_RDMA_WRITE_WITH_IMM] = { + .name = "IB_WR_RDMA_WRITE_WITH_IMM", + .mask = { + [IB_QPT_RC] = WR_INLINE_MASK | WR_WRITE_MASK, + [IB_QPT_UC] = WR_INLINE_MASK | WR_WRITE_MASK, + }, + }, + [IB_WR_SEND] = { + .name = "IB_WR_SEND", + .mask = { + [IB_QPT_SMI] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_GSI] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_RC] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_UC] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_UD] = WR_INLINE_MASK | WR_SEND_MASK, + }, + }, + [IB_WR_SEND_WITH_IMM] = { + .name = "IB_WR_SEND_WITH_IMM", + .mask = { + [IB_QPT_SMI] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_GSI] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_RC] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_UC] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_UD] = WR_INLINE_MASK | WR_SEND_MASK, + }, + }, + [IB_WR_RDMA_READ] = { + .name = "IB_WR_RDMA_READ", + .mask = { + [IB_QPT_RC] = WR_READ_MASK, + }, + }, + [IB_WR_ATOMIC_CMP_AND_SWP] = { + .name = "IB_WR_ATOMIC_CMP_AND_SWP", + .mask = { + [IB_QPT_RC] = WR_ATOMIC_MASK, + }, + }, + [IB_WR_ATOMIC_FETCH_AND_ADD] = { + .name = "IB_WR_ATOMIC_FETCH_AND_ADD", + .mask = { + [IB_QPT_RC] = WR_ATOMIC_MASK, + }, + }, + [IB_WR_LSO] = { + .name = "IB_WR_LSO", + .mask = { + /* not supported */ + }, + }, + [IB_WR_SEND_WITH_INV] = { + .name = "IB_WR_SEND_WITH_INV", + .mask = { + [IB_QPT_RC] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_UC] = WR_INLINE_MASK | WR_SEND_MASK, + [IB_QPT_UD] = WR_INLINE_MASK | WR_SEND_MASK, + }, + }, + [IB_WR_RDMA_READ_WITH_INV] = { + .name = "IB_WR_RDMA_READ_WITH_INV", + .mask = { + [IB_QPT_RC] = WR_READ_MASK, + }, + }, + [IB_WR_LOCAL_INV] = { + .name = "IB_WR_LOCAL_INV", + .mask = { + [IB_QPT_RC] = WR_REG_MASK, + }, + }, + [IB_WR_REG_MR] = { + .name = "IB_WR_REG_MR", + .mask = { + [IB_QPT_RC] = WR_REG_MASK, + }, + }, +}; + +struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE] = { + [IB_OPCODE_RC_SEND_FIRST] = { + .name = "IB_OPCODE_RC_SEND_FIRST", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_RWR_MASK + | RXE_SEND_MASK | RXE_START_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_SEND_MIDDLE] = { + .name = "IB_OPCODE_RC_SEND_MIDDLE", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_SEND_MASK + | RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_SEND_LAST] = { + .name = "IB_OPCODE_RC_SEND_LAST", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK + | RXE_SEND_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE", + .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IMMDT] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RC_SEND_ONLY] = { + .name = "IB_OPCODE_RC_SEND_ONLY", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK + | RXE_RWR_MASK | RXE_SEND_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE", + .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IMMDT] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_WRITE_FIRST] = { + .name = "IB_OPCODE_RC_RDMA_WRITE_FIRST", + .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_WRITE_MIDDLE] = { + .name = "IB_OPCODE_RC_RDMA_WRITE_MIDDLE", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_WRITE_LAST] = { + .name = "IB_OPCODE_RC_RDMA_WRITE_LAST", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE", + .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IMMDT] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_WRITE_ONLY] = { + .name = "IB_OPCODE_RC_RDMA_WRITE_ONLY", + .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_START_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE", + .mask = RXE_RETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_COMP_MASK | RXE_RWR_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_READ_REQUEST] = { + .name = "IB_OPCODE_RC_RDMA_READ_REQUEST", + .mask = RXE_RETH_MASK | RXE_REQ_MASK | RXE_READ_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST] = { + .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST", + .mask = RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK + | RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_AETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE] = { + .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE", + .mask = RXE_PAYLOAD_MASK | RXE_ACK_MASK | RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST] = { + .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST", + .mask = RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_AETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY] = { + .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY", + .mask = RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_AETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RC_ACKNOWLEDGE] = { + .name = "IB_OPCODE_RC_ACKNOWLEDGE", + .mask = RXE_AETH_MASK | RXE_ACK_MASK | RXE_START_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_AETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = { + .name = "IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE", + .mask = RXE_AETH_MASK | RXE_ATMACK_MASK | RXE_ACK_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_ATMACK_BYTES + RXE_AETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_AETH] = RXE_BTH_BYTES, + [RXE_ATMACK] = RXE_BTH_BYTES + + RXE_AETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_ATMACK_BYTES + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RC_COMPARE_SWAP] = { + .name = "IB_OPCODE_RC_COMPARE_SWAP", + .mask = RXE_ATMETH_MASK | RXE_REQ_MASK | RXE_ATOMIC_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_ATMETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_ATMETH_BYTES, + } + }, + [IB_OPCODE_RC_FETCH_ADD] = { + .name = "IB_OPCODE_RC_FETCH_ADD", + .mask = RXE_ATMETH_MASK | RXE_REQ_MASK | RXE_ATOMIC_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_ATMETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_ATMETH_BYTES, + } + }, + [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = { + .name = "IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE", + .mask = RXE_IETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IETH_BYTES, + } + }, + [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = { + .name = "IB_OPCODE_RC_SEND_ONLY_INV", + .mask = RXE_IETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK + | RXE_END_MASK | RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_IETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IETH_BYTES, + } + }, + + /* UC */ + [IB_OPCODE_UC_SEND_FIRST] = { + .name = "IB_OPCODE_UC_SEND_FIRST", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_RWR_MASK + | RXE_SEND_MASK | RXE_START_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_UC_SEND_MIDDLE] = { + .name = "IB_OPCODE_UC_SEND_MIDDLE", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_SEND_MASK + | RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_UC_SEND_LAST] = { + .name = "IB_OPCODE_UC_SEND_LAST", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK + | RXE_SEND_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE", + .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IMMDT] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_UC_SEND_ONLY] = { + .name = "IB_OPCODE_UC_SEND_ONLY", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK + | RXE_RWR_MASK | RXE_SEND_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE", + .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IMMDT] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_UC_RDMA_WRITE_FIRST] = { + .name = "IB_OPCODE_UC_RDMA_WRITE_FIRST", + .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_UC_RDMA_WRITE_MIDDLE] = { + .name = "IB_OPCODE_UC_RDMA_WRITE_MIDDLE", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_UC_RDMA_WRITE_LAST] = { + .name = "IB_OPCODE_UC_RDMA_WRITE_LAST", + .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_PAYLOAD] = RXE_BTH_BYTES, + } + }, + [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE", + .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_IMMDT] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_UC_RDMA_WRITE_ONLY] = { + .name = "IB_OPCODE_UC_RDMA_WRITE_ONLY", + .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_START_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE", + .mask = RXE_RETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_COMP_MASK | RXE_RWR_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RETH] = RXE_BTH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_RETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + + /* RD */ + [IB_OPCODE_RD_SEND_FIRST] = { + .name = "IB_OPCODE_RD_SEND_FIRST", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_RWR_MASK | RXE_SEND_MASK + | RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_RD_SEND_MIDDLE] = { + .name = "IB_OPCODE_RD_SEND_MIDDLE", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_SEND_MASK + | RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_RD_SEND_LAST] = { + .name = "IB_OPCODE_RD_SEND_LAST", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_COMP_MASK | RXE_SEND_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_RD_SEND_LAST_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RD_SEND_LAST_WITH_IMMEDIATE", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK + | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_SEND_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RD_SEND_ONLY] = { + .name = "IB_OPCODE_RD_SEND_ONLY", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_COMP_MASK | RXE_RWR_MASK + | RXE_SEND_MASK | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_RD_SEND_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RD_SEND_ONLY_WITH_IMMEDIATE", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK + | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_WRITE_FIRST] = { + .name = "IB_OPCODE_RD_RDMA_WRITE_FIRST", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK + | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_RETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_WRITE_MIDDLE] = { + .name = "IB_OPCODE_RD_RDMA_WRITE_MIDDLE", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_WRITE_LAST] = { + .name = "IB_OPCODE_RD_RDMA_WRITE_LAST", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_WRITE_LAST_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RD_RDMA_WRITE_LAST_WITH_IMMEDIATE", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK + | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_WRITE_ONLY] = { + .name = "IB_OPCODE_RD_RDMA_WRITE_ONLY", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK + | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_WRITE_MASK | RXE_START_MASK + | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_RETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_RETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_RD_RDMA_WRITE_ONLY_WITH_IMMEDIATE", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK + | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_WRITE_MASK + | RXE_COMP_MASK | RXE_RWR_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES + + RXE_DETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_RETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_RETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES + + RXE_RETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_READ_REQUEST] = { + .name = "IB_OPCODE_RD_RDMA_READ_REQUEST", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK + | RXE_REQ_MASK | RXE_READ_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_RETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RETH_BYTES + + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_READ_RESPONSE_FIRST] = { + .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_FIRST", + .mask = RXE_RDETH_MASK | RXE_AETH_MASK + | RXE_PAYLOAD_MASK | RXE_ACK_MASK + | RXE_START_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_AETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_READ_RESPONSE_MIDDLE] = { + .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_MIDDLE", + .mask = RXE_RDETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK + | RXE_MIDDLE_MASK, + .length = RXE_BTH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_READ_RESPONSE_LAST] = { + .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_LAST", + .mask = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_PAYLOAD_MASK + | RXE_ACK_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_AETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RD_RDMA_READ_RESPONSE_ONLY] = { + .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_ONLY", + .mask = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_PAYLOAD_MASK + | RXE_ACK_MASK | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_AETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RD_ACKNOWLEDGE] = { + .name = "IB_OPCODE_RD_ACKNOWLEDGE", + .mask = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_ACK_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_AETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + } + }, + [IB_OPCODE_RD_ATOMIC_ACKNOWLEDGE] = { + .name = "IB_OPCODE_RD_ATOMIC_ACKNOWLEDGE", + .mask = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_ATMACK_MASK + | RXE_ACK_MASK | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_ATMACK_BYTES + RXE_AETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_AETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_ATMACK] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_AETH_BYTES, + } + }, + [IB_OPCODE_RD_COMPARE_SWAP] = { + .name = "RD_COMPARE_SWAP", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_ATMETH_MASK + | RXE_REQ_MASK | RXE_ATOMIC_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_ATMETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + + RXE_ATMETH_BYTES + + RXE_DETH_BYTES + + + RXE_RDETH_BYTES, + } + }, + [IB_OPCODE_RD_FETCH_ADD] = { + .name = "IB_OPCODE_RD_FETCH_ADD", + .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_ATMETH_MASK + | RXE_REQ_MASK | RXE_ATOMIC_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES + RXE_DETH_BYTES + + RXE_RDETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_RDETH] = RXE_BTH_BYTES, + [RXE_DETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES, + [RXE_ATMETH] = RXE_BTH_BYTES + + RXE_RDETH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + + RXE_ATMETH_BYTES + + RXE_DETH_BYTES + + + RXE_RDETH_BYTES, + } + }, + + /* UD */ + [IB_OPCODE_UD_SEND_ONLY] = { + .name = "IB_OPCODE_UD_SEND_ONLY", + .mask = RXE_DETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK + | RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK + | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_DETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_DETH] = RXE_BTH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_DETH_BYTES, + } + }, + [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE] = { + .name = "IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE", + .mask = RXE_DETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK + | RXE_REQ_MASK | RXE_COMP_MASK | RXE_RWR_MASK + | RXE_SEND_MASK | RXE_START_MASK | RXE_END_MASK, + .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES, + .offset = { + [RXE_BTH] = 0, + [RXE_DETH] = RXE_BTH_BYTES, + [RXE_IMMDT] = RXE_BTH_BYTES + + RXE_DETH_BYTES, + [RXE_PAYLOAD] = RXE_BTH_BYTES + + RXE_DETH_BYTES + + RXE_IMMDT_BYTES, + } + }, + +}; diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.h b/drivers/infiniband/sw/rxe/rxe_opcode.h new file mode 100644 index 000000000..1041ac9a9 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_opcode.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#ifndef RXE_OPCODE_H +#define RXE_OPCODE_H + +/* + * contains header bit mask definitions and header lengths + * declaration of the rxe_opcode_info struct and + * rxe_wr_opcode_info struct + */ + +enum rxe_wr_mask { + WR_INLINE_MASK = BIT(0), + WR_ATOMIC_MASK = BIT(1), + WR_SEND_MASK = BIT(2), + WR_READ_MASK = BIT(3), + WR_WRITE_MASK = BIT(4), + WR_LOCAL_MASK = BIT(5), + WR_REG_MASK = BIT(6), + + WR_READ_OR_WRITE_MASK = WR_READ_MASK | WR_WRITE_MASK, + WR_READ_WRITE_OR_SEND_MASK = WR_READ_OR_WRITE_MASK | WR_SEND_MASK, + WR_WRITE_OR_SEND_MASK = WR_WRITE_MASK | WR_SEND_MASK, + WR_ATOMIC_OR_READ_MASK = WR_ATOMIC_MASK | WR_READ_MASK, +}; + +#define WR_MAX_QPT (8) + +struct rxe_wr_opcode_info { + char *name; + enum rxe_wr_mask mask[WR_MAX_QPT]; +}; + +extern struct rxe_wr_opcode_info rxe_wr_opcode_info[]; + +enum rxe_hdr_type { + RXE_LRH, + RXE_GRH, + RXE_BTH, + RXE_RETH, + RXE_AETH, + RXE_ATMETH, + RXE_ATMACK, + RXE_IETH, + RXE_RDETH, + RXE_DETH, + RXE_IMMDT, + RXE_PAYLOAD, + NUM_HDR_TYPES +}; + +enum rxe_hdr_mask { + RXE_LRH_MASK = BIT(RXE_LRH), + RXE_GRH_MASK = BIT(RXE_GRH), + RXE_BTH_MASK = BIT(RXE_BTH), + RXE_IMMDT_MASK = BIT(RXE_IMMDT), + RXE_RETH_MASK = BIT(RXE_RETH), + RXE_AETH_MASK = BIT(RXE_AETH), + RXE_ATMETH_MASK = BIT(RXE_ATMETH), + RXE_ATMACK_MASK = BIT(RXE_ATMACK), + RXE_IETH_MASK = BIT(RXE_IETH), + RXE_RDETH_MASK = BIT(RXE_RDETH), + RXE_DETH_MASK = BIT(RXE_DETH), + RXE_PAYLOAD_MASK = BIT(RXE_PAYLOAD), + + RXE_REQ_MASK = BIT(NUM_HDR_TYPES + 0), + RXE_ACK_MASK = BIT(NUM_HDR_TYPES + 1), + RXE_SEND_MASK = BIT(NUM_HDR_TYPES + 2), + RXE_WRITE_MASK = BIT(NUM_HDR_TYPES + 3), + RXE_READ_MASK = BIT(NUM_HDR_TYPES + 4), + RXE_ATOMIC_MASK = BIT(NUM_HDR_TYPES + 5), + + RXE_RWR_MASK = BIT(NUM_HDR_TYPES + 6), + RXE_COMP_MASK = BIT(NUM_HDR_TYPES + 7), + + RXE_START_MASK = BIT(NUM_HDR_TYPES + 8), + RXE_MIDDLE_MASK = BIT(NUM_HDR_TYPES + 9), + RXE_END_MASK = BIT(NUM_HDR_TYPES + 10), + + RXE_LOOPBACK_MASK = BIT(NUM_HDR_TYPES + 12), + + RXE_READ_OR_ATOMIC = (RXE_READ_MASK | RXE_ATOMIC_MASK), + RXE_WRITE_OR_SEND = (RXE_WRITE_MASK | RXE_SEND_MASK), +}; + +#define OPCODE_NONE (-1) +#define RXE_NUM_OPCODE 256 + +struct rxe_opcode_info { + char *name; + enum rxe_hdr_mask mask; + int length; + int offset[NUM_HDR_TYPES]; +}; + +extern struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE]; + +#endif /* RXE_OPCODE_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h new file mode 100644 index 000000000..dca86422b --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#ifndef RXE_PARAM_H +#define RXE_PARAM_H + +#include <uapi/rdma/rdma_user_rxe.h> + +static inline enum ib_mtu rxe_mtu_int_to_enum(int mtu) +{ + if (mtu < 256) + return 0; + else if (mtu < 512) + return IB_MTU_256; + else if (mtu < 1024) + return IB_MTU_512; + else if (mtu < 2048) + return IB_MTU_1024; + else if (mtu < 4096) + return IB_MTU_2048; + else + return IB_MTU_4096; +} + +/* Find the IB mtu for a given network MTU. */ +static inline enum ib_mtu eth_mtu_int_to_enum(int mtu) +{ + mtu -= RXE_MAX_HDR_LENGTH; + + return rxe_mtu_int_to_enum(mtu); +} + +/* default/initial rxe device parameter settings */ +enum rxe_device_param { + RXE_MAX_MR_SIZE = -1ull, + RXE_PAGE_SIZE_CAP = 0xfffff000, + RXE_MAX_QP = 0x10000, + RXE_MAX_QP_WR = 0x4000, + RXE_DEVICE_CAP_FLAGS = IB_DEVICE_BAD_PKEY_CNTR + | IB_DEVICE_BAD_QKEY_CNTR + | IB_DEVICE_AUTO_PATH_MIG + | IB_DEVICE_CHANGE_PHY_PORT + | IB_DEVICE_UD_AV_PORT_ENFORCE + | IB_DEVICE_PORT_ACTIVE_EVENT + | IB_DEVICE_SYS_IMAGE_GUID + | IB_DEVICE_RC_RNR_NAK_GEN + | IB_DEVICE_SRQ_RESIZE + | IB_DEVICE_MEM_MGT_EXTENSIONS + | IB_DEVICE_ALLOW_USER_UNREG, + RXE_MAX_SGE = 32, + RXE_MAX_WQE_SIZE = sizeof(struct rxe_send_wqe) + + sizeof(struct ib_sge) * RXE_MAX_SGE, + RXE_MAX_INLINE_DATA = RXE_MAX_WQE_SIZE - + sizeof(struct rxe_send_wqe), + RXE_MAX_SGE_RD = 32, + RXE_MAX_CQ = 16384, + RXE_MAX_LOG_CQE = 15, + RXE_MAX_MR = 256 * 1024, + RXE_MAX_PD = 0x7ffc, + RXE_MAX_QP_RD_ATOM = 128, + RXE_MAX_RES_RD_ATOM = 0x3f000, + RXE_MAX_QP_INIT_RD_ATOM = 128, + RXE_MAX_MCAST_GRP = 8192, + RXE_MAX_MCAST_QP_ATTACH = 56, + RXE_MAX_TOT_MCAST_QP_ATTACH = 0x70000, + RXE_MAX_AH = 100, + RXE_MAX_SRQ = 960, + RXE_MAX_SRQ_WR = 0x4000, + RXE_MIN_SRQ_WR = 1, + RXE_MAX_SRQ_SGE = 27, + RXE_MIN_SRQ_SGE = 1, + RXE_MAX_FMR_PAGE_LIST_LEN = 512, + RXE_MAX_PKEYS = 1, + RXE_LOCAL_CA_ACK_DELAY = 15, + + RXE_MAX_UCONTEXT = 512, + + RXE_NUM_PORT = 1, + + RXE_MIN_QP_INDEX = 16, + RXE_MAX_QP_INDEX = 0x00020000, + + RXE_MIN_SRQ_INDEX = 0x00020001, + RXE_MAX_SRQ_INDEX = 0x00040000, + + RXE_MIN_MR_INDEX = 0x00000001, + RXE_MAX_MR_INDEX = 0x00040000, + RXE_MIN_MW_INDEX = 0x00040001, + RXE_MAX_MW_INDEX = 0x00060000, + RXE_MAX_PKT_PER_ACK = 64, + + RXE_MAX_UNACKED_PSNS = 128, + + /* Max inflight SKBs per queue pair */ + RXE_INFLIGHT_SKBS_PER_QP_HIGH = 64, + RXE_INFLIGHT_SKBS_PER_QP_LOW = 16, + + /* Max number of interations of each tasklet + * before yielding the cpu to let other + * work make progress + */ + RXE_MAX_ITERATIONS = 1024, + + /* Delay before calling arbiter timer */ + RXE_NSEC_ARB_TIMER_DELAY = 200, + + /* IBTA v1.4 A3.3.1 VENDOR INFORMATION section */ + RXE_VENDOR_ID = 0XFFFFFF, +}; + +/* default/initial rxe port parameters */ +enum rxe_port_param { + RXE_PORT_GID_TBL_LEN = 1024, + RXE_PORT_PORT_CAP_FLAGS = IB_PORT_CM_SUP, + RXE_PORT_MAX_MSG_SZ = 0x800000, + RXE_PORT_BAD_PKEY_CNTR = 0, + RXE_PORT_QKEY_VIOL_CNTR = 0, + RXE_PORT_LID = 0, + RXE_PORT_SM_LID = 0, + RXE_PORT_SM_SL = 0, + RXE_PORT_LMC = 0, + RXE_PORT_MAX_VL_NUM = 1, + RXE_PORT_SUBNET_TIMEOUT = 0, + RXE_PORT_INIT_TYPE_REPLY = 0, + RXE_PORT_ACTIVE_WIDTH = IB_WIDTH_1X, + RXE_PORT_ACTIVE_SPEED = 1, + RXE_PORT_PKEY_TBL_LEN = 1, + RXE_PORT_PHYS_STATE = IB_PORT_PHYS_STATE_POLLING, + RXE_PORT_SUBNET_PREFIX = 0xfe80000000000000ULL, +}; + +/* default/initial port info parameters */ +enum rxe_port_info_param { + RXE_PORT_INFO_VL_CAP = 4, /* 1-8 */ + RXE_PORT_INFO_MTU_CAP = 5, /* 4096 */ + RXE_PORT_INFO_OPER_VL = 1, /* 1 */ +}; + +#endif /* RXE_PARAM_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c new file mode 100644 index 000000000..b374eb53e --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -0,0 +1,459 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include "rxe.h" +#include "rxe_loc.h" + +/* info about object pools + * note that mr and mw share a single index space + * so that one can map an lkey to the correct type of object + */ +struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = { + [RXE_TYPE_UC] = { + .name = "rxe-uc", + .size = sizeof(struct rxe_ucontext), + .flags = RXE_POOL_NO_ALLOC, + }, + [RXE_TYPE_PD] = { + .name = "rxe-pd", + .size = sizeof(struct rxe_pd), + .flags = RXE_POOL_NO_ALLOC, + }, + [RXE_TYPE_AH] = { + .name = "rxe-ah", + .size = sizeof(struct rxe_ah), + .flags = RXE_POOL_ATOMIC | RXE_POOL_NO_ALLOC, + }, + [RXE_TYPE_SRQ] = { + .name = "rxe-srq", + .size = sizeof(struct rxe_srq), + .flags = RXE_POOL_INDEX | RXE_POOL_NO_ALLOC, + .min_index = RXE_MIN_SRQ_INDEX, + .max_index = RXE_MAX_SRQ_INDEX, + }, + [RXE_TYPE_QP] = { + .name = "rxe-qp", + .size = sizeof(struct rxe_qp), + .cleanup = rxe_qp_cleanup, + .flags = RXE_POOL_INDEX, + .min_index = RXE_MIN_QP_INDEX, + .max_index = RXE_MAX_QP_INDEX, + }, + [RXE_TYPE_CQ] = { + .name = "rxe-cq", + .size = sizeof(struct rxe_cq), + .flags = RXE_POOL_NO_ALLOC, + .cleanup = rxe_cq_cleanup, + }, + [RXE_TYPE_MR] = { + .name = "rxe-mr", + .size = sizeof(struct rxe_mem), + .cleanup = rxe_mem_cleanup, + .flags = RXE_POOL_INDEX, + .max_index = RXE_MAX_MR_INDEX, + .min_index = RXE_MIN_MR_INDEX, + }, + [RXE_TYPE_MW] = { + .name = "rxe-mw", + .size = sizeof(struct rxe_mem), + .flags = RXE_POOL_INDEX, + .max_index = RXE_MAX_MW_INDEX, + .min_index = RXE_MIN_MW_INDEX, + }, + [RXE_TYPE_MC_GRP] = { + .name = "rxe-mc_grp", + .size = sizeof(struct rxe_mc_grp), + .cleanup = rxe_mc_cleanup, + .flags = RXE_POOL_KEY, + .key_offset = offsetof(struct rxe_mc_grp, mgid), + .key_size = sizeof(union ib_gid), + }, + [RXE_TYPE_MC_ELEM] = { + .name = "rxe-mc_elem", + .size = sizeof(struct rxe_mc_elem), + .flags = RXE_POOL_ATOMIC, + }, +}; + +static inline const char *pool_name(struct rxe_pool *pool) +{ + return rxe_type_info[pool->type].name; +} + +static int rxe_pool_init_index(struct rxe_pool *pool, u32 max, u32 min) +{ + int err = 0; + size_t size; + + if ((max - min + 1) < pool->max_elem) { + pr_warn("not enough indices for max_elem\n"); + err = -EINVAL; + goto out; + } + + pool->max_index = max; + pool->min_index = min; + + size = BITS_TO_LONGS(max - min + 1) * sizeof(long); + pool->table = kmalloc(size, GFP_KERNEL); + if (!pool->table) { + err = -ENOMEM; + goto out; + } + + pool->table_size = size; + bitmap_zero(pool->table, max - min + 1); + +out: + return err; +} + +int rxe_pool_init( + struct rxe_dev *rxe, + struct rxe_pool *pool, + enum rxe_elem_type type, + unsigned int max_elem) +{ + int err = 0; + size_t size = rxe_type_info[type].size; + + memset(pool, 0, sizeof(*pool)); + + pool->rxe = rxe; + pool->type = type; + pool->max_elem = max_elem; + pool->elem_size = ALIGN(size, RXE_POOL_ALIGN); + pool->flags = rxe_type_info[type].flags; + pool->tree = RB_ROOT; + pool->cleanup = rxe_type_info[type].cleanup; + + atomic_set(&pool->num_elem, 0); + + kref_init(&pool->ref_cnt); + + rwlock_init(&pool->pool_lock); + + if (rxe_type_info[type].flags & RXE_POOL_INDEX) { + err = rxe_pool_init_index(pool, + rxe_type_info[type].max_index, + rxe_type_info[type].min_index); + if (err) + goto out; + } + + if (rxe_type_info[type].flags & RXE_POOL_KEY) { + pool->key_offset = rxe_type_info[type].key_offset; + pool->key_size = rxe_type_info[type].key_size; + } + + pool->state = RXE_POOL_STATE_VALID; + +out: + return err; +} + +static void rxe_pool_release(struct kref *kref) +{ + struct rxe_pool *pool = container_of(kref, struct rxe_pool, ref_cnt); + + pool->state = RXE_POOL_STATE_INVALID; + kfree(pool->table); +} + +static void rxe_pool_put(struct rxe_pool *pool) +{ + kref_put(&pool->ref_cnt, rxe_pool_release); +} + +void rxe_pool_cleanup(struct rxe_pool *pool) +{ + unsigned long flags; + + write_lock_irqsave(&pool->pool_lock, flags); + pool->state = RXE_POOL_STATE_INVALID; + if (atomic_read(&pool->num_elem) > 0) + pr_warn("%s pool destroyed with unfree'd elem\n", + pool_name(pool)); + write_unlock_irqrestore(&pool->pool_lock, flags); + + rxe_pool_put(pool); +} + +static u32 alloc_index(struct rxe_pool *pool) +{ + u32 index; + u32 range = pool->max_index - pool->min_index + 1; + + index = find_next_zero_bit(pool->table, range, pool->last); + if (index >= range) + index = find_first_zero_bit(pool->table, range); + + WARN_ON_ONCE(index >= range); + set_bit(index, pool->table); + pool->last = index; + return index + pool->min_index; +} + +static void insert_index(struct rxe_pool *pool, struct rxe_pool_entry *new) +{ + struct rb_node **link = &pool->tree.rb_node; + struct rb_node *parent = NULL; + struct rxe_pool_entry *elem; + + while (*link) { + parent = *link; + elem = rb_entry(parent, struct rxe_pool_entry, node); + + if (elem->index == new->index) { + pr_warn("element already exists!\n"); + goto out; + } + + if (elem->index > new->index) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, &pool->tree); +out: + return; +} + +static void insert_key(struct rxe_pool *pool, struct rxe_pool_entry *new) +{ + struct rb_node **link = &pool->tree.rb_node; + struct rb_node *parent = NULL; + struct rxe_pool_entry *elem; + int cmp; + + while (*link) { + parent = *link; + elem = rb_entry(parent, struct rxe_pool_entry, node); + + cmp = memcmp((u8 *)elem + pool->key_offset, + (u8 *)new + pool->key_offset, pool->key_size); + + if (cmp == 0) { + pr_warn("key already exists!\n"); + goto out; + } + + if (cmp > 0) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, &pool->tree); +out: + return; +} + +void rxe_add_key(void *arg, void *key) +{ + struct rxe_pool_entry *elem = arg; + struct rxe_pool *pool = elem->pool; + unsigned long flags; + + write_lock_irqsave(&pool->pool_lock, flags); + memcpy((u8 *)elem + pool->key_offset, key, pool->key_size); + insert_key(pool, elem); + write_unlock_irqrestore(&pool->pool_lock, flags); +} + +void rxe_drop_key(void *arg) +{ + struct rxe_pool_entry *elem = arg; + struct rxe_pool *pool = elem->pool; + unsigned long flags; + + write_lock_irqsave(&pool->pool_lock, flags); + rb_erase(&elem->node, &pool->tree); + write_unlock_irqrestore(&pool->pool_lock, flags); +} + +void rxe_add_index(void *arg) +{ + struct rxe_pool_entry *elem = arg; + struct rxe_pool *pool = elem->pool; + unsigned long flags; + + write_lock_irqsave(&pool->pool_lock, flags); + elem->index = alloc_index(pool); + insert_index(pool, elem); + write_unlock_irqrestore(&pool->pool_lock, flags); +} + +void rxe_drop_index(void *arg) +{ + struct rxe_pool_entry *elem = arg; + struct rxe_pool *pool = elem->pool; + unsigned long flags; + + write_lock_irqsave(&pool->pool_lock, flags); + clear_bit(elem->index - pool->min_index, pool->table); + rb_erase(&elem->node, &pool->tree); + write_unlock_irqrestore(&pool->pool_lock, flags); +} + +void *rxe_alloc(struct rxe_pool *pool) +{ + struct rxe_pool_entry *elem; + unsigned long flags; + + might_sleep_if(!(pool->flags & RXE_POOL_ATOMIC)); + + read_lock_irqsave(&pool->pool_lock, flags); + if (pool->state != RXE_POOL_STATE_VALID) { + read_unlock_irqrestore(&pool->pool_lock, flags); + return NULL; + } + kref_get(&pool->ref_cnt); + read_unlock_irqrestore(&pool->pool_lock, flags); + + if (!ib_device_try_get(&pool->rxe->ib_dev)) + goto out_put_pool; + + if (atomic_inc_return(&pool->num_elem) > pool->max_elem) + goto out_cnt; + + elem = kzalloc(rxe_type_info[pool->type].size, + (pool->flags & RXE_POOL_ATOMIC) ? + GFP_ATOMIC : GFP_KERNEL); + if (!elem) + goto out_cnt; + + elem->pool = pool; + kref_init(&elem->ref_cnt); + + return elem; + +out_cnt: + atomic_dec(&pool->num_elem); + ib_device_put(&pool->rxe->ib_dev); +out_put_pool: + rxe_pool_put(pool); + return NULL; +} + +int rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_entry *elem) +{ + unsigned long flags; + + might_sleep_if(!(pool->flags & RXE_POOL_ATOMIC)); + + read_lock_irqsave(&pool->pool_lock, flags); + if (pool->state != RXE_POOL_STATE_VALID) { + read_unlock_irqrestore(&pool->pool_lock, flags); + return -EINVAL; + } + kref_get(&pool->ref_cnt); + read_unlock_irqrestore(&pool->pool_lock, flags); + + if (!ib_device_try_get(&pool->rxe->ib_dev)) + goto out_put_pool; + + if (atomic_inc_return(&pool->num_elem) > pool->max_elem) + goto out_cnt; + + elem->pool = pool; + kref_init(&elem->ref_cnt); + + return 0; + +out_cnt: + atomic_dec(&pool->num_elem); + ib_device_put(&pool->rxe->ib_dev); +out_put_pool: + rxe_pool_put(pool); + return -EINVAL; +} + +void rxe_elem_release(struct kref *kref) +{ + struct rxe_pool_entry *elem = + container_of(kref, struct rxe_pool_entry, ref_cnt); + struct rxe_pool *pool = elem->pool; + + if (pool->cleanup) + pool->cleanup(elem); + + if (!(pool->flags & RXE_POOL_NO_ALLOC)) + kfree(elem); + atomic_dec(&pool->num_elem); + ib_device_put(&pool->rxe->ib_dev); + rxe_pool_put(pool); +} + +void *rxe_pool_get_index(struct rxe_pool *pool, u32 index) +{ + struct rb_node *node = NULL; + struct rxe_pool_entry *elem = NULL; + unsigned long flags; + + read_lock_irqsave(&pool->pool_lock, flags); + + if (pool->state != RXE_POOL_STATE_VALID) + goto out; + + node = pool->tree.rb_node; + + while (node) { + elem = rb_entry(node, struct rxe_pool_entry, node); + + if (elem->index > index) + node = node->rb_left; + else if (elem->index < index) + node = node->rb_right; + else { + kref_get(&elem->ref_cnt); + break; + } + } + +out: + read_unlock_irqrestore(&pool->pool_lock, flags); + return node ? elem : NULL; +} + +void *rxe_pool_get_key(struct rxe_pool *pool, void *key) +{ + struct rb_node *node = NULL; + struct rxe_pool_entry *elem = NULL; + int cmp; + unsigned long flags; + + read_lock_irqsave(&pool->pool_lock, flags); + + if (pool->state != RXE_POOL_STATE_VALID) + goto out; + + node = pool->tree.rb_node; + + while (node) { + elem = rb_entry(node, struct rxe_pool_entry, node); + + cmp = memcmp((u8 *)elem + pool->key_offset, + key, pool->key_size); + + if (cmp > 0) + node = node->rb_left; + else if (cmp < 0) + node = node->rb_right; + else + break; + } + + if (node) + kref_get(&elem->ref_cnt); + +out: + read_unlock_irqrestore(&pool->pool_lock, flags); + return node ? elem : NULL; +} diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h new file mode 100644 index 000000000..432745ffc --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_pool.h @@ -0,0 +1,135 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#ifndef RXE_POOL_H +#define RXE_POOL_H + +#define RXE_POOL_ALIGN (16) +#define RXE_POOL_CACHE_FLAGS (0) + +enum rxe_pool_flags { + RXE_POOL_ATOMIC = BIT(0), + RXE_POOL_INDEX = BIT(1), + RXE_POOL_KEY = BIT(2), + RXE_POOL_NO_ALLOC = BIT(4), +}; + +enum rxe_elem_type { + RXE_TYPE_UC, + RXE_TYPE_PD, + RXE_TYPE_AH, + RXE_TYPE_SRQ, + RXE_TYPE_QP, + RXE_TYPE_CQ, + RXE_TYPE_MR, + RXE_TYPE_MW, + RXE_TYPE_MC_GRP, + RXE_TYPE_MC_ELEM, + RXE_NUM_TYPES, /* keep me last */ +}; + +struct rxe_pool_entry; + +struct rxe_type_info { + const char *name; + size_t size; + void (*cleanup)(struct rxe_pool_entry *obj); + enum rxe_pool_flags flags; + u32 max_index; + u32 min_index; + size_t key_offset; + size_t key_size; +}; + +extern struct rxe_type_info rxe_type_info[]; + +enum rxe_pool_state { + RXE_POOL_STATE_INVALID, + RXE_POOL_STATE_VALID, +}; + +struct rxe_pool_entry { + struct rxe_pool *pool; + struct kref ref_cnt; + struct list_head list; + + /* only used if indexed or keyed */ + struct rb_node node; + u32 index; +}; + +struct rxe_pool { + struct rxe_dev *rxe; + rwlock_t pool_lock; /* protects pool add/del/search */ + size_t elem_size; + struct kref ref_cnt; + void (*cleanup)(struct rxe_pool_entry *obj); + enum rxe_pool_state state; + enum rxe_pool_flags flags; + enum rxe_elem_type type; + + unsigned int max_elem; + atomic_t num_elem; + + /* only used if indexed or keyed */ + struct rb_root tree; + unsigned long *table; + size_t table_size; + u32 max_index; + u32 min_index; + u32 last; + size_t key_offset; + size_t key_size; +}; + +/* initialize a pool of objects with given limit on + * number of elements. gets parameters from rxe_type_info + * pool elements will be allocated out of a slab cache + */ +int rxe_pool_init(struct rxe_dev *rxe, struct rxe_pool *pool, + enum rxe_elem_type type, u32 max_elem); + +/* free resources from object pool */ +void rxe_pool_cleanup(struct rxe_pool *pool); + +/* allocate an object from pool */ +void *rxe_alloc(struct rxe_pool *pool); + +/* connect already allocated object to pool */ +int rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_entry *elem); + +/* assign an index to an indexed object and insert object into + * pool's rb tree + */ +void rxe_add_index(void *elem); + +/* drop an index and remove object from rb tree */ +void rxe_drop_index(void *elem); + +/* assign a key to a keyed object and insert object into + * pool's rb tree + */ +void rxe_add_key(void *elem, void *key); + +/* remove elem from rb tree */ +void rxe_drop_key(void *elem); + +/* lookup an indexed object from index. takes a reference on object */ +void *rxe_pool_get_index(struct rxe_pool *pool, u32 index); + +/* lookup keyed object from key. takes a reference on the object */ +void *rxe_pool_get_key(struct rxe_pool *pool, void *key); + +/* cleanup an object when all references are dropped */ +void rxe_elem_release(struct kref *kref); + +/* take a reference on an object */ +#define rxe_add_ref(elem) kref_get(&(elem)->pelem.ref_cnt) + +/* drop a reference on an object */ +#define rxe_drop_ref(elem) kref_put(&(elem)->pelem.ref_cnt, rxe_elem_release) + +#endif /* RXE_POOL_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c new file mode 100644 index 000000000..4c938d841 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -0,0 +1,828 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <linux/skbuff.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/vmalloc.h> +#include <rdma/uverbs_ioctl.h> + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" +#include "rxe_task.h" + +static int rxe_qp_chk_cap(struct rxe_dev *rxe, struct ib_qp_cap *cap, + int has_srq) +{ + if (cap->max_send_wr > rxe->attr.max_qp_wr) { + pr_warn("invalid send wr = %d > %d\n", + cap->max_send_wr, rxe->attr.max_qp_wr); + goto err1; + } + + if (cap->max_send_sge > rxe->attr.max_send_sge) { + pr_warn("invalid send sge = %d > %d\n", + cap->max_send_sge, rxe->attr.max_send_sge); + goto err1; + } + + if (!has_srq) { + if (cap->max_recv_wr > rxe->attr.max_qp_wr) { + pr_warn("invalid recv wr = %d > %d\n", + cap->max_recv_wr, rxe->attr.max_qp_wr); + goto err1; + } + + if (cap->max_recv_sge > rxe->attr.max_recv_sge) { + pr_warn("invalid recv sge = %d > %d\n", + cap->max_recv_sge, rxe->attr.max_recv_sge); + goto err1; + } + } + + if (cap->max_inline_data > rxe->max_inline_data) { + pr_warn("invalid max inline data = %d > %d\n", + cap->max_inline_data, rxe->max_inline_data); + goto err1; + } + + return 0; + +err1: + return -EINVAL; +} + +int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init) +{ + struct ib_qp_cap *cap = &init->cap; + struct rxe_port *port; + int port_num = init->port_num; + + if (!init->recv_cq || !init->send_cq) { + pr_warn("missing cq\n"); + goto err1; + } + + if (rxe_qp_chk_cap(rxe, cap, !!init->srq)) + goto err1; + + if (init->qp_type == IB_QPT_SMI || init->qp_type == IB_QPT_GSI) { + if (!rdma_is_port_valid(&rxe->ib_dev, port_num)) { + pr_warn("invalid port = %d\n", port_num); + goto err1; + } + + port = &rxe->port; + + if (init->qp_type == IB_QPT_SMI && port->qp_smi_index) { + pr_warn("SMI QP exists for port %d\n", port_num); + goto err1; + } + + if (init->qp_type == IB_QPT_GSI && port->qp_gsi_index) { + pr_warn("GSI QP exists for port %d\n", port_num); + goto err1; + } + } + + return 0; + +err1: + return -EINVAL; +} + +static int alloc_rd_atomic_resources(struct rxe_qp *qp, unsigned int n) +{ + qp->resp.res_head = 0; + qp->resp.res_tail = 0; + qp->resp.resources = kcalloc(n, sizeof(struct resp_res), GFP_KERNEL); + + if (!qp->resp.resources) + return -ENOMEM; + + return 0; +} + +static void free_rd_atomic_resources(struct rxe_qp *qp) +{ + if (qp->resp.resources) { + int i; + + for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) { + struct resp_res *res = &qp->resp.resources[i]; + + free_rd_atomic_resource(qp, res); + } + kfree(qp->resp.resources); + qp->resp.resources = NULL; + } +} + +void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res) +{ + if (res->type == RXE_ATOMIC_MASK) { + kfree_skb(res->atomic.skb); + } else if (res->type == RXE_READ_MASK) { + if (res->read.mr) + rxe_drop_ref(res->read.mr); + } + res->type = 0; +} + +static void cleanup_rd_atomic_resources(struct rxe_qp *qp) +{ + int i; + struct resp_res *res; + + if (qp->resp.resources) { + for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) { + res = &qp->resp.resources[i]; + free_rd_atomic_resource(qp, res); + } + } +} + +static void rxe_qp_init_misc(struct rxe_dev *rxe, struct rxe_qp *qp, + struct ib_qp_init_attr *init) +{ + struct rxe_port *port; + u32 qpn; + + qp->sq_sig_type = init->sq_sig_type; + qp->attr.path_mtu = 1; + qp->mtu = ib_mtu_enum_to_int(qp->attr.path_mtu); + + qpn = qp->pelem.index; + port = &rxe->port; + + switch (init->qp_type) { + case IB_QPT_SMI: + qp->ibqp.qp_num = 0; + port->qp_smi_index = qpn; + qp->attr.port_num = init->port_num; + break; + + case IB_QPT_GSI: + qp->ibqp.qp_num = 1; + port->qp_gsi_index = qpn; + qp->attr.port_num = init->port_num; + break; + + default: + qp->ibqp.qp_num = qpn; + break; + } + + INIT_LIST_HEAD(&qp->grp_list); + + skb_queue_head_init(&qp->send_pkts); + + spin_lock_init(&qp->grp_lock); + spin_lock_init(&qp->state_lock); + + spin_lock_init(&qp->req.task.state_lock); + spin_lock_init(&qp->resp.task.state_lock); + spin_lock_init(&qp->comp.task.state_lock); + + spin_lock_init(&qp->sq.sq_lock); + spin_lock_init(&qp->rq.producer_lock); + spin_lock_init(&qp->rq.consumer_lock); + + skb_queue_head_init(&qp->req_pkts); + skb_queue_head_init(&qp->resp_pkts); + + atomic_set(&qp->ssn, 0); + atomic_set(&qp->skb_out, 0); +} + +static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, + struct ib_qp_init_attr *init, struct ib_udata *udata, + struct rxe_create_qp_resp __user *uresp) +{ + int err; + int wqe_size; + + err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &qp->sk); + if (err < 0) + return err; + qp->sk->sk->sk_user_data = qp; + + /* pick a source UDP port number for this QP based on + * the source QPN. this spreads traffic for different QPs + * across different NIC RX queues (while using a single + * flow for a given QP to maintain packet order). + * the port number must be in the Dynamic Ports range + * (0xc000 - 0xffff). + */ + qp->src_port = RXE_ROCE_V2_SPORT + + (hash_32_generic(qp_num(qp), 14) & 0x3fff); + qp->sq.max_wr = init->cap.max_send_wr; + + /* These caps are limited by rxe_qp_chk_cap() done by the caller */ + wqe_size = max_t(int, init->cap.max_send_sge * sizeof(struct ib_sge), + init->cap.max_inline_data); + qp->sq.max_sge = init->cap.max_send_sge = + wqe_size / sizeof(struct ib_sge); + qp->sq.max_inline = init->cap.max_inline_data = wqe_size; + wqe_size += sizeof(struct rxe_send_wqe); + + qp->sq.queue = rxe_queue_init(rxe, &qp->sq.max_wr, wqe_size); + if (!qp->sq.queue) + return -ENOMEM; + + err = do_mmap_info(rxe, uresp ? &uresp->sq_mi : NULL, udata, + qp->sq.queue->buf, qp->sq.queue->buf_size, + &qp->sq.queue->ip); + + if (err) { + vfree(qp->sq.queue->buf); + kfree(qp->sq.queue); + qp->sq.queue = NULL; + return err; + } + + qp->req.wqe_index = producer_index(qp->sq.queue); + qp->req.state = QP_STATE_RESET; + qp->req.opcode = -1; + qp->comp.opcode = -1; + + rxe_init_task(&qp->req.task, qp, rxe_requester); + rxe_init_task(&qp->comp.task, qp, rxe_completer); + + qp->qp_timeout_jiffies = 0; /* Can't be set for UD/UC in modify_qp */ + if (init->qp_type == IB_QPT_RC) { + timer_setup(&qp->rnr_nak_timer, rnr_nak_timer, 0); + timer_setup(&qp->retrans_timer, retransmit_timer, 0); + } + return 0; +} + +static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp, + struct ib_qp_init_attr *init, + struct ib_udata *udata, + struct rxe_create_qp_resp __user *uresp) +{ + int err; + int wqe_size; + + if (!qp->srq) { + qp->rq.max_wr = init->cap.max_recv_wr; + qp->rq.max_sge = init->cap.max_recv_sge; + + wqe_size = rcv_wqe_size(qp->rq.max_sge); + + pr_debug("qp#%d max_wr = %d, max_sge = %d, wqe_size = %d\n", + qp_num(qp), qp->rq.max_wr, qp->rq.max_sge, wqe_size); + + qp->rq.queue = rxe_queue_init(rxe, + &qp->rq.max_wr, + wqe_size); + if (!qp->rq.queue) + return -ENOMEM; + + err = do_mmap_info(rxe, uresp ? &uresp->rq_mi : NULL, udata, + qp->rq.queue->buf, qp->rq.queue->buf_size, + &qp->rq.queue->ip); + if (err) { + vfree(qp->rq.queue->buf); + kfree(qp->rq.queue); + qp->rq.queue = NULL; + return err; + } + } + + rxe_init_task(&qp->resp.task, qp, rxe_responder); + + qp->resp.opcode = OPCODE_NONE; + qp->resp.msn = 0; + qp->resp.state = QP_STATE_RESET; + + return 0; +} + +/* called by the create qp verb */ +int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, + struct ib_qp_init_attr *init, + struct rxe_create_qp_resp __user *uresp, + struct ib_pd *ibpd, + struct ib_udata *udata) +{ + int err; + struct rxe_cq *rcq = to_rcq(init->recv_cq); + struct rxe_cq *scq = to_rcq(init->send_cq); + struct rxe_srq *srq = init->srq ? to_rsrq(init->srq) : NULL; + + rxe_add_ref(pd); + rxe_add_ref(rcq); + rxe_add_ref(scq); + if (srq) + rxe_add_ref(srq); + + qp->pd = pd; + qp->rcq = rcq; + qp->scq = scq; + qp->srq = srq; + + rxe_qp_init_misc(rxe, qp, init); + + err = rxe_qp_init_req(rxe, qp, init, udata, uresp); + if (err) + goto err1; + + err = rxe_qp_init_resp(rxe, qp, init, udata, uresp); + if (err) + goto err2; + + qp->attr.qp_state = IB_QPS_RESET; + qp->valid = 1; + + return 0; + +err2: + rxe_queue_cleanup(qp->sq.queue); +err1: + qp->pd = NULL; + qp->rcq = NULL; + qp->scq = NULL; + qp->srq = NULL; + + if (srq) + rxe_drop_ref(srq); + rxe_drop_ref(scq); + rxe_drop_ref(rcq); + rxe_drop_ref(pd); + + return err; +} + +/* called by the query qp verb */ +int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init) +{ + init->event_handler = qp->ibqp.event_handler; + init->qp_context = qp->ibqp.qp_context; + init->send_cq = qp->ibqp.send_cq; + init->recv_cq = qp->ibqp.recv_cq; + init->srq = qp->ibqp.srq; + + init->cap.max_send_wr = qp->sq.max_wr; + init->cap.max_send_sge = qp->sq.max_sge; + init->cap.max_inline_data = qp->sq.max_inline; + + if (!qp->srq) { + init->cap.max_recv_wr = qp->rq.max_wr; + init->cap.max_recv_sge = qp->rq.max_sge; + } + + init->sq_sig_type = qp->sq_sig_type; + + init->qp_type = qp->ibqp.qp_type; + init->port_num = 1; + + return 0; +} + +/* called by the modify qp verb, this routine checks all the parameters before + * making any changes + */ +int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, + struct ib_qp_attr *attr, int mask) +{ + enum ib_qp_state cur_state = (mask & IB_QP_CUR_STATE) ? + attr->cur_qp_state : qp->attr.qp_state; + enum ib_qp_state new_state = (mask & IB_QP_STATE) ? + attr->qp_state : cur_state; + + if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask)) { + pr_warn("invalid mask or state for qp\n"); + goto err1; + } + + if (mask & IB_QP_STATE) { + if (cur_state == IB_QPS_SQD) { + if (qp->req.state == QP_STATE_DRAIN && + new_state != IB_QPS_ERR) + goto err1; + } + } + + if (mask & IB_QP_PORT) { + if (!rdma_is_port_valid(&rxe->ib_dev, attr->port_num)) { + pr_warn("invalid port %d\n", attr->port_num); + goto err1; + } + } + + if (mask & IB_QP_CAP && rxe_qp_chk_cap(rxe, &attr->cap, !!qp->srq)) + goto err1; + + if (mask & IB_QP_AV && rxe_av_chk_attr(rxe, &attr->ah_attr)) + goto err1; + + if (mask & IB_QP_ALT_PATH) { + if (rxe_av_chk_attr(rxe, &attr->alt_ah_attr)) + goto err1; + if (!rdma_is_port_valid(&rxe->ib_dev, attr->alt_port_num)) { + pr_warn("invalid alt port %d\n", attr->alt_port_num); + goto err1; + } + if (attr->alt_timeout > 31) { + pr_warn("invalid QP alt timeout %d > 31\n", + attr->alt_timeout); + goto err1; + } + } + + if (mask & IB_QP_PATH_MTU) { + struct rxe_port *port = &rxe->port; + + enum ib_mtu max_mtu = port->attr.max_mtu; + enum ib_mtu mtu = attr->path_mtu; + + if (mtu > max_mtu) { + pr_debug("invalid mtu (%d) > (%d)\n", + ib_mtu_enum_to_int(mtu), + ib_mtu_enum_to_int(max_mtu)); + goto err1; + } + } + + if (mask & IB_QP_MAX_QP_RD_ATOMIC) { + if (attr->max_rd_atomic > rxe->attr.max_qp_rd_atom) { + pr_warn("invalid max_rd_atomic %d > %d\n", + attr->max_rd_atomic, + rxe->attr.max_qp_rd_atom); + goto err1; + } + } + + if (mask & IB_QP_TIMEOUT) { + if (attr->timeout > 31) { + pr_warn("invalid QP timeout %d > 31\n", + attr->timeout); + goto err1; + } + } + + return 0; + +err1: + return -EINVAL; +} + +/* move the qp to the reset state */ +static void rxe_qp_reset(struct rxe_qp *qp) +{ + /* stop tasks from running */ + rxe_disable_task(&qp->resp.task); + + /* stop request/comp */ + if (qp->sq.queue) { + if (qp_type(qp) == IB_QPT_RC) + rxe_disable_task(&qp->comp.task); + rxe_disable_task(&qp->req.task); + } + + /* move qp to the reset state */ + qp->req.state = QP_STATE_RESET; + qp->resp.state = QP_STATE_RESET; + + /* let state machines reset themselves drain work and packet queues + * etc. + */ + __rxe_do_task(&qp->resp.task); + + if (qp->sq.queue) { + __rxe_do_task(&qp->comp.task); + __rxe_do_task(&qp->req.task); + rxe_queue_reset(qp->sq.queue); + } + + /* cleanup attributes */ + atomic_set(&qp->ssn, 0); + qp->req.opcode = -1; + qp->req.need_retry = 0; + qp->req.noack_pkts = 0; + qp->resp.msn = 0; + qp->resp.opcode = -1; + qp->resp.drop_msg = 0; + qp->resp.goto_error = 0; + qp->resp.sent_psn_nak = 0; + + if (qp->resp.mr) { + rxe_drop_ref(qp->resp.mr); + qp->resp.mr = NULL; + } + + cleanup_rd_atomic_resources(qp); + + /* reenable tasks */ + rxe_enable_task(&qp->resp.task); + + if (qp->sq.queue) { + if (qp_type(qp) == IB_QPT_RC) + rxe_enable_task(&qp->comp.task); + + rxe_enable_task(&qp->req.task); + } +} + +/* drain the send queue */ +static void rxe_qp_drain(struct rxe_qp *qp) +{ + if (qp->sq.queue) { + if (qp->req.state != QP_STATE_DRAINED) { + qp->req.state = QP_STATE_DRAIN; + if (qp_type(qp) == IB_QPT_RC) + rxe_run_task(&qp->comp.task, 1); + else + __rxe_do_task(&qp->comp.task); + rxe_run_task(&qp->req.task, 1); + } + } +} + +/* move the qp to the error state */ +void rxe_qp_error(struct rxe_qp *qp) +{ + qp->req.state = QP_STATE_ERROR; + qp->resp.state = QP_STATE_ERROR; + qp->attr.qp_state = IB_QPS_ERR; + + /* drain work and packet queues */ + rxe_run_task(&qp->resp.task, 1); + + if (qp_type(qp) == IB_QPT_RC) + rxe_run_task(&qp->comp.task, 1); + else + __rxe_do_task(&qp->comp.task); + rxe_run_task(&qp->req.task, 1); +} + +/* called by the modify qp verb */ +int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, + struct ib_udata *udata) +{ + int err; + + if (mask & IB_QP_MAX_QP_RD_ATOMIC) { + int max_rd_atomic = attr->max_rd_atomic ? + roundup_pow_of_two(attr->max_rd_atomic) : 0; + + qp->attr.max_rd_atomic = max_rd_atomic; + atomic_set(&qp->req.rd_atomic, max_rd_atomic); + } + + if (mask & IB_QP_MAX_DEST_RD_ATOMIC) { + int max_dest_rd_atomic = attr->max_dest_rd_atomic ? + roundup_pow_of_two(attr->max_dest_rd_atomic) : 0; + + qp->attr.max_dest_rd_atomic = max_dest_rd_atomic; + + free_rd_atomic_resources(qp); + + err = alloc_rd_atomic_resources(qp, max_dest_rd_atomic); + if (err) + return err; + } + + if (mask & IB_QP_CUR_STATE) + qp->attr.cur_qp_state = attr->qp_state; + + if (mask & IB_QP_EN_SQD_ASYNC_NOTIFY) + qp->attr.en_sqd_async_notify = attr->en_sqd_async_notify; + + if (mask & IB_QP_ACCESS_FLAGS) + qp->attr.qp_access_flags = attr->qp_access_flags; + + if (mask & IB_QP_PKEY_INDEX) + qp->attr.pkey_index = attr->pkey_index; + + if (mask & IB_QP_PORT) + qp->attr.port_num = attr->port_num; + + if (mask & IB_QP_QKEY) + qp->attr.qkey = attr->qkey; + + if (mask & IB_QP_AV) + rxe_init_av(&attr->ah_attr, &qp->pri_av); + + if (mask & IB_QP_ALT_PATH) { + rxe_init_av(&attr->alt_ah_attr, &qp->alt_av); + qp->attr.alt_port_num = attr->alt_port_num; + qp->attr.alt_pkey_index = attr->alt_pkey_index; + qp->attr.alt_timeout = attr->alt_timeout; + } + + if (mask & IB_QP_PATH_MTU) { + qp->attr.path_mtu = attr->path_mtu; + qp->mtu = ib_mtu_enum_to_int(attr->path_mtu); + } + + if (mask & IB_QP_TIMEOUT) { + qp->attr.timeout = attr->timeout; + if (attr->timeout == 0) { + qp->qp_timeout_jiffies = 0; + } else { + /* According to the spec, timeout = 4.096 * 2 ^ attr->timeout [us] */ + int j = nsecs_to_jiffies(4096ULL << attr->timeout); + + qp->qp_timeout_jiffies = j ? j : 1; + } + } + + if (mask & IB_QP_RETRY_CNT) { + qp->attr.retry_cnt = attr->retry_cnt; + qp->comp.retry_cnt = attr->retry_cnt; + pr_debug("qp#%d set retry count = %d\n", qp_num(qp), + attr->retry_cnt); + } + + if (mask & IB_QP_RNR_RETRY) { + qp->attr.rnr_retry = attr->rnr_retry; + qp->comp.rnr_retry = attr->rnr_retry; + pr_debug("qp#%d set rnr retry count = %d\n", qp_num(qp), + attr->rnr_retry); + } + + if (mask & IB_QP_RQ_PSN) { + qp->attr.rq_psn = (attr->rq_psn & BTH_PSN_MASK); + qp->resp.psn = qp->attr.rq_psn; + pr_debug("qp#%d set resp psn = 0x%x\n", qp_num(qp), + qp->resp.psn); + } + + if (mask & IB_QP_MIN_RNR_TIMER) { + qp->attr.min_rnr_timer = attr->min_rnr_timer; + pr_debug("qp#%d set min rnr timer = 0x%x\n", qp_num(qp), + attr->min_rnr_timer); + } + + if (mask & IB_QP_SQ_PSN) { + qp->attr.sq_psn = (attr->sq_psn & BTH_PSN_MASK); + qp->req.psn = qp->attr.sq_psn; + qp->comp.psn = qp->attr.sq_psn; + pr_debug("qp#%d set req psn = 0x%x\n", qp_num(qp), qp->req.psn); + } + + if (mask & IB_QP_PATH_MIG_STATE) + qp->attr.path_mig_state = attr->path_mig_state; + + if (mask & IB_QP_DEST_QPN) + qp->attr.dest_qp_num = attr->dest_qp_num; + + if (mask & IB_QP_STATE) { + qp->attr.qp_state = attr->qp_state; + + switch (attr->qp_state) { + case IB_QPS_RESET: + pr_debug("qp#%d state -> RESET\n", qp_num(qp)); + rxe_qp_reset(qp); + break; + + case IB_QPS_INIT: + pr_debug("qp#%d state -> INIT\n", qp_num(qp)); + qp->req.state = QP_STATE_INIT; + qp->resp.state = QP_STATE_INIT; + break; + + case IB_QPS_RTR: + pr_debug("qp#%d state -> RTR\n", qp_num(qp)); + qp->resp.state = QP_STATE_READY; + break; + + case IB_QPS_RTS: + pr_debug("qp#%d state -> RTS\n", qp_num(qp)); + qp->req.state = QP_STATE_READY; + break; + + case IB_QPS_SQD: + pr_debug("qp#%d state -> SQD\n", qp_num(qp)); + rxe_qp_drain(qp); + break; + + case IB_QPS_SQE: + pr_warn("qp#%d state -> SQE !!?\n", qp_num(qp)); + /* Not possible from modify_qp. */ + break; + + case IB_QPS_ERR: + pr_debug("qp#%d state -> ERR\n", qp_num(qp)); + rxe_qp_error(qp); + break; + } + } + + return 0; +} + +/* called by the query qp verb */ +int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) +{ + *attr = qp->attr; + + attr->rq_psn = qp->resp.psn; + attr->sq_psn = qp->req.psn; + + attr->cap.max_send_wr = qp->sq.max_wr; + attr->cap.max_send_sge = qp->sq.max_sge; + attr->cap.max_inline_data = qp->sq.max_inline; + + if (!qp->srq) { + attr->cap.max_recv_wr = qp->rq.max_wr; + attr->cap.max_recv_sge = qp->rq.max_sge; + } + + rxe_av_to_attr(&qp->pri_av, &attr->ah_attr); + rxe_av_to_attr(&qp->alt_av, &attr->alt_ah_attr); + + if (qp->req.state == QP_STATE_DRAIN) { + attr->sq_draining = 1; + /* applications that get this state + * typically spin on it. yield the + * processor + */ + cond_resched(); + } else { + attr->sq_draining = 0; + } + + pr_debug("attr->sq_draining = %d\n", attr->sq_draining); + + return 0; +} + +/* called by the destroy qp verb */ +void rxe_qp_destroy(struct rxe_qp *qp) +{ + qp->valid = 0; + qp->qp_timeout_jiffies = 0; + rxe_cleanup_task(&qp->resp.task); + + if (qp_type(qp) == IB_QPT_RC) { + del_timer_sync(&qp->retrans_timer); + del_timer_sync(&qp->rnr_nak_timer); + } + + rxe_cleanup_task(&qp->req.task); + rxe_cleanup_task(&qp->comp.task); + + /* flush out any receive wr's or pending requests */ + if (qp->req.task.func) + __rxe_do_task(&qp->req.task); + + if (qp->sq.queue) { + __rxe_do_task(&qp->comp.task); + __rxe_do_task(&qp->req.task); + } +} + +/* called when the last reference to the qp is dropped */ +static void rxe_qp_do_cleanup(struct work_struct *work) +{ + struct rxe_qp *qp = container_of(work, typeof(*qp), cleanup_work.work); + + rxe_drop_all_mcast_groups(qp); + + if (qp->sq.queue) + rxe_queue_cleanup(qp->sq.queue); + + if (qp->srq) + rxe_drop_ref(qp->srq); + + if (qp->rq.queue) + rxe_queue_cleanup(qp->rq.queue); + + if (qp->scq) + rxe_drop_ref(qp->scq); + if (qp->rcq) + rxe_drop_ref(qp->rcq); + if (qp->pd) + rxe_drop_ref(qp->pd); + + if (qp->resp.mr) { + rxe_drop_ref(qp->resp.mr); + qp->resp.mr = NULL; + } + + free_rd_atomic_resources(qp); + + if (qp->sk) { + if (qp_type(qp) == IB_QPT_RC) + sk_dst_reset(qp->sk->sk); + + kernel_sock_shutdown(qp->sk, SHUT_RDWR); + sock_release(qp->sk); + } +} + +/* called when the last reference to the qp is dropped */ +void rxe_qp_cleanup(struct rxe_pool_entry *arg) +{ + struct rxe_qp *qp = container_of(arg, typeof(*qp), pelem); + + execute_in_process_context(rxe_qp_do_cleanup, &qp->cleanup_work); +} diff --git a/drivers/infiniband/sw/rxe/rxe_queue.c b/drivers/infiniband/sw/rxe/rxe_queue.c new file mode 100644 index 000000000..fa69241b1 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_queue.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <linux/vmalloc.h> +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" + +int do_mmap_info(struct rxe_dev *rxe, struct mminfo __user *outbuf, + struct ib_udata *udata, struct rxe_queue_buf *buf, + size_t buf_size, struct rxe_mmap_info **ip_p) +{ + int err; + struct rxe_mmap_info *ip = NULL; + + if (outbuf) { + ip = rxe_create_mmap_info(rxe, buf_size, udata, buf); + if (IS_ERR(ip)) { + err = PTR_ERR(ip); + goto err1; + } + + if (copy_to_user(outbuf, &ip->info, sizeof(ip->info))) { + err = -EFAULT; + goto err2; + } + + spin_lock_bh(&rxe->pending_lock); + list_add(&ip->pending_mmaps, &rxe->pending_mmaps); + spin_unlock_bh(&rxe->pending_lock); + } + + *ip_p = ip; + + return 0; + +err2: + kfree(ip); +err1: + return err; +} + +inline void rxe_queue_reset(struct rxe_queue *q) +{ + /* queue is comprised from header and the memory + * of the actual queue. See "struct rxe_queue_buf" in rxe_queue.h + * reset only the queue itself and not the management header + */ + memset(q->buf->data, 0, q->buf_size - sizeof(struct rxe_queue_buf)); +} + +struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe, + int *num_elem, + unsigned int elem_size) +{ + struct rxe_queue *q; + size_t buf_size; + unsigned int num_slots; + + /* num_elem == 0 is allowed, but uninteresting */ + if (*num_elem < 0) + goto err1; + + q = kmalloc(sizeof(*q), GFP_KERNEL); + if (!q) + goto err1; + + q->rxe = rxe; + + /* used in resize, only need to copy used part of queue */ + q->elem_size = elem_size; + + /* pad element up to at least a cacheline and always a power of 2 */ + if (elem_size < cache_line_size()) + elem_size = cache_line_size(); + elem_size = roundup_pow_of_two(elem_size); + + q->log2_elem_size = order_base_2(elem_size); + + num_slots = *num_elem + 1; + num_slots = roundup_pow_of_two(num_slots); + q->index_mask = num_slots - 1; + + buf_size = sizeof(struct rxe_queue_buf) + num_slots * elem_size; + + q->buf = vmalloc_user(buf_size); + if (!q->buf) + goto err2; + + q->buf->log2_elem_size = q->log2_elem_size; + q->buf->index_mask = q->index_mask; + + q->buf_size = buf_size; + + *num_elem = num_slots - 1; + return q; + +err2: + kfree(q); +err1: + return NULL; +} + +/* copies elements from original q to new q and then swaps the contents of the + * two q headers. This is so that if anyone is holding a pointer to q it will + * still work + */ +static int resize_finish(struct rxe_queue *q, struct rxe_queue *new_q, + unsigned int num_elem) +{ + if (!queue_empty(q) && (num_elem < queue_count(q))) + return -EINVAL; + + while (!queue_empty(q)) { + memcpy(producer_addr(new_q), consumer_addr(q), + new_q->elem_size); + advance_producer(new_q); + advance_consumer(q); + } + + swap(*q, *new_q); + + return 0; +} + +int rxe_queue_resize(struct rxe_queue *q, unsigned int *num_elem_p, + unsigned int elem_size, struct ib_udata *udata, + struct mminfo __user *outbuf, spinlock_t *producer_lock, + spinlock_t *consumer_lock) +{ + struct rxe_queue *new_q; + unsigned int num_elem = *num_elem_p; + int err; + unsigned long flags = 0, flags1; + + new_q = rxe_queue_init(q->rxe, &num_elem, elem_size); + if (!new_q) + return -ENOMEM; + + err = do_mmap_info(new_q->rxe, outbuf, udata, new_q->buf, + new_q->buf_size, &new_q->ip); + if (err) { + vfree(new_q->buf); + kfree(new_q); + goto err1; + } + + spin_lock_irqsave(consumer_lock, flags1); + + if (producer_lock) { + spin_lock_irqsave(producer_lock, flags); + err = resize_finish(q, new_q, num_elem); + spin_unlock_irqrestore(producer_lock, flags); + } else { + err = resize_finish(q, new_q, num_elem); + } + + spin_unlock_irqrestore(consumer_lock, flags1); + + rxe_queue_cleanup(new_q); /* new/old dep on err */ + if (err) + goto err1; + + *num_elem_p = num_elem; + return 0; + +err1: + return err; +} + +void rxe_queue_cleanup(struct rxe_queue *q) +{ + if (q->ip) + kref_put(&q->ip->ref, rxe_mmap_release); + else + vfree(q->buf); + + kfree(q); +} diff --git a/drivers/infiniband/sw/rxe/rxe_queue.h b/drivers/infiniband/sw/rxe/rxe_queue.h new file mode 100644 index 000000000..7d434a683 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_queue.h @@ -0,0 +1,147 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#ifndef RXE_QUEUE_H +#define RXE_QUEUE_H + +/* implements a simple circular buffer that can optionally be + * shared between user space and the kernel and can be resized + + * the requested element size is rounded up to a power of 2 + * and the number of elements in the buffer is also rounded + * up to a power of 2. Since the queue is empty when the + * producer and consumer indices match the maximum capacity + * of the queue is one less than the number of element slots + */ + +/* this data structure is shared between user space and kernel + * space for those cases where the queue is shared. It contains + * the producer and consumer indices. Is also contains a copy + * of the queue size parameters for user space to use but the + * kernel must use the parameters in the rxe_queue struct + * this MUST MATCH the corresponding librxe struct + * for performance reasons arrange to have producer and consumer + * pointers in separate cache lines + * the kernel should always mask the indices to avoid accessing + * memory outside of the data area + */ +struct rxe_queue_buf { + __u32 log2_elem_size; + __u32 index_mask; + __u32 pad_1[30]; + __u32 producer_index; + __u32 pad_2[31]; + __u32 consumer_index; + __u32 pad_3[31]; + __u8 data[]; +}; + +struct rxe_queue { + struct rxe_dev *rxe; + struct rxe_queue_buf *buf; + struct rxe_mmap_info *ip; + size_t buf_size; + size_t elem_size; + unsigned int log2_elem_size; + unsigned int index_mask; +}; + +int do_mmap_info(struct rxe_dev *rxe, struct mminfo __user *outbuf, + struct ib_udata *udata, struct rxe_queue_buf *buf, + size_t buf_size, struct rxe_mmap_info **ip_p); + +void rxe_queue_reset(struct rxe_queue *q); + +struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe, + int *num_elem, + unsigned int elem_size); + +int rxe_queue_resize(struct rxe_queue *q, unsigned int *num_elem_p, + unsigned int elem_size, struct ib_udata *udata, + struct mminfo __user *outbuf, + /* Protect producers while resizing queue */ + spinlock_t *producer_lock, + /* Protect consumers while resizing queue */ + spinlock_t *consumer_lock); + +void rxe_queue_cleanup(struct rxe_queue *queue); + +static inline int next_index(struct rxe_queue *q, int index) +{ + return (index + 1) & q->buf->index_mask; +} + +static inline int queue_empty(struct rxe_queue *q) +{ + return ((q->buf->producer_index - q->buf->consumer_index) + & q->index_mask) == 0; +} + +static inline int queue_full(struct rxe_queue *q) +{ + return ((q->buf->producer_index + 1 - q->buf->consumer_index) + & q->index_mask) == 0; +} + +static inline void advance_producer(struct rxe_queue *q) +{ + q->buf->producer_index = (q->buf->producer_index + 1) + & q->index_mask; +} + +static inline void advance_consumer(struct rxe_queue *q) +{ + q->buf->consumer_index = (q->buf->consumer_index + 1) + & q->index_mask; +} + +static inline void *producer_addr(struct rxe_queue *q) +{ + return q->buf->data + ((q->buf->producer_index & q->index_mask) + << q->log2_elem_size); +} + +static inline void *consumer_addr(struct rxe_queue *q) +{ + return q->buf->data + ((q->buf->consumer_index & q->index_mask) + << q->log2_elem_size); +} + +static inline unsigned int producer_index(struct rxe_queue *q) +{ + return q->buf->producer_index; +} + +static inline unsigned int consumer_index(struct rxe_queue *q) +{ + return q->buf->consumer_index; +} + +static inline void *addr_from_index(struct rxe_queue *q, unsigned int index) +{ + return q->buf->data + ((index & q->index_mask) + << q->buf->log2_elem_size); +} + +static inline unsigned int index_from_addr(const struct rxe_queue *q, + const void *addr) +{ + return (((u8 *)addr - q->buf->data) >> q->log2_elem_size) + & q->index_mask; +} + +static inline unsigned int queue_count(const struct rxe_queue *q) +{ + return (q->buf->producer_index - q->buf->consumer_index) + & q->index_mask; +} + +static inline void *queue_head(struct rxe_queue *q) +{ + return queue_empty(q) ? NULL : consumer_addr(q); +} + +#endif /* RXE_QUEUE_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c new file mode 100644 index 000000000..cb69a125e --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -0,0 +1,409 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <linux/skbuff.h> + +#include "rxe.h" +#include "rxe_loc.h" + +/* check that QP matches packet opcode type and is in a valid state */ +static int check_type_state(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, + struct rxe_qp *qp) +{ + unsigned int pkt_type; + + if (unlikely(!qp->valid)) + goto err1; + + pkt_type = pkt->opcode & 0xe0; + + switch (qp_type(qp)) { + case IB_QPT_RC: + if (unlikely(pkt_type != IB_OPCODE_RC)) { + pr_warn_ratelimited("bad qp type\n"); + goto err1; + } + break; + case IB_QPT_UC: + if (unlikely(pkt_type != IB_OPCODE_UC)) { + pr_warn_ratelimited("bad qp type\n"); + goto err1; + } + break; + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + if (unlikely(pkt_type != IB_OPCODE_UD)) { + pr_warn_ratelimited("bad qp type\n"); + goto err1; + } + break; + default: + pr_warn_ratelimited("unsupported qp type\n"); + goto err1; + } + + if (pkt->mask & RXE_REQ_MASK) { + if (unlikely(qp->resp.state != QP_STATE_READY)) + goto err1; + } else if (unlikely(qp->req.state < QP_STATE_READY || + qp->req.state > QP_STATE_DRAINED)) { + goto err1; + } + + return 0; + +err1: + return -EINVAL; +} + +static void set_bad_pkey_cntr(struct rxe_port *port) +{ + spin_lock_bh(&port->port_lock); + port->attr.bad_pkey_cntr = min((u32)0xffff, + port->attr.bad_pkey_cntr + 1); + spin_unlock_bh(&port->port_lock); +} + +static void set_qkey_viol_cntr(struct rxe_port *port) +{ + spin_lock_bh(&port->port_lock); + port->attr.qkey_viol_cntr = min((u32)0xffff, + port->attr.qkey_viol_cntr + 1); + spin_unlock_bh(&port->port_lock); +} + +static int check_keys(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, + u32 qpn, struct rxe_qp *qp) +{ + struct rxe_port *port = &rxe->port; + u16 pkey = bth_pkey(pkt); + + pkt->pkey_index = 0; + + if (!pkey_match(pkey, IB_DEFAULT_PKEY_FULL)) { + pr_warn_ratelimited("bad pkey = 0x%x\n", pkey); + set_bad_pkey_cntr(port); + goto err1; + } + + if ((qp_type(qp) == IB_QPT_UD || qp_type(qp) == IB_QPT_GSI) && + pkt->mask) { + u32 qkey = (qpn == 1) ? GSI_QKEY : qp->attr.qkey; + + if (unlikely(deth_qkey(pkt) != qkey)) { + pr_warn_ratelimited("bad qkey, got 0x%x expected 0x%x for qpn 0x%x\n", + deth_qkey(pkt), qkey, qpn); + set_qkey_viol_cntr(port); + goto err1; + } + } + + return 0; + +err1: + return -EINVAL; +} + +static int check_addr(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, + struct rxe_qp *qp) +{ + struct sk_buff *skb = PKT_TO_SKB(pkt); + + if (qp_type(qp) != IB_QPT_RC && qp_type(qp) != IB_QPT_UC) + goto done; + + if (unlikely(pkt->port_num != qp->attr.port_num)) { + pr_warn_ratelimited("port %d != qp port %d\n", + pkt->port_num, qp->attr.port_num); + goto err1; + } + + if (skb->protocol == htons(ETH_P_IP)) { + struct in_addr *saddr = + &qp->pri_av.sgid_addr._sockaddr_in.sin_addr; + struct in_addr *daddr = + &qp->pri_av.dgid_addr._sockaddr_in.sin_addr; + + if (ip_hdr(skb)->daddr != saddr->s_addr) { + pr_warn_ratelimited("dst addr %pI4 != qp source addr %pI4\n", + &ip_hdr(skb)->daddr, + &saddr->s_addr); + goto err1; + } + + if (ip_hdr(skb)->saddr != daddr->s_addr) { + pr_warn_ratelimited("source addr %pI4 != qp dst addr %pI4\n", + &ip_hdr(skb)->saddr, + &daddr->s_addr); + goto err1; + } + + } else if (skb->protocol == htons(ETH_P_IPV6)) { + struct in6_addr *saddr = + &qp->pri_av.sgid_addr._sockaddr_in6.sin6_addr; + struct in6_addr *daddr = + &qp->pri_av.dgid_addr._sockaddr_in6.sin6_addr; + + if (memcmp(&ipv6_hdr(skb)->daddr, saddr, sizeof(*saddr))) { + pr_warn_ratelimited("dst addr %pI6 != qp source addr %pI6\n", + &ipv6_hdr(skb)->daddr, saddr); + goto err1; + } + + if (memcmp(&ipv6_hdr(skb)->saddr, daddr, sizeof(*daddr))) { + pr_warn_ratelimited("source addr %pI6 != qp dst addr %pI6\n", + &ipv6_hdr(skb)->saddr, daddr); + goto err1; + } + } + +done: + return 0; + +err1: + return -EINVAL; +} + +static int hdr_check(struct rxe_pkt_info *pkt) +{ + struct rxe_dev *rxe = pkt->rxe; + struct rxe_port *port = &rxe->port; + struct rxe_qp *qp = NULL; + u32 qpn = bth_qpn(pkt); + int index; + int err; + + if (unlikely(bth_tver(pkt) != BTH_TVER)) { + pr_warn_ratelimited("bad tver\n"); + goto err1; + } + + if (unlikely(qpn == 0)) { + pr_warn_once("QP 0 not supported"); + goto err1; + } + + if (qpn != IB_MULTICAST_QPN) { + index = (qpn == 1) ? port->qp_gsi_index : qpn; + + qp = rxe_pool_get_index(&rxe->qp_pool, index); + if (unlikely(!qp)) { + pr_warn_ratelimited("no qp matches qpn 0x%x\n", qpn); + goto err1; + } + + err = check_type_state(rxe, pkt, qp); + if (unlikely(err)) + goto err2; + + err = check_addr(rxe, pkt, qp); + if (unlikely(err)) + goto err2; + + err = check_keys(rxe, pkt, qpn, qp); + if (unlikely(err)) + goto err2; + } else { + if (unlikely((pkt->mask & RXE_GRH_MASK) == 0)) { + pr_warn_ratelimited("no grh for mcast qpn\n"); + goto err1; + } + } + + pkt->qp = qp; + return 0; + +err2: + rxe_drop_ref(qp); +err1: + return -EINVAL; +} + +static inline void rxe_rcv_pkt(struct rxe_pkt_info *pkt, struct sk_buff *skb) +{ + if (pkt->mask & RXE_REQ_MASK) + rxe_resp_queue_pkt(pkt->qp, skb); + else + rxe_comp_queue_pkt(pkt->qp, skb); +} + +static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb) +{ + struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); + struct rxe_mc_grp *mcg; + struct rxe_mc_elem *mce; + struct rxe_qp *qp; + union ib_gid dgid; + struct sk_buff *per_qp_skb; + struct rxe_pkt_info *per_qp_pkt; + int err; + + if (skb->protocol == htons(ETH_P_IP)) + ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr, + (struct in6_addr *)&dgid); + else if (skb->protocol == htons(ETH_P_IPV6)) + memcpy(&dgid, &ipv6_hdr(skb)->daddr, sizeof(dgid)); + + /* lookup mcast group corresponding to mgid, takes a ref */ + mcg = rxe_pool_get_key(&rxe->mc_grp_pool, &dgid); + if (!mcg) + goto err1; /* mcast group not registered */ + + spin_lock_bh(&mcg->mcg_lock); + + list_for_each_entry(mce, &mcg->qp_list, qp_list) { + qp = mce->qp; + + /* validate qp for incoming packet */ + err = check_type_state(rxe, pkt, qp); + if (err) + continue; + + err = check_keys(rxe, pkt, bth_qpn(pkt), qp); + if (err) + continue; + + /* for all but the last qp create a new clone of the + * skb and pass to the qp. If an error occurs in the + * checks for the last qp in the list we need to + * free the skb since it hasn't been passed on to + * rxe_rcv_pkt() which would free it later. + */ + if (mce->qp_list.next != &mcg->qp_list) { + per_qp_skb = skb_clone(skb, GFP_ATOMIC); + } else { + per_qp_skb = skb; + /* show we have consumed the skb */ + skb = NULL; + } + + if (unlikely(!per_qp_skb)) + continue; + + per_qp_pkt = SKB_TO_PKT(per_qp_skb); + per_qp_pkt->qp = qp; + rxe_add_ref(qp); + rxe_rcv_pkt(per_qp_pkt, per_qp_skb); + } + + spin_unlock_bh(&mcg->mcg_lock); + + rxe_drop_ref(mcg); /* drop ref from rxe_pool_get_key. */ + +err1: + /* free skb if not consumed */ + kfree_skb(skb); +} + +/** + * rxe_chk_dgid - validate destination IP address + * @rxe: rxe device that received packet + * @skb: the received packet buffer + * + * Accept any loopback packets + * Extract IP address from packet and + * Accept if multicast packet + * Accept if matches an SGID table entry + */ +static int rxe_chk_dgid(struct rxe_dev *rxe, struct sk_buff *skb) +{ + struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); + const struct ib_gid_attr *gid_attr; + union ib_gid dgid; + union ib_gid *pdgid; + + if (pkt->mask & RXE_LOOPBACK_MASK) + return 0; + + if (skb->protocol == htons(ETH_P_IP)) { + ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr, + (struct in6_addr *)&dgid); + pdgid = &dgid; + } else { + pdgid = (union ib_gid *)&ipv6_hdr(skb)->daddr; + } + + if (rdma_is_multicast_addr((struct in6_addr *)pdgid)) + return 0; + + gid_attr = rdma_find_gid_by_port(&rxe->ib_dev, pdgid, + IB_GID_TYPE_ROCE_UDP_ENCAP, + 1, skb->dev); + if (IS_ERR(gid_attr)) + return PTR_ERR(gid_attr); + + rdma_put_gid_attr(gid_attr); + return 0; +} + +/* rxe_rcv is called from the interface driver */ +void rxe_rcv(struct sk_buff *skb) +{ + int err; + struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); + struct rxe_dev *rxe = pkt->rxe; + __be32 *icrcp; + u32 calc_icrc, pack_icrc; + + pkt->offset = 0; + + if (unlikely(skb->len < pkt->offset + RXE_BTH_BYTES)) + goto drop; + + if (rxe_chk_dgid(rxe, skb) < 0) { + pr_warn_ratelimited("failed checking dgid\n"); + goto drop; + } + + pkt->opcode = bth_opcode(pkt); + pkt->psn = bth_psn(pkt); + pkt->qp = NULL; + pkt->mask |= rxe_opcode[pkt->opcode].mask; + + if (unlikely(skb->len < header_size(pkt))) + goto drop; + + err = hdr_check(pkt); + if (unlikely(err)) + goto drop; + + /* Verify ICRC */ + icrcp = (__be32 *)(pkt->hdr + pkt->paylen - RXE_ICRC_SIZE); + pack_icrc = be32_to_cpu(*icrcp); + + calc_icrc = rxe_icrc_hdr(pkt, skb); + calc_icrc = rxe_crc32(rxe, calc_icrc, (u8 *)payload_addr(pkt), + payload_size(pkt) + bth_pad(pkt)); + calc_icrc = (__force u32)cpu_to_be32(~calc_icrc); + if (unlikely(calc_icrc != pack_icrc)) { + if (skb->protocol == htons(ETH_P_IPV6)) + pr_warn_ratelimited("bad ICRC from %pI6c\n", + &ipv6_hdr(skb)->saddr); + else if (skb->protocol == htons(ETH_P_IP)) + pr_warn_ratelimited("bad ICRC from %pI4\n", + &ip_hdr(skb)->saddr); + else + pr_warn_ratelimited("bad ICRC from unknown\n"); + + goto drop; + } + + rxe_counter_inc(rxe, RXE_CNT_RCVD_PKTS); + + if (unlikely(bth_qpn(pkt) == IB_MULTICAST_QPN)) + rxe_rcv_mcast_pkt(rxe, skb); + else + rxe_rcv_pkt(pkt, skb); + + return; + +drop: + if (pkt->qp) + rxe_drop_ref(pkt->qp); + + kfree_skb(skb); +} diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c new file mode 100644 index 000000000..69238f856 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -0,0 +1,735 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <linux/skbuff.h> +#include <crypto/hash.h> + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" + +static int next_opcode(struct rxe_qp *qp, struct rxe_send_wqe *wqe, + u32 opcode); + +static inline void retry_first_write_send(struct rxe_qp *qp, + struct rxe_send_wqe *wqe, + unsigned int mask, int npsn) +{ + int i; + + for (i = 0; i < npsn; i++) { + int to_send = (wqe->dma.resid > qp->mtu) ? + qp->mtu : wqe->dma.resid; + + qp->req.opcode = next_opcode(qp, wqe, + wqe->wr.opcode); + + if (wqe->wr.send_flags & IB_SEND_INLINE) { + wqe->dma.resid -= to_send; + wqe->dma.sge_offset += to_send; + } else { + advance_dma_data(&wqe->dma, to_send); + } + if (mask & WR_WRITE_MASK) + wqe->iova += qp->mtu; + } +} + +static void req_retry(struct rxe_qp *qp) +{ + struct rxe_send_wqe *wqe; + unsigned int wqe_index; + unsigned int mask; + int npsn; + int first = 1; + + qp->req.wqe_index = consumer_index(qp->sq.queue); + qp->req.psn = qp->comp.psn; + qp->req.opcode = -1; + + for (wqe_index = consumer_index(qp->sq.queue); + wqe_index != producer_index(qp->sq.queue); + wqe_index = next_index(qp->sq.queue, wqe_index)) { + wqe = addr_from_index(qp->sq.queue, wqe_index); + mask = wr_opcode_mask(wqe->wr.opcode, qp); + + if (wqe->state == wqe_state_posted) + break; + + if (wqe->state == wqe_state_done) + continue; + + wqe->iova = (mask & WR_ATOMIC_MASK) ? + wqe->wr.wr.atomic.remote_addr : + (mask & WR_READ_OR_WRITE_MASK) ? + wqe->wr.wr.rdma.remote_addr : + 0; + + if (!first || (mask & WR_READ_MASK) == 0) { + wqe->dma.resid = wqe->dma.length; + wqe->dma.cur_sge = 0; + wqe->dma.sge_offset = 0; + } + + if (first) { + first = 0; + + if (mask & WR_WRITE_OR_SEND_MASK) { + npsn = (qp->comp.psn - wqe->first_psn) & + BTH_PSN_MASK; + retry_first_write_send(qp, wqe, mask, npsn); + } + + if (mask & WR_READ_MASK) { + npsn = (wqe->dma.length - wqe->dma.resid) / + qp->mtu; + wqe->iova += npsn * qp->mtu; + } + } + + wqe->state = wqe_state_posted; + } +} + +void rnr_nak_timer(struct timer_list *t) +{ + struct rxe_qp *qp = from_timer(qp, t, rnr_nak_timer); + + pr_debug("qp#%d rnr nak timer fired\n", qp_num(qp)); + rxe_run_task(&qp->req.task, 1); +} + +static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp) +{ + struct rxe_send_wqe *wqe = queue_head(qp->sq.queue); + unsigned long flags; + + if (unlikely(qp->req.state == QP_STATE_DRAIN)) { + /* check to see if we are drained; + * state_lock used by requester and completer + */ + spin_lock_irqsave(&qp->state_lock, flags); + do { + if (qp->req.state != QP_STATE_DRAIN) { + /* comp just finished */ + spin_unlock_irqrestore(&qp->state_lock, + flags); + break; + } + + if (wqe && ((qp->req.wqe_index != + consumer_index(qp->sq.queue)) || + (wqe->state != wqe_state_posted))) { + /* comp not done yet */ + spin_unlock_irqrestore(&qp->state_lock, + flags); + break; + } + + qp->req.state = QP_STATE_DRAINED; + spin_unlock_irqrestore(&qp->state_lock, flags); + + if (qp->ibqp.event_handler) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_SQ_DRAINED; + qp->ibqp.event_handler(&ev, + qp->ibqp.qp_context); + } + } while (0); + } + + if (qp->req.wqe_index == producer_index(qp->sq.queue)) + return NULL; + + wqe = addr_from_index(qp->sq.queue, qp->req.wqe_index); + + if (unlikely((qp->req.state == QP_STATE_DRAIN || + qp->req.state == QP_STATE_DRAINED) && + (wqe->state != wqe_state_processing))) + return NULL; + + if (unlikely((wqe->wr.send_flags & IB_SEND_FENCE) && + (qp->req.wqe_index != consumer_index(qp->sq.queue)))) { + qp->req.wait_fence = 1; + return NULL; + } + + wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp); + return wqe; +} + +static int next_opcode_rc(struct rxe_qp *qp, u32 opcode, int fits) +{ + switch (opcode) { + case IB_WR_RDMA_WRITE: + if (qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_FIRST || + qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_MIDDLE) + return fits ? + IB_OPCODE_RC_RDMA_WRITE_LAST : + IB_OPCODE_RC_RDMA_WRITE_MIDDLE; + else + return fits ? + IB_OPCODE_RC_RDMA_WRITE_ONLY : + IB_OPCODE_RC_RDMA_WRITE_FIRST; + + case IB_WR_RDMA_WRITE_WITH_IMM: + if (qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_FIRST || + qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_MIDDLE) + return fits ? + IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE : + IB_OPCODE_RC_RDMA_WRITE_MIDDLE; + else + return fits ? + IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE : + IB_OPCODE_RC_RDMA_WRITE_FIRST; + + case IB_WR_SEND: + if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST || + qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE) + return fits ? + IB_OPCODE_RC_SEND_LAST : + IB_OPCODE_RC_SEND_MIDDLE; + else + return fits ? + IB_OPCODE_RC_SEND_ONLY : + IB_OPCODE_RC_SEND_FIRST; + + case IB_WR_SEND_WITH_IMM: + if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST || + qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE) + return fits ? + IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE : + IB_OPCODE_RC_SEND_MIDDLE; + else + return fits ? + IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE : + IB_OPCODE_RC_SEND_FIRST; + + case IB_WR_RDMA_READ: + return IB_OPCODE_RC_RDMA_READ_REQUEST; + + case IB_WR_ATOMIC_CMP_AND_SWP: + return IB_OPCODE_RC_COMPARE_SWAP; + + case IB_WR_ATOMIC_FETCH_AND_ADD: + return IB_OPCODE_RC_FETCH_ADD; + + case IB_WR_SEND_WITH_INV: + if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST || + qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE) + return fits ? IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE : + IB_OPCODE_RC_SEND_MIDDLE; + else + return fits ? IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE : + IB_OPCODE_RC_SEND_FIRST; + case IB_WR_REG_MR: + case IB_WR_LOCAL_INV: + return opcode; + } + + return -EINVAL; +} + +static int next_opcode_uc(struct rxe_qp *qp, u32 opcode, int fits) +{ + switch (opcode) { + case IB_WR_RDMA_WRITE: + if (qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_FIRST || + qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_MIDDLE) + return fits ? + IB_OPCODE_UC_RDMA_WRITE_LAST : + IB_OPCODE_UC_RDMA_WRITE_MIDDLE; + else + return fits ? + IB_OPCODE_UC_RDMA_WRITE_ONLY : + IB_OPCODE_UC_RDMA_WRITE_FIRST; + + case IB_WR_RDMA_WRITE_WITH_IMM: + if (qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_FIRST || + qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_MIDDLE) + return fits ? + IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE : + IB_OPCODE_UC_RDMA_WRITE_MIDDLE; + else + return fits ? + IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE : + IB_OPCODE_UC_RDMA_WRITE_FIRST; + + case IB_WR_SEND: + if (qp->req.opcode == IB_OPCODE_UC_SEND_FIRST || + qp->req.opcode == IB_OPCODE_UC_SEND_MIDDLE) + return fits ? + IB_OPCODE_UC_SEND_LAST : + IB_OPCODE_UC_SEND_MIDDLE; + else + return fits ? + IB_OPCODE_UC_SEND_ONLY : + IB_OPCODE_UC_SEND_FIRST; + + case IB_WR_SEND_WITH_IMM: + if (qp->req.opcode == IB_OPCODE_UC_SEND_FIRST || + qp->req.opcode == IB_OPCODE_UC_SEND_MIDDLE) + return fits ? + IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE : + IB_OPCODE_UC_SEND_MIDDLE; + else + return fits ? + IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE : + IB_OPCODE_UC_SEND_FIRST; + } + + return -EINVAL; +} + +static int next_opcode(struct rxe_qp *qp, struct rxe_send_wqe *wqe, + u32 opcode) +{ + int fits = (wqe->dma.resid <= qp->mtu); + + switch (qp_type(qp)) { + case IB_QPT_RC: + return next_opcode_rc(qp, opcode, fits); + + case IB_QPT_UC: + return next_opcode_uc(qp, opcode, fits); + + case IB_QPT_SMI: + case IB_QPT_UD: + case IB_QPT_GSI: + switch (opcode) { + case IB_WR_SEND: + return IB_OPCODE_UD_SEND_ONLY; + + case IB_WR_SEND_WITH_IMM: + return IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; + } + break; + + default: + break; + } + + return -EINVAL; +} + +static inline int check_init_depth(struct rxe_qp *qp, struct rxe_send_wqe *wqe) +{ + int depth; + + if (wqe->has_rd_atomic) + return 0; + + qp->req.need_rd_atomic = 1; + depth = atomic_dec_return(&qp->req.rd_atomic); + + if (depth >= 0) { + qp->req.need_rd_atomic = 0; + wqe->has_rd_atomic = 1; + return 0; + } + + atomic_inc(&qp->req.rd_atomic); + return -EAGAIN; +} + +static inline int get_mtu(struct rxe_qp *qp) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + + if ((qp_type(qp) == IB_QPT_RC) || (qp_type(qp) == IB_QPT_UC)) + return qp->mtu; + + return rxe->port.mtu_cap; +} + +static struct sk_buff *init_req_packet(struct rxe_qp *qp, + struct rxe_send_wqe *wqe, + int opcode, int payload, + struct rxe_pkt_info *pkt) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct sk_buff *skb; + struct rxe_send_wr *ibwr = &wqe->wr; + struct rxe_av *av; + int pad = (-payload) & 0x3; + int paylen; + int solicited; + u16 pkey; + u32 qp_num; + int ack_req; + + /* length from start of bth to end of icrc */ + paylen = rxe_opcode[opcode].length + payload + pad + RXE_ICRC_SIZE; + + /* pkt->hdr, rxe, port_num and mask are initialized in ifc + * layer + */ + pkt->opcode = opcode; + pkt->qp = qp; + pkt->psn = qp->req.psn; + pkt->mask = rxe_opcode[opcode].mask; + pkt->paylen = paylen; + pkt->offset = 0; + pkt->wqe = wqe; + + /* init skb */ + av = rxe_get_av(pkt); + skb = rxe_init_packet(rxe, av, paylen, pkt); + if (unlikely(!skb)) + return NULL; + + /* init bth */ + solicited = (ibwr->send_flags & IB_SEND_SOLICITED) && + (pkt->mask & RXE_END_MASK) && + ((pkt->mask & (RXE_SEND_MASK)) || + (pkt->mask & (RXE_WRITE_MASK | RXE_IMMDT_MASK)) == + (RXE_WRITE_MASK | RXE_IMMDT_MASK)); + + pkey = IB_DEFAULT_PKEY_FULL; + + qp_num = (pkt->mask & RXE_DETH_MASK) ? ibwr->wr.ud.remote_qpn : + qp->attr.dest_qp_num; + + ack_req = ((pkt->mask & RXE_END_MASK) || + (qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK)); + if (ack_req) + qp->req.noack_pkts = 0; + + bth_init(pkt, pkt->opcode, solicited, 0, pad, pkey, qp_num, + ack_req, pkt->psn); + + /* init optional headers */ + if (pkt->mask & RXE_RETH_MASK) { + reth_set_rkey(pkt, ibwr->wr.rdma.rkey); + reth_set_va(pkt, wqe->iova); + reth_set_len(pkt, wqe->dma.resid); + } + + if (pkt->mask & RXE_IMMDT_MASK) + immdt_set_imm(pkt, ibwr->ex.imm_data); + + if (pkt->mask & RXE_IETH_MASK) + ieth_set_rkey(pkt, ibwr->ex.invalidate_rkey); + + if (pkt->mask & RXE_ATMETH_MASK) { + atmeth_set_va(pkt, wqe->iova); + if (opcode == IB_OPCODE_RC_COMPARE_SWAP || + opcode == IB_OPCODE_RD_COMPARE_SWAP) { + atmeth_set_swap_add(pkt, ibwr->wr.atomic.swap); + atmeth_set_comp(pkt, ibwr->wr.atomic.compare_add); + } else { + atmeth_set_swap_add(pkt, ibwr->wr.atomic.compare_add); + } + atmeth_set_rkey(pkt, ibwr->wr.atomic.rkey); + } + + if (pkt->mask & RXE_DETH_MASK) { + if (qp->ibqp.qp_num == 1) + deth_set_qkey(pkt, GSI_QKEY); + else + deth_set_qkey(pkt, ibwr->wr.ud.remote_qkey); + deth_set_sqp(pkt, qp->ibqp.qp_num); + } + + return skb; +} + +static int fill_packet(struct rxe_qp *qp, struct rxe_send_wqe *wqe, + struct rxe_pkt_info *pkt, struct sk_buff *skb, + int paylen) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + u32 crc = 0; + u32 *p; + int err; + + err = rxe_prepare(pkt, skb, &crc); + if (err) + return err; + + if (pkt->mask & RXE_WRITE_OR_SEND) { + if (wqe->wr.send_flags & IB_SEND_INLINE) { + u8 *tmp = &wqe->dma.inline_data[wqe->dma.sge_offset]; + + crc = rxe_crc32(rxe, crc, tmp, paylen); + memcpy(payload_addr(pkt), tmp, paylen); + + wqe->dma.resid -= paylen; + wqe->dma.sge_offset += paylen; + } else { + err = copy_data(qp->pd, 0, &wqe->dma, + payload_addr(pkt), paylen, + from_mem_obj, + &crc); + if (err) + return err; + } + if (bth_pad(pkt)) { + u8 *pad = payload_addr(pkt) + paylen; + + memset(pad, 0, bth_pad(pkt)); + crc = rxe_crc32(rxe, crc, pad, bth_pad(pkt)); + } + } + p = payload_addr(pkt) + paylen + bth_pad(pkt); + + *p = ~crc; + + return 0; +} + +static void update_wqe_state(struct rxe_qp *qp, + struct rxe_send_wqe *wqe, + struct rxe_pkt_info *pkt) +{ + if (pkt->mask & RXE_END_MASK) { + if (qp_type(qp) == IB_QPT_RC) + wqe->state = wqe_state_pending; + } else { + wqe->state = wqe_state_processing; + } +} + +static void update_wqe_psn(struct rxe_qp *qp, + struct rxe_send_wqe *wqe, + struct rxe_pkt_info *pkt, + int payload) +{ + /* number of packets left to send including current one */ + int num_pkt = (wqe->dma.resid + payload + qp->mtu - 1) / qp->mtu; + + /* handle zero length packet case */ + if (num_pkt == 0) + num_pkt = 1; + + if (pkt->mask & RXE_START_MASK) { + wqe->first_psn = qp->req.psn; + wqe->last_psn = (qp->req.psn + num_pkt - 1) & BTH_PSN_MASK; + } + + if (pkt->mask & RXE_READ_MASK) + qp->req.psn = (wqe->first_psn + num_pkt) & BTH_PSN_MASK; + else + qp->req.psn = (qp->req.psn + 1) & BTH_PSN_MASK; +} + +static void save_state(struct rxe_send_wqe *wqe, + struct rxe_qp *qp, + struct rxe_send_wqe *rollback_wqe, + u32 *rollback_psn) +{ + rollback_wqe->state = wqe->state; + rollback_wqe->first_psn = wqe->first_psn; + rollback_wqe->last_psn = wqe->last_psn; + *rollback_psn = qp->req.psn; +} + +static void rollback_state(struct rxe_send_wqe *wqe, + struct rxe_qp *qp, + struct rxe_send_wqe *rollback_wqe, + u32 rollback_psn) +{ + wqe->state = rollback_wqe->state; + wqe->first_psn = rollback_wqe->first_psn; + wqe->last_psn = rollback_wqe->last_psn; + qp->req.psn = rollback_psn; +} + +static void update_state(struct rxe_qp *qp, struct rxe_send_wqe *wqe, + struct rxe_pkt_info *pkt, int payload) +{ + qp->req.opcode = pkt->opcode; + + if (pkt->mask & RXE_END_MASK) + qp->req.wqe_index = next_index(qp->sq.queue, qp->req.wqe_index); + + qp->need_req_skb = 0; + + if (qp->qp_timeout_jiffies && !timer_pending(&qp->retrans_timer)) + mod_timer(&qp->retrans_timer, + jiffies + qp->qp_timeout_jiffies); +} + +int rxe_requester(void *arg) +{ + struct rxe_qp *qp = (struct rxe_qp *)arg; + struct rxe_pkt_info pkt; + struct sk_buff *skb; + struct rxe_send_wqe *wqe; + enum rxe_hdr_mask mask; + int payload; + int mtu; + int opcode; + int ret; + struct rxe_send_wqe rollback_wqe; + u32 rollback_psn; + + rxe_add_ref(qp); + +next_wqe: + if (unlikely(!qp->valid || qp->req.state == QP_STATE_ERROR)) + goto exit; + + if (unlikely(qp->req.state == QP_STATE_RESET)) { + qp->req.wqe_index = consumer_index(qp->sq.queue); + qp->req.opcode = -1; + qp->req.need_rd_atomic = 0; + qp->req.wait_psn = 0; + qp->req.need_retry = 0; + goto exit; + } + + if (unlikely(qp->req.need_retry)) { + req_retry(qp); + qp->req.need_retry = 0; + } + + wqe = req_next_wqe(qp); + if (unlikely(!wqe)) + goto exit; + + if (wqe->mask & WR_REG_MASK) { + if (wqe->wr.opcode == IB_WR_LOCAL_INV) { + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct rxe_mem *rmr; + + rmr = rxe_pool_get_index(&rxe->mr_pool, + wqe->wr.ex.invalidate_rkey >> 8); + if (!rmr) { + pr_err("No mr for key %#x\n", + wqe->wr.ex.invalidate_rkey); + wqe->state = wqe_state_error; + wqe->status = IB_WC_MW_BIND_ERR; + goto exit; + } + rmr->state = RXE_MEM_STATE_FREE; + rxe_drop_ref(rmr); + wqe->state = wqe_state_done; + wqe->status = IB_WC_SUCCESS; + } else if (wqe->wr.opcode == IB_WR_REG_MR) { + struct rxe_mem *rmr = to_rmr(wqe->wr.wr.reg.mr); + + rmr->state = RXE_MEM_STATE_VALID; + rmr->access = wqe->wr.wr.reg.access; + rmr->ibmr.lkey = wqe->wr.wr.reg.key; + rmr->ibmr.rkey = wqe->wr.wr.reg.key; + rmr->iova = wqe->wr.wr.reg.mr->iova; + wqe->state = wqe_state_done; + wqe->status = IB_WC_SUCCESS; + } else { + goto exit; + } + if ((wqe->wr.send_flags & IB_SEND_SIGNALED) || + qp->sq_sig_type == IB_SIGNAL_ALL_WR) + rxe_run_task(&qp->comp.task, 1); + qp->req.wqe_index = next_index(qp->sq.queue, + qp->req.wqe_index); + goto next_wqe; + } + + if (unlikely(qp_type(qp) == IB_QPT_RC && + psn_compare(qp->req.psn, (qp->comp.psn + + RXE_MAX_UNACKED_PSNS)) > 0)) { + qp->req.wait_psn = 1; + goto exit; + } + + /* Limit the number of inflight SKBs per QP */ + if (unlikely(atomic_read(&qp->skb_out) > + RXE_INFLIGHT_SKBS_PER_QP_HIGH)) { + qp->need_req_skb = 1; + goto exit; + } + + opcode = next_opcode(qp, wqe, wqe->wr.opcode); + if (unlikely(opcode < 0)) { + wqe->status = IB_WC_LOC_QP_OP_ERR; + goto err; + } + + mask = rxe_opcode[opcode].mask; + if (unlikely(mask & RXE_READ_OR_ATOMIC)) { + if (check_init_depth(qp, wqe)) + goto exit; + } + + mtu = get_mtu(qp); + payload = (mask & RXE_WRITE_OR_SEND) ? wqe->dma.resid : 0; + if (payload > mtu) { + if (qp_type(qp) == IB_QPT_UD) { + /* C10-93.1.1: If the total sum of all the buffer lengths specified for a + * UD message exceeds the MTU of the port as returned by QueryHCA, the CI + * shall not emit any packets for this message. Further, the CI shall not + * generate an error due to this condition. + */ + + /* fake a successful UD send */ + wqe->first_psn = qp->req.psn; + wqe->last_psn = qp->req.psn; + qp->req.psn = (qp->req.psn + 1) & BTH_PSN_MASK; + qp->req.opcode = IB_OPCODE_UD_SEND_ONLY; + qp->req.wqe_index = next_index(qp->sq.queue, + qp->req.wqe_index); + wqe->state = wqe_state_done; + wqe->status = IB_WC_SUCCESS; + __rxe_do_task(&qp->comp.task); + rxe_drop_ref(qp); + return 0; + } + payload = mtu; + } + + skb = init_req_packet(qp, wqe, opcode, payload, &pkt); + if (unlikely(!skb)) { + pr_err("qp#%d Failed allocating skb\n", qp_num(qp)); + goto err; + } + + if (fill_packet(qp, wqe, &pkt, skb, payload)) { + pr_debug("qp#%d Error during fill packet\n", qp_num(qp)); + kfree_skb(skb); + goto err; + } + + /* + * To prevent a race on wqe access between requester and completer, + * wqe members state and psn need to be set before calling + * rxe_xmit_packet(). + * Otherwise, completer might initiate an unjustified retry flow. + */ + save_state(wqe, qp, &rollback_wqe, &rollback_psn); + update_wqe_state(qp, wqe, &pkt); + update_wqe_psn(qp, wqe, &pkt, payload); + ret = rxe_xmit_packet(qp, &pkt, skb); + if (ret) { + qp->need_req_skb = 1; + + rollback_state(wqe, qp, &rollback_wqe, rollback_psn); + + if (ret == -EAGAIN) { + rxe_run_task(&qp->req.task, 1); + goto exit; + } + + goto err; + } + + update_state(qp, wqe, &pkt, payload); + + goto next_wqe; + +err: + wqe->status = IB_WC_LOC_PROT_ERR; + wqe->state = wqe_state_error; + __rxe_do_task(&qp->comp.task); + +exit: + rxe_drop_ref(qp); + return -EAGAIN; +} diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c new file mode 100644 index 000000000..83c032120 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -0,0 +1,1381 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <linux/skbuff.h> + +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" + +enum resp_states { + RESPST_NONE, + RESPST_GET_REQ, + RESPST_CHK_PSN, + RESPST_CHK_OP_SEQ, + RESPST_CHK_OP_VALID, + RESPST_CHK_RESOURCE, + RESPST_CHK_LENGTH, + RESPST_CHK_RKEY, + RESPST_EXECUTE, + RESPST_READ_REPLY, + RESPST_COMPLETE, + RESPST_ACKNOWLEDGE, + RESPST_CLEANUP, + RESPST_DUPLICATE_REQUEST, + RESPST_ERR_MALFORMED_WQE, + RESPST_ERR_UNSUPPORTED_OPCODE, + RESPST_ERR_MISALIGNED_ATOMIC, + RESPST_ERR_PSN_OUT_OF_SEQ, + RESPST_ERR_MISSING_OPCODE_FIRST, + RESPST_ERR_MISSING_OPCODE_LAST_C, + RESPST_ERR_MISSING_OPCODE_LAST_D1E, + RESPST_ERR_TOO_MANY_RDMA_ATM_REQ, + RESPST_ERR_RNR, + RESPST_ERR_RKEY_VIOLATION, + RESPST_ERR_LENGTH, + RESPST_ERR_CQ_OVERFLOW, + RESPST_ERROR, + RESPST_RESET, + RESPST_DONE, + RESPST_EXIT, +}; + +static char *resp_state_name[] = { + [RESPST_NONE] = "NONE", + [RESPST_GET_REQ] = "GET_REQ", + [RESPST_CHK_PSN] = "CHK_PSN", + [RESPST_CHK_OP_SEQ] = "CHK_OP_SEQ", + [RESPST_CHK_OP_VALID] = "CHK_OP_VALID", + [RESPST_CHK_RESOURCE] = "CHK_RESOURCE", + [RESPST_CHK_LENGTH] = "CHK_LENGTH", + [RESPST_CHK_RKEY] = "CHK_RKEY", + [RESPST_EXECUTE] = "EXECUTE", + [RESPST_READ_REPLY] = "READ_REPLY", + [RESPST_COMPLETE] = "COMPLETE", + [RESPST_ACKNOWLEDGE] = "ACKNOWLEDGE", + [RESPST_CLEANUP] = "CLEANUP", + [RESPST_DUPLICATE_REQUEST] = "DUPLICATE_REQUEST", + [RESPST_ERR_MALFORMED_WQE] = "ERR_MALFORMED_WQE", + [RESPST_ERR_UNSUPPORTED_OPCODE] = "ERR_UNSUPPORTED_OPCODE", + [RESPST_ERR_MISALIGNED_ATOMIC] = "ERR_MISALIGNED_ATOMIC", + [RESPST_ERR_PSN_OUT_OF_SEQ] = "ERR_PSN_OUT_OF_SEQ", + [RESPST_ERR_MISSING_OPCODE_FIRST] = "ERR_MISSING_OPCODE_FIRST", + [RESPST_ERR_MISSING_OPCODE_LAST_C] = "ERR_MISSING_OPCODE_LAST_C", + [RESPST_ERR_MISSING_OPCODE_LAST_D1E] = "ERR_MISSING_OPCODE_LAST_D1E", + [RESPST_ERR_TOO_MANY_RDMA_ATM_REQ] = "ERR_TOO_MANY_RDMA_ATM_REQ", + [RESPST_ERR_RNR] = "ERR_RNR", + [RESPST_ERR_RKEY_VIOLATION] = "ERR_RKEY_VIOLATION", + [RESPST_ERR_LENGTH] = "ERR_LENGTH", + [RESPST_ERR_CQ_OVERFLOW] = "ERR_CQ_OVERFLOW", + [RESPST_ERROR] = "ERROR", + [RESPST_RESET] = "RESET", + [RESPST_DONE] = "DONE", + [RESPST_EXIT] = "EXIT", +}; + +/* rxe_recv calls here to add a request packet to the input queue */ +void rxe_resp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb) +{ + int must_sched; + struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); + + skb_queue_tail(&qp->req_pkts, skb); + + must_sched = (pkt->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST) || + (skb_queue_len(&qp->req_pkts) > 1); + + rxe_run_task(&qp->resp.task, must_sched); +} + +static inline enum resp_states get_req(struct rxe_qp *qp, + struct rxe_pkt_info **pkt_p) +{ + struct sk_buff *skb; + + if (qp->resp.state == QP_STATE_ERROR) { + while ((skb = skb_dequeue(&qp->req_pkts))) { + rxe_drop_ref(qp); + kfree_skb(skb); + } + + /* go drain recv wr queue */ + return RESPST_CHK_RESOURCE; + } + + skb = skb_peek(&qp->req_pkts); + if (!skb) + return RESPST_EXIT; + + *pkt_p = SKB_TO_PKT(skb); + + return (qp->resp.res) ? RESPST_READ_REPLY : RESPST_CHK_PSN; +} + +static enum resp_states check_psn(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + int diff = psn_compare(pkt->psn, qp->resp.psn); + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + + switch (qp_type(qp)) { + case IB_QPT_RC: + if (diff > 0) { + if (qp->resp.sent_psn_nak) + return RESPST_CLEANUP; + + qp->resp.sent_psn_nak = 1; + rxe_counter_inc(rxe, RXE_CNT_OUT_OF_SEQ_REQ); + return RESPST_ERR_PSN_OUT_OF_SEQ; + + } else if (diff < 0) { + rxe_counter_inc(rxe, RXE_CNT_DUP_REQ); + return RESPST_DUPLICATE_REQUEST; + } + + if (qp->resp.sent_psn_nak) + qp->resp.sent_psn_nak = 0; + + break; + + case IB_QPT_UC: + if (qp->resp.drop_msg || diff != 0) { + if (pkt->mask & RXE_START_MASK) { + qp->resp.drop_msg = 0; + return RESPST_CHK_OP_SEQ; + } + + qp->resp.drop_msg = 1; + return RESPST_CLEANUP; + } + break; + default: + break; + } + + return RESPST_CHK_OP_SEQ; +} + +static enum resp_states check_op_seq(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + switch (qp_type(qp)) { + case IB_QPT_RC: + switch (qp->resp.opcode) { + case IB_OPCODE_RC_SEND_FIRST: + case IB_OPCODE_RC_SEND_MIDDLE: + switch (pkt->opcode) { + case IB_OPCODE_RC_SEND_MIDDLE: + case IB_OPCODE_RC_SEND_LAST: + case IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE: + case IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE: + return RESPST_CHK_OP_VALID; + default: + return RESPST_ERR_MISSING_OPCODE_LAST_C; + } + + case IB_OPCODE_RC_RDMA_WRITE_FIRST: + case IB_OPCODE_RC_RDMA_WRITE_MIDDLE: + switch (pkt->opcode) { + case IB_OPCODE_RC_RDMA_WRITE_MIDDLE: + case IB_OPCODE_RC_RDMA_WRITE_LAST: + case IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE: + return RESPST_CHK_OP_VALID; + default: + return RESPST_ERR_MISSING_OPCODE_LAST_C; + } + + default: + switch (pkt->opcode) { + case IB_OPCODE_RC_SEND_MIDDLE: + case IB_OPCODE_RC_SEND_LAST: + case IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE: + case IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE: + case IB_OPCODE_RC_RDMA_WRITE_MIDDLE: + case IB_OPCODE_RC_RDMA_WRITE_LAST: + case IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE: + return RESPST_ERR_MISSING_OPCODE_FIRST; + default: + return RESPST_CHK_OP_VALID; + } + } + break; + + case IB_QPT_UC: + switch (qp->resp.opcode) { + case IB_OPCODE_UC_SEND_FIRST: + case IB_OPCODE_UC_SEND_MIDDLE: + switch (pkt->opcode) { + case IB_OPCODE_UC_SEND_MIDDLE: + case IB_OPCODE_UC_SEND_LAST: + case IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE: + return RESPST_CHK_OP_VALID; + default: + return RESPST_ERR_MISSING_OPCODE_LAST_D1E; + } + + case IB_OPCODE_UC_RDMA_WRITE_FIRST: + case IB_OPCODE_UC_RDMA_WRITE_MIDDLE: + switch (pkt->opcode) { + case IB_OPCODE_UC_RDMA_WRITE_MIDDLE: + case IB_OPCODE_UC_RDMA_WRITE_LAST: + case IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE: + return RESPST_CHK_OP_VALID; + default: + return RESPST_ERR_MISSING_OPCODE_LAST_D1E; + } + + default: + switch (pkt->opcode) { + case IB_OPCODE_UC_SEND_MIDDLE: + case IB_OPCODE_UC_SEND_LAST: + case IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE: + case IB_OPCODE_UC_RDMA_WRITE_MIDDLE: + case IB_OPCODE_UC_RDMA_WRITE_LAST: + case IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE: + qp->resp.drop_msg = 1; + return RESPST_CLEANUP; + default: + return RESPST_CHK_OP_VALID; + } + } + break; + + default: + return RESPST_CHK_OP_VALID; + } +} + +static enum resp_states check_op_valid(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + switch (qp_type(qp)) { + case IB_QPT_RC: + if (((pkt->mask & RXE_READ_MASK) && + !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_READ)) || + ((pkt->mask & RXE_WRITE_MASK) && + !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) || + ((pkt->mask & RXE_ATOMIC_MASK) && + !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) { + return RESPST_ERR_UNSUPPORTED_OPCODE; + } + + break; + + case IB_QPT_UC: + if ((pkt->mask & RXE_WRITE_MASK) && + !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) { + qp->resp.drop_msg = 1; + return RESPST_CLEANUP; + } + + break; + + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + break; + + default: + WARN_ON_ONCE(1); + break; + } + + return RESPST_CHK_RESOURCE; +} + +static enum resp_states get_srq_wqe(struct rxe_qp *qp) +{ + struct rxe_srq *srq = qp->srq; + struct rxe_queue *q = srq->rq.queue; + struct rxe_recv_wqe *wqe; + struct ib_event ev; + + if (srq->error) + return RESPST_ERR_RNR; + + spin_lock_bh(&srq->rq.consumer_lock); + + wqe = queue_head(q); + if (!wqe) { + spin_unlock_bh(&srq->rq.consumer_lock); + return RESPST_ERR_RNR; + } + + /* note kernel and user space recv wqes have same size */ + memcpy(&qp->resp.srq_wqe, wqe, sizeof(qp->resp.srq_wqe)); + + qp->resp.wqe = &qp->resp.srq_wqe.wqe; + advance_consumer(q); + + if (srq->limit && srq->ibsrq.event_handler && + (queue_count(q) < srq->limit)) { + srq->limit = 0; + goto event; + } + + spin_unlock_bh(&srq->rq.consumer_lock); + return RESPST_CHK_LENGTH; + +event: + spin_unlock_bh(&srq->rq.consumer_lock); + ev.device = qp->ibqp.device; + ev.element.srq = qp->ibqp.srq; + ev.event = IB_EVENT_SRQ_LIMIT_REACHED; + srq->ibsrq.event_handler(&ev, srq->ibsrq.srq_context); + return RESPST_CHK_LENGTH; +} + +static enum resp_states check_resource(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + struct rxe_srq *srq = qp->srq; + + if (qp->resp.state == QP_STATE_ERROR) { + if (qp->resp.wqe) { + qp->resp.status = IB_WC_WR_FLUSH_ERR; + return RESPST_COMPLETE; + } else if (!srq) { + qp->resp.wqe = queue_head(qp->rq.queue); + if (qp->resp.wqe) { + qp->resp.status = IB_WC_WR_FLUSH_ERR; + return RESPST_COMPLETE; + } else { + return RESPST_EXIT; + } + } else { + return RESPST_EXIT; + } + } + + if (pkt->mask & RXE_READ_OR_ATOMIC) { + /* it is the requesters job to not send + * too many read/atomic ops, we just + * recycle the responder resource queue + */ + if (likely(qp->attr.max_dest_rd_atomic > 0)) + return RESPST_CHK_LENGTH; + else + return RESPST_ERR_TOO_MANY_RDMA_ATM_REQ; + } + + if (pkt->mask & RXE_RWR_MASK) { + if (srq) + return get_srq_wqe(qp); + + qp->resp.wqe = queue_head(qp->rq.queue); + return (qp->resp.wqe) ? RESPST_CHK_LENGTH : RESPST_ERR_RNR; + } + + return RESPST_CHK_LENGTH; +} + +static enum resp_states check_length(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + switch (qp_type(qp)) { + case IB_QPT_RC: + return RESPST_CHK_RKEY; + + case IB_QPT_UC: + return RESPST_CHK_RKEY; + + default: + return RESPST_CHK_RKEY; + } +} + +static enum resp_states check_rkey(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + struct rxe_mem *mem = NULL; + u64 va; + u32 rkey; + u32 resid; + u32 pktlen; + int mtu = qp->mtu; + enum resp_states state; + int access; + + if (pkt->mask & (RXE_READ_MASK | RXE_WRITE_MASK)) { + if (pkt->mask & RXE_RETH_MASK) { + qp->resp.va = reth_va(pkt); + qp->resp.rkey = reth_rkey(pkt); + qp->resp.resid = reth_len(pkt); + qp->resp.length = reth_len(pkt); + } + access = (pkt->mask & RXE_READ_MASK) ? IB_ACCESS_REMOTE_READ + : IB_ACCESS_REMOTE_WRITE; + } else if (pkt->mask & RXE_ATOMIC_MASK) { + qp->resp.va = atmeth_va(pkt); + qp->resp.rkey = atmeth_rkey(pkt); + qp->resp.resid = sizeof(u64); + access = IB_ACCESS_REMOTE_ATOMIC; + } else { + return RESPST_EXECUTE; + } + + /* A zero-byte op is not required to set an addr or rkey. */ + if ((pkt->mask & (RXE_READ_MASK | RXE_WRITE_OR_SEND)) && + (pkt->mask & RXE_RETH_MASK) && + reth_len(pkt) == 0) { + return RESPST_EXECUTE; + } + + va = qp->resp.va; + rkey = qp->resp.rkey; + resid = qp->resp.resid; + pktlen = payload_size(pkt); + + mem = lookup_mem(qp->pd, access, rkey, lookup_remote); + if (!mem) { + state = RESPST_ERR_RKEY_VIOLATION; + goto err; + } + + if (unlikely(mem->state == RXE_MEM_STATE_FREE)) { + state = RESPST_ERR_RKEY_VIOLATION; + goto err; + } + + if (mem_check_range(mem, va, resid)) { + state = RESPST_ERR_RKEY_VIOLATION; + goto err; + } + + if (pkt->mask & RXE_WRITE_MASK) { + if (resid > mtu) { + if (pktlen != mtu || bth_pad(pkt)) { + state = RESPST_ERR_LENGTH; + goto err; + } + } else { + if (pktlen != resid) { + state = RESPST_ERR_LENGTH; + goto err; + } + if ((bth_pad(pkt) != (0x3 & (-resid)))) { + /* This case may not be exactly that + * but nothing else fits. + */ + state = RESPST_ERR_LENGTH; + goto err; + } + } + } + + WARN_ON_ONCE(qp->resp.mr); + + qp->resp.mr = mem; + return RESPST_EXECUTE; + +err: + if (mem) + rxe_drop_ref(mem); + return state; +} + +static enum resp_states send_data_in(struct rxe_qp *qp, void *data_addr, + int data_len) +{ + int err; + + err = copy_data(qp->pd, IB_ACCESS_LOCAL_WRITE, &qp->resp.wqe->dma, + data_addr, data_len, to_mem_obj, NULL); + if (unlikely(err)) + return (err == -ENOSPC) ? RESPST_ERR_LENGTH + : RESPST_ERR_MALFORMED_WQE; + + return RESPST_NONE; +} + +static enum resp_states write_data_in(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + enum resp_states rc = RESPST_NONE; + int err; + int data_len = payload_size(pkt); + + err = rxe_mem_copy(qp->resp.mr, qp->resp.va, payload_addr(pkt), + data_len, to_mem_obj, NULL); + if (err) { + rc = RESPST_ERR_RKEY_VIOLATION; + goto out; + } + + qp->resp.va += data_len; + qp->resp.resid -= data_len; + +out: + return rc; +} + +/* Guarantee atomicity of atomic operations at the machine level. */ +static DEFINE_SPINLOCK(atomic_ops_lock); + +static enum resp_states process_atomic(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + u64 iova = atmeth_va(pkt); + u64 *vaddr; + enum resp_states ret; + struct rxe_mem *mr = qp->resp.mr; + + if (mr->state != RXE_MEM_STATE_VALID) { + ret = RESPST_ERR_RKEY_VIOLATION; + goto out; + } + + vaddr = iova_to_vaddr(mr, iova, sizeof(u64)); + + /* check vaddr is 8 bytes aligned. */ + if (!vaddr || (uintptr_t)vaddr & 7) { + ret = RESPST_ERR_MISALIGNED_ATOMIC; + goto out; + } + + spin_lock_bh(&atomic_ops_lock); + + qp->resp.atomic_orig = *vaddr; + + if (pkt->opcode == IB_OPCODE_RC_COMPARE_SWAP || + pkt->opcode == IB_OPCODE_RD_COMPARE_SWAP) { + if (*vaddr == atmeth_comp(pkt)) + *vaddr = atmeth_swap_add(pkt); + } else { + *vaddr += atmeth_swap_add(pkt); + } + + spin_unlock_bh(&atomic_ops_lock); + + ret = RESPST_NONE; +out: + return ret; +} + +static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp, + struct rxe_pkt_info *pkt, + struct rxe_pkt_info *ack, + int opcode, + int payload, + u32 psn, + u8 syndrome, + u32 *crcp) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct sk_buff *skb; + u32 crc = 0; + u32 *p; + int paylen; + int pad; + int err; + + /* + * allocate packet + */ + pad = (-payload) & 0x3; + paylen = rxe_opcode[opcode].length + payload + pad + RXE_ICRC_SIZE; + + skb = rxe_init_packet(rxe, &qp->pri_av, paylen, ack); + if (!skb) + return NULL; + + ack->qp = qp; + ack->opcode = opcode; + ack->mask = rxe_opcode[opcode].mask; + ack->offset = pkt->offset; + ack->paylen = paylen; + + /* fill in bth using the request packet headers */ + memcpy(ack->hdr, pkt->hdr, pkt->offset + RXE_BTH_BYTES); + + bth_set_opcode(ack, opcode); + bth_set_qpn(ack, qp->attr.dest_qp_num); + bth_set_pad(ack, pad); + bth_set_se(ack, 0); + bth_set_psn(ack, psn); + bth_set_ack(ack, 0); + ack->psn = psn; + + if (ack->mask & RXE_AETH_MASK) { + aeth_set_syn(ack, syndrome); + aeth_set_msn(ack, qp->resp.msn); + } + + if (ack->mask & RXE_ATMACK_MASK) + atmack_set_orig(ack, qp->resp.atomic_orig); + + err = rxe_prepare(ack, skb, &crc); + if (err) { + kfree_skb(skb); + return NULL; + } + + if (crcp) { + /* CRC computation will be continued by the caller */ + *crcp = crc; + } else { + p = payload_addr(ack) + payload + bth_pad(ack); + *p = ~crc; + } + + return skb; +} + +/* RDMA read response. If res is not NULL, then we have a current RDMA request + * being processed or replayed. + */ +static enum resp_states read_reply(struct rxe_qp *qp, + struct rxe_pkt_info *req_pkt) +{ + struct rxe_pkt_info ack_pkt; + struct sk_buff *skb; + int mtu = qp->mtu; + enum resp_states state; + int payload; + int opcode; + int err; + struct resp_res *res = qp->resp.res; + u32 icrc; + u32 *p; + + if (!res) { + /* This is the first time we process that request. Get a + * resource + */ + res = &qp->resp.resources[qp->resp.res_head]; + + free_rd_atomic_resource(qp, res); + rxe_advance_resp_resource(qp); + + res->type = RXE_READ_MASK; + res->replay = 0; + + res->read.va = qp->resp.va; + res->read.va_org = qp->resp.va; + + res->first_psn = req_pkt->psn; + + if (reth_len(req_pkt)) { + res->last_psn = (req_pkt->psn + + (reth_len(req_pkt) + mtu - 1) / + mtu - 1) & BTH_PSN_MASK; + } else { + res->last_psn = res->first_psn; + } + res->cur_psn = req_pkt->psn; + + res->read.resid = qp->resp.resid; + res->read.length = qp->resp.resid; + res->read.rkey = qp->resp.rkey; + + /* note res inherits the reference to mr from qp */ + res->read.mr = qp->resp.mr; + qp->resp.mr = NULL; + + qp->resp.res = res; + res->state = rdatm_res_state_new; + } + + if (res->state == rdatm_res_state_new) { + if (res->read.resid <= mtu) + opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY; + else + opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST; + } else { + if (res->read.resid > mtu) + opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE; + else + opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST; + } + + res->state = rdatm_res_state_next; + + payload = min_t(int, res->read.resid, mtu); + + skb = prepare_ack_packet(qp, req_pkt, &ack_pkt, opcode, payload, + res->cur_psn, AETH_ACK_UNLIMITED, &icrc); + if (!skb) + return RESPST_ERR_RNR; + + err = rxe_mem_copy(res->read.mr, res->read.va, payload_addr(&ack_pkt), + payload, from_mem_obj, &icrc); + if (err) + pr_err("Failed copying memory\n"); + + if (bth_pad(&ack_pkt)) { + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + u8 *pad = payload_addr(&ack_pkt) + payload; + + memset(pad, 0, bth_pad(&ack_pkt)); + icrc = rxe_crc32(rxe, icrc, pad, bth_pad(&ack_pkt)); + } + p = payload_addr(&ack_pkt) + payload + bth_pad(&ack_pkt); + *p = ~icrc; + + err = rxe_xmit_packet(qp, &ack_pkt, skb); + if (err) { + pr_err("Failed sending RDMA reply.\n"); + return RESPST_ERR_RNR; + } + + res->read.va += payload; + res->read.resid -= payload; + res->cur_psn = (res->cur_psn + 1) & BTH_PSN_MASK; + + if (res->read.resid > 0) { + state = RESPST_DONE; + } else { + qp->resp.res = NULL; + if (!res->replay) + qp->resp.opcode = -1; + if (psn_compare(res->cur_psn, qp->resp.psn) >= 0) + qp->resp.psn = res->cur_psn; + state = RESPST_CLEANUP; + } + + return state; +} + +static void build_rdma_network_hdr(union rdma_network_hdr *hdr, + struct rxe_pkt_info *pkt) +{ + struct sk_buff *skb = PKT_TO_SKB(pkt); + + memset(hdr, 0, sizeof(*hdr)); + if (skb->protocol == htons(ETH_P_IP)) + memcpy(&hdr->roce4grh, ip_hdr(skb), sizeof(hdr->roce4grh)); + else if (skb->protocol == htons(ETH_P_IPV6)) + memcpy(&hdr->ibgrh, ipv6_hdr(skb), sizeof(hdr->ibgrh)); +} + +/* Executes a new request. A retried request never reach that function (send + * and writes are discarded, and reads and atomics are retried elsewhere. + */ +static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt) +{ + enum resp_states err; + + if (pkt->mask & RXE_SEND_MASK) { + if (qp_type(qp) == IB_QPT_UD || + qp_type(qp) == IB_QPT_SMI || + qp_type(qp) == IB_QPT_GSI) { + union rdma_network_hdr hdr; + + build_rdma_network_hdr(&hdr, pkt); + + err = send_data_in(qp, &hdr, sizeof(hdr)); + if (err) + return err; + } + err = send_data_in(qp, payload_addr(pkt), payload_size(pkt)); + if (err) + return err; + } else if (pkt->mask & RXE_WRITE_MASK) { + err = write_data_in(qp, pkt); + if (err) + return err; + } else if (pkt->mask & RXE_READ_MASK) { + /* For RDMA Read we can increment the msn now. See C9-148. */ + qp->resp.msn++; + return RESPST_READ_REPLY; + } else if (pkt->mask & RXE_ATOMIC_MASK) { + err = process_atomic(qp, pkt); + if (err) + return err; + } else { + /* Unreachable */ + WARN_ON_ONCE(1); + } + + /* next expected psn, read handles this separately */ + qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK; + qp->resp.ack_psn = qp->resp.psn; + + qp->resp.opcode = pkt->opcode; + qp->resp.status = IB_WC_SUCCESS; + + if (pkt->mask & RXE_COMP_MASK) { + /* We successfully processed this new request. */ + qp->resp.msn++; + return RESPST_COMPLETE; + } else if (qp_type(qp) == IB_QPT_RC) + return RESPST_ACKNOWLEDGE; + else + return RESPST_CLEANUP; +} + +static enum resp_states do_complete(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + struct rxe_cqe cqe; + struct ib_wc *wc = &cqe.ibwc; + struct ib_uverbs_wc *uwc = &cqe.uibwc; + struct rxe_recv_wqe *wqe = qp->resp.wqe; + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + + if (unlikely(!wqe)) + return RESPST_CLEANUP; + + memset(&cqe, 0, sizeof(cqe)); + + if (qp->rcq->is_user) { + uwc->status = qp->resp.status; + uwc->qp_num = qp->ibqp.qp_num; + uwc->wr_id = wqe->wr_id; + } else { + wc->status = qp->resp.status; + wc->qp = &qp->ibqp; + wc->wr_id = wqe->wr_id; + } + + if (wc->status == IB_WC_SUCCESS) { + rxe_counter_inc(rxe, RXE_CNT_RDMA_RECV); + wc->opcode = (pkt->mask & RXE_IMMDT_MASK && + pkt->mask & RXE_WRITE_MASK) ? + IB_WC_RECV_RDMA_WITH_IMM : IB_WC_RECV; + wc->vendor_err = 0; + wc->byte_len = (pkt->mask & RXE_IMMDT_MASK && + pkt->mask & RXE_WRITE_MASK) ? + qp->resp.length : wqe->dma.length - wqe->dma.resid; + + /* fields after byte_len are different between kernel and user + * space + */ + if (qp->rcq->is_user) { + uwc->wc_flags = IB_WC_GRH; + + if (pkt->mask & RXE_IMMDT_MASK) { + uwc->wc_flags |= IB_WC_WITH_IMM; + uwc->ex.imm_data = immdt_imm(pkt); + } + + if (pkt->mask & RXE_IETH_MASK) { + uwc->wc_flags |= IB_WC_WITH_INVALIDATE; + uwc->ex.invalidate_rkey = ieth_rkey(pkt); + } + + uwc->qp_num = qp->ibqp.qp_num; + + if (pkt->mask & RXE_DETH_MASK) + uwc->src_qp = deth_sqp(pkt); + + uwc->port_num = qp->attr.port_num; + } else { + struct sk_buff *skb = PKT_TO_SKB(pkt); + + wc->wc_flags = IB_WC_GRH | IB_WC_WITH_NETWORK_HDR_TYPE; + if (skb->protocol == htons(ETH_P_IP)) + wc->network_hdr_type = RDMA_NETWORK_IPV4; + else + wc->network_hdr_type = RDMA_NETWORK_IPV6; + + if (is_vlan_dev(skb->dev)) { + wc->wc_flags |= IB_WC_WITH_VLAN; + wc->vlan_id = vlan_dev_vlan_id(skb->dev); + } + + if (pkt->mask & RXE_IMMDT_MASK) { + wc->wc_flags |= IB_WC_WITH_IMM; + wc->ex.imm_data = immdt_imm(pkt); + } + + if (pkt->mask & RXE_IETH_MASK) { + struct rxe_mem *rmr; + + wc->wc_flags |= IB_WC_WITH_INVALIDATE; + wc->ex.invalidate_rkey = ieth_rkey(pkt); + + rmr = rxe_pool_get_index(&rxe->mr_pool, + wc->ex.invalidate_rkey >> 8); + if (unlikely(!rmr)) { + pr_err("Bad rkey %#x invalidation\n", + wc->ex.invalidate_rkey); + return RESPST_ERROR; + } + rmr->state = RXE_MEM_STATE_FREE; + rxe_drop_ref(rmr); + } + + wc->qp = &qp->ibqp; + + if (pkt->mask & RXE_DETH_MASK) + wc->src_qp = deth_sqp(pkt); + + wc->port_num = qp->attr.port_num; + } + } + + /* have copy for srq and reference for !srq */ + if (!qp->srq) + advance_consumer(qp->rq.queue); + + qp->resp.wqe = NULL; + + if (rxe_cq_post(qp->rcq, &cqe, pkt ? bth_se(pkt) : 1)) + return RESPST_ERR_CQ_OVERFLOW; + + if (qp->resp.state == QP_STATE_ERROR) + return RESPST_CHK_RESOURCE; + + if (!pkt) + return RESPST_DONE; + else if (qp_type(qp) == IB_QPT_RC) + return RESPST_ACKNOWLEDGE; + else + return RESPST_CLEANUP; +} + +static int send_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt, + u8 syndrome, u32 psn) +{ + int err = 0; + struct rxe_pkt_info ack_pkt; + struct sk_buff *skb; + + skb = prepare_ack_packet(qp, pkt, &ack_pkt, IB_OPCODE_RC_ACKNOWLEDGE, + 0, psn, syndrome, NULL); + if (!skb) { + err = -ENOMEM; + goto err1; + } + + err = rxe_xmit_packet(qp, &ack_pkt, skb); + if (err) + pr_err_ratelimited("Failed sending ack\n"); + +err1: + return err; +} + +static int send_atomic_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt, + u8 syndrome) +{ + int rc = 0; + struct rxe_pkt_info ack_pkt; + struct sk_buff *skb; + struct resp_res *res; + + skb = prepare_ack_packet(qp, pkt, &ack_pkt, + IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE, 0, pkt->psn, + syndrome, NULL); + if (!skb) { + rc = -ENOMEM; + goto out; + } + + res = &qp->resp.resources[qp->resp.res_head]; + free_rd_atomic_resource(qp, res); + rxe_advance_resp_resource(qp); + + memcpy(SKB_TO_PKT(skb), &ack_pkt, sizeof(ack_pkt)); + memset((unsigned char *)SKB_TO_PKT(skb) + sizeof(ack_pkt), 0, + sizeof(skb->cb) - sizeof(ack_pkt)); + + skb_get(skb); + res->type = RXE_ATOMIC_MASK; + res->atomic.skb = skb; + res->first_psn = ack_pkt.psn; + res->last_psn = ack_pkt.psn; + res->cur_psn = ack_pkt.psn; + + rc = rxe_xmit_packet(qp, &ack_pkt, skb); + if (rc) { + pr_err_ratelimited("Failed sending ack\n"); + rxe_drop_ref(qp); + } +out: + return rc; +} + +static enum resp_states acknowledge(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + if (qp_type(qp) != IB_QPT_RC) + return RESPST_CLEANUP; + + if (qp->resp.aeth_syndrome != AETH_ACK_UNLIMITED) + send_ack(qp, pkt, qp->resp.aeth_syndrome, pkt->psn); + else if (pkt->mask & RXE_ATOMIC_MASK) + send_atomic_ack(qp, pkt, AETH_ACK_UNLIMITED); + else if (bth_ack(pkt)) + send_ack(qp, pkt, AETH_ACK_UNLIMITED, pkt->psn); + + return RESPST_CLEANUP; +} + +static enum resp_states cleanup(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + struct sk_buff *skb; + + if (pkt) { + skb = skb_dequeue(&qp->req_pkts); + rxe_drop_ref(qp); + kfree_skb(skb); + } + + if (qp->resp.mr) { + rxe_drop_ref(qp->resp.mr); + qp->resp.mr = NULL; + } + + return RESPST_DONE; +} + +static struct resp_res *find_resource(struct rxe_qp *qp, u32 psn) +{ + int i; + + for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) { + struct resp_res *res = &qp->resp.resources[i]; + + if (res->type == 0) + continue; + + if (psn_compare(psn, res->first_psn) >= 0 && + psn_compare(psn, res->last_psn) <= 0) { + return res; + } + } + + return NULL; +} + +static enum resp_states duplicate_request(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + enum resp_states rc; + u32 prev_psn = (qp->resp.ack_psn - 1) & BTH_PSN_MASK; + + if (pkt->mask & RXE_SEND_MASK || + pkt->mask & RXE_WRITE_MASK) { + /* SEND. Ack again and cleanup. C9-105. */ + if (bth_ack(pkt)) + send_ack(qp, pkt, AETH_ACK_UNLIMITED, prev_psn); + rc = RESPST_CLEANUP; + goto out; + } else if (pkt->mask & RXE_READ_MASK) { + struct resp_res *res; + + res = find_resource(qp, pkt->psn); + if (!res) { + /* Resource not found. Class D error. Drop the + * request. + */ + rc = RESPST_CLEANUP; + goto out; + } else { + /* Ensure this new request is the same as the previous + * one or a subset of it. + */ + u64 iova = reth_va(pkt); + u32 resid = reth_len(pkt); + + if (iova < res->read.va_org || + resid > res->read.length || + (iova + resid) > (res->read.va_org + + res->read.length)) { + rc = RESPST_CLEANUP; + goto out; + } + + if (reth_rkey(pkt) != res->read.rkey) { + rc = RESPST_CLEANUP; + goto out; + } + + res->cur_psn = pkt->psn; + res->state = (pkt->psn == res->first_psn) ? + rdatm_res_state_new : + rdatm_res_state_replay; + res->replay = 1; + + /* Reset the resource, except length. */ + res->read.va_org = iova; + res->read.va = iova; + res->read.resid = resid; + + /* Replay the RDMA read reply. */ + qp->resp.res = res; + rc = RESPST_READ_REPLY; + goto out; + } + } else { + struct resp_res *res; + + /* Find the operation in our list of responder resources. */ + res = find_resource(qp, pkt->psn); + if (res) { + skb_get(res->atomic.skb); + /* Resend the result. */ + rc = rxe_xmit_packet(qp, pkt, res->atomic.skb); + if (rc) { + pr_err("Failed resending result. This flow is not handled - skb ignored\n"); + rc = RESPST_CLEANUP; + goto out; + } + } + + /* Resource not found. Class D error. Drop the request. */ + rc = RESPST_CLEANUP; + goto out; + } +out: + return rc; +} + +/* Process a class A or C. Both are treated the same in this implementation. */ +static void do_class_ac_error(struct rxe_qp *qp, u8 syndrome, + enum ib_wc_status status) +{ + qp->resp.aeth_syndrome = syndrome; + qp->resp.status = status; + + /* indicate that we should go through the ERROR state */ + qp->resp.goto_error = 1; +} + +static enum resp_states do_class_d1e_error(struct rxe_qp *qp) +{ + /* UC */ + if (qp->srq) { + /* Class E */ + qp->resp.drop_msg = 1; + if (qp->resp.wqe) { + qp->resp.status = IB_WC_REM_INV_REQ_ERR; + return RESPST_COMPLETE; + } else { + return RESPST_CLEANUP; + } + } else { + /* Class D1. This packet may be the start of a + * new message and could be valid. The previous + * message is invalid and ignored. reset the + * recv wr to its original state + */ + if (qp->resp.wqe) { + qp->resp.wqe->dma.resid = qp->resp.wqe->dma.length; + qp->resp.wqe->dma.cur_sge = 0; + qp->resp.wqe->dma.sge_offset = 0; + qp->resp.opcode = -1; + } + + if (qp->resp.mr) { + rxe_drop_ref(qp->resp.mr); + qp->resp.mr = NULL; + } + + return RESPST_CLEANUP; + } +} + +static void rxe_drain_req_pkts(struct rxe_qp *qp, bool notify) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(&qp->req_pkts))) { + rxe_drop_ref(qp); + kfree_skb(skb); + } + + if (notify) + return; + + while (!qp->srq && qp->rq.queue && queue_head(qp->rq.queue)) + advance_consumer(qp->rq.queue); +} + +int rxe_responder(void *arg) +{ + struct rxe_qp *qp = (struct rxe_qp *)arg; + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + enum resp_states state; + struct rxe_pkt_info *pkt = NULL; + int ret = 0; + + rxe_add_ref(qp); + + qp->resp.aeth_syndrome = AETH_ACK_UNLIMITED; + + if (!qp->valid) { + ret = -EINVAL; + goto done; + } + + switch (qp->resp.state) { + case QP_STATE_RESET: + state = RESPST_RESET; + break; + + default: + state = RESPST_GET_REQ; + break; + } + + while (1) { + pr_debug("qp#%d state = %s\n", qp_num(qp), + resp_state_name[state]); + switch (state) { + case RESPST_GET_REQ: + state = get_req(qp, &pkt); + break; + case RESPST_CHK_PSN: + state = check_psn(qp, pkt); + break; + case RESPST_CHK_OP_SEQ: + state = check_op_seq(qp, pkt); + break; + case RESPST_CHK_OP_VALID: + state = check_op_valid(qp, pkt); + break; + case RESPST_CHK_RESOURCE: + state = check_resource(qp, pkt); + break; + case RESPST_CHK_LENGTH: + state = check_length(qp, pkt); + break; + case RESPST_CHK_RKEY: + state = check_rkey(qp, pkt); + break; + case RESPST_EXECUTE: + state = execute(qp, pkt); + break; + case RESPST_COMPLETE: + state = do_complete(qp, pkt); + break; + case RESPST_READ_REPLY: + state = read_reply(qp, pkt); + break; + case RESPST_ACKNOWLEDGE: + state = acknowledge(qp, pkt); + break; + case RESPST_CLEANUP: + state = cleanup(qp, pkt); + break; + case RESPST_DUPLICATE_REQUEST: + state = duplicate_request(qp, pkt); + break; + case RESPST_ERR_PSN_OUT_OF_SEQ: + /* RC only - Class B. Drop packet. */ + send_ack(qp, pkt, AETH_NAK_PSN_SEQ_ERROR, qp->resp.psn); + state = RESPST_CLEANUP; + break; + + case RESPST_ERR_TOO_MANY_RDMA_ATM_REQ: + case RESPST_ERR_MISSING_OPCODE_FIRST: + case RESPST_ERR_MISSING_OPCODE_LAST_C: + case RESPST_ERR_UNSUPPORTED_OPCODE: + case RESPST_ERR_MISALIGNED_ATOMIC: + /* RC Only - Class C. */ + do_class_ac_error(qp, AETH_NAK_INVALID_REQ, + IB_WC_REM_INV_REQ_ERR); + state = RESPST_COMPLETE; + break; + + case RESPST_ERR_MISSING_OPCODE_LAST_D1E: + state = do_class_d1e_error(qp); + break; + case RESPST_ERR_RNR: + if (qp_type(qp) == IB_QPT_RC) { + rxe_counter_inc(rxe, RXE_CNT_SND_RNR); + /* RC - class B */ + send_ack(qp, pkt, AETH_RNR_NAK | + (~AETH_TYPE_MASK & + qp->attr.min_rnr_timer), + pkt->psn); + } else { + /* UD/UC - class D */ + qp->resp.drop_msg = 1; + } + state = RESPST_CLEANUP; + break; + + case RESPST_ERR_RKEY_VIOLATION: + if (qp_type(qp) == IB_QPT_RC) { + /* Class C */ + do_class_ac_error(qp, AETH_NAK_REM_ACC_ERR, + IB_WC_REM_ACCESS_ERR); + state = RESPST_COMPLETE; + } else { + qp->resp.drop_msg = 1; + if (qp->srq) { + /* UC/SRQ Class D */ + qp->resp.status = IB_WC_REM_ACCESS_ERR; + state = RESPST_COMPLETE; + } else { + /* UC/non-SRQ Class E. */ + state = RESPST_CLEANUP; + } + } + break; + + case RESPST_ERR_LENGTH: + if (qp_type(qp) == IB_QPT_RC) { + /* Class C */ + do_class_ac_error(qp, AETH_NAK_INVALID_REQ, + IB_WC_REM_INV_REQ_ERR); + state = RESPST_COMPLETE; + } else if (qp->srq) { + /* UC/UD - class E */ + qp->resp.status = IB_WC_REM_INV_REQ_ERR; + state = RESPST_COMPLETE; + } else { + /* UC/UD - class D */ + qp->resp.drop_msg = 1; + state = RESPST_CLEANUP; + } + break; + + case RESPST_ERR_MALFORMED_WQE: + /* All, Class A. */ + do_class_ac_error(qp, AETH_NAK_REM_OP_ERR, + IB_WC_LOC_QP_OP_ERR); + state = RESPST_COMPLETE; + break; + + case RESPST_ERR_CQ_OVERFLOW: + /* All - Class G */ + state = RESPST_ERROR; + break; + + case RESPST_DONE: + if (qp->resp.goto_error) { + state = RESPST_ERROR; + break; + } + + goto done; + + case RESPST_EXIT: + if (qp->resp.goto_error) { + state = RESPST_ERROR; + break; + } + + goto exit; + + case RESPST_RESET: + rxe_drain_req_pkts(qp, false); + qp->resp.wqe = NULL; + goto exit; + + case RESPST_ERROR: + qp->resp.goto_error = 0; + pr_warn("qp#%d moved to error state\n", qp_num(qp)); + rxe_qp_error(qp); + goto exit; + + default: + WARN_ON_ONCE(1); + } + } + +exit: + ret = -EAGAIN; +done: + rxe_drop_ref(qp); + return ret; +} diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c new file mode 100644 index 000000000..41b0d1e11 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_srq.c @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <linux/vmalloc.h> +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" + +int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_attr *attr, enum ib_srq_attr_mask mask) +{ + if (srq && srq->error) { + pr_warn("srq in error state\n"); + goto err1; + } + + if (mask & IB_SRQ_MAX_WR) { + if (attr->max_wr > rxe->attr.max_srq_wr) { + pr_warn("max_wr(%d) > max_srq_wr(%d)\n", + attr->max_wr, rxe->attr.max_srq_wr); + goto err1; + } + + if (attr->max_wr <= 0) { + pr_warn("max_wr(%d) <= 0\n", attr->max_wr); + goto err1; + } + + if (srq && srq->limit && (attr->max_wr < srq->limit)) { + pr_warn("max_wr (%d) < srq->limit (%d)\n", + attr->max_wr, srq->limit); + goto err1; + } + + if (attr->max_wr < RXE_MIN_SRQ_WR) + attr->max_wr = RXE_MIN_SRQ_WR; + } + + if (mask & IB_SRQ_LIMIT) { + if (attr->srq_limit > rxe->attr.max_srq_wr) { + pr_warn("srq_limit(%d) > max_srq_wr(%d)\n", + attr->srq_limit, rxe->attr.max_srq_wr); + goto err1; + } + + if (srq && (attr->srq_limit > srq->rq.queue->buf->index_mask)) { + pr_warn("srq_limit (%d) > cur limit(%d)\n", + attr->srq_limit, + srq->rq.queue->buf->index_mask); + goto err1; + } + } + + if (mask == IB_SRQ_INIT_MASK) { + if (attr->max_sge > rxe->attr.max_srq_sge) { + pr_warn("max_sge(%d) > max_srq_sge(%d)\n", + attr->max_sge, rxe->attr.max_srq_sge); + goto err1; + } + + if (attr->max_sge < RXE_MIN_SRQ_SGE) + attr->max_sge = RXE_MIN_SRQ_SGE; + } + + return 0; + +err1: + return -EINVAL; +} + +int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_init_attr *init, struct ib_udata *udata, + struct rxe_create_srq_resp __user *uresp) +{ + int err; + int srq_wqe_size; + struct rxe_queue *q; + + srq->ibsrq.event_handler = init->event_handler; + srq->ibsrq.srq_context = init->srq_context; + srq->limit = init->attr.srq_limit; + srq->srq_num = srq->pelem.index; + srq->rq.max_wr = init->attr.max_wr; + srq->rq.max_sge = init->attr.max_sge; + + srq_wqe_size = rcv_wqe_size(srq->rq.max_sge); + + spin_lock_init(&srq->rq.producer_lock); + spin_lock_init(&srq->rq.consumer_lock); + + q = rxe_queue_init(rxe, &srq->rq.max_wr, + srq_wqe_size); + if (!q) { + pr_warn("unable to allocate queue for srq\n"); + return -ENOMEM; + } + + srq->rq.queue = q; + + err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, udata, q->buf, + q->buf_size, &q->ip); + if (err) { + vfree(q->buf); + kfree(q); + return err; + } + + if (uresp) { + if (copy_to_user(&uresp->srq_num, &srq->srq_num, + sizeof(uresp->srq_num))) { + rxe_queue_cleanup(q); + return -EFAULT; + } + } + + return 0; +} + +int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_attr *attr, enum ib_srq_attr_mask mask, + struct rxe_modify_srq_cmd *ucmd, struct ib_udata *udata) +{ + int err; + struct rxe_queue *q = srq->rq.queue; + struct mminfo __user *mi = NULL; + + if (mask & IB_SRQ_MAX_WR) { + /* + * This is completely screwed up, the response is supposed to + * be in the outbuf not like this. + */ + mi = u64_to_user_ptr(ucmd->mmap_info_addr); + + err = rxe_queue_resize(q, &attr->max_wr, + rcv_wqe_size(srq->rq.max_sge), udata, mi, + &srq->rq.producer_lock, + &srq->rq.consumer_lock); + if (err) + goto err2; + } + + if (mask & IB_SRQ_LIMIT) + srq->limit = attr->srq_limit; + + return 0; + +err2: + rxe_queue_cleanup(q); + srq->rq.queue = NULL; + return err; +} diff --git a/drivers/infiniband/sw/rxe/rxe_sysfs.c b/drivers/infiniband/sw/rxe/rxe_sysfs.c new file mode 100644 index 000000000..666202ddf --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_sysfs.c @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include "rxe.h" +#include "rxe_net.h" + +/* Copy argument and remove trailing CR. Return the new length. */ +static int sanitize_arg(const char *val, char *intf, int intf_len) +{ + int len; + + if (!val) + return 0; + + /* Remove newline. */ + for (len = 0; len < intf_len - 1 && val[len] && val[len] != '\n'; len++) + intf[len] = val[len]; + intf[len] = 0; + + if (len == 0 || (val[len] != 0 && val[len] != '\n')) + return 0; + + return len; +} + +static int rxe_param_set_add(const char *val, const struct kernel_param *kp) +{ + int len; + int err = 0; + char intf[32]; + struct net_device *ndev; + struct rxe_dev *exists; + + if (!rxe_initialized) { + pr_err("Module parameters are not supported, use rdma link add or rxe_cfg\n"); + return -EAGAIN; + } + + len = sanitize_arg(val, intf, sizeof(intf)); + if (!len) { + pr_err("add: invalid interface name\n"); + return -EINVAL; + } + + ndev = dev_get_by_name(&init_net, intf); + if (!ndev) { + pr_err("interface %s not found\n", intf); + return -EINVAL; + } + + if (is_vlan_dev(ndev)) { + pr_err("rxe creation allowed on top of a real device only\n"); + err = -EPERM; + goto err; + } + + exists = rxe_get_dev_from_net(ndev); + if (exists) { + ib_device_put(&exists->ib_dev); + pr_err("already configured on %s\n", intf); + err = -EINVAL; + goto err; + } + + err = rxe_net_add("rxe%d", ndev); + if (err) { + pr_err("failed to add %s\n", intf); + goto err; + } + +err: + dev_put(ndev); + return err; +} + +static int rxe_param_set_remove(const char *val, const struct kernel_param *kp) +{ + int len; + char intf[32]; + struct ib_device *ib_dev; + + len = sanitize_arg(val, intf, sizeof(intf)); + if (!len) { + pr_err("add: invalid interface name\n"); + return -EINVAL; + } + + if (strncmp("all", intf, len) == 0) { + pr_info("rxe_sys: remove all"); + ib_unregister_driver(RDMA_DRIVER_RXE); + return 0; + } + + ib_dev = ib_device_get_by_name(intf, RDMA_DRIVER_RXE); + if (!ib_dev) { + pr_err("not configured on %s\n", intf); + return -EINVAL; + } + + ib_unregister_device_and_put(ib_dev); + + return 0; +} + +static const struct kernel_param_ops rxe_add_ops = { + .set = rxe_param_set_add, +}; + +static const struct kernel_param_ops rxe_remove_ops = { + .set = rxe_param_set_remove, +}; + +module_param_cb(add, &rxe_add_ops, NULL, 0200); +MODULE_PARM_DESC(add, "DEPRECATED. Create RXE device over network interface"); +module_param_cb(remove, &rxe_remove_ops, NULL, 0200); +MODULE_PARM_DESC(remove, "DEPRECATED. Remove RXE device over network interface"); diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c new file mode 100644 index 000000000..5aa69947a --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/hardirq.h> + +#include "rxe.h" + +int __rxe_do_task(struct rxe_task *task) + +{ + int ret; + + while ((ret = task->func(task->arg)) == 0) + ; + + task->ret = ret; + + return ret; +} + +/* + * this locking is due to a potential race where + * a second caller finds the task already running + * but looks just after the last call to func + */ +void rxe_do_task(struct tasklet_struct *t) +{ + int cont; + int ret; + unsigned long flags; + struct rxe_task *task = from_tasklet(task, t, tasklet); + unsigned int iterations = RXE_MAX_ITERATIONS; + + spin_lock_irqsave(&task->state_lock, flags); + switch (task->state) { + case TASK_STATE_START: + task->state = TASK_STATE_BUSY; + spin_unlock_irqrestore(&task->state_lock, flags); + break; + + case TASK_STATE_BUSY: + task->state = TASK_STATE_ARMED; + fallthrough; + case TASK_STATE_ARMED: + spin_unlock_irqrestore(&task->state_lock, flags); + return; + + default: + spin_unlock_irqrestore(&task->state_lock, flags); + pr_warn("%s failed with bad state %d\n", __func__, task->state); + return; + } + + do { + cont = 0; + ret = task->func(task->arg); + + spin_lock_irqsave(&task->state_lock, flags); + switch (task->state) { + case TASK_STATE_BUSY: + if (ret) { + task->state = TASK_STATE_START; + } else if (iterations--) { + cont = 1; + } else { + /* reschedule the tasklet and exit + * the loop to give up the cpu + */ + tasklet_schedule(&task->tasklet); + task->state = TASK_STATE_START; + } + break; + + /* someone tried to run the task since the last time we called + * func, so we will call one more time regardless of the + * return value + */ + case TASK_STATE_ARMED: + task->state = TASK_STATE_BUSY; + cont = 1; + break; + + default: + pr_warn("%s failed with bad state %d\n", __func__, + task->state); + } + spin_unlock_irqrestore(&task->state_lock, flags); + } while (cont); + + task->ret = ret; +} + +int rxe_init_task(struct rxe_task *task, void *arg, int (*func)(void *)) +{ + task->arg = arg; + task->func = func; + task->destroyed = false; + + tasklet_setup(&task->tasklet, rxe_do_task); + + task->state = TASK_STATE_START; + spin_lock_init(&task->state_lock); + + return 0; +} + +void rxe_cleanup_task(struct rxe_task *task) +{ + unsigned long flags; + bool idle; + + /* + * Mark the task, then wait for it to finish. It might be + * running in a non-tasklet (direct call) context. + */ + task->destroyed = true; + + do { + spin_lock_irqsave(&task->state_lock, flags); + idle = (task->state == TASK_STATE_START); + spin_unlock_irqrestore(&task->state_lock, flags); + } while (!idle); + + tasklet_kill(&task->tasklet); +} + +void rxe_run_task(struct rxe_task *task, int sched) +{ + if (task->destroyed) + return; + + if (sched) + tasklet_schedule(&task->tasklet); + else + rxe_do_task(&task->tasklet); +} + +void rxe_disable_task(struct rxe_task *task) +{ + tasklet_disable(&task->tasklet); +} + +void rxe_enable_task(struct rxe_task *task) +{ + tasklet_enable(&task->tasklet); +} diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h new file mode 100644 index 000000000..b3dfd970d --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_task.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#ifndef RXE_TASK_H +#define RXE_TASK_H + +enum { + TASK_STATE_START = 0, + TASK_STATE_BUSY = 1, + TASK_STATE_ARMED = 2, +}; + +/* + * data structure to describe a 'task' which is a short + * function that returns 0 as long as it needs to be + * called again. + */ +struct rxe_task { + struct tasklet_struct tasklet; + int state; + spinlock_t state_lock; /* spinlock for task state */ + void *arg; + int (*func)(void *arg); + int ret; + bool destroyed; +}; + +/* + * init rxe_task structure + * arg => parameter to pass to fcn + * func => function to call until it returns != 0 + */ +int rxe_init_task(struct rxe_task *task, void *arg, int (*func)(void *)); + +/* cleanup task */ +void rxe_cleanup_task(struct rxe_task *task); + +/* + * raw call to func in loop without any checking + * can call when tasklets are disabled + */ +int __rxe_do_task(struct rxe_task *task); + +/* + * common function called by any of the main tasklets + * If there is any chance that there is additional + * work to do someone must reschedule the task before + * leaving + */ +void rxe_do_task(struct tasklet_struct *t); + +/* run a task, else schedule it to run as a tasklet, The decision + * to run or schedule tasklet is based on the parameter sched. + */ +void rxe_run_task(struct rxe_task *task, int sched); + +/* keep a task from scheduling */ +void rxe_disable_task(struct rxe_task *task); + +/* allow task to run */ +void rxe_enable_task(struct rxe_task *task); + +#endif /* RXE_TASK_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c new file mode 100644 index 000000000..512868c23 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -0,0 +1,1187 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#include <linux/dma-mapping.h> +#include <net/addrconf.h> +#include <rdma/uverbs_ioctl.h> +#include "rxe.h" +#include "rxe_loc.h" +#include "rxe_queue.h" +#include "rxe_hw_counters.h" + +static int rxe_query_device(struct ib_device *dev, + struct ib_device_attr *attr, + struct ib_udata *uhw) +{ + struct rxe_dev *rxe = to_rdev(dev); + + if (uhw->inlen || uhw->outlen) + return -EINVAL; + + *attr = rxe->attr; + return 0; +} + +static int rxe_query_port(struct ib_device *dev, + u8 port_num, struct ib_port_attr *attr) +{ + struct rxe_dev *rxe = to_rdev(dev); + struct rxe_port *port; + int rc; + + port = &rxe->port; + + /* *attr being zeroed by the caller, avoid zeroing it here */ + *attr = port->attr; + + mutex_lock(&rxe->usdev_lock); + rc = ib_get_eth_speed(dev, port_num, &attr->active_speed, + &attr->active_width); + + if (attr->state == IB_PORT_ACTIVE) + attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; + else if (dev_get_flags(rxe->ndev) & IFF_UP) + attr->phys_state = IB_PORT_PHYS_STATE_POLLING; + else + attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; + + mutex_unlock(&rxe->usdev_lock); + + return rc; +} + +static int rxe_query_pkey(struct ib_device *device, + u8 port_num, u16 index, u16 *pkey) +{ + if (index > 0) + return -EINVAL; + + *pkey = IB_DEFAULT_PKEY_FULL; + return 0; +} + +static int rxe_modify_device(struct ib_device *dev, + int mask, struct ib_device_modify *attr) +{ + struct rxe_dev *rxe = to_rdev(dev); + + if (mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID | + IB_DEVICE_MODIFY_NODE_DESC)) + return -EOPNOTSUPP; + + if (mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) + rxe->attr.sys_image_guid = cpu_to_be64(attr->sys_image_guid); + + if (mask & IB_DEVICE_MODIFY_NODE_DESC) { + memcpy(rxe->ib_dev.node_desc, + attr->node_desc, sizeof(rxe->ib_dev.node_desc)); + } + + return 0; +} + +static int rxe_modify_port(struct ib_device *dev, + u8 port_num, int mask, struct ib_port_modify *attr) +{ + struct rxe_dev *rxe = to_rdev(dev); + struct rxe_port *port; + + port = &rxe->port; + + port->attr.port_cap_flags |= attr->set_port_cap_mask; + port->attr.port_cap_flags &= ~attr->clr_port_cap_mask; + + if (mask & IB_PORT_RESET_QKEY_CNTR) + port->attr.qkey_viol_cntr = 0; + + return 0; +} + +static enum rdma_link_layer rxe_get_link_layer(struct ib_device *dev, + u8 port_num) +{ + return IB_LINK_LAYER_ETHERNET; +} + +static int rxe_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) +{ + struct rxe_dev *rxe = to_rdev(uctx->device); + struct rxe_ucontext *uc = to_ruc(uctx); + + return rxe_add_to_pool(&rxe->uc_pool, &uc->pelem); +} + +static void rxe_dealloc_ucontext(struct ib_ucontext *ibuc) +{ + struct rxe_ucontext *uc = to_ruc(ibuc); + + rxe_drop_ref(uc); +} + +static int rxe_port_immutable(struct ib_device *dev, u8 port_num, + struct ib_port_immutable *immutable) +{ + int err; + struct ib_port_attr attr; + + immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; + + err = ib_query_port(dev, port_num, &attr); + if (err) + return err; + + immutable->pkey_tbl_len = attr.pkey_tbl_len; + immutable->gid_tbl_len = attr.gid_tbl_len; + immutable->max_mad_size = IB_MGMT_MAD_SIZE; + + return 0; +} + +static int rxe_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +{ + struct rxe_dev *rxe = to_rdev(ibpd->device); + struct rxe_pd *pd = to_rpd(ibpd); + + return rxe_add_to_pool(&rxe->pd_pool, &pd->pelem); +} + +static int rxe_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +{ + struct rxe_pd *pd = to_rpd(ibpd); + + rxe_drop_ref(pd); + return 0; +} + +static int rxe_create_ah(struct ib_ah *ibah, + struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata) + +{ + int err; + struct rxe_dev *rxe = to_rdev(ibah->device); + struct rxe_ah *ah = to_rah(ibah); + + err = rxe_av_chk_attr(rxe, init_attr->ah_attr); + if (err) + return err; + + err = rxe_add_to_pool(&rxe->ah_pool, &ah->pelem); + if (err) + return err; + + rxe_init_av(init_attr->ah_attr, &ah->av); + return 0; +} + +static int rxe_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr) +{ + int err; + struct rxe_dev *rxe = to_rdev(ibah->device); + struct rxe_ah *ah = to_rah(ibah); + + err = rxe_av_chk_attr(rxe, attr); + if (err) + return err; + + rxe_init_av(attr, &ah->av); + return 0; +} + +static int rxe_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr) +{ + struct rxe_ah *ah = to_rah(ibah); + + memset(attr, 0, sizeof(*attr)); + attr->type = ibah->type; + rxe_av_to_attr(&ah->av, attr); + return 0; +} + +static int rxe_destroy_ah(struct ib_ah *ibah, u32 flags) +{ + struct rxe_ah *ah = to_rah(ibah); + + rxe_drop_ref(ah); + return 0; +} + +static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr) +{ + int err; + int i; + u32 length; + struct rxe_recv_wqe *recv_wqe; + int num_sge = ibwr->num_sge; + + if (unlikely(queue_full(rq->queue))) { + err = -ENOMEM; + goto err1; + } + + if (unlikely(num_sge > rq->max_sge)) { + err = -EINVAL; + goto err1; + } + + length = 0; + for (i = 0; i < num_sge; i++) + length += ibwr->sg_list[i].length; + + recv_wqe = producer_addr(rq->queue); + recv_wqe->wr_id = ibwr->wr_id; + recv_wqe->num_sge = num_sge; + + memcpy(recv_wqe->dma.sge, ibwr->sg_list, + num_sge * sizeof(struct ib_sge)); + + recv_wqe->dma.length = length; + recv_wqe->dma.resid = length; + recv_wqe->dma.num_sge = num_sge; + recv_wqe->dma.cur_sge = 0; + recv_wqe->dma.sge_offset = 0; + + /* make sure all changes to the work queue are written before we + * update the producer pointer + */ + smp_wmb(); + + advance_producer(rq->queue); + return 0; + +err1: + return err; +} + +static int rxe_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init, + struct ib_udata *udata) +{ + int err; + struct rxe_dev *rxe = to_rdev(ibsrq->device); + struct rxe_pd *pd = to_rpd(ibsrq->pd); + struct rxe_srq *srq = to_rsrq(ibsrq); + struct rxe_create_srq_resp __user *uresp = NULL; + + if (udata) { + if (udata->outlen < sizeof(*uresp)) + return -EINVAL; + uresp = udata->outbuf; + } + + err = rxe_srq_chk_attr(rxe, NULL, &init->attr, IB_SRQ_INIT_MASK); + if (err) + goto err1; + + err = rxe_add_to_pool(&rxe->srq_pool, &srq->pelem); + if (err) + goto err1; + + rxe_add_ref(pd); + srq->pd = pd; + + err = rxe_srq_from_init(rxe, srq, init, udata, uresp); + if (err) + goto err2; + + return 0; + +err2: + rxe_drop_ref(pd); + rxe_drop_ref(srq); +err1: + return err; +} + +static int rxe_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask mask, + struct ib_udata *udata) +{ + int err; + struct rxe_srq *srq = to_rsrq(ibsrq); + struct rxe_dev *rxe = to_rdev(ibsrq->device); + struct rxe_modify_srq_cmd ucmd = {}; + + if (udata) { + if (udata->inlen < sizeof(ucmd)) + return -EINVAL; + + err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); + if (err) + return err; + } + + err = rxe_srq_chk_attr(rxe, srq, attr, mask); + if (err) + goto err1; + + err = rxe_srq_from_attr(rxe, srq, attr, mask, &ucmd, udata); + if (err) + goto err1; + + return 0; + +err1: + return err; +} + +static int rxe_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) +{ + struct rxe_srq *srq = to_rsrq(ibsrq); + + if (srq->error) + return -EINVAL; + + attr->max_wr = srq->rq.queue->buf->index_mask; + attr->max_sge = srq->rq.max_sge; + attr->srq_limit = srq->limit; + return 0; +} + +static int rxe_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) +{ + struct rxe_srq *srq = to_rsrq(ibsrq); + + if (srq->rq.queue) + rxe_queue_cleanup(srq->rq.queue); + + rxe_drop_ref(srq->pd); + rxe_drop_ref(srq); + return 0; +} + +static int rxe_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + int err = 0; + unsigned long flags; + struct rxe_srq *srq = to_rsrq(ibsrq); + + spin_lock_irqsave(&srq->rq.producer_lock, flags); + + while (wr) { + err = post_one_recv(&srq->rq, wr); + if (unlikely(err)) + break; + wr = wr->next; + } + + spin_unlock_irqrestore(&srq->rq.producer_lock, flags); + + if (err) + *bad_wr = wr; + + return err; +} + +static struct ib_qp *rxe_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init, + struct ib_udata *udata) +{ + int err; + struct rxe_dev *rxe = to_rdev(ibpd->device); + struct rxe_pd *pd = to_rpd(ibpd); + struct rxe_qp *qp; + struct rxe_create_qp_resp __user *uresp = NULL; + + if (udata) { + if (udata->outlen < sizeof(*uresp)) + return ERR_PTR(-EINVAL); + uresp = udata->outbuf; + } + + err = rxe_qp_chk_init(rxe, init); + if (err) + goto err1; + + qp = rxe_alloc(&rxe->qp_pool); + if (!qp) { + err = -ENOMEM; + goto err1; + } + + if (udata) { + if (udata->inlen) { + err = -EINVAL; + goto err2; + } + qp->is_user = 1; + } + + rxe_add_index(qp); + + err = rxe_qp_from_init(rxe, qp, pd, init, uresp, ibpd, udata); + if (err) + goto err3; + + return &qp->ibqp; + +err3: + rxe_drop_index(qp); +err2: + rxe_drop_ref(qp); +err1: + return ERR_PTR(err); +} + +static int rxe_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int mask, struct ib_udata *udata) +{ + int err; + struct rxe_dev *rxe = to_rdev(ibqp->device); + struct rxe_qp *qp = to_rqp(ibqp); + + err = rxe_qp_chk_attr(rxe, qp, attr, mask); + if (err) + goto err1; + + err = rxe_qp_from_attr(qp, attr, mask, udata); + if (err) + goto err1; + + return 0; + +err1: + return err; +} + +static int rxe_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int mask, struct ib_qp_init_attr *init) +{ + struct rxe_qp *qp = to_rqp(ibqp); + + rxe_qp_to_init(qp, init); + rxe_qp_to_attr(qp, attr, mask); + + return 0; +} + +static int rxe_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) +{ + struct rxe_qp *qp = to_rqp(ibqp); + + rxe_qp_destroy(qp); + rxe_drop_index(qp); + rxe_drop_ref(qp); + return 0; +} + +static int validate_send_wr(struct rxe_qp *qp, const struct ib_send_wr *ibwr, + unsigned int mask, unsigned int length) +{ + int num_sge = ibwr->num_sge; + struct rxe_sq *sq = &qp->sq; + + if (unlikely(num_sge > sq->max_sge)) + goto err1; + + if (unlikely(mask & WR_ATOMIC_MASK)) { + if (length < 8) + goto err1; + + if (atomic_wr(ibwr)->remote_addr & 0x7) + goto err1; + } + + if (unlikely((ibwr->send_flags & IB_SEND_INLINE) && + (length > sq->max_inline))) + goto err1; + + return 0; + +err1: + return -EINVAL; +} + +static void init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr, + const struct ib_send_wr *ibwr) +{ + wr->wr_id = ibwr->wr_id; + wr->num_sge = ibwr->num_sge; + wr->opcode = ibwr->opcode; + wr->send_flags = ibwr->send_flags; + + if (qp_type(qp) == IB_QPT_UD || + qp_type(qp) == IB_QPT_SMI || + qp_type(qp) == IB_QPT_GSI) { + wr->wr.ud.remote_qpn = ud_wr(ibwr)->remote_qpn; + wr->wr.ud.remote_qkey = ud_wr(ibwr)->remote_qkey; + if (qp_type(qp) == IB_QPT_GSI) + wr->wr.ud.pkey_index = ud_wr(ibwr)->pkey_index; + if (wr->opcode == IB_WR_SEND_WITH_IMM) + wr->ex.imm_data = ibwr->ex.imm_data; + } else { + switch (wr->opcode) { + case IB_WR_RDMA_WRITE_WITH_IMM: + wr->ex.imm_data = ibwr->ex.imm_data; + fallthrough; + case IB_WR_RDMA_READ: + case IB_WR_RDMA_WRITE: + wr->wr.rdma.remote_addr = rdma_wr(ibwr)->remote_addr; + wr->wr.rdma.rkey = rdma_wr(ibwr)->rkey; + break; + case IB_WR_SEND_WITH_IMM: + wr->ex.imm_data = ibwr->ex.imm_data; + break; + case IB_WR_SEND_WITH_INV: + wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey; + break; + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + wr->wr.atomic.remote_addr = + atomic_wr(ibwr)->remote_addr; + wr->wr.atomic.compare_add = + atomic_wr(ibwr)->compare_add; + wr->wr.atomic.swap = atomic_wr(ibwr)->swap; + wr->wr.atomic.rkey = atomic_wr(ibwr)->rkey; + break; + case IB_WR_LOCAL_INV: + wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey; + break; + case IB_WR_REG_MR: + wr->wr.reg.mr = reg_wr(ibwr)->mr; + wr->wr.reg.key = reg_wr(ibwr)->key; + wr->wr.reg.access = reg_wr(ibwr)->access; + break; + default: + break; + } + } +} + +static int init_send_wqe(struct rxe_qp *qp, const struct ib_send_wr *ibwr, + unsigned int mask, unsigned int length, + struct rxe_send_wqe *wqe) +{ + int num_sge = ibwr->num_sge; + struct ib_sge *sge; + int i; + u8 *p; + + init_send_wr(qp, &wqe->wr, ibwr); + + if (qp_type(qp) == IB_QPT_UD || + qp_type(qp) == IB_QPT_SMI || + qp_type(qp) == IB_QPT_GSI) + memcpy(&wqe->av, &to_rah(ud_wr(ibwr)->ah)->av, sizeof(wqe->av)); + + if (unlikely(ibwr->send_flags & IB_SEND_INLINE)) { + p = wqe->dma.inline_data; + + sge = ibwr->sg_list; + for (i = 0; i < num_sge; i++, sge++) { + memcpy(p, (void *)(uintptr_t)sge->addr, + sge->length); + + p += sge->length; + } + } else if (mask & WR_REG_MASK) { + wqe->mask = mask; + wqe->state = wqe_state_posted; + return 0; + } else + memcpy(wqe->dma.sge, ibwr->sg_list, + num_sge * sizeof(struct ib_sge)); + + wqe->iova = mask & WR_ATOMIC_MASK ? atomic_wr(ibwr)->remote_addr : + mask & WR_READ_OR_WRITE_MASK ? rdma_wr(ibwr)->remote_addr : 0; + wqe->mask = mask; + wqe->dma.length = length; + wqe->dma.resid = length; + wqe->dma.num_sge = num_sge; + wqe->dma.cur_sge = 0; + wqe->dma.sge_offset = 0; + wqe->state = wqe_state_posted; + wqe->ssn = atomic_add_return(1, &qp->ssn); + + return 0; +} + +static int post_one_send(struct rxe_qp *qp, const struct ib_send_wr *ibwr, + unsigned int mask, u32 length) +{ + int err; + struct rxe_sq *sq = &qp->sq; + struct rxe_send_wqe *send_wqe; + unsigned long flags; + + err = validate_send_wr(qp, ibwr, mask, length); + if (err) + return err; + + spin_lock_irqsave(&qp->sq.sq_lock, flags); + + if (unlikely(queue_full(sq->queue))) { + err = -ENOMEM; + goto err1; + } + + send_wqe = producer_addr(sq->queue); + + err = init_send_wqe(qp, ibwr, mask, length, send_wqe); + if (unlikely(err)) + goto err1; + + /* + * make sure all changes to the work queue are + * written before we update the producer pointer + */ + smp_wmb(); + + advance_producer(sq->queue); + spin_unlock_irqrestore(&qp->sq.sq_lock, flags); + + return 0; + +err1: + spin_unlock_irqrestore(&qp->sq.sq_lock, flags); + return err; +} + +static int rxe_post_send_kernel(struct rxe_qp *qp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) +{ + int err = 0; + unsigned int mask; + unsigned int length = 0; + int i; + struct ib_send_wr *next; + + while (wr) { + mask = wr_opcode_mask(wr->opcode, qp); + if (unlikely(!mask)) { + err = -EINVAL; + *bad_wr = wr; + break; + } + + if (unlikely((wr->send_flags & IB_SEND_INLINE) && + !(mask & WR_INLINE_MASK))) { + err = -EINVAL; + *bad_wr = wr; + break; + } + + next = wr->next; + + length = 0; + for (i = 0; i < wr->num_sge; i++) + length += wr->sg_list[i].length; + + err = post_one_send(qp, wr, mask, length); + + if (err) { + *bad_wr = wr; + break; + } + wr = next; + } + + rxe_run_task(&qp->req.task, 1); + if (unlikely(qp->req.state == QP_STATE_ERROR)) + rxe_run_task(&qp->comp.task, 1); + + return err; +} + +static int rxe_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) +{ + struct rxe_qp *qp = to_rqp(ibqp); + + if (unlikely(!qp->valid)) { + *bad_wr = wr; + return -EINVAL; + } + + if (unlikely(qp->req.state < QP_STATE_READY)) { + *bad_wr = wr; + return -EINVAL; + } + + if (qp->is_user) { + /* Utilize process context to do protocol processing */ + rxe_run_task(&qp->req.task, 0); + return 0; + } else + return rxe_post_send_kernel(qp, wr, bad_wr); +} + +static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + int err = 0; + struct rxe_qp *qp = to_rqp(ibqp); + struct rxe_rq *rq = &qp->rq; + unsigned long flags; + + if (unlikely((qp_state(qp) < IB_QPS_INIT) || !qp->valid)) { + *bad_wr = wr; + err = -EINVAL; + goto err1; + } + + if (unlikely(qp->srq)) { + *bad_wr = wr; + err = -EINVAL; + goto err1; + } + + spin_lock_irqsave(&rq->producer_lock, flags); + + while (wr) { + err = post_one_recv(rq, wr); + if (unlikely(err)) { + *bad_wr = wr; + break; + } + wr = wr->next; + } + + spin_unlock_irqrestore(&rq->producer_lock, flags); + + if (qp->resp.state == QP_STATE_ERROR) + rxe_run_task(&qp->resp.task, 1); + +err1: + return err; +} + +static int rxe_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) +{ + int err; + struct ib_device *dev = ibcq->device; + struct rxe_dev *rxe = to_rdev(dev); + struct rxe_cq *cq = to_rcq(ibcq); + struct rxe_create_cq_resp __user *uresp = NULL; + + if (udata) { + if (udata->outlen < sizeof(*uresp)) + return -EINVAL; + uresp = udata->outbuf; + } + + if (attr->flags) + return -EINVAL; + + err = rxe_cq_chk_attr(rxe, NULL, attr->cqe, attr->comp_vector); + if (err) + return err; + + err = rxe_cq_from_init(rxe, cq, attr->cqe, attr->comp_vector, udata, + uresp); + if (err) + return err; + + return rxe_add_to_pool(&rxe->cq_pool, &cq->pelem); +} + +static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +{ + struct rxe_cq *cq = to_rcq(ibcq); + + rxe_cq_disable(cq); + + rxe_drop_ref(cq); + return 0; +} + +static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) +{ + int err; + struct rxe_cq *cq = to_rcq(ibcq); + struct rxe_dev *rxe = to_rdev(ibcq->device); + struct rxe_resize_cq_resp __user *uresp = NULL; + + if (udata) { + if (udata->outlen < sizeof(*uresp)) + return -EINVAL; + uresp = udata->outbuf; + } + + err = rxe_cq_chk_attr(rxe, cq, cqe, 0); + if (err) + goto err1; + + err = rxe_cq_resize_queue(cq, cqe, uresp, udata); + if (err) + goto err1; + + return 0; + +err1: + return err; +} + +static int rxe_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + int i; + struct rxe_cq *cq = to_rcq(ibcq); + struct rxe_cqe *cqe; + unsigned long flags; + + spin_lock_irqsave(&cq->cq_lock, flags); + for (i = 0; i < num_entries; i++) { + cqe = queue_head(cq->queue); + if (!cqe) + break; + + memcpy(wc++, &cqe->ibwc, sizeof(*wc)); + advance_consumer(cq->queue); + } + spin_unlock_irqrestore(&cq->cq_lock, flags); + + return i; +} + +static int rxe_peek_cq(struct ib_cq *ibcq, int wc_cnt) +{ + struct rxe_cq *cq = to_rcq(ibcq); + int count = queue_count(cq->queue); + + return (count > wc_cnt) ? wc_cnt : count; +} + +static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct rxe_cq *cq = to_rcq(ibcq); + unsigned long irq_flags; + int ret = 0; + + spin_lock_irqsave(&cq->cq_lock, irq_flags); + if (cq->notify != IB_CQ_NEXT_COMP) + cq->notify = flags & IB_CQ_SOLICITED_MASK; + + if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && !queue_empty(cq->queue)) + ret = 1; + + spin_unlock_irqrestore(&cq->cq_lock, irq_flags); + + return ret; +} + +static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access) +{ + struct rxe_dev *rxe = to_rdev(ibpd->device); + struct rxe_pd *pd = to_rpd(ibpd); + struct rxe_mem *mr; + + mr = rxe_alloc(&rxe->mr_pool); + if (!mr) + return ERR_PTR(-ENOMEM); + + rxe_add_index(mr); + rxe_add_ref(pd); + rxe_mem_init_dma(pd, access, mr); + + return &mr->ibmr; +} + +static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, + u64 start, + u64 length, + u64 iova, + int access, struct ib_udata *udata) +{ + int err; + struct rxe_dev *rxe = to_rdev(ibpd->device); + struct rxe_pd *pd = to_rpd(ibpd); + struct rxe_mem *mr; + + mr = rxe_alloc(&rxe->mr_pool); + if (!mr) { + err = -ENOMEM; + goto err2; + } + + rxe_add_index(mr); + + rxe_add_ref(pd); + + err = rxe_mem_init_user(pd, start, length, iova, + access, udata, mr); + if (err) + goto err3; + + return &mr->ibmr; + +err3: + rxe_drop_ref(pd); + rxe_drop_index(mr); + rxe_drop_ref(mr); +err2: + return ERR_PTR(err); +} + +static int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +{ + struct rxe_mem *mr = to_rmr(ibmr); + + mr->state = RXE_MEM_STATE_ZOMBIE; + rxe_drop_ref(mr_pd(mr)); + rxe_drop_index(mr); + rxe_drop_ref(mr); + return 0; +} + +static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, + u32 max_num_sg) +{ + struct rxe_dev *rxe = to_rdev(ibpd->device); + struct rxe_pd *pd = to_rpd(ibpd); + struct rxe_mem *mr; + int err; + + if (mr_type != IB_MR_TYPE_MEM_REG) + return ERR_PTR(-EINVAL); + + mr = rxe_alloc(&rxe->mr_pool); + if (!mr) { + err = -ENOMEM; + goto err1; + } + + rxe_add_index(mr); + + rxe_add_ref(pd); + + err = rxe_mem_init_fast(pd, max_num_sg, mr); + if (err) + goto err2; + + return &mr->ibmr; + +err2: + rxe_drop_ref(pd); + rxe_drop_index(mr); + rxe_drop_ref(mr); +err1: + return ERR_PTR(err); +} + +static int rxe_set_page(struct ib_mr *ibmr, u64 addr) +{ + struct rxe_mem *mr = to_rmr(ibmr); + struct rxe_map *map; + struct rxe_phys_buf *buf; + + if (unlikely(mr->nbuf == mr->num_buf)) + return -ENOMEM; + + map = mr->map[mr->nbuf / RXE_BUF_PER_MAP]; + buf = &map->buf[mr->nbuf % RXE_BUF_PER_MAP]; + + buf->addr = addr; + buf->size = ibmr->page_size; + mr->nbuf++; + + return 0; +} + +static int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, + int sg_nents, unsigned int *sg_offset) +{ + struct rxe_mem *mr = to_rmr(ibmr); + int n; + + mr->nbuf = 0; + + n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, rxe_set_page); + + mr->va = ibmr->iova; + mr->iova = ibmr->iova; + mr->length = ibmr->length; + mr->page_shift = ilog2(ibmr->page_size); + mr->page_mask = ibmr->page_size - 1; + mr->offset = mr->iova & mr->page_mask; + + return n; +} + +static int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) +{ + int err; + struct rxe_dev *rxe = to_rdev(ibqp->device); + struct rxe_qp *qp = to_rqp(ibqp); + struct rxe_mc_grp *grp; + + /* takes a ref on grp if successful */ + err = rxe_mcast_get_grp(rxe, mgid, &grp); + if (err) + return err; + + err = rxe_mcast_add_grp_elem(rxe, qp, grp); + + rxe_drop_ref(grp); + return err; +} + +static int rxe_detach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) +{ + struct rxe_dev *rxe = to_rdev(ibqp->device); + struct rxe_qp *qp = to_rqp(ibqp); + + return rxe_mcast_drop_grp_elem(rxe, qp, mgid); +} + +static ssize_t parent_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct rxe_dev *rxe = + rdma_device_to_drv_device(device, struct rxe_dev, ib_dev); + + return scnprintf(buf, PAGE_SIZE, "%s\n", rxe_parent_name(rxe, 1)); +} + +static DEVICE_ATTR_RO(parent); + +static struct attribute *rxe_dev_attributes[] = { + &dev_attr_parent.attr, + NULL +}; + +static const struct attribute_group rxe_attr_group = { + .attrs = rxe_dev_attributes, +}; + +static int rxe_enable_driver(struct ib_device *ib_dev) +{ + struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev); + + rxe_set_port_state(rxe); + dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(rxe->ndev)); + return 0; +} + +static const struct ib_device_ops rxe_dev_ops = { + .owner = THIS_MODULE, + .driver_id = RDMA_DRIVER_RXE, + .uverbs_abi_ver = RXE_UVERBS_ABI_VERSION, + + .alloc_hw_stats = rxe_ib_alloc_hw_stats, + .alloc_mr = rxe_alloc_mr, + .alloc_pd = rxe_alloc_pd, + .alloc_ucontext = rxe_alloc_ucontext, + .attach_mcast = rxe_attach_mcast, + .create_ah = rxe_create_ah, + .create_cq = rxe_create_cq, + .create_qp = rxe_create_qp, + .create_srq = rxe_create_srq, + .dealloc_driver = rxe_dealloc, + .dealloc_pd = rxe_dealloc_pd, + .dealloc_ucontext = rxe_dealloc_ucontext, + .dereg_mr = rxe_dereg_mr, + .destroy_ah = rxe_destroy_ah, + .destroy_cq = rxe_destroy_cq, + .destroy_qp = rxe_destroy_qp, + .destroy_srq = rxe_destroy_srq, + .detach_mcast = rxe_detach_mcast, + .enable_driver = rxe_enable_driver, + .get_dma_mr = rxe_get_dma_mr, + .get_hw_stats = rxe_ib_get_hw_stats, + .get_link_layer = rxe_get_link_layer, + .get_port_immutable = rxe_port_immutable, + .map_mr_sg = rxe_map_mr_sg, + .mmap = rxe_mmap, + .modify_ah = rxe_modify_ah, + .modify_device = rxe_modify_device, + .modify_port = rxe_modify_port, + .modify_qp = rxe_modify_qp, + .modify_srq = rxe_modify_srq, + .peek_cq = rxe_peek_cq, + .poll_cq = rxe_poll_cq, + .post_recv = rxe_post_recv, + .post_send = rxe_post_send, + .post_srq_recv = rxe_post_srq_recv, + .query_ah = rxe_query_ah, + .query_device = rxe_query_device, + .query_pkey = rxe_query_pkey, + .query_port = rxe_query_port, + .query_qp = rxe_query_qp, + .query_srq = rxe_query_srq, + .reg_user_mr = rxe_reg_user_mr, + .req_notify_cq = rxe_req_notify_cq, + .resize_cq = rxe_resize_cq, + + INIT_RDMA_OBJ_SIZE(ib_ah, rxe_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, rxe_cq, ibcq), + INIT_RDMA_OBJ_SIZE(ib_pd, rxe_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_srq, rxe_srq, ibsrq), + INIT_RDMA_OBJ_SIZE(ib_ucontext, rxe_ucontext, ibuc), +}; + +int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) +{ + int err; + struct ib_device *dev = &rxe->ib_dev; + struct crypto_shash *tfm; + + strlcpy(dev->node_desc, "rxe", sizeof(dev->node_desc)); + + dev->node_type = RDMA_NODE_IB_CA; + dev->phys_port_cnt = 1; + dev->num_comp_vectors = num_possible_cpus(); + dev->local_dma_lkey = 0; + addrconf_addr_eui48((unsigned char *)&dev->node_guid, + rxe->ndev->dev_addr); + + dev->uverbs_cmd_mask = BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) + | BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) + | BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) + | BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT) + | BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) + | BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD) + | BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ) + | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ) + | BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ) + | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ) + | BIT_ULL(IB_USER_VERBS_CMD_POST_SRQ_RECV) + | BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP) + | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP) + | BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP) + | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP) + | BIT_ULL(IB_USER_VERBS_CMD_POST_SEND) + | BIT_ULL(IB_USER_VERBS_CMD_POST_RECV) + | BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ) + | BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ) + | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ) + | BIT_ULL(IB_USER_VERBS_CMD_POLL_CQ) + | BIT_ULL(IB_USER_VERBS_CMD_PEEK_CQ) + | BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) + | BIT_ULL(IB_USER_VERBS_CMD_REG_MR) + | BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR) + | BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH) + | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_AH) + | BIT_ULL(IB_USER_VERBS_CMD_QUERY_AH) + | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH) + | BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST) + | BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST) + ; + + ib_set_device_ops(dev, &rxe_dev_ops); + err = ib_device_set_netdev(&rxe->ib_dev, rxe->ndev, 1); + if (err) + return err; + + tfm = crypto_alloc_shash("crc32", 0, 0); + if (IS_ERR(tfm)) { + pr_err("failed to allocate crc algorithm err:%ld\n", + PTR_ERR(tfm)); + return PTR_ERR(tfm); + } + rxe->tfm = tfm; + + rdma_set_device_sysfs_group(dev, &rxe_attr_group); + err = ib_register_device(dev, ibdev_name, NULL); + if (err) + pr_warn("%s failed with error %d\n", __func__, err); + + /* + * Note that rxe may be invalid at this point if another thread + * unregistered it. + */ + return err; +} diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h new file mode 100644 index 000000000..4bf5d85a1 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -0,0 +1,455 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. + */ + +#ifndef RXE_VERBS_H +#define RXE_VERBS_H + +#include <linux/interrupt.h> +#include <linux/workqueue.h> +#include <rdma/rdma_user_rxe.h> +#include "rxe_pool.h" +#include "rxe_task.h" +#include "rxe_hw_counters.h" + +static inline int pkey_match(u16 key1, u16 key2) +{ + return (((key1 & 0x7fff) != 0) && + ((key1 & 0x7fff) == (key2 & 0x7fff)) && + ((key1 & 0x8000) || (key2 & 0x8000))) ? 1 : 0; +} + +/* Return >0 if psn_a > psn_b + * 0 if psn_a == psn_b + * <0 if psn_a < psn_b + */ +static inline int psn_compare(u32 psn_a, u32 psn_b) +{ + s32 diff; + + diff = (psn_a - psn_b) << 8; + return diff; +} + +struct rxe_ucontext { + struct ib_ucontext ibuc; + struct rxe_pool_entry pelem; +}; + +struct rxe_pd { + struct ib_pd ibpd; + struct rxe_pool_entry pelem; +}; + +struct rxe_ah { + struct ib_ah ibah; + struct rxe_pool_entry pelem; + struct rxe_pd *pd; + struct rxe_av av; +}; + +struct rxe_cqe { + union { + struct ib_wc ibwc; + struct ib_uverbs_wc uibwc; + }; +}; + +struct rxe_cq { + struct ib_cq ibcq; + struct rxe_pool_entry pelem; + struct rxe_queue *queue; + spinlock_t cq_lock; + u8 notify; + bool is_dying; + int is_user; + struct tasklet_struct comp_task; +}; + +enum wqe_state { + wqe_state_posted, + wqe_state_processing, + wqe_state_pending, + wqe_state_done, + wqe_state_error, +}; + +struct rxe_sq { + int max_wr; + int max_sge; + int max_inline; + spinlock_t sq_lock; /* guard queue */ + struct rxe_queue *queue; +}; + +struct rxe_rq { + int max_wr; + int max_sge; + spinlock_t producer_lock; /* guard queue producer */ + spinlock_t consumer_lock; /* guard queue consumer */ + struct rxe_queue *queue; +}; + +struct rxe_srq { + struct ib_srq ibsrq; + struct rxe_pool_entry pelem; + struct rxe_pd *pd; + struct rxe_rq rq; + u32 srq_num; + + int limit; + int error; +}; + +enum rxe_qp_state { + QP_STATE_RESET, + QP_STATE_INIT, + QP_STATE_READY, + QP_STATE_DRAIN, /* req only */ + QP_STATE_DRAINED, /* req only */ + QP_STATE_ERROR +}; + +struct rxe_req_info { + enum rxe_qp_state state; + int wqe_index; + u32 psn; + int opcode; + atomic_t rd_atomic; + int wait_fence; + int need_rd_atomic; + int wait_psn; + int need_retry; + int noack_pkts; + struct rxe_task task; +}; + +struct rxe_comp_info { + u32 psn; + int opcode; + int timeout; + int timeout_retry; + int started_retry; + u32 retry_cnt; + u32 rnr_retry; + struct rxe_task task; +}; + +enum rdatm_res_state { + rdatm_res_state_next, + rdatm_res_state_new, + rdatm_res_state_replay, +}; + +struct resp_res { + int type; + int replay; + u32 first_psn; + u32 last_psn; + u32 cur_psn; + enum rdatm_res_state state; + + union { + struct { + struct sk_buff *skb; + } atomic; + struct { + struct rxe_mem *mr; + u64 va_org; + u32 rkey; + u32 length; + u64 va; + u32 resid; + } read; + }; +}; + +struct rxe_resp_info { + enum rxe_qp_state state; + u32 msn; + u32 psn; + u32 ack_psn; + int opcode; + int drop_msg; + int goto_error; + int sent_psn_nak; + enum ib_wc_status status; + u8 aeth_syndrome; + + /* Receive only */ + struct rxe_recv_wqe *wqe; + + /* RDMA read / atomic only */ + u64 va; + struct rxe_mem *mr; + u32 resid; + u32 rkey; + u32 length; + u64 atomic_orig; + + /* SRQ only */ + struct { + struct rxe_recv_wqe wqe; + struct ib_sge sge[RXE_MAX_SGE]; + } srq_wqe; + + /* Responder resources. It's a circular list where the oldest + * resource is dropped first. + */ + struct resp_res *resources; + unsigned int res_head; + unsigned int res_tail; + struct resp_res *res; + struct rxe_task task; +}; + +struct rxe_qp { + struct rxe_pool_entry pelem; + struct ib_qp ibqp; + struct ib_qp_attr attr; + unsigned int valid; + unsigned int mtu; + int is_user; + + struct rxe_pd *pd; + struct rxe_srq *srq; + struct rxe_cq *scq; + struct rxe_cq *rcq; + + enum ib_sig_type sq_sig_type; + + struct rxe_sq sq; + struct rxe_rq rq; + + struct socket *sk; + u32 dst_cookie; + u16 src_port; + + struct rxe_av pri_av; + struct rxe_av alt_av; + + /* list of mcast groups qp has joined (for cleanup) */ + struct list_head grp_list; + spinlock_t grp_lock; /* guard grp_list */ + + struct sk_buff_head req_pkts; + struct sk_buff_head resp_pkts; + struct sk_buff_head send_pkts; + + struct rxe_req_info req; + struct rxe_comp_info comp; + struct rxe_resp_info resp; + + atomic_t ssn; + atomic_t skb_out; + int need_req_skb; + + /* Timer for retranmitting packet when ACKs have been lost. RC + * only. The requester sets it when it is not already + * started. The responder resets it whenever an ack is + * received. + */ + struct timer_list retrans_timer; + u64 qp_timeout_jiffies; + + /* Timer for handling RNR NAKS. */ + struct timer_list rnr_nak_timer; + + spinlock_t state_lock; /* guard requester and completer */ + + struct execute_work cleanup_work; +}; + +enum rxe_mem_state { + RXE_MEM_STATE_ZOMBIE, + RXE_MEM_STATE_INVALID, + RXE_MEM_STATE_FREE, + RXE_MEM_STATE_VALID, +}; + +enum rxe_mem_type { + RXE_MEM_TYPE_NONE, + RXE_MEM_TYPE_DMA, + RXE_MEM_TYPE_MR, + RXE_MEM_TYPE_FMR, + RXE_MEM_TYPE_MW, +}; + +#define RXE_BUF_PER_MAP (PAGE_SIZE / sizeof(struct rxe_phys_buf)) + +struct rxe_phys_buf { + u64 addr; + u64 size; +}; + +struct rxe_map { + struct rxe_phys_buf buf[RXE_BUF_PER_MAP]; +}; + +struct rxe_mem { + struct rxe_pool_entry pelem; + union { + struct ib_mr ibmr; + struct ib_mw ibmw; + }; + + struct ib_umem *umem; + + enum rxe_mem_state state; + enum rxe_mem_type type; + u64 va; + u64 iova; + size_t length; + u32 offset; + int access; + + int page_shift; + int page_mask; + int map_shift; + int map_mask; + + u32 num_buf; + u32 nbuf; + + u32 max_buf; + u32 num_map; + + struct rxe_map **map; +}; + +struct rxe_mc_grp { + struct rxe_pool_entry pelem; + spinlock_t mcg_lock; /* guard group */ + struct rxe_dev *rxe; + struct list_head qp_list; + union ib_gid mgid; + int num_qp; + u32 qkey; + u16 pkey; +}; + +struct rxe_mc_elem { + struct rxe_pool_entry pelem; + struct list_head qp_list; + struct list_head grp_list; + struct rxe_qp *qp; + struct rxe_mc_grp *grp; +}; + +struct rxe_port { + struct ib_port_attr attr; + __be64 port_guid; + __be64 subnet_prefix; + spinlock_t port_lock; /* guard port */ + unsigned int mtu_cap; + /* special QPs */ + u32 qp_smi_index; + u32 qp_gsi_index; +}; + +struct rxe_dev { + struct ib_device ib_dev; + struct ib_device_attr attr; + int max_ucontext; + int max_inline_data; + struct mutex usdev_lock; + + struct net_device *ndev; + + int xmit_errors; + + struct rxe_pool uc_pool; + struct rxe_pool pd_pool; + struct rxe_pool ah_pool; + struct rxe_pool srq_pool; + struct rxe_pool qp_pool; + struct rxe_pool cq_pool; + struct rxe_pool mr_pool; + struct rxe_pool mw_pool; + struct rxe_pool mc_grp_pool; + struct rxe_pool mc_elem_pool; + + spinlock_t pending_lock; /* guard pending_mmaps */ + struct list_head pending_mmaps; + + spinlock_t mmap_offset_lock; /* guard mmap_offset */ + u64 mmap_offset; + + atomic64_t stats_counters[RXE_NUM_OF_COUNTERS]; + + struct rxe_port port; + struct crypto_shash *tfm; +}; + +static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters index) +{ + atomic64_inc(&rxe->stats_counters[index]); +} + +static inline struct rxe_dev *to_rdev(struct ib_device *dev) +{ + return dev ? container_of(dev, struct rxe_dev, ib_dev) : NULL; +} + +static inline struct rxe_ucontext *to_ruc(struct ib_ucontext *uc) +{ + return uc ? container_of(uc, struct rxe_ucontext, ibuc) : NULL; +} + +static inline struct rxe_pd *to_rpd(struct ib_pd *pd) +{ + return pd ? container_of(pd, struct rxe_pd, ibpd) : NULL; +} + +static inline struct rxe_ah *to_rah(struct ib_ah *ah) +{ + return ah ? container_of(ah, struct rxe_ah, ibah) : NULL; +} + +static inline struct rxe_srq *to_rsrq(struct ib_srq *srq) +{ + return srq ? container_of(srq, struct rxe_srq, ibsrq) : NULL; +} + +static inline struct rxe_qp *to_rqp(struct ib_qp *qp) +{ + return qp ? container_of(qp, struct rxe_qp, ibqp) : NULL; +} + +static inline struct rxe_cq *to_rcq(struct ib_cq *cq) +{ + return cq ? container_of(cq, struct rxe_cq, ibcq) : NULL; +} + +static inline struct rxe_mem *to_rmr(struct ib_mr *mr) +{ + return mr ? container_of(mr, struct rxe_mem, ibmr) : NULL; +} + +static inline struct rxe_mem *to_rmw(struct ib_mw *mw) +{ + return mw ? container_of(mw, struct rxe_mem, ibmw) : NULL; +} + +static inline struct rxe_pd *mr_pd(struct rxe_mem *mr) +{ + return to_rpd(mr->ibmr.pd); +} + +static inline u32 mr_lkey(struct rxe_mem *mr) +{ + return mr->ibmr.lkey; +} + +static inline u32 mr_rkey(struct rxe_mem *mr) +{ + return mr->ibmr.rkey; +} + +int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name); + +void rxe_mc_cleanup(struct rxe_pool_entry *arg); + +#endif /* RXE_VERBS_H */ |