diff options
Diffstat (limited to 'net/sunrpc/xprtrdma')
-rw-r--r-- | net/sunrpc/xprtrdma/Makefile | 9 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/backchannel.c | 345 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/fmr_ops.c | 348 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/frwr_ops.c | 615 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/module.c | 52 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/rpc_rdma.c | 1404 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma.c | 266 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 355 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 804 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_rw.c | 877 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_sendto.c | 929 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_transport.c | 717 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/transport.c | 921 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/verbs.c | 1572 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/xprt_rdma.h | 675 |
15 files changed, 9889 insertions, 0 deletions
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile new file mode 100644 index 000000000..8bf19e142 --- /dev/null +++ b/net/sunrpc/xprtrdma/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o + +rpcrdma-y := transport.o rpc_rdma.o verbs.o \ + fmr_ops.o frwr_ops.o \ + svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ + svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \ + module.o +rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c new file mode 100644 index 000000000..90adeff4c --- /dev/null +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -0,0 +1,345 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * + * Support for backward direction RPCs on RPC/RDMA. + */ + +#include <linux/module.h> +#include <linux/sunrpc/xprt.h> +#include <linux/sunrpc/svc.h> +#include <linux/sunrpc/svc_xprt.h> +#include <linux/sunrpc/svc_rdma.h> + +#include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +#undef RPCRDMA_BACKCHANNEL_DEBUG + +static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt, + struct rpc_rqst *rqst) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + + spin_lock(&buf->rb_reqslock); + list_del(&req->rl_all); + spin_unlock(&buf->rb_reqslock); + + rpcrdma_destroy_req(req); +} + +static int rpcrdma_bc_setup_reqs(struct rpcrdma_xprt *r_xprt, + unsigned int count) +{ + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct rpc_rqst *rqst; + unsigned int i; + + for (i = 0; i < (count << 1); i++) { + struct rpcrdma_regbuf *rb; + struct rpcrdma_req *req; + size_t size; + + req = rpcrdma_create_req(r_xprt); + if (IS_ERR(req)) + return PTR_ERR(req); + rqst = &req->rl_slot; + + rqst->rq_xprt = xprt; + INIT_LIST_HEAD(&rqst->rq_list); + INIT_LIST_HEAD(&rqst->rq_bc_list); + __set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); + spin_lock_bh(&xprt->bc_pa_lock); + list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); + spin_unlock_bh(&xprt->bc_pa_lock); + + size = r_xprt->rx_data.inline_rsize; + rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL); + if (IS_ERR(rb)) + goto out_fail; + req->rl_sendbuf = rb; + xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base, + min_t(size_t, size, PAGE_SIZE)); + } + return 0; + +out_fail: + rpcrdma_bc_free_rqst(r_xprt, rqst); + return -ENOMEM; +} + +/** + * xprt_rdma_bc_setup - Pre-allocate resources for handling backchannel requests + * @xprt: transport associated with these backchannel resources + * @reqs: number of concurrent incoming requests to expect + * + * Returns 0 on success; otherwise a negative errno + */ +int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + int rc; + + /* The backchannel reply path returns each rpc_rqst to the + * bc_pa_list _after_ the reply is sent. If the server is + * faster than the client, it can send another backward + * direction request before the rpc_rqst is returned to the + * list. The client rejects the request in this case. + * + * Twice as many rpc_rqsts are prepared to ensure there is + * always an rpc_rqst available as soon as a reply is sent. + */ + if (reqs > RPCRDMA_BACKWARD_WRS >> 1) + goto out_err; + + rc = rpcrdma_bc_setup_reqs(r_xprt, reqs); + if (rc) + goto out_free; + + r_xprt->rx_buf.rb_bc_srv_max_requests = reqs; + request_module("svcrdma"); + trace_xprtrdma_cb_setup(r_xprt, reqs); + return 0; + +out_free: + xprt_rdma_bc_destroy(xprt, reqs); + +out_err: + pr_err("RPC: %s: setup backchannel transport failed\n", __func__); + return -ENOMEM; +} + +/** + * xprt_rdma_bc_up - Create transport endpoint for backchannel service + * @serv: server endpoint + * @net: network namespace + * + * The "xprt" is an implied argument: it supplies the name of the + * backchannel transport class. + * + * Returns zero on success, negative errno on failure + */ +int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net) +{ + int ret; + + ret = svc_create_xprt(serv, "rdma-bc", net, PF_INET, 0, 0); + if (ret < 0) + return ret; + return 0; +} + +/** + * xprt_rdma_bc_maxpayload - Return maximum backchannel message size + * @xprt: transport + * + * Returns maximum size, in bytes, of a backchannel message + */ +size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + size_t maxmsg; + + maxmsg = min_t(unsigned int, cdata->inline_rsize, cdata->inline_wsize); + maxmsg = min_t(unsigned int, maxmsg, PAGE_SIZE); + return maxmsg - RPCRDMA_HDRLEN_MIN; +} + +static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + __be32 *p; + + rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); + xdr_init_encode(&req->rl_stream, &req->rl_hdrbuf, + req->rl_rdmabuf->rg_base); + + p = xdr_reserve_space(&req->rl_stream, 28); + if (unlikely(!p)) + return -EIO; + *p++ = rqst->rq_xid; + *p++ = rpcrdma_version; + *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_srv_max_requests); + *p++ = rdma_msg; + *p++ = xdr_zero; + *p++ = xdr_zero; + *p = xdr_zero; + + if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN, + &rqst->rq_snd_buf, rpcrdma_noch)) + return -EIO; + + trace_xprtrdma_cb_reply(rqst); + return 0; +} + +/** + * xprt_rdma_bc_send_reply - marshal and send a backchannel reply + * @rqst: RPC rqst with a backchannel RPC reply in rq_snd_buf + * + * Caller holds the transport's write lock. + * + * Returns: + * %0 if the RPC message has been sent + * %-ENOTCONN if the caller should reconnect and call again + * %-EIO if a permanent error occurred and the request was not + * sent. Do not try to send this message again. + */ +int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + int rc; + + if (!xprt_connected(rqst->rq_xprt)) + goto drop_connection; + + rc = rpcrdma_bc_marshal_reply(rqst); + if (rc < 0) + goto failed_marshal; + + rpcrdma_post_recvs(r_xprt, true); + if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) + goto drop_connection; + return 0; + +failed_marshal: + if (rc != -ENOTCONN) + return rc; +drop_connection: + xprt_disconnect_done(rqst->rq_xprt); + return -ENOTCONN; +} + +/** + * xprt_rdma_bc_destroy - Release resources for handling backchannel requests + * @xprt: transport associated with these backchannel resources + * @reqs: number of incoming requests to destroy; ignored + */ +void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpc_rqst *rqst, *tmp; + + spin_lock_bh(&xprt->bc_pa_lock); + list_for_each_entry_safe(rqst, tmp, &xprt->bc_pa_list, rq_bc_pa_list) { + list_del(&rqst->rq_bc_pa_list); + spin_unlock_bh(&xprt->bc_pa_lock); + + rpcrdma_bc_free_rqst(r_xprt, rqst); + + spin_lock_bh(&xprt->bc_pa_lock); + } + spin_unlock_bh(&xprt->bc_pa_lock); +} + +/** + * xprt_rdma_bc_free_rqst - Release a backchannel rqst + * @rqst: request to release + */ +void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) +{ + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct rpc_xprt *xprt = rqst->rq_xprt; + + dprintk("RPC: %s: freeing rqst %p (req %p)\n", + __func__, rqst, req); + + rpcrdma_recv_buffer_put(req->rl_reply); + req->rl_reply = NULL; + + spin_lock_bh(&xprt->bc_pa_lock); + list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); + spin_unlock_bh(&xprt->bc_pa_lock); +} + +/** + * rpcrdma_bc_receive_call - Handle a backward direction call + * @r_xprt: transport receiving the call + * @rep: receive buffer containing the call + * + * Operational assumptions: + * o Backchannel credits are ignored, just as the NFS server + * forechannel currently does + * o The ULP manages a replay cache (eg, NFSv4.1 sessions). + * No replay detection is done at the transport level + */ +void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_rep *rep) +{ + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct svc_serv *bc_serv; + struct rpcrdma_req *req; + struct rpc_rqst *rqst; + struct xdr_buf *buf; + size_t size; + __be32 *p; + + p = xdr_inline_decode(&rep->rr_stream, 0); + size = xdr_stream_remaining(&rep->rr_stream); + +#ifdef RPCRDMA_BACKCHANNEL_DEBUG + pr_info("RPC: %s: callback XID %08x, length=%u\n", + __func__, be32_to_cpup(p), size); + pr_info("RPC: %s: %*ph\n", __func__, size, p); +#endif + + /* Grab a free bc rqst */ + spin_lock(&xprt->bc_pa_lock); + if (list_empty(&xprt->bc_pa_list)) { + spin_unlock(&xprt->bc_pa_lock); + goto out_overflow; + } + rqst = list_first_entry(&xprt->bc_pa_list, + struct rpc_rqst, rq_bc_pa_list); + list_del(&rqst->rq_bc_pa_list); + spin_unlock(&xprt->bc_pa_lock); + + /* Prepare rqst */ + rqst->rq_reply_bytes_recvd = 0; + rqst->rq_bytes_sent = 0; + rqst->rq_xid = *p; + + rqst->rq_private_buf.len = size; + + buf = &rqst->rq_rcv_buf; + memset(buf, 0, sizeof(*buf)); + buf->head[0].iov_base = p; + buf->head[0].iov_len = size; + buf->len = size; + + /* The receive buffer has to be hooked to the rpcrdma_req + * so that it is not released while the req is pointing + * to its buffer, and so that it can be reposted after + * the Upper Layer is done decoding it. + */ + req = rpcr_to_rdmar(rqst); + req->rl_reply = rep; + trace_xprtrdma_cb_call(rqst); + + /* Queue rqst for ULP's callback service */ + bc_serv = xprt->bc_serv; + spin_lock(&bc_serv->sv_cb_lock); + list_add(&rqst->rq_bc_list, &bc_serv->sv_cb_list); + spin_unlock(&bc_serv->sv_cb_lock); + + wake_up(&bc_serv->sv_cb_waitq); + + r_xprt->rx_stats.bcall_count++; + return; + +out_overflow: + pr_warn("RPC/RDMA backchannel overflow\n"); + xprt_disconnect_done(xprt); + /* This receive buffer gets reposted automatically + * when the connection is re-established. + */ + return; +} diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c new file mode 100644 index 000000000..0f7c465d9 --- /dev/null +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -0,0 +1,348 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2015, 2017 Oracle. All rights reserved. + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + */ + +/* Lightweight memory registration using Fast Memory Regions (FMR). + * Referred to sometimes as MTHCAFMR mode. + * + * FMR uses synchronous memory registration and deregistration. + * FMR registration is known to be fast, but FMR deregistration + * can take tens of usecs to complete. + */ + +/* Normal operation + * + * A Memory Region is prepared for RDMA READ or WRITE using the + * ib_map_phys_fmr verb (fmr_op_map). When the RDMA operation is + * finished, the Memory Region is unmapped using the ib_unmap_fmr + * verb (fmr_op_unmap). + */ + +#include <linux/sunrpc/svc_rdma.h> + +#include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +/* Maximum scatter/gather per FMR */ +#define RPCRDMA_MAX_FMR_SGES (64) + +/* Access mode of externally registered pages */ +enum { + RPCRDMA_FMR_ACCESS_FLAGS = IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ, +}; + +bool +fmr_is_supported(struct rpcrdma_ia *ia) +{ + if (!ia->ri_device->alloc_fmr) { + pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n", + ia->ri_device->name); + return false; + } + return true; +} + +static int +fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) +{ + static struct ib_fmr_attr fmr_attr = { + .max_pages = RPCRDMA_MAX_FMR_SGES, + .max_maps = 1, + .page_shift = PAGE_SHIFT + }; + + mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, + sizeof(u64), GFP_KERNEL); + if (!mr->fmr.fm_physaddrs) + goto out_free; + + mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, + sizeof(*mr->mr_sg), GFP_KERNEL); + if (!mr->mr_sg) + goto out_free; + + sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES); + + mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, + &fmr_attr); + if (IS_ERR(mr->fmr.fm_mr)) + goto out_fmr_err; + + INIT_LIST_HEAD(&mr->mr_list); + return 0; + +out_fmr_err: + dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, + PTR_ERR(mr->fmr.fm_mr)); + +out_free: + kfree(mr->mr_sg); + kfree(mr->fmr.fm_physaddrs); + return -ENOMEM; +} + +static int +__fmr_unmap(struct rpcrdma_mr *mr) +{ + LIST_HEAD(l); + int rc; + + list_add(&mr->fmr.fm_mr->list, &l); + rc = ib_unmap_fmr(&l); + list_del(&mr->fmr.fm_mr->list); + return rc; +} + +static void +fmr_op_release_mr(struct rpcrdma_mr *mr) +{ + LIST_HEAD(unmap_list); + int rc; + + kfree(mr->fmr.fm_physaddrs); + kfree(mr->mr_sg); + + /* In case this one was left mapped, try to unmap it + * to prevent dealloc_fmr from failing with EBUSY + */ + rc = __fmr_unmap(mr); + if (rc) + pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", + mr, rc); + + rc = ib_dealloc_fmr(mr->fmr.fm_mr); + if (rc) + pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n", + mr, rc); + + kfree(mr); +} + +/* Reset of a single FMR. + */ +static void +fmr_op_recover_mr(struct rpcrdma_mr *mr) +{ + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; + int rc; + + /* ORDER: invalidate first */ + rc = __fmr_unmap(mr); + if (rc) + goto out_release; + + /* ORDER: then DMA unmap */ + rpcrdma_mr_unmap_and_put(mr); + + r_xprt->rx_stats.mrs_recovered++; + return; + +out_release: + pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mr); + r_xprt->rx_stats.mrs_orphaned++; + + trace_xprtrdma_dma_unmap(mr); + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + mr->mr_sg, mr->mr_nents, mr->mr_dir); + + spin_lock(&r_xprt->rx_buf.rb_mrlock); + list_del(&mr->mr_all); + spin_unlock(&r_xprt->rx_buf.rb_mrlock); + + fmr_op_release_mr(mr); +} + +/* On success, sets: + * ep->rep_attr.cap.max_send_wr + * ep->rep_attr.cap.max_recv_wr + * cdata->max_requests + * ia->ri_max_segs + */ +static int +fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, + struct rpcrdma_create_data_internal *cdata) +{ + int max_qp_wr; + + max_qp_wr = ia->ri_device->attrs.max_qp_wr; + max_qp_wr -= RPCRDMA_BACKWARD_WRS; + max_qp_wr -= 1; + if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE) + return -ENOMEM; + if (cdata->max_requests > max_qp_wr) + cdata->max_requests = max_qp_wr; + ep->rep_attr.cap.max_send_wr = cdata->max_requests; + ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; + ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ + ep->rep_attr.cap.max_recv_wr = cdata->max_requests; + ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; + ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ + + ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / + RPCRDMA_MAX_FMR_SGES); + return 0; +} + +/* FMR mode conveys up to 64 pages of payload per chunk segment. + */ +static size_t +fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) +{ + return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES); +} + +/* Use the ib_map_phys_fmr() verb to register a memory region + * for remote access via RDMA READ or RDMA WRITE. + */ +static struct rpcrdma_mr_seg * +fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, + int nsegs, bool writing, struct rpcrdma_mr **out) +{ + struct rpcrdma_mr_seg *seg1 = seg; + int len, pageoff, i, rc; + struct rpcrdma_mr *mr; + u64 *dma_pages; + + mr = rpcrdma_mr_get(r_xprt); + if (!mr) + return ERR_PTR(-EAGAIN); + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (nsegs > RPCRDMA_MAX_FMR_SGES) + nsegs = RPCRDMA_MAX_FMR_SGES; + for (i = 0; i < nsegs;) { + if (seg->mr_page) + sg_set_page(&mr->mr_sg[i], + seg->mr_page, + seg->mr_len, + offset_in_page(seg->mr_offset)); + else + sg_set_buf(&mr->mr_sg[i], seg->mr_offset, + seg->mr_len); + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + mr->mr_dir = rpcrdma_data_dir(writing); + + mr->mr_nents = ib_dma_map_sg(r_xprt->rx_ia.ri_device, + mr->mr_sg, i, mr->mr_dir); + if (!mr->mr_nents) + goto out_dmamap_err; + trace_xprtrdma_dma_map(mr); + + for (i = 0, dma_pages = mr->fmr.fm_physaddrs; i < mr->mr_nents; i++) + dma_pages[i] = sg_dma_address(&mr->mr_sg[i]); + rc = ib_map_phys_fmr(mr->fmr.fm_mr, dma_pages, mr->mr_nents, + dma_pages[0]); + if (rc) + goto out_maperr; + + mr->mr_handle = mr->fmr.fm_mr->rkey; + mr->mr_length = len; + mr->mr_offset = dma_pages[0] + pageoff; + + *out = mr; + return seg; + +out_dmamap_err: + pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", + mr->mr_sg, i); + rpcrdma_mr_put(mr); + return ERR_PTR(-EIO); + +out_maperr: + pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", + len, (unsigned long long)dma_pages[0], + pageoff, mr->mr_nents, rc); + rpcrdma_mr_unmap_and_put(mr); + return ERR_PTR(-EIO); +} + +/* Post Send WR containing the RPC Call message. + */ +static int +fmr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) +{ + return ib_post_send(ia->ri_id->qp, &req->rl_sendctx->sc_wr, NULL); +} + +/* Invalidate all memory regions that were registered for "req". + * + * Sleeps until it is safe for the host CPU to access the + * previously mapped memory regions. + * + * Caller ensures that @mrs is not empty before the call. This + * function empties the list. + */ +static void +fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) +{ + struct rpcrdma_mr *mr; + LIST_HEAD(unmap_list); + int rc; + + /* ORDER: Invalidate all of the req's MRs first + * + * ib_unmap_fmr() is slow, so use a single call instead + * of one call per mapped FMR. + */ + list_for_each_entry(mr, mrs, mr_list) { + dprintk("RPC: %s: unmapping fmr %p\n", + __func__, &mr->fmr); + trace_xprtrdma_localinv(mr); + list_add_tail(&mr->fmr.fm_mr->list, &unmap_list); + } + r_xprt->rx_stats.local_inv_needed++; + rc = ib_unmap_fmr(&unmap_list); + if (rc) + goto out_reset; + + /* ORDER: Now DMA unmap all of the req's MRs, and return + * them to the free MW list. + */ + while (!list_empty(mrs)) { + mr = rpcrdma_mr_pop(mrs); + list_del(&mr->fmr.fm_mr->list); + rpcrdma_mr_unmap_and_put(mr); + } + + return; + +out_reset: + pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc); + + while (!list_empty(mrs)) { + mr = rpcrdma_mr_pop(mrs); + list_del(&mr->fmr.fm_mr->list); + fmr_op_recover_mr(mr); + } +} + +const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { + .ro_map = fmr_op_map, + .ro_send = fmr_op_send, + .ro_unmap_sync = fmr_op_unmap_sync, + .ro_recover_mr = fmr_op_recover_mr, + .ro_open = fmr_op_open, + .ro_maxpages = fmr_op_maxpages, + .ro_init_mr = fmr_op_init_mr, + .ro_release_mr = fmr_op_release_mr, + .ro_displayname = "fmr", + .ro_send_w_inv_ok = 0, +}; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c new file mode 100644 index 000000000..1bb00dd6c --- /dev/null +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -0,0 +1,615 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2015, 2017 Oracle. All rights reserved. + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + */ + +/* Lightweight memory registration using Fast Registration Work + * Requests (FRWR). + * + * FRWR features ordered asynchronous registration and deregistration + * of arbitrarily sized memory regions. This is the fastest and safest + * but most complex memory registration mode. + */ + +/* Normal operation + * + * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG + * Work Request (frwr_op_map). When the RDMA operation is finished, this + * Memory Region is invalidated using a LOCAL_INV Work Request + * (frwr_op_unmap_sync). + * + * Typically these Work Requests are not signaled, and neither are RDMA + * SEND Work Requests (with the exception of signaling occasionally to + * prevent provider work queue overflows). This greatly reduces HCA + * interrupt workload. + * + * As an optimization, frwr_op_unmap marks MRs INVALID before the + * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on + * rb_mrs immediately so that no work (like managing a linked list + * under a spinlock) is needed in the completion upcall. + * + * But this means that frwr_op_map() can occasionally encounter an MR + * that is INVALID but the LOCAL_INV WR has not completed. Work Queue + * ordering prevents a subsequent FAST_REG WR from executing against + * that MR while it is still being invalidated. + */ + +/* Transport recovery + * + * ->op_map and the transport connect worker cannot run at the same + * time, but ->op_unmap can fire while the transport connect worker + * is running. Thus MR recovery is handled in ->op_map, to guarantee + * that recovered MRs are owned by a sending RPC, and not one where + * ->op_unmap could fire at the same time transport reconnect is + * being done. + * + * When the underlying transport disconnects, MRs are left in one of + * four states: + * + * INVALID: The MR was not in use before the QP entered ERROR state. + * + * VALID: The MR was registered before the QP entered ERROR state. + * + * FLUSHED_FR: The MR was being registered when the QP entered ERROR + * state, and the pending WR was flushed. + * + * FLUSHED_LI: The MR was being invalidated when the QP entered ERROR + * state, and the pending WR was flushed. + * + * When frwr_op_map encounters FLUSHED and VALID MRs, they are recovered + * with ib_dereg_mr and then are re-initialized. Because MR recovery + * allocates fresh resources, it is deferred to a workqueue, and the + * recovered MRs are placed back on the rb_mrs list when recovery is + * complete. frwr_op_map allocates another MR for the current RPC while + * the broken MR is reset. + * + * To ensure that frwr_op_map doesn't encounter an MR that is marked + * INVALID but that is about to be flushed due to a previous transport + * disconnect, the transport connect worker attempts to drain all + * pending send queue WRs before the transport is reconnected. + */ + +#include <linux/sunrpc/rpc_rdma.h> +#include <linux/sunrpc/svc_rdma.h> + +#include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +bool +frwr_is_supported(struct rpcrdma_ia *ia) +{ + struct ib_device_attr *attrs = &ia->ri_device->attrs; + + if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) + goto out_not_supported; + if (attrs->max_fast_reg_page_list_len == 0) + goto out_not_supported; + return true; + +out_not_supported: + pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n", + ia->ri_device->name); + return false; +} + +static int +frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) +{ + unsigned int depth = ia->ri_max_frwr_depth; + struct rpcrdma_frwr *frwr = &mr->frwr; + int rc; + + frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); + if (IS_ERR(frwr->fr_mr)) + goto out_mr_err; + + mr->mr_sg = kcalloc(depth, sizeof(*mr->mr_sg), GFP_KERNEL); + if (!mr->mr_sg) + goto out_list_err; + + INIT_LIST_HEAD(&mr->mr_list); + sg_init_table(mr->mr_sg, depth); + init_completion(&frwr->fr_linv_done); + return 0; + +out_mr_err: + rc = PTR_ERR(frwr->fr_mr); + dprintk("RPC: %s: ib_alloc_mr status %i\n", + __func__, rc); + return rc; + +out_list_err: + rc = -ENOMEM; + dprintk("RPC: %s: sg allocation failure\n", + __func__); + ib_dereg_mr(frwr->fr_mr); + return rc; +} + +static void +frwr_op_release_mr(struct rpcrdma_mr *mr) +{ + int rc; + + rc = ib_dereg_mr(mr->frwr.fr_mr); + if (rc) + pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", + mr, rc); + kfree(mr->mr_sg); + kfree(mr); +} + +static int +__frwr_mr_reset(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) +{ + struct rpcrdma_frwr *frwr = &mr->frwr; + int rc; + + rc = ib_dereg_mr(frwr->fr_mr); + if (rc) { + pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n", + rc, mr); + return rc; + } + + frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, + ia->ri_max_frwr_depth); + if (IS_ERR(frwr->fr_mr)) { + pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n", + PTR_ERR(frwr->fr_mr), mr); + return PTR_ERR(frwr->fr_mr); + } + + dprintk("RPC: %s: recovered FRWR %p\n", __func__, frwr); + frwr->fr_state = FRWR_IS_INVALID; + return 0; +} + +/* Reset of a single FRWR. Generate a fresh rkey by replacing the MR. + */ +static void +frwr_op_recover_mr(struct rpcrdma_mr *mr) +{ + enum rpcrdma_frwr_state state = mr->frwr.fr_state; + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + int rc; + + rc = __frwr_mr_reset(ia, mr); + if (state != FRWR_FLUSHED_LI) { + trace_xprtrdma_dma_unmap(mr); + ib_dma_unmap_sg(ia->ri_device, + mr->mr_sg, mr->mr_nents, mr->mr_dir); + } + if (rc) + goto out_release; + + rpcrdma_mr_put(mr); + r_xprt->rx_stats.mrs_recovered++; + return; + +out_release: + pr_err("rpcrdma: FRWR reset failed %d, %p released\n", rc, mr); + r_xprt->rx_stats.mrs_orphaned++; + + spin_lock(&r_xprt->rx_buf.rb_mrlock); + list_del(&mr->mr_all); + spin_unlock(&r_xprt->rx_buf.rb_mrlock); + + frwr_op_release_mr(mr); +} + +/* On success, sets: + * ep->rep_attr.cap.max_send_wr + * ep->rep_attr.cap.max_recv_wr + * cdata->max_requests + * ia->ri_max_segs + * + * And these FRWR-related fields: + * ia->ri_max_frwr_depth + * ia->ri_mrtype + */ +static int +frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, + struct rpcrdma_create_data_internal *cdata) +{ + struct ib_device_attr *attrs = &ia->ri_device->attrs; + int max_qp_wr, depth, delta; + + ia->ri_mrtype = IB_MR_TYPE_MEM_REG; + if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG) + ia->ri_mrtype = IB_MR_TYPE_SG_GAPS; + + ia->ri_max_frwr_depth = + min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + attrs->max_fast_reg_page_list_len); + dprintk("RPC: %s: device's max FR page list len = %u\n", + __func__, ia->ri_max_frwr_depth); + + /* Add room for frwr register and invalidate WRs. + * 1. FRWR reg WR for head + * 2. FRWR invalidate WR for head + * 3. N FRWR reg WRs for pagelist + * 4. N FRWR invalidate WRs for pagelist + * 5. FRWR reg WR for tail + * 6. FRWR invalidate WR for tail + * 7. The RDMA_SEND WR + */ + depth = 7; + + /* Calculate N if the device max FRWR depth is smaller than + * RPCRDMA_MAX_DATA_SEGS. + */ + if (ia->ri_max_frwr_depth < RPCRDMA_MAX_DATA_SEGS) { + delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frwr_depth; + do { + depth += 2; /* FRWR reg + invalidate */ + delta -= ia->ri_max_frwr_depth; + } while (delta > 0); + } + + max_qp_wr = ia->ri_device->attrs.max_qp_wr; + max_qp_wr -= RPCRDMA_BACKWARD_WRS; + max_qp_wr -= 1; + if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE) + return -ENOMEM; + if (cdata->max_requests > max_qp_wr) + cdata->max_requests = max_qp_wr; + ep->rep_attr.cap.max_send_wr = cdata->max_requests * depth; + if (ep->rep_attr.cap.max_send_wr > max_qp_wr) { + cdata->max_requests = max_qp_wr / depth; + if (!cdata->max_requests) + return -EINVAL; + ep->rep_attr.cap.max_send_wr = cdata->max_requests * + depth; + } + ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; + ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ + ep->rep_attr.cap.max_recv_wr = cdata->max_requests; + ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; + ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ + + ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / + ia->ri_max_frwr_depth); + return 0; +} + +/* FRWR mode conveys a list of pages per chunk segment. The + * maximum length of that list is the FRWR page list depth. + */ +static size_t +frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + + return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frwr_depth); +} + +static void +__frwr_sendcompletion_flush(struct ib_wc *wc, const char *wr) +{ + if (wc->status != IB_WC_WR_FLUSH_ERR) + pr_err("rpcrdma: %s: %s (%u/0x%x)\n", + wr, ib_wc_status_msg(wc->status), + wc->status, wc->vendor_err); +} + +/** + * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC + * @cq: completion queue (ignored) + * @wc: completed WR + * + */ +static void +frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_cqe *cqe = wc->wr_cqe; + struct rpcrdma_frwr *frwr = + container_of(cqe, struct rpcrdma_frwr, fr_cqe); + + /* WARNING: Only wr_cqe and status are reliable at this point */ + if (wc->status != IB_WC_SUCCESS) { + frwr->fr_state = FRWR_FLUSHED_FR; + __frwr_sendcompletion_flush(wc, "fastreg"); + } + trace_xprtrdma_wc_fastreg(wc, frwr); +} + +/** + * frwr_wc_localinv - Invoked by RDMA provider for a flushed LocalInv WC + * @cq: completion queue (ignored) + * @wc: completed WR + * + */ +static void +frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_cqe *cqe = wc->wr_cqe; + struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr, + fr_cqe); + + /* WARNING: Only wr_cqe and status are reliable at this point */ + if (wc->status != IB_WC_SUCCESS) { + frwr->fr_state = FRWR_FLUSHED_LI; + __frwr_sendcompletion_flush(wc, "localinv"); + } + trace_xprtrdma_wc_li(wc, frwr); +} + +/** + * frwr_wc_localinv_wake - Invoked by RDMA provider for a signaled LocalInv WC + * @cq: completion queue (ignored) + * @wc: completed WR + * + * Awaken anyone waiting for an MR to finish being fenced. + */ +static void +frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_cqe *cqe = wc->wr_cqe; + struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr, + fr_cqe); + + /* WARNING: Only wr_cqe and status are reliable at this point */ + if (wc->status != IB_WC_SUCCESS) { + frwr->fr_state = FRWR_FLUSHED_LI; + __frwr_sendcompletion_flush(wc, "localinv"); + } + complete(&frwr->fr_linv_done); + trace_xprtrdma_wc_li_wake(wc, frwr); +} + +/* Post a REG_MR Work Request to register a memory region + * for remote access via RDMA READ or RDMA WRITE. + */ +static struct rpcrdma_mr_seg * +frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, + int nsegs, bool writing, struct rpcrdma_mr **out) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS; + struct rpcrdma_frwr *frwr; + struct rpcrdma_mr *mr; + struct ib_mr *ibmr; + struct ib_reg_wr *reg_wr; + int i, n; + u8 key; + + mr = NULL; + do { + if (mr) + rpcrdma_mr_defer_recovery(mr); + mr = rpcrdma_mr_get(r_xprt); + if (!mr) + return ERR_PTR(-EAGAIN); + } while (mr->frwr.fr_state != FRWR_IS_INVALID); + frwr = &mr->frwr; + frwr->fr_state = FRWR_IS_VALID; + + if (nsegs > ia->ri_max_frwr_depth) + nsegs = ia->ri_max_frwr_depth; + for (i = 0; i < nsegs;) { + if (seg->mr_page) + sg_set_page(&mr->mr_sg[i], + seg->mr_page, + seg->mr_len, + offset_in_page(seg->mr_offset)); + else + sg_set_buf(&mr->mr_sg[i], seg->mr_offset, + seg->mr_len); + + ++seg; + ++i; + if (holes_ok) + continue; + if ((i < nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + mr->mr_dir = rpcrdma_data_dir(writing); + + mr->mr_nents = ib_dma_map_sg(ia->ri_device, mr->mr_sg, i, mr->mr_dir); + if (!mr->mr_nents) + goto out_dmamap_err; + trace_xprtrdma_dma_map(mr); + + ibmr = frwr->fr_mr; + n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE); + if (unlikely(n != mr->mr_nents)) + goto out_mapmr_err; + + key = (u8)(ibmr->rkey & 0x000000FF); + ib_update_fast_reg_key(ibmr, ++key); + + reg_wr = &frwr->fr_regwr; + reg_wr->mr = ibmr; + reg_wr->key = ibmr->rkey; + reg_wr->access = writing ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_REMOTE_READ; + + mr->mr_handle = ibmr->rkey; + mr->mr_length = ibmr->length; + mr->mr_offset = ibmr->iova; + + *out = mr; + return seg; + +out_dmamap_err: + pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", + mr->mr_sg, i); + frwr->fr_state = FRWR_IS_INVALID; + rpcrdma_mr_put(mr); + return ERR_PTR(-EIO); + +out_mapmr_err: + pr_err("rpcrdma: failed to map mr %p (%d/%d)\n", + frwr->fr_mr, n, mr->mr_nents); + rpcrdma_mr_defer_recovery(mr); + return ERR_PTR(-EIO); +} + +/* Post Send WR containing the RPC Call message. + * + * For FRMR, chain any FastReg WRs to the Send WR. Only a + * single ib_post_send call is needed to register memory + * and then post the Send WR. + */ +static int +frwr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) +{ + struct ib_send_wr *post_wr; + struct rpcrdma_mr *mr; + + post_wr = &req->rl_sendctx->sc_wr; + list_for_each_entry(mr, &req->rl_registered, mr_list) { + struct rpcrdma_frwr *frwr; + + frwr = &mr->frwr; + + frwr->fr_cqe.done = frwr_wc_fastreg; + frwr->fr_regwr.wr.next = post_wr; + frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe; + frwr->fr_regwr.wr.num_sge = 0; + frwr->fr_regwr.wr.opcode = IB_WR_REG_MR; + frwr->fr_regwr.wr.send_flags = 0; + + post_wr = &frwr->fr_regwr.wr; + } + + /* If ib_post_send fails, the next ->send_request for + * @req will queue these MWs for recovery. + */ + return ib_post_send(ia->ri_id->qp, post_wr, NULL); +} + +/* Handle a remotely invalidated mr on the @mrs list + */ +static void +frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) +{ + struct rpcrdma_mr *mr; + + list_for_each_entry(mr, mrs, mr_list) + if (mr->mr_handle == rep->rr_inv_rkey) { + list_del_init(&mr->mr_list); + trace_xprtrdma_remoteinv(mr); + mr->frwr.fr_state = FRWR_IS_INVALID; + rpcrdma_mr_unmap_and_put(mr); + break; /* only one invalidated MR per RPC */ + } +} + +/* Invalidate all memory regions that were registered for "req". + * + * Sleeps until it is safe for the host CPU to access the + * previously mapped memory regions. + * + * Caller ensures that @mrs is not empty before the call. This + * function empties the list. + */ +static void +frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) +{ + struct ib_send_wr *first, **prev, *last; + const struct ib_send_wr *bad_wr; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_frwr *frwr; + struct rpcrdma_mr *mr; + int count, rc; + + /* ORDER: Invalidate all of the MRs first + * + * Chain the LOCAL_INV Work Requests and post them with + * a single ib_post_send() call. + */ + frwr = NULL; + count = 0; + prev = &first; + list_for_each_entry(mr, mrs, mr_list) { + mr->frwr.fr_state = FRWR_IS_INVALID; + + frwr = &mr->frwr; + trace_xprtrdma_localinv(mr); + + frwr->fr_cqe.done = frwr_wc_localinv; + last = &frwr->fr_invwr; + memset(last, 0, sizeof(*last)); + last->wr_cqe = &frwr->fr_cqe; + last->opcode = IB_WR_LOCAL_INV; + last->ex.invalidate_rkey = mr->mr_handle; + count++; + + *prev = last; + prev = &last->next; + } + if (!frwr) + goto unmap; + + /* Strong send queue ordering guarantees that when the + * last WR in the chain completes, all WRs in the chain + * are complete. + */ + last->send_flags = IB_SEND_SIGNALED; + frwr->fr_cqe.done = frwr_wc_localinv_wake; + reinit_completion(&frwr->fr_linv_done); + + /* Transport disconnect drains the receive CQ before it + * replaces the QP. The RPC reply handler won't call us + * unless ri_id->qp is a valid pointer. + */ + r_xprt->rx_stats.local_inv_needed++; + bad_wr = NULL; + rc = ib_post_send(ia->ri_id->qp, first, &bad_wr); + if (bad_wr != first) + wait_for_completion(&frwr->fr_linv_done); + if (rc) + goto reset_mrs; + + /* ORDER: Now DMA unmap all of the MRs, and return + * them to the free MR list. + */ +unmap: + while (!list_empty(mrs)) { + mr = rpcrdma_mr_pop(mrs); + rpcrdma_mr_unmap_and_put(mr); + } + return; + +reset_mrs: + pr_err("rpcrdma: FRWR invalidate ib_post_send returned %i\n", rc); + + /* Find and reset the MRs in the LOCAL_INV WRs that did not + * get posted. + */ + while (bad_wr) { + frwr = container_of(bad_wr, struct rpcrdma_frwr, + fr_invwr); + mr = container_of(frwr, struct rpcrdma_mr, frwr); + + __frwr_mr_reset(ia, mr); + + bad_wr = bad_wr->next; + } + goto unmap; +} + +const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { + .ro_map = frwr_op_map, + .ro_send = frwr_op_send, + .ro_reminv = frwr_op_reminv, + .ro_unmap_sync = frwr_op_unmap_sync, + .ro_recover_mr = frwr_op_recover_mr, + .ro_open = frwr_op_open, + .ro_maxpages = frwr_op_maxpages, + .ro_init_mr = frwr_op_init_mr, + .ro_release_mr = frwr_op_release_mr, + .ro_displayname = "frwr", + .ro_send_w_inv_ok = RPCRDMA_CMP_F_SND_W_INV_OK, +}; diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c new file mode 100644 index 000000000..45c5b41ac --- /dev/null +++ b/net/sunrpc/xprtrdma/module.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* + * Copyright (c) 2015, 2017 Oracle. All rights reserved. + */ + +/* rpcrdma.ko module initialization + */ + +#include <linux/types.h> +#include <linux/compiler.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/sunrpc/svc_rdma.h> + +#include <asm/swab.h> + +#include "xprt_rdma.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/rpcrdma.h> + +MODULE_AUTHOR("Open Grid Computing and Network Appliance, Inc."); +MODULE_DESCRIPTION("RPC/RDMA Transport"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS("svcrdma"); +MODULE_ALIAS("xprtrdma"); +MODULE_ALIAS("rpcrdma6"); + +static void __exit rpc_rdma_cleanup(void) +{ + xprt_rdma_cleanup(); + svc_rdma_cleanup(); +} + +static int __init rpc_rdma_init(void) +{ + int rc; + + rc = svc_rdma_init(); + if (rc) + goto out; + + rc = xprt_rdma_init(); + if (rc) + svc_rdma_cleanup(); + +out: + return rc; +} + +module_init(rpc_rdma_init); +module_exit(rpc_rdma_cleanup); diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c new file mode 100644 index 000000000..7f9d8365c --- /dev/null +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -0,0 +1,1404 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* + * Copyright (c) 2014-2017 Oracle. All rights reserved. + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * rpc_rdma.c + * + * This file contains the guts of the RPC RDMA protocol, and + * does marshaling/unmarshaling, etc. It is also where interfacing + * to the Linux RPC framework lives. + */ + +#include <linux/highmem.h> + +#include <linux/sunrpc/svc_rdma.h> + +#include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +/* Returns size of largest RPC-over-RDMA header in a Call message + * + * The largest Call header contains a full-size Read list and a + * minimal Reply chunk. + */ +static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs) +{ + unsigned int size; + + /* Fixed header fields and list discriminators */ + size = RPCRDMA_HDRLEN_MIN; + + /* Maximum Read list size */ + maxsegs += 2; /* segment for head and tail buffers */ + size += maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32); + + /* Minimal Read chunk size */ + size += sizeof(__be32); /* segment count */ + size += rpcrdma_segment_maxsz * sizeof(__be32); + size += sizeof(__be32); /* list discriminator */ + + dprintk("RPC: %s: max call header size = %u\n", + __func__, size); + return size; +} + +/* Returns size of largest RPC-over-RDMA header in a Reply message + * + * There is only one Write list or one Reply chunk per Reply + * message. The larger list is the Write list. + */ +static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) +{ + unsigned int size; + + /* Fixed header fields and list discriminators */ + size = RPCRDMA_HDRLEN_MIN; + + /* Maximum Write list size */ + maxsegs += 2; /* segment for head and tail buffers */ + size += sizeof(__be32); /* segment count */ + size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32); + size += sizeof(__be32); /* list discriminator */ + + dprintk("RPC: %s: max reply header size = %u\n", + __func__, size); + return size; +} + +void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + unsigned int maxsegs = ia->ri_max_segs; + + ia->ri_max_inline_write = cdata->inline_wsize - + rpcrdma_max_call_header_size(maxsegs); + ia->ri_max_inline_read = cdata->inline_rsize - + rpcrdma_max_reply_header_size(maxsegs); +} + +/* The client can send a request inline as long as the RPCRDMA header + * plus the RPC call fit under the transport's inline limit. If the + * combined call message size exceeds that limit, the client must use + * a Read chunk for this operation. + * + * A Read chunk is also required if sending the RPC call inline would + * exceed this device's max_sge limit. + */ +static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, + struct rpc_rqst *rqst) +{ + struct xdr_buf *xdr = &rqst->rq_snd_buf; + unsigned int count, remaining, offset; + + if (xdr->len > r_xprt->rx_ia.ri_max_inline_write) + return false; + + if (xdr->page_len) { + remaining = xdr->page_len; + offset = offset_in_page(xdr->page_base); + count = RPCRDMA_MIN_SEND_SGES; + while (remaining) { + remaining -= min_t(unsigned int, + PAGE_SIZE - offset, remaining); + offset = 0; + if (++count > r_xprt->rx_ia.ri_max_send_sges) + return false; + } + } + + return true; +} + +/* The client can't know how large the actual reply will be. Thus it + * plans for the largest possible reply for that particular ULP + * operation. If the maximum combined reply message size exceeds that + * limit, the client must provide a write list or a reply chunk for + * this request. + */ +static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt, + struct rpc_rqst *rqst) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + + return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read; +} + +/* Split @vec on page boundaries into SGEs. FMR registers pages, not + * a byte range. Other modes coalesce these SGEs into a single MR + * when they can. + * + * Returns pointer to next available SGE, and bumps the total number + * of SGEs consumed. + */ +static struct rpcrdma_mr_seg * +rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, + unsigned int *n) +{ + u32 remaining, page_offset; + char *base; + + base = vec->iov_base; + page_offset = offset_in_page(base); + remaining = vec->iov_len; + while (remaining) { + seg->mr_page = NULL; + seg->mr_offset = base; + seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); + remaining -= seg->mr_len; + base += seg->mr_len; + ++seg; + ++(*n); + page_offset = 0; + } + return seg; +} + +/* Convert @xdrbuf into SGEs no larger than a page each. As they + * are registered, these SGEs are then coalesced into RDMA segments + * when the selected memreg mode supports it. + * + * Returns positive number of SGEs consumed, or a negative errno. + */ + +static int +rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, + unsigned int pos, enum rpcrdma_chunktype type, + struct rpcrdma_mr_seg *seg) +{ + unsigned long page_base; + unsigned int len, n; + struct page **ppages; + + n = 0; + if (pos == 0) + seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n); + + len = xdrbuf->page_len; + ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); + page_base = offset_in_page(xdrbuf->page_base); + while (len) { + if (unlikely(!*ppages)) { + /* XXX: Certain upper layer operations do + * not provide receive buffer pages. + */ + *ppages = alloc_page(GFP_ATOMIC); + if (!*ppages) + return -ENOBUFS; + } + seg->mr_page = *ppages; + seg->mr_offset = (char *)page_base; + seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len); + len -= seg->mr_len; + ++ppages; + ++seg; + ++n; + page_base = 0; + } + + /* When encoding a Read chunk, the tail iovec contains an + * XDR pad and may be omitted. + */ + if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup) + goto out; + + /* When encoding a Write chunk, some servers need to see an + * extra segment for non-XDR-aligned Write chunks. The upper + * layer provides space in the tail iovec that may be used + * for this purpose. + */ + if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup) + goto out; + + if (xdrbuf->tail[0].iov_len) + seg = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n); + +out: + if (unlikely(n > RPCRDMA_MAX_SEGS)) + return -EIO; + return n; +} + +static inline int +encode_item_present(struct xdr_stream *xdr) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EMSGSIZE; + + *p = xdr_one; + return 0; +} + +static inline int +encode_item_not_present(struct xdr_stream *xdr) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EMSGSIZE; + + *p = xdr_zero; + return 0; +} + +static void +xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr *mr) +{ + *iptr++ = cpu_to_be32(mr->mr_handle); + *iptr++ = cpu_to_be32(mr->mr_length); + xdr_encode_hyper(iptr, mr->mr_offset); +} + +static int +encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4 * sizeof(*p)); + if (unlikely(!p)) + return -EMSGSIZE; + + xdr_encode_rdma_segment(p, mr); + return 0; +} + +static int +encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr, + u32 position) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 6 * sizeof(*p)); + if (unlikely(!p)) + return -EMSGSIZE; + + *p++ = xdr_one; /* Item present */ + *p++ = cpu_to_be32(position); + xdr_encode_rdma_segment(p, mr); + return 0; +} + +/* Register and XDR encode the Read list. Supports encoding a list of read + * segments that belong to a single read chunk. + * + * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): + * + * Read chunklist (a linked list): + * N elements, position P (same P for all chunks of same arg!): + * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0 + * + * Returns zero on success, or a negative errno if a failure occurred. + * @xdr is advanced to the next position in the stream. + * + * Only a single @pos value is currently supported. + */ +static noinline int +rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + struct rpc_rqst *rqst, enum rpcrdma_chunktype rtype) +{ + struct xdr_stream *xdr = &req->rl_stream; + struct rpcrdma_mr_seg *seg; + struct rpcrdma_mr *mr; + unsigned int pos; + int nsegs; + + pos = rqst->rq_snd_buf.head[0].iov_len; + if (rtype == rpcrdma_areadch) + pos = 0; + seg = req->rl_segments; + nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos, + rtype, seg); + if (nsegs < 0) + return nsegs; + + do { + seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, + false, &mr); + if (IS_ERR(seg)) + return PTR_ERR(seg); + rpcrdma_mr_push(mr, &req->rl_registered); + + if (encode_read_segment(xdr, mr, pos) < 0) + return -EMSGSIZE; + + trace_xprtrdma_read_chunk(rqst->rq_task, pos, mr, nsegs); + r_xprt->rx_stats.read_chunk_count++; + nsegs -= mr->mr_nents; + } while (nsegs); + + return 0; +} + +/* Register and XDR encode the Write list. Supports encoding a list + * containing one array of plain segments that belong to a single + * write chunk. + * + * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): + * + * Write chunklist (a list of (one) counted array): + * N elements: + * 1 - N - HLOO - HLOO - ... - HLOO - 0 + * + * Returns zero on success, or a negative errno if a failure occurred. + * @xdr is advanced to the next position in the stream. + * + * Only a single Write chunk is currently supported. + */ +static noinline int +rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) +{ + struct xdr_stream *xdr = &req->rl_stream; + struct rpcrdma_mr_seg *seg; + struct rpcrdma_mr *mr; + int nsegs, nchunks; + __be32 *segcount; + + seg = req->rl_segments; + nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, + rqst->rq_rcv_buf.head[0].iov_len, + wtype, seg); + if (nsegs < 0) + return nsegs; + + if (encode_item_present(xdr) < 0) + return -EMSGSIZE; + segcount = xdr_reserve_space(xdr, sizeof(*segcount)); + if (unlikely(!segcount)) + return -EMSGSIZE; + /* Actual value encoded below */ + + nchunks = 0; + do { + seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, + true, &mr); + if (IS_ERR(seg)) + return PTR_ERR(seg); + rpcrdma_mr_push(mr, &req->rl_registered); + + if (encode_rdma_segment(xdr, mr) < 0) + return -EMSGSIZE; + + trace_xprtrdma_write_chunk(rqst->rq_task, mr, nsegs); + r_xprt->rx_stats.write_chunk_count++; + r_xprt->rx_stats.total_rdma_request += mr->mr_length; + nchunks++; + nsegs -= mr->mr_nents; + } while (nsegs); + + /* Update count of segments in this Write chunk */ + *segcount = cpu_to_be32(nchunks); + + return 0; +} + +/* Register and XDR encode the Reply chunk. Supports encoding an array + * of plain segments that belong to a single write (reply) chunk. + * + * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): + * + * Reply chunk (a counted array): + * N elements: + * 1 - N - HLOO - HLOO - ... - HLOO + * + * Returns zero on success, or a negative errno if a failure occurred. + * @xdr is advanced to the next position in the stream. + */ +static noinline int +rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) +{ + struct xdr_stream *xdr = &req->rl_stream; + struct rpcrdma_mr_seg *seg; + struct rpcrdma_mr *mr; + int nsegs, nchunks; + __be32 *segcount; + + seg = req->rl_segments; + nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg); + if (nsegs < 0) + return nsegs; + + if (encode_item_present(xdr) < 0) + return -EMSGSIZE; + segcount = xdr_reserve_space(xdr, sizeof(*segcount)); + if (unlikely(!segcount)) + return -EMSGSIZE; + /* Actual value encoded below */ + + nchunks = 0; + do { + seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, + true, &mr); + if (IS_ERR(seg)) + return PTR_ERR(seg); + rpcrdma_mr_push(mr, &req->rl_registered); + + if (encode_rdma_segment(xdr, mr) < 0) + return -EMSGSIZE; + + trace_xprtrdma_reply_chunk(rqst->rq_task, mr, nsegs); + r_xprt->rx_stats.reply_chunk_count++; + r_xprt->rx_stats.total_rdma_request += mr->mr_length; + nchunks++; + nsegs -= mr->mr_nents; + } while (nsegs); + + /* Update count of segments in the Reply chunk */ + *segcount = cpu_to_be32(nchunks); + + return 0; +} + +/** + * rpcrdma_unmap_sendctx - DMA-unmap Send buffers + * @sc: sendctx containing SGEs to unmap + * + */ +void +rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc) +{ + struct rpcrdma_ia *ia = &sc->sc_xprt->rx_ia; + struct ib_sge *sge; + unsigned int count; + + /* The first two SGEs contain the transport header and + * the inline buffer. These are always left mapped so + * they can be cheaply re-used. + */ + sge = &sc->sc_sges[2]; + for (count = sc->sc_unmap_count; count; ++sge, --count) + ib_dma_unmap_page(ia->ri_device, + sge->addr, sge->length, DMA_TO_DEVICE); + + if (test_and_clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &sc->sc_req->rl_flags)) { + smp_mb__after_atomic(); + wake_up_bit(&sc->sc_req->rl_flags, RPCRDMA_REQ_F_TX_RESOURCES); + } +} + +/* Prepare an SGE for the RPC-over-RDMA transport header. + */ +static bool +rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req, + u32 len) +{ + struct rpcrdma_sendctx *sc = req->rl_sendctx; + struct rpcrdma_regbuf *rb = req->rl_rdmabuf; + struct ib_sge *sge = sc->sc_sges; + + if (!rpcrdma_dma_map_regbuf(ia, rb)) + goto out_regbuf; + sge->addr = rdmab_addr(rb); + sge->length = len; + sge->lkey = rdmab_lkey(rb); + + ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, + sge->length, DMA_TO_DEVICE); + sc->sc_wr.num_sge++; + return true; + +out_regbuf: + pr_err("rpcrdma: failed to DMA map a Send buffer\n"); + return false; +} + +/* Prepare the Send SGEs. The head and tail iovec, and each entry + * in the page list, gets its own SGE. + */ +static bool +rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, + struct xdr_buf *xdr, enum rpcrdma_chunktype rtype) +{ + struct rpcrdma_sendctx *sc = req->rl_sendctx; + unsigned int sge_no, page_base, len, remaining; + struct rpcrdma_regbuf *rb = req->rl_sendbuf; + struct ib_device *device = ia->ri_device; + struct ib_sge *sge = sc->sc_sges; + u32 lkey = ia->ri_pd->local_dma_lkey; + struct page *page, **ppages; + + /* The head iovec is straightforward, as it is already + * DMA-mapped. Sync the content that has changed. + */ + if (!rpcrdma_dma_map_regbuf(ia, rb)) + goto out_regbuf; + sge_no = 1; + sge[sge_no].addr = rdmab_addr(rb); + sge[sge_no].length = xdr->head[0].iov_len; + sge[sge_no].lkey = rdmab_lkey(rb); + ib_dma_sync_single_for_device(rdmab_device(rb), sge[sge_no].addr, + sge[sge_no].length, DMA_TO_DEVICE); + + /* If there is a Read chunk, the page list is being handled + * via explicit RDMA, and thus is skipped here. However, the + * tail iovec may include an XDR pad for the page list, as + * well as additional content, and may not reside in the + * same page as the head iovec. + */ + if (rtype == rpcrdma_readch) { + len = xdr->tail[0].iov_len; + + /* Do not include the tail if it is only an XDR pad */ + if (len < 4) + goto out; + + page = virt_to_page(xdr->tail[0].iov_base); + page_base = offset_in_page(xdr->tail[0].iov_base); + + /* If the content in the page list is an odd length, + * xdr_write_pages() has added a pad at the beginning + * of the tail iovec. Force the tail's non-pad content + * to land at the next XDR position in the Send message. + */ + page_base += len & 3; + len -= len & 3; + goto map_tail; + } + + /* If there is a page list present, temporarily DMA map + * and prepare an SGE for each page to be sent. + */ + if (xdr->page_len) { + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); + page_base = offset_in_page(xdr->page_base); + remaining = xdr->page_len; + while (remaining) { + sge_no++; + if (sge_no > RPCRDMA_MAX_SEND_SGES - 2) + goto out_mapping_overflow; + + len = min_t(u32, PAGE_SIZE - page_base, remaining); + sge[sge_no].addr = ib_dma_map_page(device, *ppages, + page_base, len, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(device, sge[sge_no].addr)) + goto out_mapping_err; + sge[sge_no].length = len; + sge[sge_no].lkey = lkey; + + sc->sc_unmap_count++; + ppages++; + remaining -= len; + page_base = 0; + } + } + + /* The tail iovec is not always constructed in the same + * page where the head iovec resides (see, for example, + * gss_wrap_req_priv). To neatly accommodate that case, + * DMA map it separately. + */ + if (xdr->tail[0].iov_len) { + page = virt_to_page(xdr->tail[0].iov_base); + page_base = offset_in_page(xdr->tail[0].iov_base); + len = xdr->tail[0].iov_len; + +map_tail: + sge_no++; + sge[sge_no].addr = ib_dma_map_page(device, page, + page_base, len, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(device, sge[sge_no].addr)) + goto out_mapping_err; + sge[sge_no].length = len; + sge[sge_no].lkey = lkey; + sc->sc_unmap_count++; + } + +out: + sc->sc_wr.num_sge += sge_no; + if (sc->sc_unmap_count) + __set_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags); + return true; + +out_regbuf: + pr_err("rpcrdma: failed to DMA map a Send buffer\n"); + return false; + +out_mapping_overflow: + rpcrdma_unmap_sendctx(sc); + pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no); + return false; + +out_mapping_err: + rpcrdma_unmap_sendctx(sc); + pr_err("rpcrdma: Send mapping error\n"); + return false; +} + +/** + * rpcrdma_prepare_send_sges - Construct SGEs for a Send WR + * @r_xprt: controlling transport + * @req: context of RPC Call being marshalled + * @hdrlen: size of transport header, in bytes + * @xdr: xdr_buf containing RPC Call + * @rtype: chunk type being encoded + * + * Returns 0 on success; otherwise a negative errno is returned. + */ +int +rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, u32 hdrlen, + struct xdr_buf *xdr, enum rpcrdma_chunktype rtype) +{ + req->rl_sendctx = rpcrdma_sendctx_get_locked(&r_xprt->rx_buf); + if (!req->rl_sendctx) + return -EAGAIN; + req->rl_sendctx->sc_wr.num_sge = 0; + req->rl_sendctx->sc_unmap_count = 0; + req->rl_sendctx->sc_req = req; + __clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags); + + if (!rpcrdma_prepare_hdr_sge(&r_xprt->rx_ia, req, hdrlen)) + return -EIO; + + if (rtype != rpcrdma_areadch) + if (!rpcrdma_prepare_msg_sges(&r_xprt->rx_ia, req, xdr, rtype)) + return -EIO; + + return 0; +} + +/** + * rpcrdma_marshal_req - Marshal and send one RPC request + * @r_xprt: controlling transport + * @rqst: RPC request to be marshaled + * + * For the RPC in "rqst", this function: + * - Chooses the transfer mode (eg., RDMA_MSG or RDMA_NOMSG) + * - Registers Read, Write, and Reply chunks + * - Constructs the transport header + * - Posts a Send WR to send the transport header and request + * + * Returns: + * %0 if the RPC was sent successfully, + * %-ENOTCONN if the connection was lost, + * %-EAGAIN if the caller should call again with the same arguments, + * %-ENOBUFS if the caller should call again after a delay, + * %-EMSGSIZE if the transport header is too small, + * %-EIO if a permanent problem occurred while marshaling. + */ +int +rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) +{ + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct xdr_stream *xdr = &req->rl_stream; + enum rpcrdma_chunktype rtype, wtype; + bool ddp_allowed; + __be32 *p; + int ret; + + rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); + xdr_init_encode(xdr, &req->rl_hdrbuf, + req->rl_rdmabuf->rg_base); + + /* Fixed header fields */ + ret = -EMSGSIZE; + p = xdr_reserve_space(xdr, 4 * sizeof(*p)); + if (!p) + goto out_err; + *p++ = rqst->rq_xid; + *p++ = rpcrdma_version; + *p++ = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); + + /* When the ULP employs a GSS flavor that guarantees integrity + * or privacy, direct data placement of individual data items + * is not allowed. + */ + ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags & + RPCAUTH_AUTH_DATATOUCH); + + /* + * Chunks needed for results? + * + * o If the expected result is under the inline threshold, all ops + * return as inline. + * o Large read ops return data as write chunk(s), header as + * inline. + * o Large non-read ops return as a single reply chunk. + */ + if (rpcrdma_results_inline(r_xprt, rqst)) + wtype = rpcrdma_noch; + else if (ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ) + wtype = rpcrdma_writech; + else + wtype = rpcrdma_replych; + + /* + * Chunks needed for arguments? + * + * o If the total request is under the inline threshold, all ops + * are sent as inline. + * o Large write ops transmit data as read chunk(s), header as + * inline. + * o Large non-write ops are sent with the entire message as a + * single read chunk (protocol 0-position special case). + * + * This assumes that the upper layer does not present a request + * that both has a data payload, and whose non-data arguments + * by themselves are larger than the inline threshold. + */ + if (rpcrdma_args_inline(r_xprt, rqst)) { + *p++ = rdma_msg; + rtype = rpcrdma_noch; + } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { + *p++ = rdma_msg; + rtype = rpcrdma_readch; + } else { + r_xprt->rx_stats.nomsg_call_count++; + *p++ = rdma_nomsg; + rtype = rpcrdma_areadch; + } + + /* If this is a retransmit, discard previously registered + * chunks. Very likely the connection has been replaced, + * so these registrations are invalid and unusable. + */ + while (unlikely(!list_empty(&req->rl_registered))) { + struct rpcrdma_mr *mr; + + mr = rpcrdma_mr_pop(&req->rl_registered); + rpcrdma_mr_defer_recovery(mr); + } + + /* This implementation supports the following combinations + * of chunk lists in one RPC-over-RDMA Call message: + * + * - Read list + * - Write list + * - Reply chunk + * - Read list + Reply chunk + * + * It might not yet support the following combinations: + * + * - Read list + Write list + * + * It does not support the following combinations: + * + * - Write list + Reply chunk + * - Read list + Write list + Reply chunk + * + * This implementation supports only a single chunk in each + * Read or Write list. Thus for example the client cannot + * send a Call message with a Position Zero Read chunk and a + * regular Read chunk at the same time. + */ + if (rtype != rpcrdma_noch) { + ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype); + if (ret) + goto out_err; + } + ret = encode_item_not_present(xdr); + if (ret) + goto out_err; + + if (wtype == rpcrdma_writech) { + ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype); + if (ret) + goto out_err; + } + ret = encode_item_not_present(xdr); + if (ret) + goto out_err; + + if (wtype != rpcrdma_replych) + ret = encode_item_not_present(xdr); + else + ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype); + if (ret) + goto out_err; + + trace_xprtrdma_marshal(rqst, xdr_stream_pos(xdr), rtype, wtype); + + ret = rpcrdma_prepare_send_sges(r_xprt, req, xdr_stream_pos(xdr), + &rqst->rq_snd_buf, rtype); + if (ret) + goto out_err; + return 0; + +out_err: + switch (ret) { + case -EAGAIN: + xprt_wait_for_buffer_space(rqst->rq_task, NULL); + break; + case -ENOBUFS: + break; + default: + r_xprt->rx_stats.failed_marshal_count++; + } + return ret; +} + +/** + * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs + * @rqst: controlling RPC request + * @srcp: points to RPC message payload in receive buffer + * @copy_len: remaining length of receive buffer content + * @pad: Write chunk pad bytes needed (zero for pure inline) + * + * The upper layer has set the maximum number of bytes it can + * receive in each component of rq_rcv_buf. These values are set in + * the head.iov_len, page_len, tail.iov_len, and buflen fields. + * + * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in + * many cases this function simply updates iov_base pointers in + * rq_rcv_buf to point directly to the received reply data, to + * avoid copying reply data. + * + * Returns the count of bytes which had to be memcopied. + */ +static unsigned long +rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) +{ + unsigned long fixup_copy_count; + int i, npages, curlen; + char *destp; + struct page **ppages; + int page_base; + + /* The head iovec is redirected to the RPC reply message + * in the receive buffer, to avoid a memcopy. + */ + rqst->rq_rcv_buf.head[0].iov_base = srcp; + rqst->rq_private_buf.head[0].iov_base = srcp; + + /* The contents of the receive buffer that follow + * head.iov_len bytes are copied into the page list. + */ + curlen = rqst->rq_rcv_buf.head[0].iov_len; + if (curlen > copy_len) + curlen = copy_len; + trace_xprtrdma_fixup(rqst, copy_len, curlen); + srcp += curlen; + copy_len -= curlen; + + ppages = rqst->rq_rcv_buf.pages + + (rqst->rq_rcv_buf.page_base >> PAGE_SHIFT); + page_base = offset_in_page(rqst->rq_rcv_buf.page_base); + fixup_copy_count = 0; + if (copy_len && rqst->rq_rcv_buf.page_len) { + int pagelist_len; + + pagelist_len = rqst->rq_rcv_buf.page_len; + if (pagelist_len > copy_len) + pagelist_len = copy_len; + npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { + curlen = PAGE_SIZE - page_base; + if (curlen > pagelist_len) + curlen = pagelist_len; + + trace_xprtrdma_fixup_pg(rqst, i, srcp, + copy_len, curlen); + destp = kmap_atomic(ppages[i]); + memcpy(destp + page_base, srcp, curlen); + flush_dcache_page(ppages[i]); + kunmap_atomic(destp); + srcp += curlen; + copy_len -= curlen; + fixup_copy_count += curlen; + pagelist_len -= curlen; + if (!pagelist_len) + break; + page_base = 0; + } + + /* Implicit padding for the last segment in a Write + * chunk is inserted inline at the front of the tail + * iovec. The upper layer ignores the content of + * the pad. Simply ensure inline content in the tail + * that follows the Write chunk is properly aligned. + */ + if (pad) + srcp -= pad; + } + + /* The tail iovec is redirected to the remaining data + * in the receive buffer, to avoid a memcopy. + */ + if (copy_len || pad) { + rqst->rq_rcv_buf.tail[0].iov_base = srcp; + rqst->rq_private_buf.tail[0].iov_base = srcp; + } + + return fixup_copy_count; +} + +/* By convention, backchannel calls arrive via rdma_msg type + * messages, and never populate the chunk lists. This makes + * the RPC/RDMA header small and fixed in size, so it is + * straightforward to check the RPC header's direction field. + */ +static bool +rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) +#if defined(CONFIG_SUNRPC_BACKCHANNEL) +{ + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct xdr_stream *xdr = &rep->rr_stream; + __be32 *p; + + if (rep->rr_proc != rdma_msg) + return false; + + /* Peek at stream contents without advancing. */ + p = xdr_inline_decode(xdr, 0); + + /* Chunk lists */ + if (*p++ != xdr_zero) + return false; + if (*p++ != xdr_zero) + return false; + if (*p++ != xdr_zero) + return false; + + /* RPC header */ + if (*p++ != rep->rr_xid) + return false; + if (*p != cpu_to_be32(RPC_CALL)) + return false; + + /* No bc service. */ + if (xprt->bc_serv == NULL) + return false; + + /* Now that we are sure this is a backchannel call, + * advance to the RPC header. + */ + p = xdr_inline_decode(xdr, 3 * sizeof(*p)); + if (unlikely(!p)) + goto out_short; + + rpcrdma_bc_receive_call(r_xprt, rep); + return true; + +out_short: + pr_warn("RPC/RDMA short backward direction call\n"); + return true; +} +#else /* CONFIG_SUNRPC_BACKCHANNEL */ +{ + return false; +} +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ + +static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length) +{ + u32 handle; + u64 offset; + __be32 *p; + + p = xdr_inline_decode(xdr, 4 * sizeof(*p)); + if (unlikely(!p)) + return -EIO; + + handle = be32_to_cpup(p++); + *length = be32_to_cpup(p++); + xdr_decode_hyper(p, &offset); + + trace_xprtrdma_decode_seg(handle, *length, offset); + return 0; +} + +static int decode_write_chunk(struct xdr_stream *xdr, u32 *length) +{ + u32 segcount, seglength; + __be32 *p; + + p = xdr_inline_decode(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EIO; + + *length = 0; + segcount = be32_to_cpup(p); + while (segcount--) { + if (decode_rdma_segment(xdr, &seglength)) + return -EIO; + *length += seglength; + } + + return 0; +} + +/* In RPC-over-RDMA Version One replies, a Read list is never + * expected. This decoder is a stub that returns an error if + * a Read list is present. + */ +static int decode_read_list(struct xdr_stream *xdr) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EIO; + if (unlikely(*p != xdr_zero)) + return -EIO; + return 0; +} + +/* Supports only one Write chunk in the Write list + */ +static int decode_write_list(struct xdr_stream *xdr, u32 *length) +{ + u32 chunklen; + bool first; + __be32 *p; + + *length = 0; + first = true; + do { + p = xdr_inline_decode(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EIO; + if (*p == xdr_zero) + break; + if (!first) + return -EIO; + + if (decode_write_chunk(xdr, &chunklen)) + return -EIO; + *length += chunklen; + first = false; + } while (true); + return 0; +} + +static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EIO; + + *length = 0; + if (*p != xdr_zero) + if (decode_write_chunk(xdr, length)) + return -EIO; + return 0; +} + +static int +rpcrdma_decode_msg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, + struct rpc_rqst *rqst) +{ + struct xdr_stream *xdr = &rep->rr_stream; + u32 writelist, replychunk, rpclen; + char *base; + + /* Decode the chunk lists */ + if (decode_read_list(xdr)) + return -EIO; + if (decode_write_list(xdr, &writelist)) + return -EIO; + if (decode_reply_chunk(xdr, &replychunk)) + return -EIO; + + /* RDMA_MSG sanity checks */ + if (unlikely(replychunk)) + return -EIO; + + /* Build the RPC reply's Payload stream in rqst->rq_rcv_buf */ + base = (char *)xdr_inline_decode(xdr, 0); + rpclen = xdr_stream_remaining(xdr); + r_xprt->rx_stats.fixup_copy_count += + rpcrdma_inline_fixup(rqst, base, rpclen, writelist & 3); + + r_xprt->rx_stats.total_rdma_reply += writelist; + return rpclen + xdr_align_size(writelist); +} + +static noinline int +rpcrdma_decode_nomsg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) +{ + struct xdr_stream *xdr = &rep->rr_stream; + u32 writelist, replychunk; + + /* Decode the chunk lists */ + if (decode_read_list(xdr)) + return -EIO; + if (decode_write_list(xdr, &writelist)) + return -EIO; + if (decode_reply_chunk(xdr, &replychunk)) + return -EIO; + + /* RDMA_NOMSG sanity checks */ + if (unlikely(writelist)) + return -EIO; + if (unlikely(!replychunk)) + return -EIO; + + /* Reply chunk buffer already is the reply vector */ + r_xprt->rx_stats.total_rdma_reply += replychunk; + return replychunk; +} + +static noinline int +rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, + struct rpc_rqst *rqst) +{ + struct xdr_stream *xdr = &rep->rr_stream; + __be32 *p; + + p = xdr_inline_decode(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EIO; + + switch (*p) { + case err_vers: + p = xdr_inline_decode(xdr, 2 * sizeof(*p)); + if (!p) + break; + dprintk("RPC: %5u: %s: server reports version error (%u-%u)\n", + rqst->rq_task->tk_pid, __func__, + be32_to_cpup(p), be32_to_cpu(*(p + 1))); + break; + case err_chunk: + dprintk("RPC: %5u: %s: server reports header decoding error\n", + rqst->rq_task->tk_pid, __func__); + break; + default: + dprintk("RPC: %5u: %s: server reports unrecognized error %d\n", + rqst->rq_task->tk_pid, __func__, be32_to_cpup(p)); + } + + r_xprt->rx_stats.bad_reply_count++; + return -EREMOTEIO; +} + +/* Perform XID lookup, reconstruction of the RPC reply, and + * RPC completion while holding the transport lock to ensure + * the rep, rqst, and rq_task pointers remain stable. + */ +void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) +{ + struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct rpc_rqst *rqst = rep->rr_rqst; + unsigned long cwnd; + int status; + + xprt->reestablish_timeout = 0; + + switch (rep->rr_proc) { + case rdma_msg: + status = rpcrdma_decode_msg(r_xprt, rep, rqst); + break; + case rdma_nomsg: + status = rpcrdma_decode_nomsg(r_xprt, rep); + break; + case rdma_error: + status = rpcrdma_decode_error(r_xprt, rep, rqst); + break; + default: + status = -EIO; + } + if (status < 0) + goto out_badheader; + +out: + spin_lock(&xprt->recv_lock); + cwnd = xprt->cwnd; + xprt->cwnd = r_xprt->rx_buf.rb_credits << RPC_CWNDSHIFT; + if (xprt->cwnd > cwnd) + xprt_release_rqst_cong(rqst->rq_task); + + xprt_complete_rqst(rqst->rq_task, status); + xprt_unpin_rqst(rqst); + spin_unlock(&xprt->recv_lock); + return; + +/* If the incoming reply terminated a pending RPC, the next + * RPC call will post a replacement receive buffer as it is + * being marshaled. + */ +out_badheader: + trace_xprtrdma_reply_hdr(rep); + r_xprt->rx_stats.bad_reply_count++; + status = -EIO; + goto out; +} + +void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) +{ + /* Invalidate and unmap the data payloads before waking + * the waiting application. This guarantees the memory + * regions are properly fenced from the server before the + * application accesses the data. It also ensures proper + * send flow control: waking the next RPC waits until this + * RPC has relinquished all its Send Queue entries. + */ + if (!list_empty(&req->rl_registered)) + r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, + &req->rl_registered); + + /* Ensure that any DMA mapped pages associated with + * the Send of the RPC Call have been unmapped before + * allowing the RPC to complete. This protects argument + * memory not controlled by the RPC client from being + * re-used before we're done with it. + */ + if (test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) { + r_xprt->rx_stats.reply_waits_for_send++; + out_of_line_wait_on_bit(&req->rl_flags, + RPCRDMA_REQ_F_TX_RESOURCES, + bit_wait, + TASK_UNINTERRUPTIBLE); + } +} + +/* Reply handling runs in the poll worker thread. Anything that + * might wait is deferred to a separate workqueue. + */ +void rpcrdma_deferred_completion(struct work_struct *work) +{ + struct rpcrdma_rep *rep = + container_of(work, struct rpcrdma_rep, rr_work); + struct rpcrdma_req *req = rpcr_to_rdmar(rep->rr_rqst); + struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; + + trace_xprtrdma_defer_cmp(rep); + if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) + r_xprt->rx_ia.ri_ops->ro_reminv(rep, &req->rl_registered); + rpcrdma_release_rqst(r_xprt, req); + rpcrdma_complete_rqst(rep); +} + +/* Process received RPC/RDMA messages. + * + * Errors must result in the RPC task either being awakened, or + * allowed to timeout, to discover the errors at that time. + */ +void rpcrdma_reply_handler(struct rpcrdma_rep *rep) +{ + struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_req *req; + struct rpc_rqst *rqst; + u32 credits; + __be32 *p; + + --buf->rb_posted_receives; + + if (rep->rr_hdrbuf.head[0].iov_len == 0) + goto out_badstatus; + + /* Fixed transport header fields */ + xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf, + rep->rr_hdrbuf.head[0].iov_base); + p = xdr_inline_decode(&rep->rr_stream, 4 * sizeof(*p)); + if (unlikely(!p)) + goto out_shortreply; + rep->rr_xid = *p++; + rep->rr_vers = *p++; + credits = be32_to_cpu(*p++); + rep->rr_proc = *p++; + + if (rep->rr_vers != rpcrdma_version) + goto out_badversion; + + if (rpcrdma_is_bcall(r_xprt, rep)) + return; + + /* Match incoming rpcrdma_rep to an rpcrdma_req to + * get context for handling any incoming chunks. + */ + spin_lock(&xprt->recv_lock); + rqst = xprt_lookup_rqst(xprt, rep->rr_xid); + if (!rqst) + goto out_norqst; + xprt_pin_rqst(rqst); + + if (credits == 0) + credits = 1; /* don't deadlock */ + else if (credits > buf->rb_max_requests) + credits = buf->rb_max_requests; + buf->rb_credits = credits; + + spin_unlock(&xprt->recv_lock); + + req = rpcr_to_rdmar(rqst); + if (req->rl_reply) { + trace_xprtrdma_leaked_rep(rqst, req->rl_reply); + rpcrdma_recv_buffer_put(req->rl_reply); + } + req->rl_reply = rep; + rep->rr_rqst = rqst; + clear_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); + + trace_xprtrdma_reply(rqst->rq_task, rep, req, credits); + + rpcrdma_post_recvs(r_xprt, false); + queue_work(rpcrdma_receive_wq, &rep->rr_work); + return; + +out_badversion: + trace_xprtrdma_reply_vers(rep); + goto repost; + +/* The RPC transaction has already been terminated, or the header + * is corrupt. + */ +out_norqst: + spin_unlock(&xprt->recv_lock); + trace_xprtrdma_reply_rqst(rep); + goto repost; + +out_shortreply: + trace_xprtrdma_reply_short(rep); + +/* If no pending RPC transaction was matched, post a replacement + * receive buffer before returning. + */ +repost: + rpcrdma_post_recvs(r_xprt, false); +out_badstatus: + rpcrdma_recv_buffer_put(rep); +} diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c new file mode 100644 index 000000000..134bef6a4 --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -0,0 +1,266 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* + * Copyright (c) 2015-2018 Oracle. All rights reserved. + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker <tom@opengridcomputing.com> + */ + +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/sysctl.h> +#include <linux/workqueue.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/sched.h> +#include <linux/sunrpc/svc_rdma.h> + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +/* RPC/RDMA parameters */ +unsigned int svcrdma_ord = 16; /* historical default */ +static unsigned int min_ord = 1; +static unsigned int max_ord = 255; +unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS; +unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS; +static unsigned int min_max_requests = 4; +static unsigned int max_max_requests = 16384; +unsigned int svcrdma_max_req_size = RPCRDMA_DEF_INLINE_THRESH; +static unsigned int min_max_inline = RPCRDMA_DEF_INLINE_THRESH; +static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH; + +atomic_t rdma_stat_recv; +atomic_t rdma_stat_read; +atomic_t rdma_stat_write; +atomic_t rdma_stat_sq_starve; +atomic_t rdma_stat_rq_starve; +atomic_t rdma_stat_rq_poll; +atomic_t rdma_stat_rq_prod; +atomic_t rdma_stat_sq_poll; +atomic_t rdma_stat_sq_prod; + +struct workqueue_struct *svc_rdma_wq; + +/* + * This function implements reading and resetting an atomic_t stat + * variable through read/write to a proc file. Any write to the file + * resets the associated statistic to zero. Any read returns it's + * current value. + */ +static int read_reset_stat(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + atomic_t *stat = (atomic_t *)table->data; + + if (!stat) + return -EINVAL; + + if (write) + atomic_set(stat, 0); + else { + char str_buf[32]; + int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat)); + if (len >= 32) + return -EFAULT; + len = strlen(str_buf); + if (*ppos > len) { + *lenp = 0; + return 0; + } + len -= *ppos; + if (len > *lenp) + len = *lenp; + if (len && copy_to_user(buffer, str_buf, len)) + return -EFAULT; + *lenp = len; + *ppos += len; + } + return 0; +} + +static struct ctl_table_header *svcrdma_table_header; +static struct ctl_table svcrdma_parm_table[] = { + { + .procname = "max_requests", + .data = &svcrdma_max_requests, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_max_requests, + .extra2 = &max_max_requests + }, + { + .procname = "max_req_size", + .data = &svcrdma_max_req_size, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_max_inline, + .extra2 = &max_max_inline + }, + { + .procname = "max_outbound_read_requests", + .data = &svcrdma_ord, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_ord, + .extra2 = &max_ord, + }, + + { + .procname = "rdma_stat_read", + .data = &rdma_stat_read, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { + .procname = "rdma_stat_recv", + .data = &rdma_stat_recv, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { + .procname = "rdma_stat_write", + .data = &rdma_stat_write, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { + .procname = "rdma_stat_sq_starve", + .data = &rdma_stat_sq_starve, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { + .procname = "rdma_stat_rq_starve", + .data = &rdma_stat_rq_starve, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { + .procname = "rdma_stat_rq_poll", + .data = &rdma_stat_rq_poll, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { + .procname = "rdma_stat_rq_prod", + .data = &rdma_stat_rq_prod, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { + .procname = "rdma_stat_sq_poll", + .data = &rdma_stat_sq_poll, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { + .procname = "rdma_stat_sq_prod", + .data = &rdma_stat_sq_prod, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = read_reset_stat, + }, + { }, +}; + +static struct ctl_table svcrdma_table[] = { + { + .procname = "svc_rdma", + .mode = 0555, + .child = svcrdma_parm_table + }, + { }, +}; + +static struct ctl_table svcrdma_root_table[] = { + { + .procname = "sunrpc", + .mode = 0555, + .child = svcrdma_table + }, + { }, +}; + +void svc_rdma_cleanup(void) +{ + dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); + destroy_workqueue(svc_rdma_wq); + if (svcrdma_table_header) { + unregister_sysctl_table(svcrdma_table_header); + svcrdma_table_header = NULL; + } +#if defined(CONFIG_SUNRPC_BACKCHANNEL) + svc_unreg_xprt_class(&svc_rdma_bc_class); +#endif + svc_unreg_xprt_class(&svc_rdma_class); +} + +int svc_rdma_init(void) +{ + dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); + dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); + dprintk("\tmax_requests : %u\n", svcrdma_max_requests); + dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests); + dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); + + svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0); + if (!svc_rdma_wq) + return -ENOMEM; + + if (!svcrdma_table_header) + svcrdma_table_header = + register_sysctl_table(svcrdma_root_table); + + /* Register RDMA with the SVC transport switch */ + svc_reg_xprt_class(&svc_rdma_class); +#if defined(CONFIG_SUNRPC_BACKCHANNEL) + svc_reg_xprt_class(&svc_rdma_bc_class); +#endif + return 0; +} diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c new file mode 100644 index 000000000..cf2272a90 --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -0,0 +1,355 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2015-2018 Oracle. All rights reserved. + * + * Support for backward direction RPCs on RPC/RDMA (server-side). + */ + +#include <linux/module.h> + +#include <linux/sunrpc/svc_rdma.h> + +#include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +#undef SVCRDMA_BACKCHANNEL_DEBUG + +/** + * svc_rdma_handle_bc_reply - Process incoming backchannel reply + * @xprt: controlling backchannel transport + * @rdma_resp: pointer to incoming transport header + * @rcvbuf: XDR buffer into which to decode the reply + * + * Returns: + * %0 if @rcvbuf is filled in, xprt_complete_rqst called, + * %-EAGAIN if server should call ->recvfrom again. + */ +int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, + struct xdr_buf *rcvbuf) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct kvec *dst, *src = &rcvbuf->head[0]; + struct rpc_rqst *req; + unsigned long cwnd; + u32 credits; + size_t len; + __be32 xid; + __be32 *p; + int ret; + + p = (__be32 *)src->iov_base; + len = src->iov_len; + xid = *rdma_resp; + +#ifdef SVCRDMA_BACKCHANNEL_DEBUG + pr_info("%s: xid=%08x, length=%zu\n", + __func__, be32_to_cpu(xid), len); + pr_info("%s: RPC/RDMA: %*ph\n", + __func__, (int)RPCRDMA_HDRLEN_MIN, rdma_resp); + pr_info("%s: RPC: %*ph\n", + __func__, (int)len, p); +#endif + + ret = -EAGAIN; + if (src->iov_len < 24) + goto out_shortreply; + + spin_lock(&xprt->recv_lock); + req = xprt_lookup_rqst(xprt, xid); + if (!req) + goto out_notfound; + + dst = &req->rq_private_buf.head[0]; + memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf)); + if (dst->iov_len < len) + goto out_unlock; + memcpy(dst->iov_base, p, len); + + credits = be32_to_cpup(rdma_resp + 2); + if (credits == 0) + credits = 1; /* don't deadlock */ + else if (credits > r_xprt->rx_buf.rb_bc_max_requests) + credits = r_xprt->rx_buf.rb_bc_max_requests; + + spin_lock_bh(&xprt->transport_lock); + cwnd = xprt->cwnd; + xprt->cwnd = credits << RPC_CWNDSHIFT; + if (xprt->cwnd > cwnd) + xprt_release_rqst_cong(req->rq_task); + spin_unlock_bh(&xprt->transport_lock); + + + ret = 0; + xprt_complete_rqst(req->rq_task, rcvbuf->len); + rcvbuf->len = 0; + +out_unlock: + spin_unlock(&xprt->recv_lock); +out: + return ret; + +out_shortreply: + dprintk("svcrdma: short bc reply: xprt=%p, len=%zu\n", + xprt, src->iov_len); + goto out; + +out_notfound: + dprintk("svcrdma: unrecognized bc reply: xprt=%p, xid=%08x\n", + xprt, be32_to_cpu(xid)); + goto out_unlock; +} + +/* Send a backwards direction RPC call. + * + * Caller holds the connection's mutex and has already marshaled + * the RPC/RDMA request. + * + * This is similar to svc_rdma_send_reply_msg, but takes a struct + * rpc_rqst instead, does not support chunks, and avoids blocking + * memory allocation. + * + * XXX: There is still an opportunity to block in svc_rdma_send() + * if there are no SQ entries to post the Send. This may occur if + * the adapter has a small maximum SQ depth. + */ +static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, + struct rpc_rqst *rqst, + struct svc_rdma_send_ctxt *ctxt) +{ + int ret; + + ret = svc_rdma_map_reply_msg(rdma, ctxt, &rqst->rq_snd_buf, NULL); + if (ret < 0) + return -EIO; + + /* Bump page refcnt so Send completion doesn't release + * the rq_buffer before all retransmits are complete. + */ + get_page(virt_to_page(rqst->rq_buffer)); + ctxt->sc_send_wr.opcode = IB_WR_SEND; + return svc_rdma_send(rdma, &ctxt->sc_send_wr); +} + +/* Server-side transport endpoint wants a whole page for its send + * buffer. The client RPC code constructs the RPC header in this + * buffer before it invokes ->send_request. + */ +static int +xprt_rdma_bc_allocate(struct rpc_task *task) +{ + struct rpc_rqst *rqst = task->tk_rqstp; + size_t size = rqst->rq_callsize; + struct page *page; + + if (size > PAGE_SIZE) { + WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n", + size); + return -EINVAL; + } + + page = alloc_page(RPCRDMA_DEF_GFP); + if (!page) + return -ENOMEM; + rqst->rq_buffer = page_address(page); + + rqst->rq_rbuffer = kmalloc(rqst->rq_rcvsize, RPCRDMA_DEF_GFP); + if (!rqst->rq_rbuffer) { + put_page(page); + return -ENOMEM; + } + return 0; +} + +static void +xprt_rdma_bc_free(struct rpc_task *task) +{ + struct rpc_rqst *rqst = task->tk_rqstp; + + put_page(virt_to_page(rqst->rq_buffer)); + kfree(rqst->rq_rbuffer); +} + +static int +rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) +{ + struct rpc_xprt *xprt = rqst->rq_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct svc_rdma_send_ctxt *ctxt; + __be32 *p; + int rc; + + ctxt = svc_rdma_send_ctxt_get(rdma); + if (!ctxt) + goto drop_connection; + + p = ctxt->sc_xprt_buf; + *p++ = rqst->rq_xid; + *p++ = rpcrdma_version; + *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests); + *p++ = rdma_msg; + *p++ = xdr_zero; + *p++ = xdr_zero; + *p = xdr_zero; + svc_rdma_sync_reply_hdr(rdma, ctxt, RPCRDMA_HDRLEN_MIN); + +#ifdef SVCRDMA_BACKCHANNEL_DEBUG + pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer); +#endif + + rc = svc_rdma_bc_sendto(rdma, rqst, ctxt); + if (rc) { + svc_rdma_send_ctxt_put(rdma, ctxt); + goto drop_connection; + } + return rc; + +drop_connection: + dprintk("svcrdma: failed to send bc call\n"); + xprt_disconnect_done(xprt); + return -ENOTCONN; +} + +/* Send an RPC call on the passive end of a transport + * connection. + */ +static int +xprt_rdma_bc_send_request(struct rpc_task *task) +{ + struct rpc_rqst *rqst = task->tk_rqstp; + struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; + struct svcxprt_rdma *rdma; + int ret; + + dprintk("svcrdma: sending bc call with xid: %08x\n", + be32_to_cpu(rqst->rq_xid)); + + if (!mutex_trylock(&sxprt->xpt_mutex)) { + rpc_sleep_on(&sxprt->xpt_bc_pending, task, NULL); + if (!mutex_trylock(&sxprt->xpt_mutex)) + return -EAGAIN; + rpc_wake_up_queued_task(&sxprt->xpt_bc_pending, task); + } + + ret = -ENOTCONN; + rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); + if (!test_bit(XPT_DEAD, &sxprt->xpt_flags)) + ret = rpcrdma_bc_send_request(rdma, rqst); + + mutex_unlock(&sxprt->xpt_mutex); + + if (ret < 0) + return ret; + return 0; +} + +static void +xprt_rdma_bc_close(struct rpc_xprt *xprt) +{ + dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); + xprt->cwnd = RPC_CWNDSHIFT; +} + +static void +xprt_rdma_bc_put(struct rpc_xprt *xprt) +{ + dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); + + xprt_rdma_free_addresses(xprt); + xprt_free(xprt); + module_put(THIS_MODULE); +} + +static const struct rpc_xprt_ops xprt_rdma_bc_procs = { + .reserve_xprt = xprt_reserve_xprt_cong, + .release_xprt = xprt_release_xprt_cong, + .alloc_slot = xprt_alloc_slot, + .free_slot = xprt_free_slot, + .release_request = xprt_release_rqst_cong, + .buf_alloc = xprt_rdma_bc_allocate, + .buf_free = xprt_rdma_bc_free, + .send_request = xprt_rdma_bc_send_request, + .set_retrans_timeout = xprt_set_retrans_timeout_def, + .close = xprt_rdma_bc_close, + .destroy = xprt_rdma_bc_put, + .print_stats = xprt_rdma_print_stats +}; + +static const struct rpc_timeout xprt_rdma_bc_timeout = { + .to_initval = 60 * HZ, + .to_maxval = 60 * HZ, +}; + +/* It shouldn't matter if the number of backchannel session slots + * doesn't match the number of RPC/RDMA credits. That just means + * one or the other will have extra slots that aren't used. + */ +static struct rpc_xprt * +xprt_setup_rdma_bc(struct xprt_create *args) +{ + struct rpc_xprt *xprt; + struct rpcrdma_xprt *new_xprt; + + if (args->addrlen > sizeof(xprt->addr)) { + dprintk("RPC: %s: address too large\n", __func__); + return ERR_PTR(-EBADF); + } + + xprt = xprt_alloc(args->net, sizeof(*new_xprt), + RPCRDMA_MAX_BC_REQUESTS, + RPCRDMA_MAX_BC_REQUESTS); + if (!xprt) { + dprintk("RPC: %s: couldn't allocate rpc_xprt\n", + __func__); + return ERR_PTR(-ENOMEM); + } + + xprt->timeout = &xprt_rdma_bc_timeout; + xprt_set_bound(xprt); + xprt_set_connected(xprt); + xprt->bind_timeout = 0; + xprt->reestablish_timeout = 0; + xprt->idle_timeout = 0; + + xprt->prot = XPRT_TRANSPORT_BC_RDMA; + xprt->tsh_size = 0; + xprt->ops = &xprt_rdma_bc_procs; + + memcpy(&xprt->addr, args->dstaddr, args->addrlen); + xprt->addrlen = args->addrlen; + xprt_rdma_format_addresses(xprt, (struct sockaddr *)&xprt->addr); + xprt->resvport = 0; + + xprt->max_payload = xprt_rdma_max_inline_read; + + new_xprt = rpcx_to_rdmax(xprt); + new_xprt->rx_buf.rb_bc_max_requests = xprt->max_reqs; + + xprt_get(xprt); + args->bc_xprt->xpt_bc_xprt = xprt; + xprt->bc_xprt = args->bc_xprt; + + if (!try_module_get(THIS_MODULE)) + goto out_fail; + + /* Final put for backchannel xprt is in __svc_rdma_free */ + xprt_get(xprt); + return xprt; + +out_fail: + xprt_rdma_free_addresses(xprt); + args->bc_xprt->xpt_bc_xprt = NULL; + args->bc_xprt->xpt_bc_xps = NULL; + xprt_put(xprt); + xprt_free(xprt); + return ERR_PTR(-EINVAL); +} + +struct xprt_class xprt_rdma_bc = { + .list = LIST_HEAD_INIT(xprt_rdma_bc.list), + .name = "rdma backchannel", + .owner = THIS_MODULE, + .ident = XPRT_TRANSPORT_BC_RDMA, + .setup = xprt_setup_rdma_bc, +}; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c new file mode 100644 index 000000000..252495ff9 --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -0,0 +1,804 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* + * Copyright (c) 2016-2018 Oracle. All rights reserved. + * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker <tom@opengridcomputing.com> + */ + +/* Operation + * + * The main entry point is svc_rdma_recvfrom. This is called from + * svc_recv when the transport indicates there is incoming data to + * be read. "Data Ready" is signaled when an RDMA Receive completes, + * or when a set of RDMA Reads complete. + * + * An svc_rqst is passed in. This structure contains an array of + * free pages (rq_pages) that will contain the incoming RPC message. + * + * Short messages are moved directly into svc_rqst::rq_arg, and + * the RPC Call is ready to be processed by the Upper Layer. + * svc_rdma_recvfrom returns the length of the RPC Call message, + * completing the reception of the RPC Call. + * + * However, when an incoming message has Read chunks, + * svc_rdma_recvfrom must post RDMA Reads to pull the RPC Call's + * data payload from the client. svc_rdma_recvfrom sets up the + * RDMA Reads using pages in svc_rqst::rq_pages, which are + * transferred to an svc_rdma_recv_ctxt for the duration of the + * I/O. svc_rdma_recvfrom then returns zero, since the RPC message + * is still not yet ready. + * + * When the Read chunk payloads have become available on the + * server, "Data Ready" is raised again, and svc_recv calls + * svc_rdma_recvfrom again. This second call may use a different + * svc_rqst than the first one, thus any information that needs + * to be preserved across these two calls is kept in an + * svc_rdma_recv_ctxt. + * + * The second call to svc_rdma_recvfrom performs final assembly + * of the RPC Call message, using the RDMA Read sink pages kept in + * the svc_rdma_recv_ctxt. The xdr_buf is copied from the + * svc_rdma_recv_ctxt to the second svc_rqst. The second call returns + * the length of the completed RPC Call message. + * + * Page Management + * + * Pages under I/O must be transferred from the first svc_rqst to an + * svc_rdma_recv_ctxt before the first svc_rdma_recvfrom call returns. + * + * The first svc_rqst supplies pages for RDMA Reads. These are moved + * from rqstp::rq_pages into ctxt::pages. The consumed elements of + * the rq_pages array are set to NULL and refilled with the first + * svc_rdma_recvfrom call returns. + * + * During the second svc_rdma_recvfrom call, RDMA Read sink pages + * are transferred from the svc_rdma_recv_ctxt to the second svc_rqst + * (see rdma_read_complete() below). + */ + +#include <linux/spinlock.h> +#include <asm/unaligned.h> +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> + +#include <linux/sunrpc/xdr.h> +#include <linux/sunrpc/debug.h> +#include <linux/sunrpc/rpc_rdma.h> +#include <linux/sunrpc/svc_rdma.h> + +#include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc); + +static inline struct svc_rdma_recv_ctxt * +svc_rdma_next_recv_ctxt(struct list_head *list) +{ + return list_first_entry_or_null(list, struct svc_rdma_recv_ctxt, + rc_list); +} + +static struct svc_rdma_recv_ctxt * +svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_recv_ctxt *ctxt; + dma_addr_t addr; + void *buffer; + + ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); + if (!ctxt) + goto fail0; + buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL); + if (!buffer) + goto fail1; + addr = ib_dma_map_single(rdma->sc_pd->device, buffer, + rdma->sc_max_req_size, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(rdma->sc_pd->device, addr)) + goto fail2; + + ctxt->rc_recv_wr.next = NULL; + ctxt->rc_recv_wr.wr_cqe = &ctxt->rc_cqe; + ctxt->rc_recv_wr.sg_list = &ctxt->rc_recv_sge; + ctxt->rc_recv_wr.num_sge = 1; + ctxt->rc_cqe.done = svc_rdma_wc_receive; + ctxt->rc_recv_sge.addr = addr; + ctxt->rc_recv_sge.length = rdma->sc_max_req_size; + ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey; + ctxt->rc_recv_buf = buffer; + ctxt->rc_temp = false; + return ctxt; + +fail2: + kfree(buffer); +fail1: + kfree(ctxt); +fail0: + return NULL; +} + +static void svc_rdma_recv_ctxt_destroy(struct svcxprt_rdma *rdma, + struct svc_rdma_recv_ctxt *ctxt) +{ + ib_dma_unmap_single(rdma->sc_pd->device, ctxt->rc_recv_sge.addr, + ctxt->rc_recv_sge.length, DMA_FROM_DEVICE); + kfree(ctxt->rc_recv_buf); + kfree(ctxt); +} + +/** + * svc_rdma_recv_ctxts_destroy - Release all recv_ctxt's for an xprt + * @rdma: svcxprt_rdma being torn down + * + */ +void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_recv_ctxt *ctxt; + + while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts))) { + list_del(&ctxt->rc_list); + svc_rdma_recv_ctxt_destroy(rdma, ctxt); + } +} + +static struct svc_rdma_recv_ctxt * +svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_recv_ctxt *ctxt; + + spin_lock(&rdma->sc_recv_lock); + ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts); + if (!ctxt) + goto out_empty; + list_del(&ctxt->rc_list); + spin_unlock(&rdma->sc_recv_lock); + +out: + ctxt->rc_page_count = 0; + return ctxt; + +out_empty: + spin_unlock(&rdma->sc_recv_lock); + + ctxt = svc_rdma_recv_ctxt_alloc(rdma); + if (!ctxt) + return NULL; + goto out; +} + +/** + * svc_rdma_recv_ctxt_put - Return recv_ctxt to free list + * @rdma: controlling svcxprt_rdma + * @ctxt: object to return to the free list + * + */ +void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, + struct svc_rdma_recv_ctxt *ctxt) +{ + unsigned int i; + + for (i = 0; i < ctxt->rc_page_count; i++) + put_page(ctxt->rc_pages[i]); + + if (!ctxt->rc_temp) { + spin_lock(&rdma->sc_recv_lock); + list_add(&ctxt->rc_list, &rdma->sc_recv_ctxts); + spin_unlock(&rdma->sc_recv_lock); + } else + svc_rdma_recv_ctxt_destroy(rdma, ctxt); +} + +/** + * svc_rdma_release_rqst - Release transport-specific per-rqst resources + * @rqstp: svc_rqst being released + * + * Ensure that the recv_ctxt is released whether or not a Reply + * was sent. For example, the client could close the connection, + * or svc_process could drop an RPC, before the Reply is sent. + */ +void svc_rdma_release_rqst(struct svc_rqst *rqstp) +{ + struct svc_rdma_recv_ctxt *ctxt = rqstp->rq_xprt_ctxt; + struct svc_xprt *xprt = rqstp->rq_xprt; + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + + rqstp->rq_xprt_ctxt = NULL; + if (ctxt) + svc_rdma_recv_ctxt_put(rdma, ctxt); +} + +static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma, + struct svc_rdma_recv_ctxt *ctxt) +{ + int ret; + + svc_xprt_get(&rdma->sc_xprt); + ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, NULL); + trace_svcrdma_post_recv(&ctxt->rc_recv_wr, ret); + if (ret) + goto err_post; + return 0; + +err_post: + svc_rdma_recv_ctxt_put(rdma, ctxt); + svc_xprt_put(&rdma->sc_xprt); + return ret; +} + +static int svc_rdma_post_recv(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_recv_ctxt *ctxt; + + if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) + return 0; + ctxt = svc_rdma_recv_ctxt_get(rdma); + if (!ctxt) + return -ENOMEM; + return __svc_rdma_post_recv(rdma, ctxt); +} + +/** + * svc_rdma_post_recvs - Post initial set of Recv WRs + * @rdma: fresh svcxprt_rdma + * + * Returns true if successful, otherwise false. + */ +bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_recv_ctxt *ctxt; + unsigned int i; + int ret; + + for (i = 0; i < rdma->sc_max_requests; i++) { + ctxt = svc_rdma_recv_ctxt_get(rdma); + if (!ctxt) + return false; + ctxt->rc_temp = true; + ret = __svc_rdma_post_recv(rdma, ctxt); + if (ret) { + pr_err("svcrdma: failure posting recv buffers: %d\n", + ret); + return false; + } + } + return true; +} + +/** + * svc_rdma_wc_receive - Invoked by RDMA provider for each polled Receive WC + * @cq: Completion Queue context + * @wc: Work Completion object + * + * NB: The svc_xprt/svcxprt_rdma is pinned whenever it's possible that + * the Receive completion handler could be running. + */ +static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) +{ + struct svcxprt_rdma *rdma = cq->cq_context; + struct ib_cqe *cqe = wc->wr_cqe; + struct svc_rdma_recv_ctxt *ctxt; + + trace_svcrdma_wc_receive(wc); + + /* WARNING: Only wc->wr_cqe and wc->status are reliable */ + ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe); + + if (wc->status != IB_WC_SUCCESS) + goto flushed; + + if (svc_rdma_post_recv(rdma)) + goto post_err; + + /* All wc fields are now known to be valid */ + ctxt->rc_byte_len = wc->byte_len; + ib_dma_sync_single_for_cpu(rdma->sc_pd->device, + ctxt->rc_recv_sge.addr, + wc->byte_len, DMA_FROM_DEVICE); + + spin_lock(&rdma->sc_rq_dto_lock); + list_add_tail(&ctxt->rc_list, &rdma->sc_rq_dto_q); + spin_unlock(&rdma->sc_rq_dto_lock); + set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags); + if (!test_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags)) + svc_xprt_enqueue(&rdma->sc_xprt); + goto out; + +flushed: + if (wc->status != IB_WC_WR_FLUSH_ERR) + pr_err("svcrdma: Recv: %s (%u/0x%x)\n", + ib_wc_status_msg(wc->status), + wc->status, wc->vendor_err); +post_err: + svc_rdma_recv_ctxt_put(rdma, ctxt); + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + svc_xprt_enqueue(&rdma->sc_xprt); +out: + svc_xprt_put(&rdma->sc_xprt); +} + +/** + * svc_rdma_flush_recv_queues - Drain pending Receive work + * @rdma: svcxprt_rdma being shut down + * + */ +void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_recv_ctxt *ctxt; + + while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_read_complete_q))) { + list_del(&ctxt->rc_list); + svc_rdma_recv_ctxt_put(rdma, ctxt); + } + while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_rq_dto_q))) { + list_del(&ctxt->rc_list); + svc_rdma_recv_ctxt_put(rdma, ctxt); + } +} + +static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *ctxt) +{ + struct xdr_buf *arg = &rqstp->rq_arg; + + arg->head[0].iov_base = ctxt->rc_recv_buf; + arg->head[0].iov_len = ctxt->rc_byte_len; + arg->tail[0].iov_base = NULL; + arg->tail[0].iov_len = 0; + arg->page_len = 0; + arg->page_base = 0; + arg->buflen = ctxt->rc_byte_len; + arg->len = ctxt->rc_byte_len; +} + +/* This accommodates the largest possible Write chunk, + * in one segment. + */ +#define MAX_BYTES_WRITE_SEG ((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT)) + +/* This accommodates the largest possible Position-Zero + * Read chunk or Reply chunk, in one segment. + */ +#define MAX_BYTES_SPECIAL_SEG ((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT)) + +/* Sanity check the Read list. + * + * Implementation limits: + * - This implementation supports only one Read chunk. + * + * Sanity checks: + * - Read list does not overflow buffer. + * - Segment size limited by largest NFS data payload. + * + * The segment count is limited to how many segments can + * fit in the transport header without overflowing the + * buffer. That's about 40 Read segments for a 1KB inline + * threshold. + * + * Returns pointer to the following Write list. + */ +static __be32 *xdr_check_read_list(__be32 *p, const __be32 *end) +{ + u32 position; + bool first; + + first = true; + while (*p++ != xdr_zero) { + if (first) { + position = be32_to_cpup(p++); + first = false; + } else if (be32_to_cpup(p++) != position) { + return NULL; + } + p++; /* handle */ + if (be32_to_cpup(p++) > MAX_BYTES_SPECIAL_SEG) + return NULL; + p += 2; /* offset */ + + if (p > end) + return NULL; + } + return p; +} + +/* The segment count is limited to how many segments can + * fit in the transport header without overflowing the + * buffer. That's about 60 Write segments for a 1KB inline + * threshold. + */ +static __be32 *xdr_check_write_chunk(__be32 *p, const __be32 *end, + u32 maxlen) +{ + u32 i, segcount; + + segcount = be32_to_cpup(p++); + for (i = 0; i < segcount; i++) { + p++; /* handle */ + if (be32_to_cpup(p++) > maxlen) + return NULL; + p += 2; /* offset */ + + if (p > end) + return NULL; + } + + return p; +} + +/* Sanity check the Write list. + * + * Implementation limits: + * - This implementation supports only one Write chunk. + * + * Sanity checks: + * - Write list does not overflow buffer. + * - Segment size limited by largest NFS data payload. + * + * Returns pointer to the following Reply chunk. + */ +static __be32 *xdr_check_write_list(__be32 *p, const __be32 *end) +{ + u32 chcount; + + chcount = 0; + while (*p++ != xdr_zero) { + p = xdr_check_write_chunk(p, end, MAX_BYTES_WRITE_SEG); + if (!p) + return NULL; + if (chcount++ > 1) + return NULL; + } + return p; +} + +/* Sanity check the Reply chunk. + * + * Sanity checks: + * - Reply chunk does not overflow buffer. + * - Segment size limited by largest NFS data payload. + * + * Returns pointer to the following RPC header. + */ +static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end) +{ + if (*p++ != xdr_zero) { + p = xdr_check_write_chunk(p, end, MAX_BYTES_SPECIAL_SEG); + if (!p) + return NULL; + } + return p; +} + +/* On entry, xdr->head[0].iov_base points to first byte in the + * RPC-over-RDMA header. + * + * On successful exit, head[0] points to first byte past the + * RPC-over-RDMA header. For RDMA_MSG, this is the RPC message. + * The length of the RPC-over-RDMA header is returned. + * + * Assumptions: + * - The transport header is entirely contained in the head iovec. + */ +static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg) +{ + __be32 *p, *end, *rdma_argp; + unsigned int hdr_len; + + /* Verify that there's enough bytes for header + something */ + if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) + goto out_short; + + rdma_argp = rq_arg->head[0].iov_base; + if (*(rdma_argp + 1) != rpcrdma_version) + goto out_version; + + switch (*(rdma_argp + 3)) { + case rdma_msg: + break; + case rdma_nomsg: + break; + + case rdma_done: + goto out_drop; + + case rdma_error: + goto out_drop; + + default: + goto out_proc; + } + + end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len); + p = xdr_check_read_list(rdma_argp + 4, end); + if (!p) + goto out_inval; + p = xdr_check_write_list(p, end); + if (!p) + goto out_inval; + p = xdr_check_reply_chunk(p, end); + if (!p) + goto out_inval; + if (p > end) + goto out_inval; + + rq_arg->head[0].iov_base = p; + hdr_len = (unsigned long)p - (unsigned long)rdma_argp; + rq_arg->head[0].iov_len -= hdr_len; + rq_arg->len -= hdr_len; + trace_svcrdma_decode_rqst(rdma_argp, hdr_len); + return hdr_len; + +out_short: + trace_svcrdma_decode_short(rq_arg->len); + return -EINVAL; + +out_version: + trace_svcrdma_decode_badvers(rdma_argp); + return -EPROTONOSUPPORT; + +out_drop: + trace_svcrdma_decode_drop(rdma_argp); + return 0; + +out_proc: + trace_svcrdma_decode_badproc(rdma_argp); + return -EINVAL; + +out_inval: + trace_svcrdma_decode_parse(rdma_argp); + return -EINVAL; +} + +static void rdma_read_complete(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head) +{ + int page_no; + + /* Move Read chunk pages to rqstp so that they will be released + * when svc_process is done with them. + */ + for (page_no = 0; page_no < head->rc_page_count; page_no++) { + put_page(rqstp->rq_pages[page_no]); + rqstp->rq_pages[page_no] = head->rc_pages[page_no]; + } + head->rc_page_count = 0; + + /* Point rq_arg.pages past header */ + rqstp->rq_arg.pages = &rqstp->rq_pages[head->rc_hdr_count]; + rqstp->rq_arg.page_len = head->rc_arg.page_len; + + /* rq_respages starts after the last arg page */ + rqstp->rq_respages = &rqstp->rq_pages[page_no]; + rqstp->rq_next_page = rqstp->rq_respages + 1; + + /* Rebuild rq_arg head and tail. */ + rqstp->rq_arg.head[0] = head->rc_arg.head[0]; + rqstp->rq_arg.tail[0] = head->rc_arg.tail[0]; + rqstp->rq_arg.len = head->rc_arg.len; + rqstp->rq_arg.buflen = head->rc_arg.buflen; +} + +static void svc_rdma_send_error(struct svcxprt_rdma *xprt, + __be32 *rdma_argp, int status) +{ + struct svc_rdma_send_ctxt *ctxt; + unsigned int length; + __be32 *p; + int ret; + + ctxt = svc_rdma_send_ctxt_get(xprt); + if (!ctxt) + return; + + p = ctxt->sc_xprt_buf; + *p++ = *rdma_argp; + *p++ = *(rdma_argp + 1); + *p++ = xprt->sc_fc_credits; + *p++ = rdma_error; + switch (status) { + case -EPROTONOSUPPORT: + *p++ = err_vers; + *p++ = rpcrdma_version; + *p++ = rpcrdma_version; + trace_svcrdma_err_vers(*rdma_argp); + break; + default: + *p++ = err_chunk; + trace_svcrdma_err_chunk(*rdma_argp); + } + length = (unsigned long)p - (unsigned long)ctxt->sc_xprt_buf; + svc_rdma_sync_reply_hdr(xprt, ctxt, length); + + ctxt->sc_send_wr.opcode = IB_WR_SEND; + ret = svc_rdma_send(xprt, &ctxt->sc_send_wr); + if (ret) + svc_rdma_send_ctxt_put(xprt, ctxt); +} + +/* By convention, backchannel calls arrive via rdma_msg type + * messages, and never populate the chunk lists. This makes + * the RPC/RDMA header small and fixed in size, so it is + * straightforward to check the RPC header's direction field. + */ +static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, + __be32 *rdma_resp) +{ + __be32 *p; + + if (!xprt->xpt_bc_xprt) + return false; + + p = rdma_resp + 3; + if (*p++ != rdma_msg) + return false; + + if (*p++ != xdr_zero) + return false; + if (*p++ != xdr_zero) + return false; + if (*p++ != xdr_zero) + return false; + + /* XID sanity */ + if (*p++ != *rdma_resp) + return false; + /* call direction */ + if (*p == cpu_to_be32(RPC_CALL)) + return false; + + return true; +} + +/** + * svc_rdma_recvfrom - Receive an RPC call + * @rqstp: request structure into which to receive an RPC Call + * + * Returns: + * The positive number of bytes in the RPC Call message, + * %0 if there were no Calls ready to return, + * %-EINVAL if the Read chunk data is too large, + * %-ENOMEM if rdma_rw context pool was exhausted, + * %-ENOTCONN if posting failed (connection is lost), + * %-EIO if rdma_rw initialization failed (DMA mapping, etc). + * + * Called in a loop when XPT_DATA is set. XPT_DATA is cleared only + * when there are no remaining ctxt's to process. + * + * The next ctxt is removed from the "receive" lists. + * + * - If the ctxt completes a Read, then finish assembling the Call + * message and return the number of bytes in the message. + * + * - If the ctxt completes a Receive, then construct the Call + * message from the contents of the Receive buffer. + * + * - If there are no Read chunks in this message, then finish + * assembling the Call message and return the number of bytes + * in the message. + * + * - If there are Read chunks in this message, post Read WRs to + * pull that payload and return 0. + */ +int svc_rdma_recvfrom(struct svc_rqst *rqstp) +{ + struct svc_xprt *xprt = rqstp->rq_xprt; + struct svcxprt_rdma *rdma_xprt = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + struct svc_rdma_recv_ctxt *ctxt; + __be32 *p; + int ret; + + rqstp->rq_xprt_ctxt = NULL; + + spin_lock(&rdma_xprt->sc_rq_dto_lock); + ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_read_complete_q); + if (ctxt) { + list_del(&ctxt->rc_list); + spin_unlock(&rdma_xprt->sc_rq_dto_lock); + rdma_read_complete(rqstp, ctxt); + goto complete; + } + ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_rq_dto_q); + if (!ctxt) { + /* No new incoming requests, terminate the loop */ + clear_bit(XPT_DATA, &xprt->xpt_flags); + spin_unlock(&rdma_xprt->sc_rq_dto_lock); + return 0; + } + list_del(&ctxt->rc_list); + spin_unlock(&rdma_xprt->sc_rq_dto_lock); + + atomic_inc(&rdma_stat_recv); + + svc_rdma_build_arg_xdr(rqstp, ctxt); + + /* Prevent svc_xprt_release from releasing pages in rq_pages + * if we return 0 or an error. + */ + rqstp->rq_respages = rqstp->rq_pages; + rqstp->rq_next_page = rqstp->rq_respages; + + p = (__be32 *)rqstp->rq_arg.head[0].iov_base; + ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg); + if (ret < 0) + goto out_err; + if (ret == 0) + goto out_drop; + rqstp->rq_xprt_hlen = ret; + + if (svc_rdma_is_backchannel_reply(xprt, p)) { + ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, p, + &rqstp->rq_arg); + svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); + return ret; + } + + p += rpcrdma_fixed_maxsz; + if (*p != xdr_zero) + goto out_readchunk; + +complete: + rqstp->rq_xprt_ctxt = ctxt; + rqstp->rq_prot = IPPROTO_MAX; + svc_xprt_copy_addrs(rqstp, xprt); + return rqstp->rq_arg.len; + +out_readchunk: + ret = svc_rdma_recv_read_chunk(rdma_xprt, rqstp, ctxt, p); + if (ret < 0) + goto out_postfail; + return 0; + +out_err: + svc_rdma_send_error(rdma_xprt, p, ret); + svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); + return 0; + +out_postfail: + if (ret == -EINVAL) + svc_rdma_send_error(rdma_xprt, p, ret); + svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); + return ret; + +out_drop: + svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); + return 0; +} diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c new file mode 100644 index 000000000..22f135263 --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -0,0 +1,877 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2016-2018 Oracle. All rights reserved. + * + * Use the core R/W API to move RPC-over-RDMA Read and Write chunks. + */ + +#include <rdma/rw.h> + +#include <linux/sunrpc/rpc_rdma.h> +#include <linux/sunrpc/svc_rdma.h> +#include <linux/sunrpc/debug.h> + +#include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc); +static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc); + +/* Each R/W context contains state for one chain of RDMA Read or + * Write Work Requests. + * + * Each WR chain handles a single contiguous server-side buffer, + * because scatterlist entries after the first have to start on + * page alignment. xdr_buf iovecs cannot guarantee alignment. + * + * Each WR chain handles only one R_key. Each RPC-over-RDMA segment + * from a client may contain a unique R_key, so each WR chain moves + * up to one segment at a time. + * + * The scatterlist makes this data structure over 4KB in size. To + * make it less likely to fail, and to handle the allocation for + * smaller I/O requests without disabling bottom-halves, these + * contexts are created on demand, but cached and reused until the + * controlling svcxprt_rdma is destroyed. + */ +struct svc_rdma_rw_ctxt { + struct list_head rw_list; + struct rdma_rw_ctx rw_ctx; + int rw_nents; + struct sg_table rw_sg_table; + struct scatterlist rw_first_sgl[0]; +}; + +static inline struct svc_rdma_rw_ctxt * +svc_rdma_next_ctxt(struct list_head *list) +{ + return list_first_entry_or_null(list, struct svc_rdma_rw_ctxt, + rw_list); +} + +static struct svc_rdma_rw_ctxt * +svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) +{ + struct svc_rdma_rw_ctxt *ctxt; + + spin_lock(&rdma->sc_rw_ctxt_lock); + + ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts); + if (ctxt) { + list_del(&ctxt->rw_list); + spin_unlock(&rdma->sc_rw_ctxt_lock); + } else { + spin_unlock(&rdma->sc_rw_ctxt_lock); + ctxt = kmalloc(sizeof(*ctxt) + + SG_CHUNK_SIZE * sizeof(struct scatterlist), + GFP_KERNEL); + if (!ctxt) + goto out; + INIT_LIST_HEAD(&ctxt->rw_list); + } + + ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl; + if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges, + ctxt->rw_sg_table.sgl)) { + kfree(ctxt); + ctxt = NULL; + } +out: + return ctxt; +} + +static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, + struct svc_rdma_rw_ctxt *ctxt) +{ + sg_free_table_chained(&ctxt->rw_sg_table, true); + + spin_lock(&rdma->sc_rw_ctxt_lock); + list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts); + spin_unlock(&rdma->sc_rw_ctxt_lock); +} + +/** + * svc_rdma_destroy_rw_ctxts - Free accumulated R/W contexts + * @rdma: transport about to be destroyed + * + */ +void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_rw_ctxt *ctxt; + + while ((ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts)) != NULL) { + list_del(&ctxt->rw_list); + kfree(ctxt); + } +} + +/* A chunk context tracks all I/O for moving one Read or Write + * chunk. This is a a set of rdma_rw's that handle data movement + * for all segments of one chunk. + * + * These are small, acquired with a single allocator call, and + * no more than one is needed per chunk. They are allocated on + * demand, and not cached. + */ +struct svc_rdma_chunk_ctxt { + struct ib_cqe cc_cqe; + struct svcxprt_rdma *cc_rdma; + struct list_head cc_rwctxts; + int cc_sqecount; +}; + +static void svc_rdma_cc_init(struct svcxprt_rdma *rdma, + struct svc_rdma_chunk_ctxt *cc) +{ + cc->cc_rdma = rdma; + svc_xprt_get(&rdma->sc_xprt); + + INIT_LIST_HEAD(&cc->cc_rwctxts); + cc->cc_sqecount = 0; +} + +static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, + enum dma_data_direction dir) +{ + struct svcxprt_rdma *rdma = cc->cc_rdma; + struct svc_rdma_rw_ctxt *ctxt; + + while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) { + list_del(&ctxt->rw_list); + + rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, ctxt->rw_sg_table.sgl, + ctxt->rw_nents, dir); + svc_rdma_put_rw_ctxt(rdma, ctxt); + } + svc_xprt_put(&rdma->sc_xprt); +} + +/* State for sending a Write or Reply chunk. + * - Tracks progress of writing one chunk over all its segments + * - Stores arguments for the SGL constructor functions + */ +struct svc_rdma_write_info { + /* write state of this chunk */ + unsigned int wi_seg_off; + unsigned int wi_seg_no; + unsigned int wi_nsegs; + __be32 *wi_segs; + + /* SGL constructor arguments */ + struct xdr_buf *wi_xdr; + unsigned char *wi_base; + unsigned int wi_next_off; + + struct svc_rdma_chunk_ctxt wi_cc; +}; + +static struct svc_rdma_write_info * +svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk) +{ + struct svc_rdma_write_info *info; + + info = kmalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return info; + + info->wi_seg_off = 0; + info->wi_seg_no = 0; + info->wi_nsegs = be32_to_cpup(++chunk); + info->wi_segs = ++chunk; + svc_rdma_cc_init(rdma, &info->wi_cc); + info->wi_cc.cc_cqe.done = svc_rdma_write_done; + return info; +} + +static void svc_rdma_write_info_free(struct svc_rdma_write_info *info) +{ + svc_rdma_cc_release(&info->wi_cc, DMA_TO_DEVICE); + kfree(info); +} + +/** + * svc_rdma_write_done - Write chunk completion + * @cq: controlling Completion Queue + * @wc: Work Completion + * + * Pages under I/O are freed by a subsequent Send completion. + */ +static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_cqe *cqe = wc->wr_cqe; + struct svc_rdma_chunk_ctxt *cc = + container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); + struct svcxprt_rdma *rdma = cc->cc_rdma; + struct svc_rdma_write_info *info = + container_of(cc, struct svc_rdma_write_info, wi_cc); + + trace_svcrdma_wc_write(wc); + + atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); + wake_up(&rdma->sc_send_wait); + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + if (wc->status != IB_WC_WR_FLUSH_ERR) + pr_err("svcrdma: write ctx: %s (%u/0x%x)\n", + ib_wc_status_msg(wc->status), + wc->status, wc->vendor_err); + } + + svc_rdma_write_info_free(info); +} + +/* State for pulling a Read chunk. + */ +struct svc_rdma_read_info { + struct svc_rdma_recv_ctxt *ri_readctxt; + unsigned int ri_position; + unsigned int ri_pageno; + unsigned int ri_pageoff; + unsigned int ri_chunklen; + + struct svc_rdma_chunk_ctxt ri_cc; +}; + +static struct svc_rdma_read_info * +svc_rdma_read_info_alloc(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_read_info *info; + + info = kmalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return info; + + svc_rdma_cc_init(rdma, &info->ri_cc); + info->ri_cc.cc_cqe.done = svc_rdma_wc_read_done; + return info; +} + +static void svc_rdma_read_info_free(struct svc_rdma_read_info *info) +{ + svc_rdma_cc_release(&info->ri_cc, DMA_FROM_DEVICE); + kfree(info); +} + +/** + * svc_rdma_wc_read_done - Handle completion of an RDMA Read ctx + * @cq: controlling Completion Queue + * @wc: Work Completion + * + */ +static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_cqe *cqe = wc->wr_cqe; + struct svc_rdma_chunk_ctxt *cc = + container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); + struct svcxprt_rdma *rdma = cc->cc_rdma; + struct svc_rdma_read_info *info = + container_of(cc, struct svc_rdma_read_info, ri_cc); + + trace_svcrdma_wc_read(wc); + + atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); + wake_up(&rdma->sc_send_wait); + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + if (wc->status != IB_WC_WR_FLUSH_ERR) + pr_err("svcrdma: read ctx: %s (%u/0x%x)\n", + ib_wc_status_msg(wc->status), + wc->status, wc->vendor_err); + svc_rdma_recv_ctxt_put(rdma, info->ri_readctxt); + } else { + spin_lock(&rdma->sc_rq_dto_lock); + list_add_tail(&info->ri_readctxt->rc_list, + &rdma->sc_read_complete_q); + spin_unlock(&rdma->sc_rq_dto_lock); + + set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags); + svc_xprt_enqueue(&rdma->sc_xprt); + } + + svc_rdma_read_info_free(info); +} + +/* This function sleeps when the transport's Send Queue is congested. + * + * Assumptions: + * - If ib_post_send() succeeds, only one completion is expected, + * even if one or more WRs are flushed. This is true when posting + * an rdma_rw_ctx or when posting a single signaled WR. + */ +static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) +{ + struct svcxprt_rdma *rdma = cc->cc_rdma; + struct svc_xprt *xprt = &rdma->sc_xprt; + struct ib_send_wr *first_wr; + const struct ib_send_wr *bad_wr; + struct list_head *tmp; + struct ib_cqe *cqe; + int ret; + + if (cc->cc_sqecount > rdma->sc_sq_depth) + return -EINVAL; + + first_wr = NULL; + cqe = &cc->cc_cqe; + list_for_each(tmp, &cc->cc_rwctxts) { + struct svc_rdma_rw_ctxt *ctxt; + + ctxt = list_entry(tmp, struct svc_rdma_rw_ctxt, rw_list); + first_wr = rdma_rw_ctx_wrs(&ctxt->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, cqe, first_wr); + cqe = NULL; + } + + do { + if (atomic_sub_return(cc->cc_sqecount, + &rdma->sc_sq_avail) > 0) { + ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); + if (ret) + break; + return 0; + } + + trace_svcrdma_sq_full(rdma); + atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); + wait_event(rdma->sc_send_wait, + atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount); + trace_svcrdma_sq_retry(rdma); + } while (1); + + trace_svcrdma_sq_post_err(rdma, ret); + set_bit(XPT_CLOSE, &xprt->xpt_flags); + + /* If even one was posted, there will be a completion. */ + if (bad_wr != first_wr) + return 0; + + atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); + wake_up(&rdma->sc_send_wait); + return -ENOTCONN; +} + +/* Build and DMA-map an SGL that covers one kvec in an xdr_buf + */ +static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info, + unsigned int len, + struct svc_rdma_rw_ctxt *ctxt) +{ + struct scatterlist *sg = ctxt->rw_sg_table.sgl; + + sg_set_buf(&sg[0], info->wi_base, len); + info->wi_base += len; + + ctxt->rw_nents = 1; +} + +/* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist. + */ +static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info, + unsigned int remaining, + struct svc_rdma_rw_ctxt *ctxt) +{ + unsigned int sge_no, sge_bytes, page_off, page_no; + struct xdr_buf *xdr = info->wi_xdr; + struct scatterlist *sg; + struct page **page; + + page_off = info->wi_next_off + xdr->page_base; + page_no = page_off >> PAGE_SHIFT; + page_off = offset_in_page(page_off); + page = xdr->pages + page_no; + info->wi_next_off += remaining; + sg = ctxt->rw_sg_table.sgl; + sge_no = 0; + do { + sge_bytes = min_t(unsigned int, remaining, + PAGE_SIZE - page_off); + sg_set_page(sg, *page, sge_bytes, page_off); + + remaining -= sge_bytes; + sg = sg_next(sg); + page_off = 0; + sge_no++; + page++; + } while (remaining); + + ctxt->rw_nents = sge_no; +} + +/* Construct RDMA Write WRs to send a portion of an xdr_buf containing + * an RPC Reply. + */ +static int +svc_rdma_build_writes(struct svc_rdma_write_info *info, + void (*constructor)(struct svc_rdma_write_info *info, + unsigned int len, + struct svc_rdma_rw_ctxt *ctxt), + unsigned int remaining) +{ + struct svc_rdma_chunk_ctxt *cc = &info->wi_cc; + struct svcxprt_rdma *rdma = cc->cc_rdma; + struct svc_rdma_rw_ctxt *ctxt; + __be32 *seg; + int ret; + + seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz; + do { + unsigned int write_len; + u32 seg_length, seg_handle; + u64 seg_offset; + + if (info->wi_seg_no >= info->wi_nsegs) + goto out_overflow; + + seg_handle = be32_to_cpup(seg); + seg_length = be32_to_cpup(seg + 1); + xdr_decode_hyper(seg + 2, &seg_offset); + seg_offset += info->wi_seg_off; + + write_len = min(remaining, seg_length - info->wi_seg_off); + ctxt = svc_rdma_get_rw_ctxt(rdma, + (write_len >> PAGE_SHIFT) + 2); + if (!ctxt) + goto out_noctx; + + constructor(info, write_len, ctxt); + ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, ctxt->rw_sg_table.sgl, + ctxt->rw_nents, 0, seg_offset, + seg_handle, DMA_TO_DEVICE); + if (ret < 0) + goto out_initerr; + + trace_svcrdma_encode_wseg(seg_handle, write_len, seg_offset); + list_add(&ctxt->rw_list, &cc->cc_rwctxts); + cc->cc_sqecount += ret; + if (write_len == seg_length - info->wi_seg_off) { + seg += 4; + info->wi_seg_no++; + info->wi_seg_off = 0; + } else { + info->wi_seg_off += write_len; + } + remaining -= write_len; + } while (remaining); + + return 0; + +out_overflow: + dprintk("svcrdma: inadequate space in Write chunk (%u)\n", + info->wi_nsegs); + return -E2BIG; + +out_noctx: + dprintk("svcrdma: no R/W ctxs available\n"); + return -ENOMEM; + +out_initerr: + svc_rdma_put_rw_ctxt(rdma, ctxt); + trace_svcrdma_dma_map_rwctx(rdma, ret); + return -EIO; +} + +/* Send one of an xdr_buf's kvecs by itself. To send a Reply + * chunk, the whole RPC Reply is written back to the client. + * This function writes either the head or tail of the xdr_buf + * containing the Reply. + */ +static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info, + struct kvec *vec) +{ + info->wi_base = vec->iov_base; + return svc_rdma_build_writes(info, svc_rdma_vec_to_sg, + vec->iov_len); +} + +/* Send an xdr_buf's page list by itself. A Write chunk is + * just the page list. a Reply chunk is the head, page list, + * and tail. This function is shared between the two types + * of chunk. + */ +static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info, + struct xdr_buf *xdr) +{ + info->wi_xdr = xdr; + info->wi_next_off = 0; + return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg, + xdr->page_len); +} + +/** + * svc_rdma_send_write_chunk - Write all segments in a Write chunk + * @rdma: controlling RDMA transport + * @wr_ch: Write chunk provided by client + * @xdr: xdr_buf containing the data payload + * + * Returns a non-negative number of bytes the chunk consumed, or + * %-E2BIG if the payload was larger than the Write chunk, + * %-EINVAL if client provided too many segments, + * %-ENOMEM if rdma_rw context pool was exhausted, + * %-ENOTCONN if posting failed (connection is lost), + * %-EIO if rdma_rw initialization failed (DMA mapping, etc). + */ +int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch, + struct xdr_buf *xdr) +{ + struct svc_rdma_write_info *info; + int ret; + + if (!xdr->page_len) + return 0; + + info = svc_rdma_write_info_alloc(rdma, wr_ch); + if (!info) + return -ENOMEM; + + ret = svc_rdma_send_xdr_pagelist(info, xdr); + if (ret < 0) + goto out_err; + + ret = svc_rdma_post_chunk_ctxt(&info->wi_cc); + if (ret < 0) + goto out_err; + + trace_svcrdma_encode_write(xdr->page_len); + return xdr->page_len; + +out_err: + svc_rdma_write_info_free(info); + return ret; +} + +/** + * svc_rdma_send_reply_chunk - Write all segments in the Reply chunk + * @rdma: controlling RDMA transport + * @rp_ch: Reply chunk provided by client + * @writelist: true if client provided a Write list + * @xdr: xdr_buf containing an RPC Reply + * + * Returns a non-negative number of bytes the chunk consumed, or + * %-E2BIG if the payload was larger than the Reply chunk, + * %-EINVAL if client provided too many segments, + * %-ENOMEM if rdma_rw context pool was exhausted, + * %-ENOTCONN if posting failed (connection is lost), + * %-EIO if rdma_rw initialization failed (DMA mapping, etc). + */ +int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch, + bool writelist, struct xdr_buf *xdr) +{ + struct svc_rdma_write_info *info; + int consumed, ret; + + info = svc_rdma_write_info_alloc(rdma, rp_ch); + if (!info) + return -ENOMEM; + + ret = svc_rdma_send_xdr_kvec(info, &xdr->head[0]); + if (ret < 0) + goto out_err; + consumed = xdr->head[0].iov_len; + + /* Send the page list in the Reply chunk only if the + * client did not provide Write chunks. + */ + if (!writelist && xdr->page_len) { + ret = svc_rdma_send_xdr_pagelist(info, xdr); + if (ret < 0) + goto out_err; + consumed += xdr->page_len; + } + + if (xdr->tail[0].iov_len) { + ret = svc_rdma_send_xdr_kvec(info, &xdr->tail[0]); + if (ret < 0) + goto out_err; + consumed += xdr->tail[0].iov_len; + } + + ret = svc_rdma_post_chunk_ctxt(&info->wi_cc); + if (ret < 0) + goto out_err; + + trace_svcrdma_encode_reply(consumed); + return consumed; + +out_err: + svc_rdma_write_info_free(info); + return ret; +} + +static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, + struct svc_rqst *rqstp, + u32 rkey, u32 len, u64 offset) +{ + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; + struct svc_rdma_chunk_ctxt *cc = &info->ri_cc; + struct svc_rdma_rw_ctxt *ctxt; + unsigned int sge_no, seg_len; + struct scatterlist *sg; + int ret; + + sge_no = PAGE_ALIGN(info->ri_pageoff + len) >> PAGE_SHIFT; + ctxt = svc_rdma_get_rw_ctxt(cc->cc_rdma, sge_no); + if (!ctxt) + goto out_noctx; + ctxt->rw_nents = sge_no; + + sg = ctxt->rw_sg_table.sgl; + for (sge_no = 0; sge_no < ctxt->rw_nents; sge_no++) { + seg_len = min_t(unsigned int, len, + PAGE_SIZE - info->ri_pageoff); + + head->rc_arg.pages[info->ri_pageno] = + rqstp->rq_pages[info->ri_pageno]; + if (!info->ri_pageoff) + head->rc_page_count++; + + sg_set_page(sg, rqstp->rq_pages[info->ri_pageno], + seg_len, info->ri_pageoff); + sg = sg_next(sg); + + info->ri_pageoff += seg_len; + if (info->ri_pageoff == PAGE_SIZE) { + info->ri_pageno++; + info->ri_pageoff = 0; + } + len -= seg_len; + + /* Safety check */ + if (len && + &rqstp->rq_pages[info->ri_pageno + 1] > rqstp->rq_page_end) + goto out_overrun; + } + + ret = rdma_rw_ctx_init(&ctxt->rw_ctx, cc->cc_rdma->sc_qp, + cc->cc_rdma->sc_port_num, + ctxt->rw_sg_table.sgl, ctxt->rw_nents, + 0, offset, rkey, DMA_FROM_DEVICE); + if (ret < 0) + goto out_initerr; + + list_add(&ctxt->rw_list, &cc->cc_rwctxts); + cc->cc_sqecount += ret; + return 0; + +out_noctx: + dprintk("svcrdma: no R/W ctxs available\n"); + return -ENOMEM; + +out_overrun: + dprintk("svcrdma: request overruns rq_pages\n"); + return -EINVAL; + +out_initerr: + trace_svcrdma_dma_map_rwctx(cc->cc_rdma, ret); + svc_rdma_put_rw_ctxt(cc->cc_rdma, ctxt); + return -EIO; +} + +/* Walk the segments in the Read chunk starting at @p and construct + * RDMA Read operations to pull the chunk to the server. + */ +static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, + struct svc_rdma_read_info *info, + __be32 *p) +{ + int ret; + + ret = -EINVAL; + info->ri_chunklen = 0; + while (*p++ != xdr_zero && be32_to_cpup(p++) == info->ri_position) { + u32 rs_handle, rs_length; + u64 rs_offset; + + rs_handle = be32_to_cpup(p++); + rs_length = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &rs_offset); + + ret = svc_rdma_build_read_segment(info, rqstp, + rs_handle, rs_length, + rs_offset); + if (ret < 0) + break; + + trace_svcrdma_encode_rseg(rs_handle, rs_length, rs_offset); + info->ri_chunklen += rs_length; + } + + return ret; +} + +/* Construct RDMA Reads to pull over a normal Read chunk. The chunk + * data lands in the page list of head->rc_arg.pages. + * + * Currently NFSD does not look at the head->rc_arg.tail[0] iovec. + * Therefore, XDR round-up of the Read chunk and trailing + * inline content must both be added at the end of the pagelist. + */ +static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, + struct svc_rdma_read_info *info, + __be32 *p) +{ + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; + int ret; + + ret = svc_rdma_build_read_chunk(rqstp, info, p); + if (ret < 0) + goto out; + + trace_svcrdma_encode_read(info->ri_chunklen, info->ri_position); + + head->rc_hdr_count = 0; + + /* Split the Receive buffer between the head and tail + * buffers at Read chunk's position. XDR roundup of the + * chunk is not included in either the pagelist or in + * the tail. + */ + head->rc_arg.tail[0].iov_base = + head->rc_arg.head[0].iov_base + info->ri_position; + head->rc_arg.tail[0].iov_len = + head->rc_arg.head[0].iov_len - info->ri_position; + head->rc_arg.head[0].iov_len = info->ri_position; + + /* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2). + * + * If the client already rounded up the chunk length, the + * length does not change. Otherwise, the length of the page + * list is increased to include XDR round-up. + * + * Currently these chunks always start at page offset 0, + * thus the rounded-up length never crosses a page boundary. + */ + info->ri_chunklen = XDR_QUADLEN(info->ri_chunklen) << 2; + + head->rc_arg.page_len = info->ri_chunklen; + head->rc_arg.len += info->ri_chunklen; + head->rc_arg.buflen += info->ri_chunklen; + +out: + return ret; +} + +/* Construct RDMA Reads to pull over a Position Zero Read chunk. + * The start of the data lands in the first page just after + * the Transport header, and the rest lands in the page list of + * head->rc_arg.pages. + * + * Assumptions: + * - A PZRC has an XDR-aligned length (no implicit round-up). + * - There can be no trailing inline content (IOW, we assume + * a PZRC is never sent in an RDMA_MSG message, though it's + * allowed by spec). + */ +static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp, + struct svc_rdma_read_info *info, + __be32 *p) +{ + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; + int ret; + + ret = svc_rdma_build_read_chunk(rqstp, info, p); + if (ret < 0) + goto out; + + trace_svcrdma_encode_pzr(info->ri_chunklen); + + head->rc_arg.len += info->ri_chunklen; + head->rc_arg.buflen += info->ri_chunklen; + + head->rc_hdr_count = 1; + head->rc_arg.head[0].iov_base = page_address(head->rc_pages[0]); + head->rc_arg.head[0].iov_len = min_t(size_t, PAGE_SIZE, + info->ri_chunklen); + + head->rc_arg.page_len = info->ri_chunklen - + head->rc_arg.head[0].iov_len; + +out: + return ret; +} + +/* Pages under I/O have been copied to head->rc_pages. Ensure they + * are not released by svc_xprt_release() until the I/O is complete. + * + * This has to be done after all Read WRs are constructed to properly + * handle a page that is part of I/O on behalf of two different RDMA + * segments. + * + * Do this only if I/O has been posted. Otherwise, we do indeed want + * svc_xprt_release() to clean things up properly. + */ +static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, + const unsigned int start, + const unsigned int num_pages) +{ + unsigned int i; + + for (i = start; i < num_pages + start; i++) + rqstp->rq_pages[i] = NULL; +} + +/** + * svc_rdma_recv_read_chunk - Pull a Read chunk from the client + * @rdma: controlling RDMA transport + * @rqstp: set of pages to use as Read sink buffers + * @head: pages under I/O collect here + * @p: pointer to start of Read chunk + * + * Returns: + * %0 if all needed RDMA Reads were posted successfully, + * %-EINVAL if client provided too many segments, + * %-ENOMEM if rdma_rw context pool was exhausted, + * %-ENOTCONN if posting failed (connection is lost), + * %-EIO if rdma_rw initialization failed (DMA mapping, etc). + * + * Assumptions: + * - All Read segments in @p have the same Position value. + */ +int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *head, __be32 *p) +{ + struct svc_rdma_read_info *info; + int ret; + + /* The request (with page list) is constructed in + * head->rc_arg. Pages involved with RDMA Read I/O are + * transferred there. + */ + head->rc_arg.head[0] = rqstp->rq_arg.head[0]; + head->rc_arg.tail[0] = rqstp->rq_arg.tail[0]; + head->rc_arg.pages = head->rc_pages; + head->rc_arg.page_base = 0; + head->rc_arg.page_len = 0; + head->rc_arg.len = rqstp->rq_arg.len; + head->rc_arg.buflen = rqstp->rq_arg.buflen; + + info = svc_rdma_read_info_alloc(rdma); + if (!info) + return -ENOMEM; + info->ri_readctxt = head; + info->ri_pageno = 0; + info->ri_pageoff = 0; + + info->ri_position = be32_to_cpup(p + 1); + if (info->ri_position) + ret = svc_rdma_build_normal_read_chunk(rqstp, info, p); + else + ret = svc_rdma_build_pz_read_chunk(rqstp, info, p); + if (ret < 0) + goto out_err; + + ret = svc_rdma_post_chunk_ctxt(&info->ri_cc); + if (ret < 0) + goto out_err; + svc_rdma_save_io_pages(rqstp, 0, head->rc_page_count); + return 0; + +out_err: + svc_rdma_read_info_free(info); + return ret; +} diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c new file mode 100644 index 000000000..4062cd624 --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -0,0 +1,929 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* + * Copyright (c) 2016-2018 Oracle. All rights reserved. + * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker <tom@opengridcomputing.com> + */ + +/* Operation + * + * The main entry point is svc_rdma_sendto. This is called by the + * RPC server when an RPC Reply is ready to be transmitted to a client. + * + * The passed-in svc_rqst contains a struct xdr_buf which holds an + * XDR-encoded RPC Reply message. sendto must construct the RPC-over-RDMA + * transport header, post all Write WRs needed for this Reply, then post + * a Send WR conveying the transport header and the RPC message itself to + * the client. + * + * svc_rdma_sendto must fully transmit the Reply before returning, as + * the svc_rqst will be recycled as soon as sendto returns. Remaining + * resources referred to by the svc_rqst are also recycled at that time. + * Therefore any resources that must remain longer must be detached + * from the svc_rqst and released later. + * + * Page Management + * + * The I/O that performs Reply transmission is asynchronous, and may + * complete well after sendto returns. Thus pages under I/O must be + * removed from the svc_rqst before sendto returns. + * + * The logic here depends on Send Queue and completion ordering. Since + * the Send WR is always posted last, it will always complete last. Thus + * when it completes, it is guaranteed that all previous Write WRs have + * also completed. + * + * Write WRs are constructed and posted. Each Write segment gets its own + * svc_rdma_rw_ctxt, allowing the Write completion handler to find and + * DMA-unmap the pages under I/O for that Write segment. The Write + * completion handler does not release any pages. + * + * When the Send WR is constructed, it also gets its own svc_rdma_send_ctxt. + * The ownership of all of the Reply's pages are transferred into that + * ctxt, the Send WR is posted, and sendto returns. + * + * The svc_rdma_send_ctxt is presented when the Send WR completes. The + * Send completion handler finally releases the Reply's pages. + * + * This mechanism also assumes that completions on the transport's Send + * Completion Queue do not run in parallel. Otherwise a Write completion + * and Send completion running at the same time could release pages that + * are still DMA-mapped. + * + * Error Handling + * + * - If the Send WR is posted successfully, it will either complete + * successfully, or get flushed. Either way, the Send completion + * handler releases the Reply's pages. + * - If the Send WR cannot be not posted, the forward path releases + * the Reply's pages. + * + * This handles the case, without the use of page reference counting, + * where two different Write segments send portions of the same page. + */ + +#include <linux/spinlock.h> +#include <asm/unaligned.h> + +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> + +#include <linux/sunrpc/debug.h> +#include <linux/sunrpc/rpc_rdma.h> +#include <linux/sunrpc/svc_rdma.h> + +#include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc); + +static inline struct svc_rdma_send_ctxt * +svc_rdma_next_send_ctxt(struct list_head *list) +{ + return list_first_entry_or_null(list, struct svc_rdma_send_ctxt, + sc_list); +} + +static struct svc_rdma_send_ctxt * +svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_send_ctxt *ctxt; + dma_addr_t addr; + void *buffer; + size_t size; + int i; + + size = sizeof(*ctxt); + size += rdma->sc_max_send_sges * sizeof(struct ib_sge); + ctxt = kmalloc(size, GFP_KERNEL); + if (!ctxt) + goto fail0; + buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL); + if (!buffer) + goto fail1; + addr = ib_dma_map_single(rdma->sc_pd->device, buffer, + rdma->sc_max_req_size, DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdma->sc_pd->device, addr)) + goto fail2; + + ctxt->sc_send_wr.next = NULL; + ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe; + ctxt->sc_send_wr.sg_list = ctxt->sc_sges; + ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED; + ctxt->sc_cqe.done = svc_rdma_wc_send; + ctxt->sc_xprt_buf = buffer; + ctxt->sc_sges[0].addr = addr; + + for (i = 0; i < rdma->sc_max_send_sges; i++) + ctxt->sc_sges[i].lkey = rdma->sc_pd->local_dma_lkey; + return ctxt; + +fail2: + kfree(buffer); +fail1: + kfree(ctxt); +fail0: + return NULL; +} + +/** + * svc_rdma_send_ctxts_destroy - Release all send_ctxt's for an xprt + * @rdma: svcxprt_rdma being torn down + * + */ +void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_send_ctxt *ctxt; + + while ((ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts))) { + list_del(&ctxt->sc_list); + ib_dma_unmap_single(rdma->sc_pd->device, + ctxt->sc_sges[0].addr, + rdma->sc_max_req_size, + DMA_TO_DEVICE); + kfree(ctxt->sc_xprt_buf); + kfree(ctxt); + } +} + +/** + * svc_rdma_send_ctxt_get - Get a free send_ctxt + * @rdma: controlling svcxprt_rdma + * + * Returns a ready-to-use send_ctxt, or NULL if none are + * available and a fresh one cannot be allocated. + */ +struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_send_ctxt *ctxt; + + spin_lock(&rdma->sc_send_lock); + ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts); + if (!ctxt) + goto out_empty; + list_del(&ctxt->sc_list); + spin_unlock(&rdma->sc_send_lock); + +out: + ctxt->sc_send_wr.num_sge = 0; + ctxt->sc_cur_sge_no = 0; + ctxt->sc_page_count = 0; + return ctxt; + +out_empty: + spin_unlock(&rdma->sc_send_lock); + ctxt = svc_rdma_send_ctxt_alloc(rdma); + if (!ctxt) + return NULL; + goto out; +} + +/** + * svc_rdma_send_ctxt_put - Return send_ctxt to free list + * @rdma: controlling svcxprt_rdma + * @ctxt: object to return to the free list + * + * Pages left in sc_pages are DMA unmapped and released. + */ +void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt) +{ + struct ib_device *device = rdma->sc_cm_id->device; + unsigned int i; + + /* The first SGE contains the transport header, which + * remains mapped until @ctxt is destroyed. + */ + for (i = 1; i < ctxt->sc_send_wr.num_sge; i++) + ib_dma_unmap_page(device, + ctxt->sc_sges[i].addr, + ctxt->sc_sges[i].length, + DMA_TO_DEVICE); + + for (i = 0; i < ctxt->sc_page_count; ++i) + put_page(ctxt->sc_pages[i]); + + spin_lock(&rdma->sc_send_lock); + list_add(&ctxt->sc_list, &rdma->sc_send_ctxts); + spin_unlock(&rdma->sc_send_lock); +} + +/** + * svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC + * @cq: Completion Queue context + * @wc: Work Completion object + * + * NB: The svc_xprt/svcxprt_rdma is pinned whenever it's possible that + * the Send completion handler could be running. + */ +static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) +{ + struct svcxprt_rdma *rdma = cq->cq_context; + struct ib_cqe *cqe = wc->wr_cqe; + struct svc_rdma_send_ctxt *ctxt; + + trace_svcrdma_wc_send(wc); + + atomic_inc(&rdma->sc_sq_avail); + wake_up(&rdma->sc_send_wait); + + ctxt = container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe); + svc_rdma_send_ctxt_put(rdma, ctxt); + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + svc_xprt_enqueue(&rdma->sc_xprt); + if (wc->status != IB_WC_WR_FLUSH_ERR) + pr_err("svcrdma: Send: %s (%u/0x%x)\n", + ib_wc_status_msg(wc->status), + wc->status, wc->vendor_err); + } + + svc_xprt_put(&rdma->sc_xprt); +} + +/** + * svc_rdma_send - Post a single Send WR + * @rdma: transport on which to post the WR + * @wr: prepared Send WR to post + * + * Returns zero the Send WR was posted successfully. Otherwise, a + * negative errno is returned. + */ +int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr) +{ + int ret; + + might_sleep(); + + /* If the SQ is full, wait until an SQ entry is available */ + while (1) { + if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) { + atomic_inc(&rdma_stat_sq_starve); + trace_svcrdma_sq_full(rdma); + atomic_inc(&rdma->sc_sq_avail); + wait_event(rdma->sc_send_wait, + atomic_read(&rdma->sc_sq_avail) > 1); + if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) + return -ENOTCONN; + trace_svcrdma_sq_retry(rdma); + continue; + } + + svc_xprt_get(&rdma->sc_xprt); + trace_svcrdma_post_send(wr); + ret = ib_post_send(rdma->sc_qp, wr, NULL); + if (ret) + break; + return 0; + } + + trace_svcrdma_sq_post_err(rdma, ret); + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + svc_xprt_put(&rdma->sc_xprt); + wake_up(&rdma->sc_send_wait); + return ret; +} + +static u32 xdr_padsize(u32 len) +{ + return (len & 3) ? (4 - (len & 3)) : 0; +} + +/* Returns length of transport header, in bytes. + */ +static unsigned int svc_rdma_reply_hdr_len(__be32 *rdma_resp) +{ + unsigned int nsegs; + __be32 *p; + + p = rdma_resp; + + /* RPC-over-RDMA V1 replies never have a Read list. */ + p += rpcrdma_fixed_maxsz + 1; + + /* Skip Write list. */ + while (*p++ != xdr_zero) { + nsegs = be32_to_cpup(p++); + p += nsegs * rpcrdma_segment_maxsz; + } + + /* Skip Reply chunk. */ + if (*p++ != xdr_zero) { + nsegs = be32_to_cpup(p++); + p += nsegs * rpcrdma_segment_maxsz; + } + + return (unsigned long)p - (unsigned long)rdma_resp; +} + +/* One Write chunk is copied from Call transport header to Reply + * transport header. Each segment's length field is updated to + * reflect number of bytes consumed in the segment. + * + * Returns number of segments in this chunk. + */ +static unsigned int xdr_encode_write_chunk(__be32 *dst, __be32 *src, + unsigned int remaining) +{ + unsigned int i, nsegs; + u32 seg_len; + + /* Write list discriminator */ + *dst++ = *src++; + + /* number of segments in this chunk */ + nsegs = be32_to_cpup(src); + *dst++ = *src++; + + for (i = nsegs; i; i--) { + /* segment's RDMA handle */ + *dst++ = *src++; + + /* bytes returned in this segment */ + seg_len = be32_to_cpu(*src); + if (remaining >= seg_len) { + /* entire segment was consumed */ + *dst = *src; + remaining -= seg_len; + } else { + /* segment only partly filled */ + *dst = cpu_to_be32(remaining); + remaining = 0; + } + dst++; src++; + + /* segment's RDMA offset */ + *dst++ = *src++; + *dst++ = *src++; + } + + return nsegs; +} + +/* The client provided a Write list in the Call message. Fill in + * the segments in the first Write chunk in the Reply's transport + * header with the number of bytes consumed in each segment. + * Remaining chunks are returned unused. + * + * Assumptions: + * - Client has provided only one Write chunk + */ +static void svc_rdma_xdr_encode_write_list(__be32 *rdma_resp, __be32 *wr_ch, + unsigned int consumed) +{ + unsigned int nsegs; + __be32 *p, *q; + + /* RPC-over-RDMA V1 replies never have a Read list. */ + p = rdma_resp + rpcrdma_fixed_maxsz + 1; + + q = wr_ch; + while (*q != xdr_zero) { + nsegs = xdr_encode_write_chunk(p, q, consumed); + q += 2 + nsegs * rpcrdma_segment_maxsz; + p += 2 + nsegs * rpcrdma_segment_maxsz; + consumed = 0; + } + + /* Terminate Write list */ + *p++ = xdr_zero; + + /* Reply chunk discriminator; may be replaced later */ + *p = xdr_zero; +} + +/* The client provided a Reply chunk in the Call message. Fill in + * the segments in the Reply chunk in the Reply message with the + * number of bytes consumed in each segment. + * + * Assumptions: + * - Reply can always fit in the provided Reply chunk + */ +static void svc_rdma_xdr_encode_reply_chunk(__be32 *rdma_resp, __be32 *rp_ch, + unsigned int consumed) +{ + __be32 *p; + + /* Find the Reply chunk in the Reply's xprt header. + * RPC-over-RDMA V1 replies never have a Read list. + */ + p = rdma_resp + rpcrdma_fixed_maxsz + 1; + + /* Skip past Write list */ + while (*p++ != xdr_zero) + p += 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz; + + xdr_encode_write_chunk(p, rp_ch, consumed); +} + +/* Parse the RPC Call's transport header. + */ +static void svc_rdma_get_write_arrays(__be32 *rdma_argp, + __be32 **write, __be32 **reply) +{ + __be32 *p; + + p = rdma_argp + rpcrdma_fixed_maxsz; + + /* Read list */ + while (*p++ != xdr_zero) + p += 5; + + /* Write list */ + if (*p != xdr_zero) { + *write = p; + while (*p++ != xdr_zero) + p += 1 + be32_to_cpu(*p) * 4; + } else { + *write = NULL; + p++; + } + + /* Reply chunk */ + if (*p != xdr_zero) + *reply = p; + else + *reply = NULL; +} + +/* RPC-over-RDMA Version One private extension: Remote Invalidation. + * Responder's choice: requester signals it can handle Send With + * Invalidate, and responder chooses one rkey to invalidate. + * + * Find a candidate rkey to invalidate when sending a reply. Picks the + * first R_key it finds in the chunk lists. + * + * Returns zero if RPC's chunk lists are empty. + */ +static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp, + __be32 *wr_lst, __be32 *rp_ch) +{ + __be32 *p; + + p = rdma_argp + rpcrdma_fixed_maxsz; + if (*p != xdr_zero) + p += 2; + else if (wr_lst && be32_to_cpup(wr_lst + 1)) + p = wr_lst + 2; + else if (rp_ch && be32_to_cpup(rp_ch + 1)) + p = rp_ch + 2; + else + return 0; + return be32_to_cpup(p); +} + +static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt, + struct page *page, + unsigned long offset, + unsigned int len) +{ + struct ib_device *dev = rdma->sc_cm_id->device; + dma_addr_t dma_addr; + + dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE); + if (ib_dma_mapping_error(dev, dma_addr)) + goto out_maperr; + + ctxt->sc_sges[ctxt->sc_cur_sge_no].addr = dma_addr; + ctxt->sc_sges[ctxt->sc_cur_sge_no].length = len; + ctxt->sc_send_wr.num_sge++; + return 0; + +out_maperr: + trace_svcrdma_dma_map_page(rdma, page); + return -EIO; +} + +/* ib_dma_map_page() is used here because svc_rdma_dma_unmap() + * handles DMA-unmap and it uses ib_dma_unmap_page() exclusively. + */ +static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt, + unsigned char *base, + unsigned int len) +{ + return svc_rdma_dma_map_page(rdma, ctxt, virt_to_page(base), + offset_in_page(base), len); +} + +/** + * svc_rdma_sync_reply_hdr - DMA sync the transport header buffer + * @rdma: controlling transport + * @ctxt: send_ctxt for the Send WR + * @len: length of transport header + * + */ +void svc_rdma_sync_reply_hdr(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt, + unsigned int len) +{ + ctxt->sc_sges[0].length = len; + ctxt->sc_send_wr.num_sge++; + ib_dma_sync_single_for_device(rdma->sc_pd->device, + ctxt->sc_sges[0].addr, len, + DMA_TO_DEVICE); +} + +/* If the xdr_buf has more elements than the device can + * transmit in a single RDMA Send, then the reply will + * have to be copied into a bounce buffer. + */ +static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma, + struct xdr_buf *xdr, + __be32 *wr_lst) +{ + int elements; + + /* xdr->head */ + elements = 1; + + /* xdr->pages */ + if (!wr_lst) { + unsigned int remaining; + unsigned long pageoff; + + pageoff = xdr->page_base & ~PAGE_MASK; + remaining = xdr->page_len; + while (remaining) { + ++elements; + remaining -= min_t(u32, PAGE_SIZE - pageoff, + remaining); + pageoff = 0; + } + } + + /* xdr->tail */ + if (xdr->tail[0].iov_len) + ++elements; + + /* assume 1 SGE is needed for the transport header */ + return elements >= rdma->sc_max_send_sges; +} + +/* The device is not capable of sending the reply directly. + * Assemble the elements of @xdr into the transport header + * buffer. + */ +static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt, + struct xdr_buf *xdr, __be32 *wr_lst) +{ + unsigned char *dst, *tailbase; + unsigned int taillen; + + dst = ctxt->sc_xprt_buf; + dst += ctxt->sc_sges[0].length; + + memcpy(dst, xdr->head[0].iov_base, xdr->head[0].iov_len); + dst += xdr->head[0].iov_len; + + tailbase = xdr->tail[0].iov_base; + taillen = xdr->tail[0].iov_len; + if (wr_lst) { + u32 xdrpad; + + xdrpad = xdr_padsize(xdr->page_len); + if (taillen && xdrpad) { + tailbase += xdrpad; + taillen -= xdrpad; + } + } else { + unsigned int len, remaining; + unsigned long pageoff; + struct page **ppages; + + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); + pageoff = xdr->page_base & ~PAGE_MASK; + remaining = xdr->page_len; + while (remaining) { + len = min_t(u32, PAGE_SIZE - pageoff, remaining); + + memcpy(dst, page_address(*ppages) + pageoff, len); + remaining -= len; + dst += len; + pageoff = 0; + ppages++; + } + } + + if (taillen) + memcpy(dst, tailbase, taillen); + + ctxt->sc_sges[0].length += xdr->len; + ib_dma_sync_single_for_device(rdma->sc_pd->device, + ctxt->sc_sges[0].addr, + ctxt->sc_sges[0].length, + DMA_TO_DEVICE); + + return 0; +} + +/* svc_rdma_map_reply_msg - Map the buffer holding RPC message + * @rdma: controlling transport + * @ctxt: send_ctxt for the Send WR + * @xdr: prepared xdr_buf containing RPC message + * @wr_lst: pointer to Call header's Write list, or NULL + * + * Load the xdr_buf into the ctxt's sge array, and DMA map each + * element as it is added. + * + * Returns zero on success, or a negative errno on failure. + */ +int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt, + struct xdr_buf *xdr, __be32 *wr_lst) +{ + unsigned int len, remaining; + unsigned long page_off; + struct page **ppages; + unsigned char *base; + u32 xdr_pad; + int ret; + + if (svc_rdma_pull_up_needed(rdma, xdr, wr_lst)) + return svc_rdma_pull_up_reply_msg(rdma, ctxt, xdr, wr_lst); + + ++ctxt->sc_cur_sge_no; + ret = svc_rdma_dma_map_buf(rdma, ctxt, + xdr->head[0].iov_base, + xdr->head[0].iov_len); + if (ret < 0) + return ret; + + /* If a Write chunk is present, the xdr_buf's page list + * is not included inline. However the Upper Layer may + * have added XDR padding in the tail buffer, and that + * should not be included inline. + */ + if (wr_lst) { + base = xdr->tail[0].iov_base; + len = xdr->tail[0].iov_len; + xdr_pad = xdr_padsize(xdr->page_len); + + if (len && xdr_pad) { + base += xdr_pad; + len -= xdr_pad; + } + + goto tail; + } + + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); + page_off = xdr->page_base & ~PAGE_MASK; + remaining = xdr->page_len; + while (remaining) { + len = min_t(u32, PAGE_SIZE - page_off, remaining); + + ++ctxt->sc_cur_sge_no; + ret = svc_rdma_dma_map_page(rdma, ctxt, *ppages++, + page_off, len); + if (ret < 0) + return ret; + + remaining -= len; + page_off = 0; + } + + base = xdr->tail[0].iov_base; + len = xdr->tail[0].iov_len; +tail: + if (len) { + ++ctxt->sc_cur_sge_no; + ret = svc_rdma_dma_map_buf(rdma, ctxt, base, len); + if (ret < 0) + return ret; + } + + return 0; +} + +/* The svc_rqst and all resources it owns are released as soon as + * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt + * so they are released by the Send completion handler. + */ +static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, + struct svc_rdma_send_ctxt *ctxt) +{ + int i, pages = rqstp->rq_next_page - rqstp->rq_respages; + + ctxt->sc_page_count += pages; + for (i = 0; i < pages; i++) { + ctxt->sc_pages[i] = rqstp->rq_respages[i]; + rqstp->rq_respages[i] = NULL; + } + + /* Prevent svc_xprt_release from releasing pages in rq_pages */ + rqstp->rq_next_page = rqstp->rq_respages; +} + +/* Prepare the portion of the RPC Reply that will be transmitted + * via RDMA Send. The RPC-over-RDMA transport header is prepared + * in sc_sges[0], and the RPC xdr_buf is prepared in following sges. + * + * Depending on whether a Write list or Reply chunk is present, + * the server may send all, a portion of, or none of the xdr_buf. + * In the latter case, only the transport header (sc_sges[0]) is + * transmitted. + * + * RDMA Send is the last step of transmitting an RPC reply. Pages + * involved in the earlier RDMA Writes are here transferred out + * of the rqstp and into the ctxt's page array. These pages are + * DMA unmapped by each Write completion, but the subsequent Send + * completion finally releases these pages. + * + * Assumptions: + * - The Reply's transport header will never be larger than a page. + */ +static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt, + __be32 *rdma_argp, + struct svc_rqst *rqstp, + __be32 *wr_lst, __be32 *rp_ch) +{ + int ret; + + if (!rp_ch) { + ret = svc_rdma_map_reply_msg(rdma, ctxt, + &rqstp->rq_res, wr_lst); + if (ret < 0) + return ret; + } + + svc_rdma_save_io_pages(rqstp, ctxt); + + ctxt->sc_send_wr.opcode = IB_WR_SEND; + if (rdma->sc_snd_w_inv) { + ctxt->sc_send_wr.ex.invalidate_rkey = + svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch); + if (ctxt->sc_send_wr.ex.invalidate_rkey) + ctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV; + } + dprintk("svcrdma: posting Send WR with %u sge(s)\n", + ctxt->sc_send_wr.num_sge); + return svc_rdma_send(rdma, &ctxt->sc_send_wr); +} + +/* Given the client-provided Write and Reply chunks, the server was not + * able to form a complete reply. Return an RDMA_ERROR message so the + * client can retire this RPC transaction. As above, the Send completion + * routine releases payload pages that were part of a previous RDMA Write. + * + * Remote Invalidation is skipped for simplicity. + */ +static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt, + struct svc_rqst *rqstp) +{ + __be32 *p; + int ret; + + p = ctxt->sc_xprt_buf; + trace_svcrdma_err_chunk(*p); + p += 3; + *p++ = rdma_error; + *p = err_chunk; + svc_rdma_sync_reply_hdr(rdma, ctxt, RPCRDMA_HDRLEN_ERR); + + svc_rdma_save_io_pages(rqstp, ctxt); + + ctxt->sc_send_wr.opcode = IB_WR_SEND; + ret = svc_rdma_send(rdma, &ctxt->sc_send_wr); + if (ret) { + svc_rdma_send_ctxt_put(rdma, ctxt); + return ret; + } + + return 0; +} + +void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp) +{ +} + +/** + * svc_rdma_sendto - Transmit an RPC reply + * @rqstp: processed RPC request, reply XDR already in ::rq_res + * + * Any resources still associated with @rqstp are released upon return. + * If no reply message was possible, the connection is closed. + * + * Returns: + * %0 if an RPC reply has been successfully posted, + * %-ENOMEM if a resource shortage occurred (connection is lost), + * %-ENOTCONN if posting failed (connection is lost). + */ +int svc_rdma_sendto(struct svc_rqst *rqstp) +{ + struct svc_xprt *xprt = rqstp->rq_xprt; + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; + __be32 *p, *rdma_argp, *rdma_resp, *wr_lst, *rp_ch; + struct xdr_buf *xdr = &rqstp->rq_res; + struct svc_rdma_send_ctxt *sctxt; + int ret; + + rdma_argp = rctxt->rc_recv_buf; + svc_rdma_get_write_arrays(rdma_argp, &wr_lst, &rp_ch); + + /* Create the RDMA response header. xprt->xpt_mutex, + * acquired in svc_send(), serializes RPC replies. The + * code path below that inserts the credit grant value + * into each transport header runs only inside this + * critical section. + */ + ret = -ENOMEM; + sctxt = svc_rdma_send_ctxt_get(rdma); + if (!sctxt) + goto err0; + rdma_resp = sctxt->sc_xprt_buf; + + p = rdma_resp; + *p++ = *rdma_argp; + *p++ = *(rdma_argp + 1); + *p++ = rdma->sc_fc_credits; + *p++ = rp_ch ? rdma_nomsg : rdma_msg; + + /* Start with empty chunks */ + *p++ = xdr_zero; + *p++ = xdr_zero; + *p = xdr_zero; + + if (wr_lst) { + /* XXX: Presume the client sent only one Write chunk */ + ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr); + if (ret < 0) + goto err2; + svc_rdma_xdr_encode_write_list(rdma_resp, wr_lst, ret); + } + if (rp_ch) { + ret = svc_rdma_send_reply_chunk(rdma, rp_ch, wr_lst, xdr); + if (ret < 0) + goto err2; + svc_rdma_xdr_encode_reply_chunk(rdma_resp, rp_ch, ret); + } + + svc_rdma_sync_reply_hdr(rdma, sctxt, svc_rdma_reply_hdr_len(rdma_resp)); + ret = svc_rdma_send_reply_msg(rdma, sctxt, rdma_argp, rqstp, + wr_lst, rp_ch); + if (ret < 0) + goto err1; + return 0; + + err2: + if (ret != -E2BIG && ret != -EINVAL) + goto err1; + + ret = svc_rdma_send_error_msg(rdma, sctxt, rqstp); + if (ret < 0) + goto err1; + return 0; + + err1: + svc_rdma_send_ctxt_put(rdma, sctxt); + err0: + trace_svcrdma_send_failed(rqstp, ret); + set_bit(XPT_CLOSE, &xprt->xpt_flags); + return -ENOTCONN; +} diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c new file mode 100644 index 000000000..f1824303e --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -0,0 +1,717 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* + * Copyright (c) 2015-2018 Oracle. All rights reserved. + * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. + * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker <tom@opengridcomputing.com> + */ + +#include <linux/interrupt.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/workqueue.h> +#include <linux/export.h> + +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> +#include <rdma/rw.h> + +#include <linux/sunrpc/addr.h> +#include <linux/sunrpc/debug.h> +#include <linux/sunrpc/rpc_rdma.h> +#include <linux/sunrpc/svc_xprt.h> +#include <linux/sunrpc/svc_rdma.h> + +#include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, + struct net *net); +static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, + struct net *net, + struct sockaddr *sa, int salen, + int flags); +static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt); +static void svc_rdma_detach(struct svc_xprt *xprt); +static void svc_rdma_free(struct svc_xprt *xprt); +static int svc_rdma_has_wspace(struct svc_xprt *xprt); +static void svc_rdma_secure_port(struct svc_rqst *); +static void svc_rdma_kill_temp_xprt(struct svc_xprt *); + +static const struct svc_xprt_ops svc_rdma_ops = { + .xpo_create = svc_rdma_create, + .xpo_recvfrom = svc_rdma_recvfrom, + .xpo_sendto = svc_rdma_sendto, + .xpo_release_rqst = svc_rdma_release_rqst, + .xpo_detach = svc_rdma_detach, + .xpo_free = svc_rdma_free, + .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr, + .xpo_has_wspace = svc_rdma_has_wspace, + .xpo_accept = svc_rdma_accept, + .xpo_secure_port = svc_rdma_secure_port, + .xpo_kill_temp_xprt = svc_rdma_kill_temp_xprt, +}; + +struct svc_xprt_class svc_rdma_class = { + .xcl_name = "rdma", + .xcl_owner = THIS_MODULE, + .xcl_ops = &svc_rdma_ops, + .xcl_max_payload = RPCSVC_MAXPAYLOAD_RDMA, + .xcl_ident = XPRT_TRANSPORT_RDMA, +}; + +#if defined(CONFIG_SUNRPC_BACKCHANNEL) +static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *, struct net *, + struct sockaddr *, int, int); +static void svc_rdma_bc_detach(struct svc_xprt *); +static void svc_rdma_bc_free(struct svc_xprt *); + +static const struct svc_xprt_ops svc_rdma_bc_ops = { + .xpo_create = svc_rdma_bc_create, + .xpo_detach = svc_rdma_bc_detach, + .xpo_free = svc_rdma_bc_free, + .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr, + .xpo_secure_port = svc_rdma_secure_port, +}; + +struct svc_xprt_class svc_rdma_bc_class = { + .xcl_name = "rdma-bc", + .xcl_owner = THIS_MODULE, + .xcl_ops = &svc_rdma_bc_ops, + .xcl_max_payload = (1024 - RPCRDMA_HDRLEN_MIN) +}; + +static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *serv, + struct net *net, + struct sockaddr *sa, int salen, + int flags) +{ + struct svcxprt_rdma *cma_xprt; + struct svc_xprt *xprt; + + cma_xprt = svc_rdma_create_xprt(serv, net); + if (!cma_xprt) + return ERR_PTR(-ENOMEM); + xprt = &cma_xprt->sc_xprt; + + svc_xprt_init(net, &svc_rdma_bc_class, xprt, serv); + set_bit(XPT_CONG_CTRL, &xprt->xpt_flags); + serv->sv_bc_xprt = xprt; + + dprintk("svcrdma: %s(%p)\n", __func__, xprt); + return xprt; +} + +static void svc_rdma_bc_detach(struct svc_xprt *xprt) +{ + dprintk("svcrdma: %s(%p)\n", __func__, xprt); +} + +static void svc_rdma_bc_free(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + + dprintk("svcrdma: %s(%p)\n", __func__, xprt); + if (xprt) + kfree(rdma); +} +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ + +/* QP event handler */ +static void qp_event_handler(struct ib_event *event, void *context) +{ + struct svc_xprt *xprt = context; + + trace_svcrdma_qp_error(event, (struct sockaddr *)&xprt->xpt_remote); + switch (event->event) { + /* These are considered benign events */ + case IB_EVENT_PATH_MIG: + case IB_EVENT_COMM_EST: + case IB_EVENT_SQ_DRAINED: + case IB_EVENT_QP_LAST_WQE_REACHED: + break; + + /* These are considered fatal events */ + case IB_EVENT_PATH_MIG_ERR: + case IB_EVENT_QP_FATAL: + case IB_EVENT_QP_REQ_ERR: + case IB_EVENT_QP_ACCESS_ERR: + case IB_EVENT_DEVICE_FATAL: + default: + set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); + break; + } +} + +static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, + struct net *net) +{ + struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL); + + if (!cma_xprt) { + dprintk("svcrdma: failed to create new transport\n"); + return NULL; + } + svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv); + INIT_LIST_HEAD(&cma_xprt->sc_accept_q); + INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); + INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); + INIT_LIST_HEAD(&cma_xprt->sc_send_ctxts); + INIT_LIST_HEAD(&cma_xprt->sc_recv_ctxts); + INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts); + init_waitqueue_head(&cma_xprt->sc_send_wait); + + spin_lock_init(&cma_xprt->sc_lock); + spin_lock_init(&cma_xprt->sc_rq_dto_lock); + spin_lock_init(&cma_xprt->sc_send_lock); + spin_lock_init(&cma_xprt->sc_recv_lock); + spin_lock_init(&cma_xprt->sc_rw_ctxt_lock); + + /* + * Note that this implies that the underlying transport support + * has some form of congestion control (see RFC 7530 section 3.1 + * paragraph 2). For now, we assume that all supported RDMA + * transports are suitable here. + */ + set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags); + + return cma_xprt; +} + +static void +svc_rdma_parse_connect_private(struct svcxprt_rdma *newxprt, + struct rdma_conn_param *param) +{ + const struct rpcrdma_connect_private *pmsg = param->private_data; + + if (pmsg && + pmsg->cp_magic == rpcrdma_cmp_magic && + pmsg->cp_version == RPCRDMA_CMP_VERSION) { + newxprt->sc_snd_w_inv = pmsg->cp_flags & + RPCRDMA_CMP_F_SND_W_INV_OK; + + dprintk("svcrdma: client send_size %u, recv_size %u " + "remote inv %ssupported\n", + rpcrdma_decode_buffer_size(pmsg->cp_send_size), + rpcrdma_decode_buffer_size(pmsg->cp_recv_size), + newxprt->sc_snd_w_inv ? "" : "un"); + } +} + +/* + * This function handles the CONNECT_REQUEST event on a listening + * endpoint. It is passed the cma_id for the _new_ connection. The context in + * this cma_id is inherited from the listening cma_id and is the svc_xprt + * structure for the listening endpoint. + * + * This function creates a new xprt for the new connection and enqueues it on + * the accept queue for the listent xprt. When the listen thread is kicked, it + * will call the recvfrom method on the listen xprt which will accept the new + * connection. + */ +static void handle_connect_req(struct rdma_cm_id *new_cma_id, + struct rdma_conn_param *param) +{ + struct svcxprt_rdma *listen_xprt = new_cma_id->context; + struct svcxprt_rdma *newxprt; + struct sockaddr *sa; + + /* Create a new transport */ + newxprt = svc_rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, + listen_xprt->sc_xprt.xpt_net); + if (!newxprt) + return; + newxprt->sc_cm_id = new_cma_id; + new_cma_id->context = newxprt; + svc_rdma_parse_connect_private(newxprt, param); + + /* Save client advertised inbound read limit for use later in accept. */ + newxprt->sc_ord = param->initiator_depth; + + sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; + svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa)); + /* The remote port is arbitrary and not under the control of the + * client ULP. Set it to a fixed value so that the DRC continues + * to be effective after a reconnect. + */ + rpc_set_port((struct sockaddr *)&newxprt->sc_xprt.xpt_remote, 0); + + sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; + svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa)); + + /* + * Enqueue the new transport on the accept queue of the listening + * transport + */ + spin_lock_bh(&listen_xprt->sc_lock); + list_add_tail(&newxprt->sc_accept_q, &listen_xprt->sc_accept_q); + spin_unlock_bh(&listen_xprt->sc_lock); + + set_bit(XPT_CONN, &listen_xprt->sc_xprt.xpt_flags); + svc_xprt_enqueue(&listen_xprt->sc_xprt); +} + +/* + * Handles events generated on the listening endpoint. These events will be + * either be incoming connect requests or adapter removal events. + */ +static int rdma_listen_handler(struct rdma_cm_id *cma_id, + struct rdma_cm_event *event) +{ + struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.src_addr; + + trace_svcrdma_cm_event(event, sap); + + switch (event->event) { + case RDMA_CM_EVENT_CONNECT_REQUEST: + dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " + "event = %s (%d)\n", cma_id, cma_id->context, + rdma_event_msg(event->event), event->event); + handle_connect_req(cma_id, &event->param.conn); + break; + default: + /* NB: No device removal upcall for INADDR_ANY listeners */ + dprintk("svcrdma: Unexpected event on listening endpoint %p, " + "event = %s (%d)\n", cma_id, + rdma_event_msg(event->event), event->event); + break; + } + + return 0; +} + +static int rdma_cma_handler(struct rdma_cm_id *cma_id, + struct rdma_cm_event *event) +{ + struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.dst_addr; + struct svcxprt_rdma *rdma = cma_id->context; + struct svc_xprt *xprt = &rdma->sc_xprt; + + trace_svcrdma_cm_event(event, sap); + + switch (event->event) { + case RDMA_CM_EVENT_ESTABLISHED: + /* Accept complete */ + svc_xprt_get(xprt); + dprintk("svcrdma: Connection completed on DTO xprt=%p, " + "cm_id=%p\n", xprt, cma_id); + clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags); + svc_xprt_enqueue(xprt); + break; + case RDMA_CM_EVENT_DISCONNECTED: + dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n", + xprt, cma_id); + set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, " + "event = %s (%d)\n", cma_id, xprt, + rdma_event_msg(event->event), event->event); + set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); + break; + default: + dprintk("svcrdma: Unexpected event on DTO endpoint %p, " + "event = %s (%d)\n", cma_id, + rdma_event_msg(event->event), event->event); + break; + } + return 0; +} + +/* + * Create a listening RDMA service endpoint. + */ +static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, + struct net *net, + struct sockaddr *sa, int salen, + int flags) +{ + struct rdma_cm_id *listen_id; + struct svcxprt_rdma *cma_xprt; + int ret; + + dprintk("svcrdma: Creating RDMA listener\n"); + if ((sa->sa_family != AF_INET) && (sa->sa_family != AF_INET6)) { + dprintk("svcrdma: Address family %d is not supported.\n", sa->sa_family); + return ERR_PTR(-EAFNOSUPPORT); + } + cma_xprt = svc_rdma_create_xprt(serv, net); + if (!cma_xprt) + return ERR_PTR(-ENOMEM); + set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); + strcpy(cma_xprt->sc_xprt.xpt_remotebuf, "listener"); + + listen_id = rdma_create_id(net, rdma_listen_handler, cma_xprt, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(listen_id)) { + ret = PTR_ERR(listen_id); + dprintk("svcrdma: rdma_create_id failed = %d\n", ret); + goto err0; + } + + /* Allow both IPv4 and IPv6 sockets to bind a single port + * at the same time. + */ +#if IS_ENABLED(CONFIG_IPV6) + ret = rdma_set_afonly(listen_id, 1); + if (ret) { + dprintk("svcrdma: rdma_set_afonly failed = %d\n", ret); + goto err1; + } +#endif + ret = rdma_bind_addr(listen_id, sa); + if (ret) { + dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret); + goto err1; + } + cma_xprt->sc_cm_id = listen_id; + + ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG); + if (ret) { + dprintk("svcrdma: rdma_listen failed = %d\n", ret); + goto err1; + } + + /* + * We need to use the address from the cm_id in case the + * caller specified 0 for the port number. + */ + sa = (struct sockaddr *)&cma_xprt->sc_cm_id->route.addr.src_addr; + svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen); + + return &cma_xprt->sc_xprt; + + err1: + rdma_destroy_id(listen_id); + err0: + kfree(cma_xprt); + return ERR_PTR(ret); +} + +/* + * This is the xpo_recvfrom function for listening endpoints. Its + * purpose is to accept incoming connections. The CMA callback handler + * has already created a new transport and attached it to the new CMA + * ID. + * + * There is a queue of pending connections hung on the listening + * transport. This queue contains the new svc_xprt structure. This + * function takes svc_xprt structures off the accept_q and completes + * the connection. + */ +static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *listen_rdma; + struct svcxprt_rdma *newxprt = NULL; + struct rdma_conn_param conn_param; + struct rpcrdma_connect_private pmsg; + struct ib_qp_init_attr qp_attr; + unsigned int ctxts, rq_depth; + struct ib_device *dev; + struct sockaddr *sap; + int ret = 0; + + listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); + clear_bit(XPT_CONN, &xprt->xpt_flags); + /* Get the next entry off the accept list */ + spin_lock_bh(&listen_rdma->sc_lock); + if (!list_empty(&listen_rdma->sc_accept_q)) { + newxprt = list_entry(listen_rdma->sc_accept_q.next, + struct svcxprt_rdma, sc_accept_q); + list_del_init(&newxprt->sc_accept_q); + } + if (!list_empty(&listen_rdma->sc_accept_q)) + set_bit(XPT_CONN, &listen_rdma->sc_xprt.xpt_flags); + spin_unlock_bh(&listen_rdma->sc_lock); + if (!newxprt) + return NULL; + + dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n", + newxprt, newxprt->sc_cm_id); + + dev = newxprt->sc_cm_id->device; + newxprt->sc_port_num = newxprt->sc_cm_id->port_num; + + /* Qualify the transport resource defaults with the + * capabilities of this particular device */ + /* Transport header, head iovec, tail iovec */ + newxprt->sc_max_send_sges = 3; + /* Add one SGE per page list entry */ + newxprt->sc_max_send_sges += (svcrdma_max_req_size / PAGE_SIZE) + 1; + if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge) + newxprt->sc_max_send_sges = dev->attrs.max_send_sge; + newxprt->sc_max_req_size = svcrdma_max_req_size; + newxprt->sc_max_requests = svcrdma_max_requests; + newxprt->sc_max_bc_requests = svcrdma_max_bc_requests; + rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests; + if (rq_depth > dev->attrs.max_qp_wr) { + pr_warn("svcrdma: reducing receive depth to %d\n", + dev->attrs.max_qp_wr); + rq_depth = dev->attrs.max_qp_wr; + newxprt->sc_max_requests = rq_depth - 2; + newxprt->sc_max_bc_requests = 2; + } + newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests); + ctxts = rdma_rw_mr_factor(dev, newxprt->sc_port_num, RPCSVC_MAXPAGES); + ctxts *= newxprt->sc_max_requests; + newxprt->sc_sq_depth = rq_depth + ctxts; + if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) { + pr_warn("svcrdma: reducing send depth to %d\n", + dev->attrs.max_qp_wr); + newxprt->sc_sq_depth = dev->attrs.max_qp_wr; + } + atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth); + + newxprt->sc_pd = ib_alloc_pd(dev, 0); + if (IS_ERR(newxprt->sc_pd)) { + dprintk("svcrdma: error creating PD for connect request\n"); + goto errout; + } + newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth, + 0, IB_POLL_WORKQUEUE); + if (IS_ERR(newxprt->sc_sq_cq)) { + dprintk("svcrdma: error creating SQ CQ for connect request\n"); + goto errout; + } + newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, rq_depth, + 0, IB_POLL_WORKQUEUE); + if (IS_ERR(newxprt->sc_rq_cq)) { + dprintk("svcrdma: error creating RQ CQ for connect request\n"); + goto errout; + } + + memset(&qp_attr, 0, sizeof qp_attr); + qp_attr.event_handler = qp_event_handler; + qp_attr.qp_context = &newxprt->sc_xprt; + qp_attr.port_num = newxprt->sc_port_num; + qp_attr.cap.max_rdma_ctxs = ctxts; + qp_attr.cap.max_send_wr = newxprt->sc_sq_depth - ctxts; + qp_attr.cap.max_recv_wr = rq_depth; + qp_attr.cap.max_send_sge = newxprt->sc_max_send_sges; + qp_attr.cap.max_recv_sge = 1; + qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + qp_attr.qp_type = IB_QPT_RC; + qp_attr.send_cq = newxprt->sc_sq_cq; + qp_attr.recv_cq = newxprt->sc_rq_cq; + dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n", + newxprt->sc_cm_id, newxprt->sc_pd); + dprintk(" cap.max_send_wr = %d, cap.max_recv_wr = %d\n", + qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr); + dprintk(" cap.max_send_sge = %d, cap.max_recv_sge = %d\n", + qp_attr.cap.max_send_sge, qp_attr.cap.max_recv_sge); + + ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr); + if (ret) { + dprintk("svcrdma: failed to create QP, ret=%d\n", ret); + goto errout; + } + newxprt->sc_qp = newxprt->sc_cm_id->qp; + + if (!(dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) + newxprt->sc_snd_w_inv = false; + if (!rdma_protocol_iwarp(dev, newxprt->sc_port_num) && + !rdma_ib_or_roce(dev, newxprt->sc_port_num)) + goto errout; + + if (!svc_rdma_post_recvs(newxprt)) + goto errout; + + /* Swap out the handler */ + newxprt->sc_cm_id->event_handler = rdma_cma_handler; + + /* Construct RDMA-CM private message */ + pmsg.cp_magic = rpcrdma_cmp_magic; + pmsg.cp_version = RPCRDMA_CMP_VERSION; + pmsg.cp_flags = 0; + pmsg.cp_send_size = pmsg.cp_recv_size = + rpcrdma_encode_buffer_size(newxprt->sc_max_req_size); + + /* Accept Connection */ + set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags); + memset(&conn_param, 0, sizeof conn_param); + conn_param.responder_resources = 0; + conn_param.initiator_depth = min_t(int, newxprt->sc_ord, + dev->attrs.max_qp_init_rd_atom); + if (!conn_param.initiator_depth) { + dprintk("svcrdma: invalid ORD setting\n"); + ret = -EINVAL; + goto errout; + } + conn_param.private_data = &pmsg; + conn_param.private_data_len = sizeof(pmsg); + ret = rdma_accept(newxprt->sc_cm_id, &conn_param); + if (ret) + goto errout; + + dprintk("svcrdma: new connection %p accepted:\n", newxprt); + sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; + dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap)); + sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; + dprintk(" remote address : %pIS:%u\n", sap, rpc_get_port(sap)); + dprintk(" max_sge : %d\n", newxprt->sc_max_send_sges); + dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth); + dprintk(" rdma_rw_ctxs : %d\n", ctxts); + dprintk(" max_requests : %d\n", newxprt->sc_max_requests); + dprintk(" ord : %d\n", conn_param.initiator_depth); + + trace_svcrdma_xprt_accept(&newxprt->sc_xprt); + return &newxprt->sc_xprt; + + errout: + dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret); + trace_svcrdma_xprt_fail(&newxprt->sc_xprt); + /* Take a reference in case the DTO handler runs */ + svc_xprt_get(&newxprt->sc_xprt); + if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp)) + ib_destroy_qp(newxprt->sc_qp); + rdma_destroy_id(newxprt->sc_cm_id); + /* This call to put will destroy the transport */ + svc_xprt_put(&newxprt->sc_xprt); + return NULL; +} + +/* + * When connected, an svc_xprt has at least two references: + * + * - A reference held by the cm_id between the ESTABLISHED and + * DISCONNECTED events. If the remote peer disconnected first, this + * reference could be gone. + * + * - A reference held by the svc_recv code that called this function + * as part of close processing. + * + * At a minimum one references should still be held. + */ +static void svc_rdma_detach(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + + /* Disconnect and flush posted WQE */ + rdma_disconnect(rdma->sc_cm_id); +} + +static void __svc_rdma_free(struct work_struct *work) +{ + struct svcxprt_rdma *rdma = + container_of(work, struct svcxprt_rdma, sc_work); + struct svc_xprt *xprt = &rdma->sc_xprt; + + trace_svcrdma_xprt_free(xprt); + + if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) + ib_drain_qp(rdma->sc_qp); + + /* We should only be called from kref_put */ + if (kref_read(&xprt->xpt_ref) != 0) + pr_err("svcrdma: sc_xprt still in use? (%d)\n", + kref_read(&xprt->xpt_ref)); + + svc_rdma_flush_recv_queues(rdma); + + /* Final put of backchannel client transport */ + if (xprt->xpt_bc_xprt) { + xprt_put(xprt->xpt_bc_xprt); + xprt->xpt_bc_xprt = NULL; + } + + svc_rdma_destroy_rw_ctxts(rdma); + svc_rdma_send_ctxts_destroy(rdma); + svc_rdma_recv_ctxts_destroy(rdma); + + /* Destroy the QP if present (not a listener) */ + if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) + ib_destroy_qp(rdma->sc_qp); + + if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq)) + ib_free_cq(rdma->sc_sq_cq); + + if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq)) + ib_free_cq(rdma->sc_rq_cq); + + if (rdma->sc_pd && !IS_ERR(rdma->sc_pd)) + ib_dealloc_pd(rdma->sc_pd); + + /* Destroy the CM ID */ + rdma_destroy_id(rdma->sc_cm_id); + + kfree(rdma); +} + +static void svc_rdma_free(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + INIT_WORK(&rdma->sc_work, __svc_rdma_free); + queue_work(svc_rdma_wq, &rdma->sc_work); +} + +static int svc_rdma_has_wspace(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + + /* + * If there are already waiters on the SQ, + * return false. + */ + if (waitqueue_active(&rdma->sc_send_wait)) + return 0; + + /* Otherwise return true. */ + return 1; +} + +static void svc_rdma_secure_port(struct svc_rqst *rqstp) +{ + set_bit(RQ_SECURE, &rqstp->rq_flags); +} + +static void svc_rdma_kill_temp_xprt(struct svc_xprt *xprt) +{ +} diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c new file mode 100644 index 000000000..e87a79be7 --- /dev/null +++ b/net/sunrpc/xprtrdma/transport.c @@ -0,0 +1,921 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* + * Copyright (c) 2014-2017 Oracle. All rights reserved. + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * transport.c + * + * This file contains the top-level implementation of an RPC RDMA + * transport. + * + * Naming convention: functions beginning with xprt_ are part of the + * transport switch. All others are RPC RDMA internal. + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/seq_file.h> +#include <linux/smp.h> + +#include <linux/sunrpc/addr.h> +#include <linux/sunrpc/svc_rdma.h> + +#include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +/* + * tunables + */ + +static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; +unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; +static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; +unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRWR; +int xprt_rdma_pad_optimize; + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) + +static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE; +static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE; +static unsigned int min_inline_size = RPCRDMA_MIN_INLINE; +static unsigned int max_inline_size = RPCRDMA_MAX_INLINE; +static unsigned int zero; +static unsigned int max_padding = PAGE_SIZE; +static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS; +static unsigned int max_memreg = RPCRDMA_LAST - 1; +static unsigned int dummy; + +static struct ctl_table_header *sunrpc_table_header; + +static struct ctl_table xr_tunables_table[] = { + { + .procname = "rdma_slot_table_entries", + .data = &xprt_rdma_slot_table_entries, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_slot_table_size, + .extra2 = &max_slot_table_size + }, + { + .procname = "rdma_max_inline_read", + .data = &xprt_rdma_max_inline_read, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_inline_size, + .extra2 = &max_inline_size, + }, + { + .procname = "rdma_max_inline_write", + .data = &xprt_rdma_max_inline_write, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_inline_size, + .extra2 = &max_inline_size, + }, + { + .procname = "rdma_inline_write_padding", + .data = &dummy, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &max_padding, + }, + { + .procname = "rdma_memreg_strategy", + .data = &xprt_rdma_memreg_strategy, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_memreg, + .extra2 = &max_memreg, + }, + { + .procname = "rdma_pad_optimize", + .data = &xprt_rdma_pad_optimize, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { }, +}; + +static struct ctl_table sunrpc_table[] = { + { + .procname = "sunrpc", + .mode = 0555, + .child = xr_tunables_table + }, + { }, +}; + +#endif + +static const struct rpc_xprt_ops xprt_rdma_procs; + +static void +xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)sap; + char buf[20]; + + snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr)); + xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); + + xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA; +} + +static void +xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap) +{ + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + char buf[40]; + + snprintf(buf, sizeof(buf), "%pi6", &sin6->sin6_addr); + xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); + + xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6; +} + +void +xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap) +{ + char buf[128]; + + switch (sap->sa_family) { + case AF_INET: + xprt_rdma_format_addresses4(xprt, sap); + break; + case AF_INET6: + xprt_rdma_format_addresses6(xprt, sap); + break; + default: + pr_err("rpcrdma: Unrecognized address family\n"); + return; + } + + (void)rpc_ntop(sap, buf, sizeof(buf)); + xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL); + + snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap)); + xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL); + + snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap)); + xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL); + + xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma"; +} + +void +xprt_rdma_free_addresses(struct rpc_xprt *xprt) +{ + unsigned int i; + + for (i = 0; i < RPC_DISPLAY_MAX; i++) + switch (i) { + case RPC_DISPLAY_PROTO: + case RPC_DISPLAY_NETID: + continue; + default: + kfree(xprt->address_strings[i]); + } +} + +void +rpcrdma_conn_func(struct rpcrdma_ep *ep) +{ + schedule_delayed_work(&ep->rep_connect_worker, 0); +} + +void +rpcrdma_connect_worker(struct work_struct *work) +{ + struct rpcrdma_ep *ep = + container_of(work, struct rpcrdma_ep, rep_connect_worker.work); + struct rpcrdma_xprt *r_xprt = + container_of(ep, struct rpcrdma_xprt, rx_ep); + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + + spin_lock_bh(&xprt->transport_lock); + if (ep->rep_connected > 0) { + if (!xprt_test_and_set_connected(xprt)) { + xprt->stat.connect_count++; + xprt->stat.connect_time += (long)jiffies - + xprt->stat.connect_start; + xprt_wake_pending_tasks(xprt, 0); + } + } else { + if (xprt_test_and_clear_connected(xprt)) + xprt_wake_pending_tasks(xprt, -ENOTCONN); + } + spin_unlock_bh(&xprt->transport_lock); +} + +static void +xprt_rdma_connect_worker(struct work_struct *work) +{ + struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt, + rx_connect_worker.work); + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + int rc = 0; + + xprt_clear_connected(xprt); + + rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); + if (rc) + xprt_wake_pending_tasks(xprt, rc); + + xprt_clear_connecting(xprt); +} + +static void +xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) +{ + struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt, + rx_xprt); + + trace_xprtrdma_inject_dsc(r_xprt); + rdma_disconnect(r_xprt->rx_ia.ri_id); +} + +/* + * xprt_rdma_destroy + * + * Destroy the xprt. + * Free all memory associated with the object, including its own. + * NOTE: none of the *destroy methods free memory for their top-level + * objects, even though they may have allocated it (they do free + * private memory). It's up to the caller to handle it. In this + * case (RDMA transport), all structure memory is inlined with the + * struct rpcrdma_xprt. + */ +static void +xprt_rdma_destroy(struct rpc_xprt *xprt) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + + trace_xprtrdma_destroy(r_xprt); + + cancel_delayed_work_sync(&r_xprt->rx_connect_worker); + + xprt_clear_connected(xprt); + + rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); + rpcrdma_buffer_destroy(&r_xprt->rx_buf); + rpcrdma_ia_close(&r_xprt->rx_ia); + + xprt_rdma_free_addresses(xprt); + xprt_free(xprt); + + module_put(THIS_MODULE); +} + +static const struct rpc_timeout xprt_rdma_default_timeout = { + .to_initval = 60 * HZ, + .to_maxval = 60 * HZ, +}; + +/** + * xprt_setup_rdma - Set up transport to use RDMA + * + * @args: rpc transport arguments + */ +static struct rpc_xprt * +xprt_setup_rdma(struct xprt_create *args) +{ + struct rpcrdma_create_data_internal cdata; + struct rpc_xprt *xprt; + struct rpcrdma_xprt *new_xprt; + struct rpcrdma_ep *new_ep; + struct sockaddr *sap; + int rc; + + if (args->addrlen > sizeof(xprt->addr)) { + dprintk("RPC: %s: address too large\n", __func__); + return ERR_PTR(-EBADF); + } + + xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0, 0); + if (xprt == NULL) { + dprintk("RPC: %s: couldn't allocate rpcrdma_xprt\n", + __func__); + return ERR_PTR(-ENOMEM); + } + + /* 60 second timeout, no retries */ + xprt->timeout = &xprt_rdma_default_timeout; + xprt->bind_timeout = RPCRDMA_BIND_TO; + xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; + xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO; + + xprt->resvport = 0; /* privileged port not needed */ + xprt->tsh_size = 0; /* RPC-RDMA handles framing */ + xprt->ops = &xprt_rdma_procs; + + /* + * Set up RDMA-specific connect data. + */ + sap = args->dstaddr; + + /* Ensure xprt->addr holds valid server TCP (not RDMA) + * address, for any side protocols which peek at it */ + xprt->prot = IPPROTO_TCP; + xprt->addrlen = args->addrlen; + memcpy(&xprt->addr, sap, xprt->addrlen); + + if (rpc_get_port(sap)) + xprt_set_bound(xprt); + xprt_rdma_format_addresses(xprt, sap); + + cdata.max_requests = xprt_rdma_slot_table_entries; + + cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */ + cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */ + + cdata.inline_wsize = xprt_rdma_max_inline_write; + if (cdata.inline_wsize > cdata.wsize) + cdata.inline_wsize = cdata.wsize; + + cdata.inline_rsize = xprt_rdma_max_inline_read; + if (cdata.inline_rsize > cdata.rsize) + cdata.inline_rsize = cdata.rsize; + + /* + * Create new transport instance, which includes initialized + * o ia + * o endpoint + * o buffers + */ + + new_xprt = rpcx_to_rdmax(xprt); + + rc = rpcrdma_ia_open(new_xprt); + if (rc) + goto out1; + + /* + * initialize and create ep + */ + new_xprt->rx_data = cdata; + new_ep = &new_xprt->rx_ep; + + rc = rpcrdma_ep_create(&new_xprt->rx_ep, + &new_xprt->rx_ia, &new_xprt->rx_data); + if (rc) + goto out2; + + rc = rpcrdma_buffer_create(new_xprt); + if (rc) + goto out3; + + INIT_DELAYED_WORK(&new_xprt->rx_connect_worker, + xprt_rdma_connect_worker); + + xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt); + if (xprt->max_payload == 0) + goto out4; + xprt->max_payload <<= PAGE_SHIFT; + dprintk("RPC: %s: transport data payload maximum: %zu bytes\n", + __func__, xprt->max_payload); + + if (!try_module_get(THIS_MODULE)) + goto out4; + + dprintk("RPC: %s: %s:%s\n", __func__, + xprt->address_strings[RPC_DISPLAY_ADDR], + xprt->address_strings[RPC_DISPLAY_PORT]); + trace_xprtrdma_create(new_xprt); + return xprt; + +out4: + rpcrdma_buffer_destroy(&new_xprt->rx_buf); + rc = -ENODEV; +out3: + rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia); +out2: + rpcrdma_ia_close(&new_xprt->rx_ia); +out1: + trace_xprtrdma_destroy(new_xprt); + xprt_rdma_free_addresses(xprt); + xprt_free(xprt); + return ERR_PTR(rc); +} + +/** + * xprt_rdma_close - Close down RDMA connection + * @xprt: generic transport to be closed + * + * Called during transport shutdown reconnect, or device + * removal. Caller holds the transport's write lock. + */ +static void +xprt_rdma_close(struct rpc_xprt *xprt) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + + dprintk("RPC: %s: closing xprt %p\n", __func__, xprt); + + if (test_and_clear_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags)) { + xprt_clear_connected(xprt); + rpcrdma_ia_remove(ia); + return; + } + if (ep->rep_connected == -ENODEV) + return; + if (ep->rep_connected > 0) + xprt->reestablish_timeout = 0; + xprt_disconnect_done(xprt); + rpcrdma_ep_disconnect(ep, ia); + + /* Prepare @xprt for the next connection by reinitializing + * its credit grant to one (see RFC 8166, Section 3.3.3). + */ + r_xprt->rx_buf.rb_credits = 1; + xprt->cwnd = RPC_CWNDSHIFT; +} + +/** + * xprt_rdma_set_port - update server port with rpcbind result + * @xprt: controlling RPC transport + * @port: new port value + * + * Transport connect status is unchanged. + */ +static void +xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port) +{ + struct sockaddr *sap = (struct sockaddr *)&xprt->addr; + char buf[8]; + + dprintk("RPC: %s: setting port for xprt %p (%s:%s) to %u\n", + __func__, xprt, + xprt->address_strings[RPC_DISPLAY_ADDR], + xprt->address_strings[RPC_DISPLAY_PORT], + port); + + rpc_set_port(sap, port); + + kfree(xprt->address_strings[RPC_DISPLAY_PORT]); + snprintf(buf, sizeof(buf), "%u", port); + xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL); + + kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]); + snprintf(buf, sizeof(buf), "%4hx", port); + xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL); +} + +/** + * xprt_rdma_timer - invoked when an RPC times out + * @xprt: controlling RPC transport + * @task: RPC task that timed out + * + * Invoked when the transport is still connected, but an RPC + * retransmit timeout occurs. + * + * Since RDMA connections don't have a keep-alive, forcibly + * disconnect and retry to connect. This drives full + * detection of the network path, and retransmissions of + * all pending RPCs. + */ +static void +xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task) +{ + xprt_force_disconnect(xprt); +} + +static void +xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + + if (r_xprt->rx_ep.rep_connected != 0) { + /* Reconnect */ + schedule_delayed_work(&r_xprt->rx_connect_worker, + xprt->reestablish_timeout); + xprt->reestablish_timeout <<= 1; + if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO) + xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO; + else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO) + xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; + } else { + schedule_delayed_work(&r_xprt->rx_connect_worker, 0); + if (!RPC_IS_ASYNC(task)) + flush_delayed_work(&r_xprt->rx_connect_worker); + } +} + +/** + * xprt_rdma_alloc_slot - allocate an rpc_rqst + * @xprt: controlling RPC transport + * @task: RPC task requesting a fresh rpc_rqst + * + * tk_status values: + * %0 if task->tk_rqstp points to a fresh rpc_rqst + * %-EAGAIN if no rpc_rqst is available; queued on backlog + */ +static void +xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_req *req; + + req = rpcrdma_buffer_get(&r_xprt->rx_buf); + if (!req) + goto out_sleep; + task->tk_rqstp = &req->rl_slot; + task->tk_status = 0; + return; + +out_sleep: + rpc_sleep_on(&xprt->backlog, task, NULL); + task->tk_status = -EAGAIN; +} + +/** + * xprt_rdma_free_slot - release an rpc_rqst + * @xprt: controlling RPC transport + * @rqst: rpc_rqst to release + * + */ +static void +xprt_rdma_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *rqst) +{ + memset(rqst, 0, sizeof(*rqst)); + rpcrdma_buffer_put(rpcr_to_rdmar(rqst)); + rpc_wake_up_next(&xprt->backlog); +} + +static bool +rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + size_t size, gfp_t flags) +{ + struct rpcrdma_regbuf *rb; + + if (req->rl_sendbuf && rdmab_length(req->rl_sendbuf) >= size) + return true; + + rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags); + if (IS_ERR(rb)) + return false; + + rpcrdma_free_regbuf(req->rl_sendbuf); + r_xprt->rx_stats.hardway_register_count += size; + req->rl_sendbuf = rb; + return true; +} + +/* The rq_rcv_buf is used only if a Reply chunk is necessary. + * The decision to use a Reply chunk is made later in + * rpcrdma_marshal_req. This buffer is registered at that time. + * + * Otherwise, the associated RPC Reply arrives in a separate + * Receive buffer, arbitrarily chosen by the HCA. The buffer + * allocated here for the RPC Reply is not utilized in that + * case. See rpcrdma_inline_fixup. + * + * A regbuf is used here to remember the buffer size. + */ +static bool +rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + size_t size, gfp_t flags) +{ + struct rpcrdma_regbuf *rb; + + if (req->rl_recvbuf && rdmab_length(req->rl_recvbuf) >= size) + return true; + + rb = rpcrdma_alloc_regbuf(size, DMA_NONE, flags); + if (IS_ERR(rb)) + return false; + + rpcrdma_free_regbuf(req->rl_recvbuf); + r_xprt->rx_stats.hardway_register_count += size; + req->rl_recvbuf = rb; + return true; +} + +/** + * xprt_rdma_allocate - allocate transport resources for an RPC + * @task: RPC task + * + * Return values: + * 0: Success; rq_buffer points to RPC buffer to use + * ENOMEM: Out of memory, call again later + * EIO: A permanent error occurred, do not retry + * + * The RDMA allocate/free functions need the task structure as a place + * to hide the struct rpcrdma_req, which is necessary for the actual + * send/recv sequence. + * + * xprt_rdma_allocate provides buffers that are already mapped for + * DMA, and a local DMA lkey is provided for each. + */ +static int +xprt_rdma_allocate(struct rpc_task *task) +{ + struct rpc_rqst *rqst = task->tk_rqstp; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + gfp_t flags; + + flags = RPCRDMA_DEF_GFP; + if (RPC_IS_ASYNC(task)) + flags = GFP_NOWAIT | __GFP_NOWARN; + if (RPC_IS_SWAPPER(task)) + flags |= __GFP_MEMALLOC; + + if (!rpcrdma_get_sendbuf(r_xprt, req, rqst->rq_callsize, flags)) + goto out_fail; + if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags)) + goto out_fail; + + rqst->rq_buffer = req->rl_sendbuf->rg_base; + rqst->rq_rbuffer = req->rl_recvbuf->rg_base; + trace_xprtrdma_allocate(task, req); + return 0; + +out_fail: + trace_xprtrdma_allocate(task, NULL); + return -ENOMEM; +} + +/** + * xprt_rdma_free - release resources allocated by xprt_rdma_allocate + * @task: RPC task + * + * Caller guarantees rqst->rq_buffer is non-NULL. + */ +static void +xprt_rdma_free(struct rpc_task *task) +{ + struct rpc_rqst *rqst = task->tk_rqstp; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + + if (test_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags)) + rpcrdma_release_rqst(r_xprt, req); + trace_xprtrdma_rpc_done(task, req); +} + +/** + * xprt_rdma_send_request - marshal and send an RPC request + * @task: RPC task with an RPC message in rq_snd_buf + * + * Caller holds the transport's write lock. + * + * Returns: + * %0 if the RPC message has been sent + * %-ENOTCONN if the caller should reconnect and call again + * %-EAGAIN if the caller should call again + * %-ENOBUFS if the caller should call again after a delay + * %-EIO if a permanent error occurred and the request was not + * sent. Do not try to send this message again. + */ +static int +xprt_rdma_send_request(struct rpc_task *task) +{ + struct rpc_rqst *rqst = task->tk_rqstp; + struct rpc_xprt *xprt = rqst->rq_xprt; + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + int rc = 0; + +#if defined(CONFIG_SUNRPC_BACKCHANNEL) + if (unlikely(!rqst->rq_buffer)) + return xprt_rdma_bc_send_reply(rqst); +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ + + if (!xprt_connected(xprt)) + goto drop_connection; + + rc = rpcrdma_marshal_req(r_xprt, rqst); + if (rc < 0) + goto failed_marshal; + + /* Must suppress retransmit to maintain credits */ + if (rqst->rq_connect_cookie == xprt->connect_cookie) + goto drop_connection; + rqst->rq_xtime = ktime_get(); + + __set_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); + if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) + goto drop_connection; + + rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len; + rqst->rq_bytes_sent = 0; + + /* An RPC with no reply will throw off credit accounting, + * so drop the connection to reset the credit grant. + */ + if (!rpc_reply_expected(task)) + goto drop_connection; + return 0; + +failed_marshal: + if (rc != -ENOTCONN) + return rc; +drop_connection: + xprt_disconnect_done(xprt); + return -ENOTCONN; /* implies disconnect */ +} + +void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + long idle_time = 0; + + if (xprt_connected(xprt)) + idle_time = (long)(jiffies - xprt->last_used) / HZ; + + seq_puts(seq, "\txprt:\trdma "); + seq_printf(seq, "%u %lu %lu %lu %ld %lu %lu %lu %llu %llu ", + 0, /* need a local port? */ + xprt->stat.bind_count, + xprt->stat.connect_count, + xprt->stat.connect_time, + idle_time, + xprt->stat.sends, + xprt->stat.recvs, + xprt->stat.bad_xids, + xprt->stat.req_u, + xprt->stat.bklog_u); + seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu ", + r_xprt->rx_stats.read_chunk_count, + r_xprt->rx_stats.write_chunk_count, + r_xprt->rx_stats.reply_chunk_count, + r_xprt->rx_stats.total_rdma_request, + r_xprt->rx_stats.total_rdma_reply, + r_xprt->rx_stats.pullup_copy_count, + r_xprt->rx_stats.fixup_copy_count, + r_xprt->rx_stats.hardway_register_count, + r_xprt->rx_stats.failed_marshal_count, + r_xprt->rx_stats.bad_reply_count, + r_xprt->rx_stats.nomsg_call_count); + seq_printf(seq, "%lu %lu %lu %lu %lu %lu\n", + r_xprt->rx_stats.mrs_recovered, + r_xprt->rx_stats.mrs_orphaned, + r_xprt->rx_stats.mrs_allocated, + r_xprt->rx_stats.local_inv_needed, + r_xprt->rx_stats.empty_sendctx_q, + r_xprt->rx_stats.reply_waits_for_send); +} + +static int +xprt_rdma_enable_swap(struct rpc_xprt *xprt) +{ + return 0; +} + +static void +xprt_rdma_disable_swap(struct rpc_xprt *xprt) +{ +} + +/* + * Plumbing for rpc transport switch and kernel module + */ + +static const struct rpc_xprt_ops xprt_rdma_procs = { + .reserve_xprt = xprt_reserve_xprt_cong, + .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */ + .alloc_slot = xprt_rdma_alloc_slot, + .free_slot = xprt_rdma_free_slot, + .release_request = xprt_release_rqst_cong, /* ditto */ + .set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */ + .timer = xprt_rdma_timer, + .rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */ + .set_port = xprt_rdma_set_port, + .connect = xprt_rdma_connect, + .buf_alloc = xprt_rdma_allocate, + .buf_free = xprt_rdma_free, + .send_request = xprt_rdma_send_request, + .close = xprt_rdma_close, + .destroy = xprt_rdma_destroy, + .print_stats = xprt_rdma_print_stats, + .enable_swap = xprt_rdma_enable_swap, + .disable_swap = xprt_rdma_disable_swap, + .inject_disconnect = xprt_rdma_inject_disconnect, +#if defined(CONFIG_SUNRPC_BACKCHANNEL) + .bc_setup = xprt_rdma_bc_setup, + .bc_up = xprt_rdma_bc_up, + .bc_maxpayload = xprt_rdma_bc_maxpayload, + .bc_free_rqst = xprt_rdma_bc_free_rqst, + .bc_destroy = xprt_rdma_bc_destroy, +#endif +}; + +static struct xprt_class xprt_rdma = { + .list = LIST_HEAD_INIT(xprt_rdma.list), + .name = "rdma", + .owner = THIS_MODULE, + .ident = XPRT_TRANSPORT_RDMA, + .setup = xprt_setup_rdma, + .netid = { "rdma", "rdma6", "" }, +}; + +void xprt_rdma_cleanup(void) +{ + int rc; + + dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n"); +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) + if (sunrpc_table_header) { + unregister_sysctl_table(sunrpc_table_header); + sunrpc_table_header = NULL; + } +#endif + rc = xprt_unregister_transport(&xprt_rdma); + if (rc) + dprintk("RPC: %s: xprt_unregister returned %i\n", + __func__, rc); + + rpcrdma_destroy_wq(); + + rc = xprt_unregister_transport(&xprt_rdma_bc); + if (rc) + dprintk("RPC: %s: xprt_unregister(bc) returned %i\n", + __func__, rc); +} + +int xprt_rdma_init(void) +{ + int rc; + + rc = rpcrdma_alloc_wq(); + if (rc) + return rc; + + rc = xprt_register_transport(&xprt_rdma); + if (rc) { + rpcrdma_destroy_wq(); + return rc; + } + + rc = xprt_register_transport(&xprt_rdma_bc); + if (rc) { + xprt_unregister_transport(&xprt_rdma); + rpcrdma_destroy_wq(); + return rc; + } + + dprintk("RPCRDMA Module Init, register RPC RDMA transport\n"); + + dprintk("Defaults:\n"); + dprintk("\tSlots %d\n" + "\tMaxInlineRead %d\n\tMaxInlineWrite %d\n", + xprt_rdma_slot_table_entries, + xprt_rdma_max_inline_read, xprt_rdma_max_inline_write); + dprintk("\tPadding 0\n\tMemreg %d\n", xprt_rdma_memreg_strategy); + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) + if (!sunrpc_table_header) + sunrpc_table_header = register_sysctl_table(sunrpc_table); +#endif + return 0; +} diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c new file mode 100644 index 000000000..ef1f3d076 --- /dev/null +++ b/net/sunrpc/xprtrdma/verbs.c @@ -0,0 +1,1572 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* + * Copyright (c) 2014-2017 Oracle. All rights reserved. + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * verbs.c + * + * Encapsulates the major functions managing: + * o adapters + * o endpoints + * o connections + * o buffer memory + */ + +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/sunrpc/addr.h> +#include <linux/sunrpc/svc_rdma.h> + +#include <asm-generic/barrier.h> +#include <asm/bitops.h> + +#include <rdma/ib_cm.h> + +#include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> + +/* + * Globals/Macros + */ + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +/* + * internal functions + */ +static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc); +static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); +static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf); +static int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp); +static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb); + +struct workqueue_struct *rpcrdma_receive_wq __read_mostly; + +int +rpcrdma_alloc_wq(void) +{ + struct workqueue_struct *recv_wq; + + recv_wq = alloc_workqueue("xprtrdma_receive", + WQ_MEM_RECLAIM | WQ_HIGHPRI, + 0); + if (!recv_wq) + return -ENOMEM; + + rpcrdma_receive_wq = recv_wq; + return 0; +} + +void +rpcrdma_destroy_wq(void) +{ + struct workqueue_struct *wq; + + if (rpcrdma_receive_wq) { + wq = rpcrdma_receive_wq; + rpcrdma_receive_wq = NULL; + destroy_workqueue(wq); + } +} + +static void +rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) +{ + struct rpcrdma_ep *ep = context; + struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt, + rx_ep); + + trace_xprtrdma_qp_error(r_xprt, event); + pr_err("rpcrdma: %s on device %s ep %p\n", + ib_event_msg(event->event), event->device->name, context); + + if (ep->rep_connected == 1) { + ep->rep_connected = -EIO; + rpcrdma_conn_func(ep); + wake_up_all(&ep->rep_connect_wait); + } +} + +/** + * rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC + * @cq: completion queue (ignored) + * @wc: completed WR + * + */ +static void +rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_cqe *cqe = wc->wr_cqe; + struct rpcrdma_sendctx *sc = + container_of(cqe, struct rpcrdma_sendctx, sc_cqe); + + /* WARNING: Only wr_cqe and status are reliable at this point */ + trace_xprtrdma_wc_send(sc, wc); + if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR) + pr_err("rpcrdma: Send: %s (%u/0x%x)\n", + ib_wc_status_msg(wc->status), + wc->status, wc->vendor_err); + + rpcrdma_sendctx_put_locked(sc); +} + +/** + * rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC + * @cq: completion queue (ignored) + * @wc: completed WR + * + */ +static void +rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_cqe *cqe = wc->wr_cqe; + struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep, + rr_cqe); + + /* WARNING: Only wr_id and status are reliable at this point */ + trace_xprtrdma_wc_receive(wc); + if (wc->status != IB_WC_SUCCESS) + goto out_fail; + + /* status == SUCCESS means all fields in wc are trustworthy */ + rpcrdma_set_xdrlen(&rep->rr_hdrbuf, wc->byte_len); + rep->rr_wc_flags = wc->wc_flags; + rep->rr_inv_rkey = wc->ex.invalidate_rkey; + + ib_dma_sync_single_for_cpu(rdmab_device(rep->rr_rdmabuf), + rdmab_addr(rep->rr_rdmabuf), + wc->byte_len, DMA_FROM_DEVICE); + +out_schedule: + rpcrdma_reply_handler(rep); + return; + +out_fail: + if (wc->status != IB_WC_WR_FLUSH_ERR) + pr_err("rpcrdma: Recv: %s (%u/0x%x)\n", + ib_wc_status_msg(wc->status), + wc->status, wc->vendor_err); + rpcrdma_set_xdrlen(&rep->rr_hdrbuf, 0); + goto out_schedule; +} + +static void +rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, + struct rdma_conn_param *param) +{ + struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + const struct rpcrdma_connect_private *pmsg = param->private_data; + unsigned int rsize, wsize; + + /* Default settings for RPC-over-RDMA Version One */ + r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize; + rsize = RPCRDMA_V1_DEF_INLINE_SIZE; + wsize = RPCRDMA_V1_DEF_INLINE_SIZE; + + if (pmsg && + pmsg->cp_magic == rpcrdma_cmp_magic && + pmsg->cp_version == RPCRDMA_CMP_VERSION) { + r_xprt->rx_ia.ri_implicit_roundup = true; + rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size); + wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size); + } + + if (rsize < cdata->inline_rsize) + cdata->inline_rsize = rsize; + if (wsize < cdata->inline_wsize) + cdata->inline_wsize = wsize; + dprintk("RPC: %s: max send %u, max recv %u\n", + __func__, cdata->inline_wsize, cdata->inline_rsize); + rpcrdma_set_max_header_sizes(r_xprt); +} + +static int +rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) +{ + struct rpcrdma_xprt *xprt = id->context; + struct rpcrdma_ia *ia = &xprt->rx_ia; + struct rpcrdma_ep *ep = &xprt->rx_ep; + int connstate = 0; + + trace_xprtrdma_conn_upcall(xprt, event); + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + case RDMA_CM_EVENT_ROUTE_RESOLVED: + ia->ri_async_rc = 0; + complete(&ia->ri_done); + break; + case RDMA_CM_EVENT_ADDR_ERROR: + ia->ri_async_rc = -EPROTO; + complete(&ia->ri_done); + break; + case RDMA_CM_EVENT_ROUTE_ERROR: + ia->ri_async_rc = -ENETUNREACH; + complete(&ia->ri_done); + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) + pr_info("rpcrdma: removing device %s for %s:%s\n", + ia->ri_device->name, + rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt)); +#endif + init_completion(&ia->ri_remove_done); + set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags); + ep->rep_connected = -ENODEV; + xprt_force_disconnect(&xprt->rx_xprt); + wait_for_completion(&ia->ri_remove_done); + + ia->ri_id = NULL; + ia->ri_device = NULL; + /* Return 1 to ensure the core destroys the id. */ + return 1; + case RDMA_CM_EVENT_ESTABLISHED: + ++xprt->rx_xprt.connect_cookie; + connstate = 1; + rpcrdma_update_connect_private(xprt, &event->param.conn); + goto connected; + case RDMA_CM_EVENT_CONNECT_ERROR: + connstate = -ENOTCONN; + goto connected; + case RDMA_CM_EVENT_UNREACHABLE: + connstate = -ENETUNREACH; + goto connected; + case RDMA_CM_EVENT_REJECTED: + dprintk("rpcrdma: connection to %s:%s rejected: %s\n", + rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt), + rdma_reject_msg(id, event->status)); + connstate = -ECONNREFUSED; + if (event->status == IB_CM_REJ_STALE_CONN) + connstate = -EAGAIN; + goto connected; + case RDMA_CM_EVENT_DISCONNECTED: + ++xprt->rx_xprt.connect_cookie; + connstate = -ECONNABORTED; +connected: + ep->rep_connected = connstate; + rpcrdma_conn_func(ep); + wake_up_all(&ep->rep_connect_wait); + /*FALLTHROUGH*/ + default: + dprintk("RPC: %s: %s:%s on %s/%s (ep 0x%p): %s\n", + __func__, + rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt), + ia->ri_device->name, ia->ri_ops->ro_displayname, + ep, rdma_event_msg(event->event)); + break; + } + + return 0; +} + +static struct rdma_cm_id * +rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) +{ + unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1; + struct rdma_cm_id *id; + int rc; + + trace_xprtrdma_conn_start(xprt); + + init_completion(&ia->ri_done); + + id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_conn_upcall, + xprt, RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(id)) { + rc = PTR_ERR(id); + dprintk("RPC: %s: rdma_create_id() failed %i\n", + __func__, rc); + return id; + } + + ia->ri_async_rc = -ETIMEDOUT; + rc = rdma_resolve_addr(id, NULL, + (struct sockaddr *)&xprt->rx_xprt.addr, + RDMA_RESOLVE_TIMEOUT); + if (rc) { + dprintk("RPC: %s: rdma_resolve_addr() failed %i\n", + __func__, rc); + goto out; + } + rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); + if (rc < 0) { + trace_xprtrdma_conn_tout(xprt); + goto out; + } + + rc = ia->ri_async_rc; + if (rc) + goto out; + + ia->ri_async_rc = -ETIMEDOUT; + rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); + if (rc) { + dprintk("RPC: %s: rdma_resolve_route() failed %i\n", + __func__, rc); + goto out; + } + rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); + if (rc < 0) { + trace_xprtrdma_conn_tout(xprt); + goto out; + } + rc = ia->ri_async_rc; + if (rc) + goto out; + + return id; + +out: + rdma_destroy_id(id); + return ERR_PTR(rc); +} + +/* + * Exported functions. + */ + +/** + * rpcrdma_ia_open - Open and initialize an Interface Adapter. + * @xprt: transport with IA to (re)initialize + * + * Returns 0 on success, negative errno if an appropriate + * Interface Adapter could not be found and opened. + */ +int +rpcrdma_ia_open(struct rpcrdma_xprt *xprt) +{ + struct rpcrdma_ia *ia = &xprt->rx_ia; + int rc; + + ia->ri_id = rpcrdma_create_id(xprt, ia); + if (IS_ERR(ia->ri_id)) { + rc = PTR_ERR(ia->ri_id); + goto out_err; + } + ia->ri_device = ia->ri_id->device; + + ia->ri_pd = ib_alloc_pd(ia->ri_device, 0); + if (IS_ERR(ia->ri_pd)) { + rc = PTR_ERR(ia->ri_pd); + pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc); + goto out_err; + } + + switch (xprt_rdma_memreg_strategy) { + case RPCRDMA_FRWR: + if (frwr_is_supported(ia)) { + ia->ri_ops = &rpcrdma_frwr_memreg_ops; + break; + } + /*FALLTHROUGH*/ + case RPCRDMA_MTHCAFMR: + if (fmr_is_supported(ia)) { + ia->ri_ops = &rpcrdma_fmr_memreg_ops; + break; + } + /*FALLTHROUGH*/ + default: + pr_err("rpcrdma: Device %s does not support memreg mode %d\n", + ia->ri_device->name, xprt_rdma_memreg_strategy); + rc = -EINVAL; + goto out_err; + } + + return 0; + +out_err: + rpcrdma_ia_close(ia); + return rc; +} + +/** + * rpcrdma_ia_remove - Handle device driver unload + * @ia: interface adapter being removed + * + * Divest transport H/W resources associated with this adapter, + * but allow it to be restored later. + */ +void +rpcrdma_ia_remove(struct rpcrdma_ia *ia) +{ + struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, + rx_ia); + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_req *req; + struct rpcrdma_rep *rep; + + cancel_delayed_work_sync(&buf->rb_refresh_worker); + + /* This is similar to rpcrdma_ep_destroy, but: + * - Don't cancel the connect worker. + * - Don't call rpcrdma_ep_disconnect, which waits + * for another conn upcall, which will deadlock. + * - rdma_disconnect is unneeded, the underlying + * connection is already gone. + */ + if (ia->ri_id->qp) { + ib_drain_qp(ia->ri_id->qp); + rdma_destroy_qp(ia->ri_id); + ia->ri_id->qp = NULL; + } + ib_free_cq(ep->rep_attr.recv_cq); + ep->rep_attr.recv_cq = NULL; + ib_free_cq(ep->rep_attr.send_cq); + ep->rep_attr.send_cq = NULL; + + /* The ULP is responsible for ensuring all DMA + * mappings and MRs are gone. + */ + list_for_each_entry(rep, &buf->rb_recv_bufs, rr_list) + rpcrdma_dma_unmap_regbuf(rep->rr_rdmabuf); + list_for_each_entry(req, &buf->rb_allreqs, rl_all) { + rpcrdma_dma_unmap_regbuf(req->rl_rdmabuf); + rpcrdma_dma_unmap_regbuf(req->rl_sendbuf); + rpcrdma_dma_unmap_regbuf(req->rl_recvbuf); + } + rpcrdma_mrs_destroy(buf); + ib_dealloc_pd(ia->ri_pd); + ia->ri_pd = NULL; + + /* Allow waiters to continue */ + complete(&ia->ri_remove_done); + + trace_xprtrdma_remove(r_xprt); +} + +/** + * rpcrdma_ia_close - Clean up/close an IA. + * @ia: interface adapter to close + * + */ +void +rpcrdma_ia_close(struct rpcrdma_ia *ia) +{ + if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { + if (ia->ri_id->qp) + rdma_destroy_qp(ia->ri_id); + rdma_destroy_id(ia->ri_id); + } + ia->ri_id = NULL; + ia->ri_device = NULL; + + /* If the pd is still busy, xprtrdma missed freeing a resource */ + if (ia->ri_pd && !IS_ERR(ia->ri_pd)) + ib_dealloc_pd(ia->ri_pd); + ia->ri_pd = NULL; +} + +/* + * Create unconnected endpoint. + */ +int +rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, + struct rpcrdma_create_data_internal *cdata) +{ + struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private; + struct ib_cq *sendcq, *recvcq; + unsigned int max_sge; + int rc; + + max_sge = min_t(unsigned int, ia->ri_device->attrs.max_send_sge, + RPCRDMA_MAX_SEND_SGES); + if (max_sge < RPCRDMA_MIN_SEND_SGES) { + pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge); + return -ENOMEM; + } + ia->ri_max_send_sges = max_sge; + + rc = ia->ri_ops->ro_open(ia, ep, cdata); + if (rc) + return rc; + + ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; + ep->rep_attr.qp_context = ep; + ep->rep_attr.srq = NULL; + ep->rep_attr.cap.max_send_sge = max_sge; + ep->rep_attr.cap.max_recv_sge = 1; + ep->rep_attr.cap.max_inline_data = 0; + ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + ep->rep_attr.qp_type = IB_QPT_RC; + ep->rep_attr.port_num = ~0; + + dprintk("RPC: %s: requested max: dtos: send %d recv %d; " + "iovs: send %d recv %d\n", + __func__, + ep->rep_attr.cap.max_send_wr, + ep->rep_attr.cap.max_recv_wr, + ep->rep_attr.cap.max_send_sge, + ep->rep_attr.cap.max_recv_sge); + + /* set trigger for requesting send completion */ + ep->rep_send_batch = min_t(unsigned int, RPCRDMA_MAX_SEND_BATCH, + cdata->max_requests >> 2); + ep->rep_send_count = ep->rep_send_batch; + init_waitqueue_head(&ep->rep_connect_wait); + INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); + + sendcq = ib_alloc_cq(ia->ri_device, NULL, + ep->rep_attr.cap.max_send_wr + 1, + ia->ri_device->num_comp_vectors > 1 ? 1 : 0, + IB_POLL_WORKQUEUE); + if (IS_ERR(sendcq)) { + rc = PTR_ERR(sendcq); + dprintk("RPC: %s: failed to create send CQ: %i\n", + __func__, rc); + goto out1; + } + + recvcq = ib_alloc_cq(ia->ri_device, NULL, + ep->rep_attr.cap.max_recv_wr + 1, + 0, IB_POLL_WORKQUEUE); + if (IS_ERR(recvcq)) { + rc = PTR_ERR(recvcq); + dprintk("RPC: %s: failed to create recv CQ: %i\n", + __func__, rc); + goto out2; + } + + ep->rep_attr.send_cq = sendcq; + ep->rep_attr.recv_cq = recvcq; + + /* Initialize cma parameters */ + memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma)); + + /* Prepare RDMA-CM private message */ + pmsg->cp_magic = rpcrdma_cmp_magic; + pmsg->cp_version = RPCRDMA_CMP_VERSION; + pmsg->cp_flags |= ia->ri_ops->ro_send_w_inv_ok; + pmsg->cp_send_size = rpcrdma_encode_buffer_size(cdata->inline_wsize); + pmsg->cp_recv_size = rpcrdma_encode_buffer_size(cdata->inline_rsize); + ep->rep_remote_cma.private_data = pmsg; + ep->rep_remote_cma.private_data_len = sizeof(*pmsg); + + /* Client offers RDMA Read but does not initiate */ + ep->rep_remote_cma.initiator_depth = 0; + ep->rep_remote_cma.responder_resources = + min_t(int, U8_MAX, ia->ri_device->attrs.max_qp_rd_atom); + + /* Limit transport retries so client can detect server + * GID changes quickly. RPC layer handles re-establishing + * transport connection and retransmission. + */ + ep->rep_remote_cma.retry_count = 6; + + /* RPC-over-RDMA handles its own flow control. In addition, + * make all RNR NAKs visible so we know that RPC-over-RDMA + * flow control is working correctly (no NAKs should be seen). + */ + ep->rep_remote_cma.flow_control = 0; + ep->rep_remote_cma.rnr_retry_count = 0; + + return 0; + +out2: + ib_free_cq(sendcq); +out1: + return rc; +} + +/* + * rpcrdma_ep_destroy + * + * Disconnect and destroy endpoint. After this, the only + * valid operations on the ep are to free it (if dynamically + * allocated) or re-create it. + */ +void +rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +{ + cancel_delayed_work_sync(&ep->rep_connect_worker); + + if (ia->ri_id && ia->ri_id->qp) { + rpcrdma_ep_disconnect(ep, ia); + rdma_destroy_qp(ia->ri_id); + ia->ri_id->qp = NULL; + } + + if (ep->rep_attr.recv_cq) + ib_free_cq(ep->rep_attr.recv_cq); + if (ep->rep_attr.send_cq) + ib_free_cq(ep->rep_attr.send_cq); +} + +/* Re-establish a connection after a device removal event. + * Unlike a normal reconnection, a fresh PD and a new set + * of MRs and buffers is needed. + */ +static int +rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +{ + int rc, err; + + trace_xprtrdma_reinsert(r_xprt); + + rc = -EHOSTUNREACH; + if (rpcrdma_ia_open(r_xprt)) + goto out1; + + rc = -ENOMEM; + err = rpcrdma_ep_create(ep, ia, &r_xprt->rx_data); + if (err) { + pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err); + goto out2; + } + + rc = -ENETUNREACH; + err = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); + if (err) { + pr_err("rpcrdma: rdma_create_qp returned %d\n", err); + goto out3; + } + + rpcrdma_mrs_create(r_xprt); + return 0; + +out3: + rpcrdma_ep_destroy(ep, ia); +out2: + rpcrdma_ia_close(ia); +out1: + return rc; +} + +static int +rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, + struct rpcrdma_ia *ia) +{ + struct rdma_cm_id *id, *old; + int err, rc; + + trace_xprtrdma_reconnect(r_xprt); + + rpcrdma_ep_disconnect(ep, ia); + + rc = -EHOSTUNREACH; + id = rpcrdma_create_id(r_xprt, ia); + if (IS_ERR(id)) + goto out; + + /* As long as the new ID points to the same device as the + * old ID, we can reuse the transport's existing PD and all + * previously allocated MRs. Also, the same device means + * the transport's previous DMA mappings are still valid. + * + * This is a sanity check only. There should be no way these + * point to two different devices here. + */ + old = id; + rc = -ENETUNREACH; + if (ia->ri_device != id->device) { + pr_err("rpcrdma: can't reconnect on different device!\n"); + goto out_destroy; + } + + err = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); + if (err) { + dprintk("RPC: %s: rdma_create_qp returned %d\n", + __func__, err); + goto out_destroy; + } + + /* Atomically replace the transport's ID and QP. */ + rc = 0; + old = ia->ri_id; + ia->ri_id = id; + rdma_destroy_qp(old); + +out_destroy: + rdma_destroy_id(old); +out: + return rc; +} + +/* + * Connect unconnected endpoint. + */ +int +rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +{ + struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, + rx_ia); + int rc; + +retry: + switch (ep->rep_connected) { + case 0: + dprintk("RPC: %s: connecting...\n", __func__); + rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); + if (rc) { + dprintk("RPC: %s: rdma_create_qp failed %i\n", + __func__, rc); + rc = -ENETUNREACH; + goto out_noupdate; + } + break; + case -ENODEV: + rc = rpcrdma_ep_recreate_xprt(r_xprt, ep, ia); + if (rc) + goto out_noupdate; + break; + default: + rc = rpcrdma_ep_reconnect(r_xprt, ep, ia); + if (rc) + goto out; + } + + ep->rep_connected = 0; + rpcrdma_post_recvs(r_xprt, true); + + rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); + if (rc) { + dprintk("RPC: %s: rdma_connect() failed with %i\n", + __func__, rc); + goto out; + } + + wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); + if (ep->rep_connected <= 0) { + if (ep->rep_connected == -EAGAIN) + goto retry; + rc = ep->rep_connected; + goto out; + } + + dprintk("RPC: %s: connected\n", __func__); + +out: + if (rc) + ep->rep_connected = rc; + +out_noupdate: + return rc; +} + +/* + * rpcrdma_ep_disconnect + * + * This is separate from destroy to facilitate the ability + * to reconnect without recreating the endpoint. + * + * This call is not reentrant, and must not be made in parallel + * on the same endpoint. + */ +void +rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +{ + int rc; + + rc = rdma_disconnect(ia->ri_id); + if (!rc) + /* returns without wait if not connected */ + wait_event_interruptible(ep->rep_connect_wait, + ep->rep_connected != 1); + else + ep->rep_connected = rc; + trace_xprtrdma_disconnect(container_of(ep, struct rpcrdma_xprt, + rx_ep), rc); + + ib_drain_qp(ia->ri_id->qp); +} + +/* Fixed-size circular FIFO queue. This implementation is wait-free and + * lock-free. + * + * Consumer is the code path that posts Sends. This path dequeues a + * sendctx for use by a Send operation. Multiple consumer threads + * are serialized by the RPC transport lock, which allows only one + * ->send_request call at a time. + * + * Producer is the code path that handles Send completions. This path + * enqueues a sendctx that has been completed. Multiple producer + * threads are serialized by the ib_poll_cq() function. + */ + +/* rpcrdma_sendctxs_destroy() assumes caller has already quiesced + * queue activity, and ib_drain_qp has flushed all remaining Send + * requests. + */ +static void rpcrdma_sendctxs_destroy(struct rpcrdma_buffer *buf) +{ + unsigned long i; + + for (i = 0; i <= buf->rb_sc_last; i++) + kfree(buf->rb_sc_ctxs[i]); + kfree(buf->rb_sc_ctxs); +} + +static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ia *ia) +{ + struct rpcrdma_sendctx *sc; + + sc = kzalloc(sizeof(*sc) + + ia->ri_max_send_sges * sizeof(struct ib_sge), + GFP_KERNEL); + if (!sc) + return NULL; + + sc->sc_wr.wr_cqe = &sc->sc_cqe; + sc->sc_wr.sg_list = sc->sc_sges; + sc->sc_wr.opcode = IB_WR_SEND; + sc->sc_cqe.done = rpcrdma_wc_send; + return sc; +} + +static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_sendctx *sc; + unsigned long i; + + /* Maximum number of concurrent outstanding Send WRs. Capping + * the circular queue size stops Send Queue overflow by causing + * the ->send_request call to fail temporarily before too many + * Sends are posted. + */ + i = buf->rb_max_requests + RPCRDMA_MAX_BC_REQUESTS; + dprintk("RPC: %s: allocating %lu send_ctxs\n", __func__, i); + buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL); + if (!buf->rb_sc_ctxs) + return -ENOMEM; + + buf->rb_sc_last = i - 1; + for (i = 0; i <= buf->rb_sc_last; i++) { + sc = rpcrdma_sendctx_create(&r_xprt->rx_ia); + if (!sc) + return -ENOMEM; + + sc->sc_xprt = r_xprt; + buf->rb_sc_ctxs[i] = sc; + } + buf->rb_flags = 0; + + return 0; +} + +/* The sendctx queue is not guaranteed to have a size that is a + * power of two, thus the helpers in circ_buf.h cannot be used. + * The other option is to use modulus (%), which can be expensive. + */ +static unsigned long rpcrdma_sendctx_next(struct rpcrdma_buffer *buf, + unsigned long item) +{ + return likely(item < buf->rb_sc_last) ? item + 1 : 0; +} + +/** + * rpcrdma_sendctx_get_locked - Acquire a send context + * @buf: transport buffers from which to acquire an unused context + * + * Returns pointer to a free send completion context; or NULL if + * the queue is empty. + * + * Usage: Called to acquire an SGE array before preparing a Send WR. + * + * The caller serializes calls to this function (per rpcrdma_buffer), + * and provides an effective memory barrier that flushes the new value + * of rb_sc_head. + */ +struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_xprt *r_xprt; + struct rpcrdma_sendctx *sc; + unsigned long next_head; + + next_head = rpcrdma_sendctx_next(buf, buf->rb_sc_head); + + if (next_head == READ_ONCE(buf->rb_sc_tail)) + goto out_emptyq; + + /* ORDER: item must be accessed _before_ head is updated */ + sc = buf->rb_sc_ctxs[next_head]; + + /* Releasing the lock in the caller acts as a memory + * barrier that flushes rb_sc_head. + */ + buf->rb_sc_head = next_head; + + return sc; + +out_emptyq: + /* The queue is "empty" if there have not been enough Send + * completions recently. This is a sign the Send Queue is + * backing up. Cause the caller to pause and try again. + */ + set_bit(RPCRDMA_BUF_F_EMPTY_SCQ, &buf->rb_flags); + r_xprt = container_of(buf, struct rpcrdma_xprt, rx_buf); + r_xprt->rx_stats.empty_sendctx_q++; + return NULL; +} + +/** + * rpcrdma_sendctx_put_locked - Release a send context + * @sc: send context to release + * + * Usage: Called from Send completion to return a sendctxt + * to the queue. + * + * The caller serializes calls to this function (per rpcrdma_buffer). + */ +static void +rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc) +{ + struct rpcrdma_buffer *buf = &sc->sc_xprt->rx_buf; + unsigned long next_tail; + + /* Unmap SGEs of previously completed by unsignaled + * Sends by walking up the queue until @sc is found. + */ + next_tail = buf->rb_sc_tail; + do { + next_tail = rpcrdma_sendctx_next(buf, next_tail); + + /* ORDER: item must be accessed _before_ tail is updated */ + rpcrdma_unmap_sendctx(buf->rb_sc_ctxs[next_tail]); + + } while (buf->rb_sc_ctxs[next_tail] != sc); + + /* Paired with READ_ONCE */ + smp_store_release(&buf->rb_sc_tail, next_tail); + + if (test_and_clear_bit(RPCRDMA_BUF_F_EMPTY_SCQ, &buf->rb_flags)) { + smp_mb__after_atomic(); + xprt_write_space(&sc->sc_xprt->rx_xprt); + } +} + +static void +rpcrdma_mr_recovery_worker(struct work_struct *work) +{ + struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, + rb_recovery_worker.work); + struct rpcrdma_mr *mr; + + spin_lock(&buf->rb_recovery_lock); + while (!list_empty(&buf->rb_stale_mrs)) { + mr = rpcrdma_mr_pop(&buf->rb_stale_mrs); + spin_unlock(&buf->rb_recovery_lock); + + trace_xprtrdma_recover_mr(mr); + mr->mr_xprt->rx_ia.ri_ops->ro_recover_mr(mr); + + spin_lock(&buf->rb_recovery_lock); + } + spin_unlock(&buf->rb_recovery_lock); +} + +void +rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr) +{ + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + + spin_lock(&buf->rb_recovery_lock); + rpcrdma_mr_push(mr, &buf->rb_stale_mrs); + spin_unlock(&buf->rb_recovery_lock); + + schedule_delayed_work(&buf->rb_recovery_worker, 0); +} + +static void +rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + unsigned int count; + LIST_HEAD(free); + LIST_HEAD(all); + + for (count = 0; count < 3; count++) { + struct rpcrdma_mr *mr; + int rc; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + break; + + rc = ia->ri_ops->ro_init_mr(ia, mr); + if (rc) { + kfree(mr); + break; + } + + mr->mr_xprt = r_xprt; + + list_add(&mr->mr_list, &free); + list_add(&mr->mr_all, &all); + } + + spin_lock(&buf->rb_mrlock); + list_splice(&free, &buf->rb_mrs); + list_splice(&all, &buf->rb_all); + r_xprt->rx_stats.mrs_allocated += count; + spin_unlock(&buf->rb_mrlock); + trace_xprtrdma_createmrs(r_xprt, count); + + xprt_write_space(&r_xprt->rx_xprt); +} + +static void +rpcrdma_mr_refresh_worker(struct work_struct *work) +{ + struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, + rb_refresh_worker.work); + struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, + rx_buf); + + rpcrdma_mrs_create(r_xprt); +} + +struct rpcrdma_req * +rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; + struct rpcrdma_regbuf *rb; + struct rpcrdma_req *req; + + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (req == NULL) + return ERR_PTR(-ENOMEM); + + rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE, + DMA_TO_DEVICE, GFP_KERNEL); + if (IS_ERR(rb)) { + kfree(req); + return ERR_PTR(-ENOMEM); + } + req->rl_rdmabuf = rb; + xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb)); + req->rl_buffer = buffer; + INIT_LIST_HEAD(&req->rl_registered); + + spin_lock(&buffer->rb_reqslock); + list_add(&req->rl_all, &buffer->rb_allreqs); + spin_unlock(&buffer->rb_reqslock); + return req; +} + +static int +rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp) +{ + struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_rep *rep; + int rc; + + rc = -ENOMEM; + rep = kzalloc(sizeof(*rep), GFP_KERNEL); + if (rep == NULL) + goto out; + + rep->rr_rdmabuf = rpcrdma_alloc_regbuf(cdata->inline_rsize, + DMA_FROM_DEVICE, GFP_KERNEL); + if (IS_ERR(rep->rr_rdmabuf)) { + rc = PTR_ERR(rep->rr_rdmabuf); + goto out_free; + } + xdr_buf_init(&rep->rr_hdrbuf, rep->rr_rdmabuf->rg_base, + rdmab_length(rep->rr_rdmabuf)); + + rep->rr_cqe.done = rpcrdma_wc_receive; + rep->rr_rxprt = r_xprt; + INIT_WORK(&rep->rr_work, rpcrdma_deferred_completion); + rep->rr_recv_wr.next = NULL; + rep->rr_recv_wr.wr_cqe = &rep->rr_cqe; + rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; + rep->rr_recv_wr.num_sge = 1; + rep->rr_temp = temp; + + spin_lock(&buf->rb_lock); + list_add(&rep->rr_list, &buf->rb_recv_bufs); + spin_unlock(&buf->rb_lock); + return 0; + +out_free: + kfree(rep); +out: + dprintk("RPC: %s: reply buffer %d alloc failed\n", + __func__, rc); + return rc; +} + +int +rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + int i, rc; + + buf->rb_max_requests = r_xprt->rx_data.max_requests; + buf->rb_bc_srv_max_requests = 0; + spin_lock_init(&buf->rb_mrlock); + spin_lock_init(&buf->rb_lock); + spin_lock_init(&buf->rb_recovery_lock); + INIT_LIST_HEAD(&buf->rb_mrs); + INIT_LIST_HEAD(&buf->rb_all); + INIT_LIST_HEAD(&buf->rb_stale_mrs); + INIT_DELAYED_WORK(&buf->rb_refresh_worker, + rpcrdma_mr_refresh_worker); + INIT_DELAYED_WORK(&buf->rb_recovery_worker, + rpcrdma_mr_recovery_worker); + + rpcrdma_mrs_create(r_xprt); + + INIT_LIST_HEAD(&buf->rb_send_bufs); + INIT_LIST_HEAD(&buf->rb_allreqs); + spin_lock_init(&buf->rb_reqslock); + for (i = 0; i < buf->rb_max_requests; i++) { + struct rpcrdma_req *req; + + req = rpcrdma_create_req(r_xprt); + if (IS_ERR(req)) { + dprintk("RPC: %s: request buffer %d alloc" + " failed\n", __func__, i); + rc = PTR_ERR(req); + goto out; + } + list_add(&req->rl_list, &buf->rb_send_bufs); + } + + buf->rb_credits = 1; + buf->rb_posted_receives = 0; + INIT_LIST_HEAD(&buf->rb_recv_bufs); + + rc = rpcrdma_sendctxs_create(r_xprt); + if (rc) + goto out; + + return 0; +out: + rpcrdma_buffer_destroy(buf); + return rc; +} + +static void +rpcrdma_destroy_rep(struct rpcrdma_rep *rep) +{ + rpcrdma_free_regbuf(rep->rr_rdmabuf); + kfree(rep); +} + +void +rpcrdma_destroy_req(struct rpcrdma_req *req) +{ + rpcrdma_free_regbuf(req->rl_recvbuf); + rpcrdma_free_regbuf(req->rl_sendbuf); + rpcrdma_free_regbuf(req->rl_rdmabuf); + kfree(req); +} + +static void +rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, + rx_buf); + struct rpcrdma_ia *ia = rdmab_to_ia(buf); + struct rpcrdma_mr *mr; + unsigned int count; + + count = 0; + spin_lock(&buf->rb_mrlock); + while (!list_empty(&buf->rb_all)) { + mr = list_entry(buf->rb_all.next, struct rpcrdma_mr, mr_all); + list_del(&mr->mr_all); + + spin_unlock(&buf->rb_mrlock); + + /* Ensure MW is not on any rl_registered list */ + if (!list_empty(&mr->mr_list)) + list_del(&mr->mr_list); + + ia->ri_ops->ro_release_mr(mr); + count++; + spin_lock(&buf->rb_mrlock); + } + spin_unlock(&buf->rb_mrlock); + r_xprt->rx_stats.mrs_allocated = 0; + + dprintk("RPC: %s: released %u MRs\n", __func__, count); +} + +void +rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) +{ + cancel_delayed_work_sync(&buf->rb_recovery_worker); + cancel_delayed_work_sync(&buf->rb_refresh_worker); + + rpcrdma_sendctxs_destroy(buf); + + while (!list_empty(&buf->rb_recv_bufs)) { + struct rpcrdma_rep *rep; + + rep = list_first_entry(&buf->rb_recv_bufs, + struct rpcrdma_rep, rr_list); + list_del(&rep->rr_list); + rpcrdma_destroy_rep(rep); + } + + spin_lock(&buf->rb_reqslock); + while (!list_empty(&buf->rb_allreqs)) { + struct rpcrdma_req *req; + + req = list_first_entry(&buf->rb_allreqs, + struct rpcrdma_req, rl_all); + list_del(&req->rl_all); + + spin_unlock(&buf->rb_reqslock); + rpcrdma_destroy_req(req); + spin_lock(&buf->rb_reqslock); + } + spin_unlock(&buf->rb_reqslock); + + rpcrdma_mrs_destroy(buf); +} + +/** + * rpcrdma_mr_get - Allocate an rpcrdma_mr object + * @r_xprt: controlling transport + * + * Returns an initialized rpcrdma_mr or NULL if no free + * rpcrdma_mr objects are available. + */ +struct rpcrdma_mr * +rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_mr *mr = NULL; + + spin_lock(&buf->rb_mrlock); + if (!list_empty(&buf->rb_mrs)) + mr = rpcrdma_mr_pop(&buf->rb_mrs); + spin_unlock(&buf->rb_mrlock); + + if (!mr) + goto out_nomrs; + return mr; + +out_nomrs: + trace_xprtrdma_nomrs(r_xprt); + if (r_xprt->rx_ep.rep_connected != -ENODEV) + schedule_delayed_work(&buf->rb_refresh_worker, 0); + + /* Allow the reply handler and refresh worker to run */ + cond_resched(); + + return NULL; +} + +static void +__rpcrdma_mr_put(struct rpcrdma_buffer *buf, struct rpcrdma_mr *mr) +{ + spin_lock(&buf->rb_mrlock); + rpcrdma_mr_push(mr, &buf->rb_mrs); + spin_unlock(&buf->rb_mrlock); +} + +/** + * rpcrdma_mr_put - Release an rpcrdma_mr object + * @mr: object to release + * + */ +void +rpcrdma_mr_put(struct rpcrdma_mr *mr) +{ + __rpcrdma_mr_put(&mr->mr_xprt->rx_buf, mr); +} + +/** + * rpcrdma_mr_unmap_and_put - DMA unmap an MR and release it + * @mr: object to release + * + */ +void +rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr) +{ + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; + + trace_xprtrdma_dma_unmap(mr); + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + mr->mr_sg, mr->mr_nents, mr->mr_dir); + __rpcrdma_mr_put(&r_xprt->rx_buf, mr); +} + +/** + * rpcrdma_buffer_get - Get a request buffer + * @buffers: Buffer pool from which to obtain a buffer + * + * Returns a fresh rpcrdma_req, or NULL if none are available. + */ +struct rpcrdma_req * +rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) +{ + struct rpcrdma_req *req; + + spin_lock(&buffers->rb_lock); + req = list_first_entry_or_null(&buffers->rb_send_bufs, + struct rpcrdma_req, rl_list); + if (req) + list_del_init(&req->rl_list); + spin_unlock(&buffers->rb_lock); + return req; +} + +/** + * rpcrdma_buffer_put - Put request/reply buffers back into pool + * @req: object to return + * + */ +void +rpcrdma_buffer_put(struct rpcrdma_req *req) +{ + struct rpcrdma_buffer *buffers = req->rl_buffer; + struct rpcrdma_rep *rep = req->rl_reply; + + req->rl_reply = NULL; + + spin_lock(&buffers->rb_lock); + list_add(&req->rl_list, &buffers->rb_send_bufs); + if (rep) { + if (!rep->rr_temp) { + list_add(&rep->rr_list, &buffers->rb_recv_bufs); + rep = NULL; + } + } + spin_unlock(&buffers->rb_lock); + if (rep) + rpcrdma_destroy_rep(rep); +} + +/* + * Put reply buffers back into pool when not attached to + * request. This happens in error conditions. + */ +void +rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) +{ + struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf; + + if (!rep->rr_temp) { + spin_lock(&buffers->rb_lock); + list_add(&rep->rr_list, &buffers->rb_recv_bufs); + spin_unlock(&buffers->rb_lock); + } else { + rpcrdma_destroy_rep(rep); + } +} + +/** + * rpcrdma_alloc_regbuf - allocate and DMA-map memory for SEND/RECV buffers + * @size: size of buffer to be allocated, in bytes + * @direction: direction of data movement + * @flags: GFP flags + * + * Returns an ERR_PTR, or a pointer to a regbuf, a buffer that + * can be persistently DMA-mapped for I/O. + * + * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for + * receiving the payload of RDMA RECV operations. During Long Calls + * or Replies they may be registered externally via ro_map. + */ +struct rpcrdma_regbuf * +rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction, + gfp_t flags) +{ + struct rpcrdma_regbuf *rb; + + rb = kmalloc(sizeof(*rb) + size, flags); + if (rb == NULL) + return ERR_PTR(-ENOMEM); + + rb->rg_device = NULL; + rb->rg_direction = direction; + rb->rg_iov.length = size; + + return rb; +} + +/** + * __rpcrdma_map_regbuf - DMA-map a regbuf + * @ia: controlling rpcrdma_ia + * @rb: regbuf to be mapped + */ +bool +__rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) +{ + struct ib_device *device = ia->ri_device; + + if (rb->rg_direction == DMA_NONE) + return false; + + rb->rg_iov.addr = ib_dma_map_single(device, + (void *)rb->rg_base, + rdmab_length(rb), + rb->rg_direction); + if (ib_dma_mapping_error(device, rdmab_addr(rb))) + return false; + + rb->rg_device = device; + rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey; + return true; +} + +static void +rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb) +{ + if (!rb) + return; + + if (!rpcrdma_regbuf_is_mapped(rb)) + return; + + ib_dma_unmap_single(rb->rg_device, rdmab_addr(rb), + rdmab_length(rb), rb->rg_direction); + rb->rg_device = NULL; +} + +/** + * rpcrdma_free_regbuf - deregister and free registered buffer + * @rb: regbuf to be deregistered and freed + */ +void +rpcrdma_free_regbuf(struct rpcrdma_regbuf *rb) +{ + rpcrdma_dma_unmap_regbuf(rb); + kfree(rb); +} + +/* + * Prepost any receive buffer, then post send. + * + * Receive buffer is donated to hardware, reclaimed upon recv completion. + */ +int +rpcrdma_ep_post(struct rpcrdma_ia *ia, + struct rpcrdma_ep *ep, + struct rpcrdma_req *req) +{ + struct ib_send_wr *send_wr = &req->rl_sendctx->sc_wr; + int rc; + + if (!ep->rep_send_count || + test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) { + send_wr->send_flags |= IB_SEND_SIGNALED; + ep->rep_send_count = ep->rep_send_batch; + } else { + send_wr->send_flags &= ~IB_SEND_SIGNALED; + --ep->rep_send_count; + } + + rc = ia->ri_ops->ro_send(ia, req); + trace_xprtrdma_post_send(req, rc); + if (rc) + return -ENOTCONN; + return 0; +} + +/** + * rpcrdma_post_recvs - Maybe post some Receive buffers + * @r_xprt: controlling transport + * @temp: when true, allocate temp rpcrdma_rep objects + * + */ +void +rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct ib_recv_wr *wr, *bad_wr; + int needed, count, rc; + + needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1); + if (buf->rb_posted_receives > needed) + return; + needed -= buf->rb_posted_receives; + + count = 0; + wr = NULL; + while (needed) { + struct rpcrdma_regbuf *rb; + struct rpcrdma_rep *rep; + + spin_lock(&buf->rb_lock); + rep = list_first_entry_or_null(&buf->rb_recv_bufs, + struct rpcrdma_rep, rr_list); + if (likely(rep)) + list_del(&rep->rr_list); + spin_unlock(&buf->rb_lock); + if (!rep) { + if (rpcrdma_create_rep(r_xprt, temp)) + break; + continue; + } + + rb = rep->rr_rdmabuf; + if (!rpcrdma_regbuf_is_mapped(rb)) { + if (!__rpcrdma_dma_map_regbuf(&r_xprt->rx_ia, rb)) { + rpcrdma_recv_buffer_put(rep); + break; + } + } + + trace_xprtrdma_post_recv(rep->rr_recv_wr.wr_cqe); + rep->rr_recv_wr.next = wr; + wr = &rep->rr_recv_wr; + ++count; + --needed; + } + if (!count) + return; + + rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr, + (const struct ib_recv_wr **)&bad_wr); + if (rc) { + for (wr = bad_wr; wr;) { + struct rpcrdma_rep *rep; + + rep = container_of(wr, struct rpcrdma_rep, rr_recv_wr); + wr = wr->next; + rpcrdma_recv_buffer_put(rep); + --count; + } + } + buf->rb_posted_receives += count; + trace_xprtrdma_post_recvs(r_xprt, count, rc); +} diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h new file mode 100644 index 000000000..2ca14f7c2 --- /dev/null +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -0,0 +1,675 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* + * Copyright (c) 2014-2017 Oracle. All rights reserved. + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_SUNRPC_XPRT_RDMA_H +#define _LINUX_SUNRPC_XPRT_RDMA_H + +#include <linux/wait.h> /* wait_queue_head_t, etc */ +#include <linux/spinlock.h> /* spinlock_t, etc */ +#include <linux/atomic.h> /* atomic_t, etc */ +#include <linux/workqueue.h> /* struct work_struct */ + +#include <rdma/rdma_cm.h> /* RDMA connection api */ +#include <rdma/ib_verbs.h> /* RDMA verbs api */ + +#include <linux/sunrpc/clnt.h> /* rpc_xprt */ +#include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */ +#include <linux/sunrpc/xprtrdma.h> /* xprt parameters */ + +#define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */ +#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ + +#define RPCRDMA_BIND_TO (60U * HZ) +#define RPCRDMA_INIT_REEST_TO (5U * HZ) +#define RPCRDMA_MAX_REEST_TO (30U * HZ) +#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ) + +/* + * Interface Adapter -- one per transport instance + */ +struct rpcrdma_ia { + const struct rpcrdma_memreg_ops *ri_ops; + struct ib_device *ri_device; + struct rdma_cm_id *ri_id; + struct ib_pd *ri_pd; + struct completion ri_done; + struct completion ri_remove_done; + int ri_async_rc; + unsigned int ri_max_segs; + unsigned int ri_max_frwr_depth; + unsigned int ri_max_inline_write; + unsigned int ri_max_inline_read; + unsigned int ri_max_send_sges; + bool ri_implicit_roundup; + enum ib_mr_type ri_mrtype; + unsigned long ri_flags; + struct ib_qp_attr ri_qp_attr; + struct ib_qp_init_attr ri_qp_init_attr; +}; + +enum { + RPCRDMA_IAF_REMOVING = 0, +}; + +/* + * RDMA Endpoint -- one per transport instance + */ + +struct rpcrdma_ep { + unsigned int rep_send_count; + unsigned int rep_send_batch; + int rep_connected; + struct ib_qp_init_attr rep_attr; + wait_queue_head_t rep_connect_wait; + struct rpcrdma_connect_private rep_cm_private; + struct rdma_conn_param rep_remote_cma; + struct delayed_work rep_connect_worker; +}; + +/* Pre-allocate extra Work Requests for handling backward receives + * and sends. This is a fixed value because the Work Queues are + * allocated when the forward channel is set up. + */ +#if defined(CONFIG_SUNRPC_BACKCHANNEL) +#define RPCRDMA_BACKWARD_WRS (8) +#else +#define RPCRDMA_BACKWARD_WRS (0) +#endif + +/* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV + * + * The below structure appears at the front of a large region of kmalloc'd + * memory, which always starts on a good alignment boundary. + */ + +struct rpcrdma_regbuf { + struct ib_sge rg_iov; + struct ib_device *rg_device; + enum dma_data_direction rg_direction; + __be32 rg_base[0] __attribute__ ((aligned(256))); +}; + +static inline u64 +rdmab_addr(struct rpcrdma_regbuf *rb) +{ + return rb->rg_iov.addr; +} + +static inline u32 +rdmab_length(struct rpcrdma_regbuf *rb) +{ + return rb->rg_iov.length; +} + +static inline u32 +rdmab_lkey(struct rpcrdma_regbuf *rb) +{ + return rb->rg_iov.lkey; +} + +static inline struct ib_device * +rdmab_device(struct rpcrdma_regbuf *rb) +{ + return rb->rg_device; +} + +#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN) + +/* To ensure a transport can always make forward progress, + * the number of RDMA segments allowed in header chunk lists + * is capped at 8. This prevents less-capable devices and + * memory registrations from overrunning the Send buffer + * while building chunk lists. + * + * Elements of the Read list take up more room than the + * Write list or Reply chunk. 8 read segments means the Read + * list (or Write list or Reply chunk) cannot consume more + * than + * + * ((8 + 2) * read segment size) + 1 XDR words, or 244 bytes. + * + * And the fixed part of the header is another 24 bytes. + * + * The smallest inline threshold is 1024 bytes, ensuring that + * at least 750 bytes are available for RPC messages. + */ +enum { + RPCRDMA_MAX_HDR_SEGS = 8, + RPCRDMA_HDRBUF_SIZE = 256, +}; + +/* + * struct rpcrdma_rep -- this structure encapsulates state required + * to receive and complete an RPC Reply, asychronously. It needs + * several pieces of state: + * + * o receive buffer and ib_sge (donated to provider) + * o status of receive (success or not, length, inv rkey) + * o bookkeeping state to get run by reply handler (XDR stream) + * + * These structures are allocated during transport initialization. + * N of these are associated with a transport instance, managed by + * struct rpcrdma_buffer. N is the max number of outstanding RPCs. + */ + +struct rpcrdma_rep { + struct ib_cqe rr_cqe; + __be32 rr_xid; + __be32 rr_vers; + __be32 rr_proc; + int rr_wc_flags; + u32 rr_inv_rkey; + bool rr_temp; + struct rpcrdma_regbuf *rr_rdmabuf; + struct rpcrdma_xprt *rr_rxprt; + struct work_struct rr_work; + struct xdr_buf rr_hdrbuf; + struct xdr_stream rr_stream; + struct rpc_rqst *rr_rqst; + struct list_head rr_list; + struct ib_recv_wr rr_recv_wr; +}; + +/* struct rpcrdma_sendctx - DMA mapped SGEs to unmap after Send completes + */ +struct rpcrdma_req; +struct rpcrdma_xprt; +struct rpcrdma_sendctx { + struct ib_send_wr sc_wr; + struct ib_cqe sc_cqe; + struct rpcrdma_xprt *sc_xprt; + struct rpcrdma_req *sc_req; + unsigned int sc_unmap_count; + struct ib_sge sc_sges[]; +}; + +/* Limit the number of SGEs that can be unmapped during one + * Send completion. This caps the amount of work a single + * completion can do before returning to the provider. + * + * Setting this to zero disables Send completion batching. + */ +enum { + RPCRDMA_MAX_SEND_BATCH = 7, +}; + +/* + * struct rpcrdma_mr - external memory region metadata + * + * An external memory region is any buffer or page that is registered + * on the fly (ie, not pre-registered). + * + * Each rpcrdma_buffer has a list of free MWs anchored in rb_mrs. During + * call_allocate, rpcrdma_buffer_get() assigns one to each segment in + * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep + * track of registration metadata while each RPC is pending. + * rpcrdma_deregister_external() uses this metadata to unmap and + * release these resources when an RPC is complete. + */ +enum rpcrdma_frwr_state { + FRWR_IS_INVALID, /* ready to be used */ + FRWR_IS_VALID, /* in use */ + FRWR_FLUSHED_FR, /* flushed FASTREG WR */ + FRWR_FLUSHED_LI, /* flushed LOCALINV WR */ +}; + +struct rpcrdma_frwr { + struct ib_mr *fr_mr; + struct ib_cqe fr_cqe; + enum rpcrdma_frwr_state fr_state; + struct completion fr_linv_done; + union { + struct ib_reg_wr fr_regwr; + struct ib_send_wr fr_invwr; + }; +}; + +struct rpcrdma_fmr { + struct ib_fmr *fm_mr; + u64 *fm_physaddrs; +}; + +struct rpcrdma_mr { + struct list_head mr_list; + struct scatterlist *mr_sg; + int mr_nents; + enum dma_data_direction mr_dir; + union { + struct rpcrdma_fmr fmr; + struct rpcrdma_frwr frwr; + }; + struct rpcrdma_xprt *mr_xprt; + u32 mr_handle; + u32 mr_length; + u64 mr_offset; + struct list_head mr_all; +}; + +/* + * struct rpcrdma_req -- structure central to the request/reply sequence. + * + * N of these are associated with a transport instance, and stored in + * struct rpcrdma_buffer. N is the max number of outstanding requests. + * + * It includes pre-registered buffer memory for send AND recv. + * The recv buffer, however, is not owned by this structure, and + * is "donated" to the hardware when a recv is posted. When a + * reply is handled, the recv buffer used is given back to the + * struct rpcrdma_req associated with the request. + * + * In addition to the basic memory, this structure includes an array + * of iovs for send operations. The reason is that the iovs passed to + * ib_post_{send,recv} must not be modified until the work request + * completes. + */ + +/* Maximum number of page-sized "segments" per chunk list to be + * registered or invalidated. Must handle a Reply chunk: + */ +enum { + RPCRDMA_MAX_IOV_SEGS = 3, + RPCRDMA_MAX_DATA_SEGS = ((1 * 1024 * 1024) / PAGE_SIZE) + 1, + RPCRDMA_MAX_SEGS = RPCRDMA_MAX_DATA_SEGS + + RPCRDMA_MAX_IOV_SEGS, +}; + +struct rpcrdma_mr_seg { /* chunk descriptors */ + u32 mr_len; /* length of chunk or segment */ + struct page *mr_page; /* owning page, if any */ + char *mr_offset; /* kva if no page, else offset */ +}; + +/* The Send SGE array is provisioned to send a maximum size + * inline request: + * - RPC-over-RDMA header + * - xdr_buf head iovec + * - RPCRDMA_MAX_INLINE bytes, in pages + * - xdr_buf tail iovec + * + * The actual number of array elements consumed by each RPC + * depends on the device's max_sge limit. + */ +enum { + RPCRDMA_MIN_SEND_SGES = 3, + RPCRDMA_MAX_PAGE_SGES = RPCRDMA_MAX_INLINE >> PAGE_SHIFT, + RPCRDMA_MAX_SEND_SGES = 1 + 1 + RPCRDMA_MAX_PAGE_SGES + 1, +}; + +struct rpcrdma_buffer; +struct rpcrdma_req { + struct list_head rl_list; + struct rpc_rqst rl_slot; + struct rpcrdma_buffer *rl_buffer; + struct rpcrdma_rep *rl_reply; + struct xdr_stream rl_stream; + struct xdr_buf rl_hdrbuf; + struct rpcrdma_sendctx *rl_sendctx; + struct rpcrdma_regbuf *rl_rdmabuf; /* xprt header */ + struct rpcrdma_regbuf *rl_sendbuf; /* rq_snd_buf */ + struct rpcrdma_regbuf *rl_recvbuf; /* rq_rcv_buf */ + + struct list_head rl_all; + unsigned long rl_flags; + + struct list_head rl_registered; /* registered segments */ + struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; +}; + +/* rl_flags */ +enum { + RPCRDMA_REQ_F_PENDING = 0, + RPCRDMA_REQ_F_TX_RESOURCES, +}; + +static inline struct rpcrdma_req * +rpcr_to_rdmar(const struct rpc_rqst *rqst) +{ + return container_of(rqst, struct rpcrdma_req, rl_slot); +} + +static inline void +rpcrdma_mr_push(struct rpcrdma_mr *mr, struct list_head *list) +{ + list_add_tail(&mr->mr_list, list); +} + +static inline struct rpcrdma_mr * +rpcrdma_mr_pop(struct list_head *list) +{ + struct rpcrdma_mr *mr; + + mr = list_first_entry(list, struct rpcrdma_mr, mr_list); + list_del_init(&mr->mr_list); + return mr; +} + +/* + * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for + * inline requests/replies, and client/server credits. + * + * One of these is associated with a transport instance + */ +struct rpcrdma_buffer { + spinlock_t rb_mrlock; /* protect rb_mrs list */ + struct list_head rb_mrs; + struct list_head rb_all; + + unsigned long rb_sc_head; + unsigned long rb_sc_tail; + unsigned long rb_sc_last; + struct rpcrdma_sendctx **rb_sc_ctxs; + + spinlock_t rb_lock; /* protect buf lists */ + struct list_head rb_send_bufs; + struct list_head rb_recv_bufs; + unsigned long rb_flags; + u32 rb_max_requests; + u32 rb_credits; /* most recent credit grant */ + int rb_posted_receives; + + u32 rb_bc_srv_max_requests; + spinlock_t rb_reqslock; /* protect rb_allreqs */ + struct list_head rb_allreqs; + + u32 rb_bc_max_requests; + + spinlock_t rb_recovery_lock; /* protect rb_stale_mrs */ + struct list_head rb_stale_mrs; + struct delayed_work rb_recovery_worker; + struct delayed_work rb_refresh_worker; +}; +#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) + +/* rb_flags */ +enum { + RPCRDMA_BUF_F_EMPTY_SCQ = 0, +}; + +/* + * Internal structure for transport instance creation. This + * exists primarily for modularity. + * + * This data should be set with mount options + */ +struct rpcrdma_create_data_internal { + unsigned int max_requests; /* max requests (slots) in flight */ + unsigned int rsize; /* mount rsize - max read hdr+data */ + unsigned int wsize; /* mount wsize - max write hdr+data */ + unsigned int inline_rsize; /* max non-rdma read data payload */ + unsigned int inline_wsize; /* max non-rdma write data payload */ +}; + +/* + * Statistics for RPCRDMA + */ +struct rpcrdma_stats { + /* accessed when sending a call */ + unsigned long read_chunk_count; + unsigned long write_chunk_count; + unsigned long reply_chunk_count; + unsigned long long total_rdma_request; + + /* rarely accessed error counters */ + unsigned long long pullup_copy_count; + unsigned long hardway_register_count; + unsigned long failed_marshal_count; + unsigned long bad_reply_count; + unsigned long mrs_recovered; + unsigned long mrs_orphaned; + unsigned long mrs_allocated; + unsigned long empty_sendctx_q; + + /* accessed when receiving a reply */ + unsigned long long total_rdma_reply; + unsigned long long fixup_copy_count; + unsigned long reply_waits_for_send; + unsigned long local_inv_needed; + unsigned long nomsg_call_count; + unsigned long bcall_count; +}; + +/* + * Per-registration mode operations + */ +struct rpcrdma_xprt; +struct rpcrdma_memreg_ops { + struct rpcrdma_mr_seg * + (*ro_map)(struct rpcrdma_xprt *, + struct rpcrdma_mr_seg *, int, bool, + struct rpcrdma_mr **); + int (*ro_send)(struct rpcrdma_ia *ia, + struct rpcrdma_req *req); + void (*ro_reminv)(struct rpcrdma_rep *rep, + struct list_head *mrs); + void (*ro_unmap_sync)(struct rpcrdma_xprt *, + struct list_head *); + void (*ro_recover_mr)(struct rpcrdma_mr *mr); + int (*ro_open)(struct rpcrdma_ia *, + struct rpcrdma_ep *, + struct rpcrdma_create_data_internal *); + size_t (*ro_maxpages)(struct rpcrdma_xprt *); + int (*ro_init_mr)(struct rpcrdma_ia *, + struct rpcrdma_mr *); + void (*ro_release_mr)(struct rpcrdma_mr *mr); + const char *ro_displayname; + const int ro_send_w_inv_ok; +}; + +extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops; +extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops; + +/* + * RPCRDMA transport -- encapsulates the structures above for + * integration with RPC. + * + * The contained structures are embedded, not pointers, + * for convenience. This structure need not be visible externally. + * + * It is allocated and initialized during mount, and released + * during unmount. + */ +struct rpcrdma_xprt { + struct rpc_xprt rx_xprt; + struct rpcrdma_ia rx_ia; + struct rpcrdma_ep rx_ep; + struct rpcrdma_buffer rx_buf; + struct rpcrdma_create_data_internal rx_data; + struct delayed_work rx_connect_worker; + struct rpcrdma_stats rx_stats; +}; + +#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, rx_xprt) +#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data) + +static inline const char * +rpcrdma_addrstr(const struct rpcrdma_xprt *r_xprt) +{ + return r_xprt->rx_xprt.address_strings[RPC_DISPLAY_ADDR]; +} + +static inline const char * +rpcrdma_portstr(const struct rpcrdma_xprt *r_xprt) +{ + return r_xprt->rx_xprt.address_strings[RPC_DISPLAY_PORT]; +} + +/* Setting this to 0 ensures interoperability with early servers. + * Setting this to 1 enhances certain unaligned read/write performance. + * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */ +extern int xprt_rdma_pad_optimize; + +/* This setting controls the hunt for a supported memory + * registration strategy. + */ +extern unsigned int xprt_rdma_memreg_strategy; + +/* + * Interface Adapter calls - xprtrdma/verbs.c + */ +int rpcrdma_ia_open(struct rpcrdma_xprt *xprt); +void rpcrdma_ia_remove(struct rpcrdma_ia *ia); +void rpcrdma_ia_close(struct rpcrdma_ia *); +bool frwr_is_supported(struct rpcrdma_ia *); +bool fmr_is_supported(struct rpcrdma_ia *); + +extern struct workqueue_struct *rpcrdma_receive_wq; + +/* + * Endpoint calls - xprtrdma/verbs.c + */ +int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, + struct rpcrdma_create_data_internal *); +void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); +int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); +void rpcrdma_conn_func(struct rpcrdma_ep *ep); +void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); + +int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, + struct rpcrdma_req *); +void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp); + +/* + * Buffer calls - xprtrdma/verbs.c + */ +struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *); +void rpcrdma_destroy_req(struct rpcrdma_req *); +int rpcrdma_buffer_create(struct rpcrdma_xprt *); +void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); +struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf); + +struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); +void rpcrdma_mr_put(struct rpcrdma_mr *mr); +void rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr); +void rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr); + +struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); +void rpcrdma_buffer_put(struct rpcrdma_req *); +void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); + +struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(size_t, enum dma_data_direction, + gfp_t); +bool __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *); +void rpcrdma_free_regbuf(struct rpcrdma_regbuf *); + +static inline bool +rpcrdma_regbuf_is_mapped(struct rpcrdma_regbuf *rb) +{ + return rb->rg_device != NULL; +} + +static inline bool +rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) +{ + if (likely(rpcrdma_regbuf_is_mapped(rb))) + return true; + return __rpcrdma_dma_map_regbuf(ia, rb); +} + +int rpcrdma_alloc_wq(void); +void rpcrdma_destroy_wq(void); + +/* + * Wrappers for chunk registration, shared by read/write chunk code. + */ + +static inline enum dma_data_direction +rpcrdma_data_dir(bool writing) +{ + return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; +} + +/* + * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c + */ + +enum rpcrdma_chunktype { + rpcrdma_noch = 0, + rpcrdma_readch, + rpcrdma_areadch, + rpcrdma_writech, + rpcrdma_replych +}; + +int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, u32 hdrlen, + struct xdr_buf *xdr, + enum rpcrdma_chunktype rtype); +void rpcrdma_unmap_sendctx(struct rpcrdma_sendctx *sc); +int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); +void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); +void rpcrdma_complete_rqst(struct rpcrdma_rep *rep); +void rpcrdma_reply_handler(struct rpcrdma_rep *rep); +void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req); +void rpcrdma_deferred_completion(struct work_struct *work); + +static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) +{ + xdr->head[0].iov_len = len; + xdr->len = len; +} + +/* RPC/RDMA module init - xprtrdma/transport.c + */ +extern unsigned int xprt_rdma_max_inline_read; +void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap); +void xprt_rdma_free_addresses(struct rpc_xprt *xprt); +void rpcrdma_connect_worker(struct work_struct *work); +void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq); +int xprt_rdma_init(void); +void xprt_rdma_cleanup(void); + +/* Backchannel calls - xprtrdma/backchannel.c + */ +#if defined(CONFIG_SUNRPC_BACKCHANNEL) +int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int); +int xprt_rdma_bc_up(struct svc_serv *, struct net *); +size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *); +int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int); +void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *); +int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst); +void xprt_rdma_bc_free_rqst(struct rpc_rqst *); +void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int); +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ + +extern struct xprt_class xprt_rdma_bc; + +#endif /* _LINUX_SUNRPC_XPRT_RDMA_H */ |