Adding upstream version 5.10.209.upstream/5.10.209 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-27 10:05:51 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-27 10:05:51 +0000
commit: 5d1646d90e1f2cceb9f0828f4b28318cd0ec7744 (patch)
tree: a94efe259b9009378be6d90eb30d2b019d95c194 /net/sunrpc/xprtrdma
parent: Initial commit. (diff)
download: linux-upstream.tar.xz
linux-upstream.zip
14 files changed, 9290 insertions, 0 deletions
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
new file mode 100644
index 000000000..8ed0377d7
--- /dev/null
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
+
+rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o \
+	svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
+	svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \
+	module.o
+rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
new file mode 100644
index 000000000..c92c1aac2
--- /dev/null
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -0,0 +1,284 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2015 Oracle.  All rights reserved.
+ *
+ * Support for backward direction RPCs on RPC/RDMA.
+ */
+
+#include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svc_xprt.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#include "xprt_rdma.h"
+#include <trace/events/rpcrdma.h>
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+#undef RPCRDMA_BACKCHANNEL_DEBUG
+
+/**
+ * xprt_rdma_bc_setup - Pre-allocate resources for handling backchannel requests
+ * @xprt: transport associated with these backchannel resources
+ * @reqs: number of concurrent incoming requests to expect
+ *
+ * Returns 0 on success; otherwise a negative errno
+ */
+int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+	r_xprt->rx_buf.rb_bc_srv_max_requests = RPCRDMA_BACKWARD_WRS >> 1;
+	trace_xprtrdma_cb_setup(r_xprt, reqs);
+	return 0;
+}
+
+/**
+ * xprt_rdma_bc_maxpayload - Return maximum backchannel message size
+ * @xprt: transport
+ *
+ * Returns maximum size, in bytes, of a backchannel message
+ */
+size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	struct rpcrdma_ep *ep = r_xprt->rx_ep;
+	size_t maxmsg;
+
+	maxmsg = min_t(unsigned int, ep->re_inline_send, ep->re_inline_recv);
+	maxmsg = min_t(unsigned int, maxmsg, PAGE_SIZE);
+	return maxmsg - RPCRDMA_HDRLEN_MIN;
+}
+
+unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *xprt)
+{
+	return RPCRDMA_BACKWARD_WRS >> 1;
+}
+
+static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	__be32 *p;
+
+	rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0);
+	xdr_init_encode(&req->rl_stream, &req->rl_hdrbuf,
+			rdmab_data(req->rl_rdmabuf), rqst);
+
+	p = xdr_reserve_space(&req->rl_stream, 28);
+	if (unlikely(!p))
+		return -EIO;
+	*p++ = rqst->rq_xid;
+	*p++ = rpcrdma_version;
+	*p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_srv_max_requests);
+	*p++ = rdma_msg;
+	*p++ = xdr_zero;
+	*p++ = xdr_zero;
+	*p = xdr_zero;
+
+	if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN,
+				      &rqst->rq_snd_buf, rpcrdma_noch_pullup))
+		return -EIO;
+
+	trace_xprtrdma_cb_reply(rqst);
+	return 0;
+}
+
+/**
+ * xprt_rdma_bc_send_reply - marshal and send a backchannel reply
+ * @rqst: RPC rqst with a backchannel RPC reply in rq_snd_buf
+ *
+ * Caller holds the transport's write lock.
+ *
+ * Returns:
+ *	%0 if the RPC message has been sent
+ *	%-ENOTCONN if the caller should reconnect and call again
+ *	%-EIO if a permanent error occurred and the request was not
+ *		sent. Do not try to send this message again.
+ */
+int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst)
+{
+	struct rpc_xprt *xprt = rqst->rq_xprt;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	int rc;
+
+	if (!xprt_connected(xprt))
+		return -ENOTCONN;
+
+	if (!xprt_request_get_cong(xprt, rqst))
+		return -EBADSLT;
+
+	rc = rpcrdma_bc_marshal_reply(rqst);
+	if (rc < 0)
+		goto failed_marshal;
+
+	if (rpcrdma_post_sends(r_xprt, req))
+		goto drop_connection;
+	return 0;
+
+failed_marshal:
+	if (rc != -ENOTCONN)
+		return rc;
+drop_connection:
+	xprt_rdma_close(xprt);
+	return -ENOTCONN;
+}
+
+/**
+ * xprt_rdma_bc_destroy - Release resources for handling backchannel requests
+ * @xprt: transport associated with these backchannel resources
+ * @reqs: number of incoming requests to destroy; ignored
+ */
+void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs)
+{
+	struct rpc_rqst *rqst, *tmp;
+
+	spin_lock(&xprt->bc_pa_lock);
+	list_for_each_entry_safe(rqst, tmp, &xprt->bc_pa_list, rq_bc_pa_list) {
+		list_del(&rqst->rq_bc_pa_list);
+		spin_unlock(&xprt->bc_pa_lock);
+
+		rpcrdma_req_destroy(rpcr_to_rdmar(rqst));
+
+		spin_lock(&xprt->bc_pa_lock);
+	}
+	spin_unlock(&xprt->bc_pa_lock);
+}
+
+/**
+ * xprt_rdma_bc_free_rqst - Release a backchannel rqst
+ * @rqst: request to release
+ */
+void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst)
+{
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	struct rpc_xprt *xprt = rqst->rq_xprt;
+
+	rpcrdma_recv_buffer_put(req->rl_reply);
+	req->rl_reply = NULL;
+
+	spin_lock(&xprt->bc_pa_lock);
+	list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list);
+	spin_unlock(&xprt->bc_pa_lock);
+	xprt_put(xprt);
+}
+
+static struct rpc_rqst *rpcrdma_bc_rqst_get(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+	struct rpcrdma_req *req;
+	struct rpc_rqst *rqst;
+	size_t size;
+
+	spin_lock(&xprt->bc_pa_lock);
+	rqst = list_first_entry_or_null(&xprt->bc_pa_list, struct rpc_rqst,
+					rq_bc_pa_list);
+	if (!rqst)
+		goto create_req;
+	list_del(&rqst->rq_bc_pa_list);
+	spin_unlock(&xprt->bc_pa_lock);
+	return rqst;
+
+create_req:
+	spin_unlock(&xprt->bc_pa_lock);
+
+	/* Set a limit to prevent a remote from overrunning our resources.
+	 */
+	if (xprt->bc_alloc_count >= RPCRDMA_BACKWARD_WRS)
+		return NULL;
+
+	size = min_t(size_t, r_xprt->rx_ep->re_inline_recv, PAGE_SIZE);
+	req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL);
+	if (!req)
+		return NULL;
+	if (rpcrdma_req_setup(r_xprt, req)) {
+		rpcrdma_req_destroy(req);
+		return NULL;
+	}
+
+	xprt->bc_alloc_count++;
+	rqst = &req->rl_slot;
+	rqst->rq_xprt = xprt;
+	__set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
+	xdr_buf_init(&rqst->rq_snd_buf, rdmab_data(req->rl_sendbuf), size);
+	return rqst;
+}
+
+/**
+ * rpcrdma_bc_receive_call - Handle a backward direction call
+ * @r_xprt: transport receiving the call
+ * @rep: receive buffer containing the call
+ *
+ * Operational assumptions:
+ *    o Backchannel credits are ignored, just as the NFS server
+ *      forechannel currently does
+ *    o The ULP manages a replay cache (eg, NFSv4.1 sessions).
+ *      No replay detection is done at the transport level
+ */
+void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
+			     struct rpcrdma_rep *rep)
+{
+	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+	struct svc_serv *bc_serv;
+	struct rpcrdma_req *req;
+	struct rpc_rqst *rqst;
+	struct xdr_buf *buf;
+	size_t size;
+	__be32 *p;
+
+	p = xdr_inline_decode(&rep->rr_stream, 0);
+	size = xdr_stream_remaining(&rep->rr_stream);
+
+#ifdef RPCRDMA_BACKCHANNEL_DEBUG
+	pr_info("RPC:       %s: callback XID %08x, length=%u\n",
+		__func__, be32_to_cpup(p), size);
+	pr_info("RPC:       %s: %*ph\n", __func__, size, p);
+#endif
+
+	rqst = rpcrdma_bc_rqst_get(r_xprt);
+	if (!rqst)
+		goto out_overflow;
+
+	rqst->rq_reply_bytes_recvd = 0;
+	rqst->rq_xid = *p;
+
+	rqst->rq_private_buf.len = size;
+
+	buf = &rqst->rq_rcv_buf;
+	memset(buf, 0, sizeof(*buf));
+	buf->head[0].iov_base = p;
+	buf->head[0].iov_len = size;
+	buf->len = size;
+
+	/* The receive buffer has to be hooked to the rpcrdma_req
+	 * so that it is not released while the req is pointing
+	 * to its buffer, and so that it can be reposted after
+	 * the Upper Layer is done decoding it.
+	 */
+	req = rpcr_to_rdmar(rqst);
+	req->rl_reply = rep;
+	trace_xprtrdma_cb_call(rqst);
+
+	/* Queue rqst for ULP's callback service */
+	bc_serv = xprt->bc_serv;
+	xprt_get(xprt);
+	spin_lock(&bc_serv->sv_cb_lock);
+	list_add(&rqst->rq_bc_list, &bc_serv->sv_cb_list);
+	spin_unlock(&bc_serv->sv_cb_lock);
+
+	wake_up(&bc_serv->sv_cb_waitq);
+
+	r_xprt->rx_stats.bcall_count++;
+	return;
+
+out_overflow:
+	pr_warn("RPC/RDMA backchannel overflow\n");
+	xprt_force_disconnect(xprt);
+	/* This receive buffer gets reposted automatically
+	 * when the connection is re-established.
+	 */
+	return;
+}
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
new file mode 100644
index 000000000..bf3627dce
--- /dev/null
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -0,0 +1,662 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2015, 2017 Oracle.  All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ */
+
+/* Lightweight memory registration using Fast Registration Work
+ * Requests (FRWR).
+ *
+ * FRWR features ordered asynchronous registration and invalidation
+ * of arbitrarily-sized memory regions. This is the fastest and safest
+ * but most complex memory registration mode.
+ */
+
+/* Normal operation
+ *
+ * A Memory Region is prepared for RDMA Read or Write using a FAST_REG
+ * Work Request (frwr_map). When the RDMA operation is finished, this
+ * Memory Region is invalidated using a LOCAL_INV Work Request
+ * (frwr_unmap_async and frwr_unmap_sync).
+ *
+ * Typically FAST_REG Work Requests are not signaled, and neither are
+ * RDMA Send Work Requests (with the exception of signaling occasionally
+ * to prevent provider work queue overflows). This greatly reduces HCA
+ * interrupt workload.
+ */
+
+/* Transport recovery
+ *
+ * frwr_map and frwr_unmap_* cannot run at the same time the transport
+ * connect worker is running. The connect worker holds the transport
+ * send lock, just as ->send_request does. This prevents frwr_map and
+ * the connect worker from running concurrently. When a connection is
+ * closed, the Receive completion queue is drained before the allowing
+ * the connect worker to get control. This prevents frwr_unmap and the
+ * connect worker from running concurrently.
+ *
+ * When the underlying transport disconnects, MRs that are in flight
+ * are flushed and are likely unusable. Thus all MRs are destroyed.
+ * New MRs are created on demand.
+ */
+
+#include <linux/sunrpc/svc_rdma.h>
+
+#include "xprt_rdma.h"
+#include <trace/events/rpcrdma.h>
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+/**
+ * frwr_release_mr - Destroy one MR
+ * @mr: MR allocated by frwr_mr_init
+ *
+ */
+void frwr_release_mr(struct rpcrdma_mr *mr)
+{
+	int rc;
+
+	rc = ib_dereg_mr(mr->frwr.fr_mr);
+	if (rc)
+		trace_xprtrdma_frwr_dereg(mr, rc);
+	kfree(mr->mr_sg);
+	kfree(mr);
+}
+
+static void frwr_mr_recycle(struct rpcrdma_mr *mr)
+{
+	struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
+
+	trace_xprtrdma_mr_recycle(mr);
+
+	if (mr->mr_dir != DMA_NONE) {
+		trace_xprtrdma_mr_unmap(mr);
+		ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device,
+				mr->mr_sg, mr->mr_nents, mr->mr_dir);
+		mr->mr_dir = DMA_NONE;
+	}
+
+	spin_lock(&r_xprt->rx_buf.rb_lock);
+	list_del(&mr->mr_all);
+	r_xprt->rx_stats.mrs_recycled++;
+	spin_unlock(&r_xprt->rx_buf.rb_lock);
+
+	frwr_release_mr(mr);
+}
+
+/* frwr_reset - Place MRs back on the free list
+ * @req: request to reset
+ *
+ * Used after a failed marshal. For FRWR, this means the MRs
+ * don't have to be fully released and recreated.
+ *
+ * NB: This is safe only as long as none of @req's MRs are
+ * involved with an ongoing asynchronous FAST_REG or LOCAL_INV
+ * Work Request.
+ */
+void frwr_reset(struct rpcrdma_req *req)
+{
+	struct rpcrdma_mr *mr;
+
+	while ((mr = rpcrdma_mr_pop(&req->rl_registered)))
+		rpcrdma_mr_put(mr);
+}
+
+/**
+ * frwr_mr_init - Initialize one MR
+ * @r_xprt: controlling transport instance
+ * @mr: generic MR to prepare for FRWR
+ *
+ * Returns zero if successful. Otherwise a negative errno
+ * is returned.
+ */
+int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
+{
+	struct rpcrdma_ep *ep = r_xprt->rx_ep;
+	unsigned int depth = ep->re_max_fr_depth;
+	struct scatterlist *sg;
+	struct ib_mr *frmr;
+	int rc;
+
+	frmr = ib_alloc_mr(ep->re_pd, ep->re_mrtype, depth);
+	if (IS_ERR(frmr))
+		goto out_mr_err;
+
+	sg = kmalloc_array(depth, sizeof(*sg), GFP_NOFS);
+	if (!sg)
+		goto out_list_err;
+
+	mr->mr_xprt = r_xprt;
+	mr->frwr.fr_mr = frmr;
+	mr->mr_dir = DMA_NONE;
+	INIT_LIST_HEAD(&mr->mr_list);
+	init_completion(&mr->frwr.fr_linv_done);
+
+	sg_init_table(sg, depth);
+	mr->mr_sg = sg;
+	return 0;
+
+out_mr_err:
+	rc = PTR_ERR(frmr);
+	trace_xprtrdma_frwr_alloc(mr, rc);
+	return rc;
+
+out_list_err:
+	ib_dereg_mr(frmr);
+	return -ENOMEM;
+}
+
+/**
+ * frwr_query_device - Prepare a transport for use with FRWR
+ * @ep: endpoint to fill in
+ * @device: RDMA device to query
+ *
+ * On success, sets:
+ *	ep->re_attr
+ *	ep->re_max_requests
+ *	ep->re_max_rdma_segs
+ *	ep->re_max_fr_depth
+ *	ep->re_mrtype
+ *
+ * Return values:
+ *   On success, returns zero.
+ *   %-EINVAL - the device does not support FRWR memory registration
+ *   %-ENOMEM - the device is not sufficiently capable for NFS/RDMA
+ */
+int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device)
+{
+	const struct ib_device_attr *attrs = &device->attrs;
+	int max_qp_wr, depth, delta;
+	unsigned int max_sge;
+
+	if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) ||
+	    attrs->max_fast_reg_page_list_len == 0) {
+		pr_err("rpcrdma: 'frwr' mode is not supported by device %s\n",
+		       device->name);
+		return -EINVAL;
+	}
+
+	max_sge = min_t(unsigned int, attrs->max_send_sge,
+			RPCRDMA_MAX_SEND_SGES);
+	if (max_sge < RPCRDMA_MIN_SEND_SGES) {
+		pr_err("rpcrdma: HCA provides only %u send SGEs\n", max_sge);
+		return -ENOMEM;
+	}
+	ep->re_attr.cap.max_send_sge = max_sge;
+	ep->re_attr.cap.max_recv_sge = 1;
+
+	ep->re_mrtype = IB_MR_TYPE_MEM_REG;
+	if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG)
+		ep->re_mrtype = IB_MR_TYPE_SG_GAPS;
+
+	/* Quirk: Some devices advertise a large max_fast_reg_page_list_len
+	 * capability, but perform optimally when the MRs are not larger
+	 * than a page.
+	 */
+	if (attrs->max_sge_rd > RPCRDMA_MAX_HDR_SEGS)
+		ep->re_max_fr_depth = attrs->max_sge_rd;
+	else
+		ep->re_max_fr_depth = attrs->max_fast_reg_page_list_len;
+	if (ep->re_max_fr_depth > RPCRDMA_MAX_DATA_SEGS)
+		ep->re_max_fr_depth = RPCRDMA_MAX_DATA_SEGS;
+
+	/* Add room for frwr register and invalidate WRs.
+	 * 1. FRWR reg WR for head
+	 * 2. FRWR invalidate WR for head
+	 * 3. N FRWR reg WRs for pagelist
+	 * 4. N FRWR invalidate WRs for pagelist
+	 * 5. FRWR reg WR for tail
+	 * 6. FRWR invalidate WR for tail
+	 * 7. The RDMA_SEND WR
+	 */
+	depth = 7;
+
+	/* Calculate N if the device max FRWR depth is smaller than
+	 * RPCRDMA_MAX_DATA_SEGS.
+	 */
+	if (ep->re_max_fr_depth < RPCRDMA_MAX_DATA_SEGS) {
+		delta = RPCRDMA_MAX_DATA_SEGS - ep->re_max_fr_depth;
+		do {
+			depth += 2; /* FRWR reg + invalidate */
+			delta -= ep->re_max_fr_depth;
+		} while (delta > 0);
+	}
+
+	max_qp_wr = attrs->max_qp_wr;
+	max_qp_wr -= RPCRDMA_BACKWARD_WRS;
+	max_qp_wr -= 1;
+	if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE)
+		return -ENOMEM;
+	if (ep->re_max_requests > max_qp_wr)
+		ep->re_max_requests = max_qp_wr;
+	ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth;
+	if (ep->re_attr.cap.max_send_wr > max_qp_wr) {
+		ep->re_max_requests = max_qp_wr / depth;
+		if (!ep->re_max_requests)
+			return -ENOMEM;
+		ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth;
+	}
+	ep->re_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
+	ep->re_attr.cap.max_send_wr += 1; /* for ib_drain_sq */
+	ep->re_attr.cap.max_recv_wr = ep->re_max_requests;
+	ep->re_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
+	ep->re_attr.cap.max_recv_wr += RPCRDMA_MAX_RECV_BATCH;
+	ep->re_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */
+
+	ep->re_max_rdma_segs =
+		DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ep->re_max_fr_depth);
+	/* Reply chunks require segments for head and tail buffers */
+	ep->re_max_rdma_segs += 2;
+	if (ep->re_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS)
+		ep->re_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS;
+
+	/* Ensure the underlying device is capable of conveying the
+	 * largest r/wsize NFS will ask for. This guarantees that
+	 * failing over from one RDMA device to another will not
+	 * break NFS I/O.
+	 */
+	if ((ep->re_max_rdma_segs * ep->re_max_fr_depth) < RPCRDMA_MAX_SEGS)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/**
+ * frwr_map - Register a memory region
+ * @r_xprt: controlling transport
+ * @seg: memory region co-ordinates
+ * @nsegs: number of segments remaining
+ * @writing: true when RDMA Write will be used
+ * @xid: XID of RPC using the registered memory
+ * @mr: MR to fill in
+ *
+ * Prepare a REG_MR Work Request to register a memory region
+ * for remote access via RDMA READ or RDMA WRITE.
+ *
+ * Returns the next segment or a negative errno pointer.
+ * On success, @mr is filled in.
+ */
+struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
+				struct rpcrdma_mr_seg *seg,
+				int nsegs, bool writing, __be32 xid,
+				struct rpcrdma_mr *mr)
+{
+	struct rpcrdma_ep *ep = r_xprt->rx_ep;
+	struct ib_reg_wr *reg_wr;
+	int i, n, dma_nents;
+	struct ib_mr *ibmr;
+	u8 key;
+
+	if (nsegs > ep->re_max_fr_depth)
+		nsegs = ep->re_max_fr_depth;
+	for (i = 0; i < nsegs;) {
+		if (seg->mr_page)
+			sg_set_page(&mr->mr_sg[i],
+				    seg->mr_page,
+				    seg->mr_len,
+				    offset_in_page(seg->mr_offset));
+		else
+			sg_set_buf(&mr->mr_sg[i], seg->mr_offset,
+				   seg->mr_len);
+
+		++seg;
+		++i;
+		if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS)
+			continue;
+		if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
+		    offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
+			break;
+	}
+	mr->mr_dir = rpcrdma_data_dir(writing);
+	mr->mr_nents = i;
+
+	dma_nents = ib_dma_map_sg(ep->re_id->device, mr->mr_sg, mr->mr_nents,
+				  mr->mr_dir);
+	if (!dma_nents)
+		goto out_dmamap_err;
+
+	ibmr = mr->frwr.fr_mr;
+	n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE);
+	if (n != dma_nents)
+		goto out_mapmr_err;
+
+	ibmr->iova &= 0x00000000ffffffff;
+	ibmr->iova |= ((u64)be32_to_cpu(xid)) << 32;
+	key = (u8)(ibmr->rkey & 0x000000FF);
+	ib_update_fast_reg_key(ibmr, ++key);
+
+	reg_wr = &mr->frwr.fr_regwr;
+	reg_wr->mr = ibmr;
+	reg_wr->key = ibmr->rkey;
+	reg_wr->access = writing ?
+			 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
+			 IB_ACCESS_REMOTE_READ;
+
+	mr->mr_handle = ibmr->rkey;
+	mr->mr_length = ibmr->length;
+	mr->mr_offset = ibmr->iova;
+	trace_xprtrdma_mr_map(mr);
+
+	return seg;
+
+out_dmamap_err:
+	mr->mr_dir = DMA_NONE;
+	trace_xprtrdma_frwr_sgerr(mr, i);
+	return ERR_PTR(-EIO);
+
+out_mapmr_err:
+	trace_xprtrdma_frwr_maperr(mr, n);
+	return ERR_PTR(-EIO);
+}
+
+/**
+ * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC
+ * @cq: completion queue
+ * @wc: WCE for a completed FastReg WR
+ *
+ */
+static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct rpcrdma_frwr *frwr =
+		container_of(cqe, struct rpcrdma_frwr, fr_cqe);
+
+	/* WARNING: Only wr_cqe and status are reliable at this point */
+	trace_xprtrdma_wc_fastreg(wc, frwr);
+	/* The MR will get recycled when the associated req is retransmitted */
+
+	rpcrdma_flush_disconnect(cq->cq_context, wc);
+}
+
+/**
+ * frwr_send - post Send WRs containing the RPC Call message
+ * @r_xprt: controlling transport instance
+ * @req: prepared RPC Call
+ *
+ * For FRWR, chain any FastReg WRs to the Send WR. Only a
+ * single ib_post_send call is needed to register memory
+ * and then post the Send WR.
+ *
+ * Returns the return code from ib_post_send.
+ *
+ * Caller must hold the transport send lock to ensure that the
+ * pointers to the transport's rdma_cm_id and QP are stable.
+ */
+int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+{
+	struct ib_send_wr *post_wr;
+	struct rpcrdma_mr *mr;
+
+	post_wr = &req->rl_wr;
+	list_for_each_entry(mr, &req->rl_registered, mr_list) {
+		struct rpcrdma_frwr *frwr;
+
+		frwr = &mr->frwr;
+
+		frwr->fr_cqe.done = frwr_wc_fastreg;
+		frwr->fr_regwr.wr.next = post_wr;
+		frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe;
+		frwr->fr_regwr.wr.num_sge = 0;
+		frwr->fr_regwr.wr.opcode = IB_WR_REG_MR;
+		frwr->fr_regwr.wr.send_flags = 0;
+
+		post_wr = &frwr->fr_regwr.wr;
+	}
+
+	return ib_post_send(r_xprt->rx_ep->re_id->qp, post_wr, NULL);
+}
+
+/**
+ * frwr_reminv - handle a remotely invalidated mr on the @mrs list
+ * @rep: Received reply
+ * @mrs: list of MRs to check
+ *
+ */
+void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
+{
+	struct rpcrdma_mr *mr;
+
+	list_for_each_entry(mr, mrs, mr_list)
+		if (mr->mr_handle == rep->rr_inv_rkey) {
+			list_del_init(&mr->mr_list);
+			trace_xprtrdma_mr_reminv(mr);
+			rpcrdma_mr_put(mr);
+			break;	/* only one invalidated MR per RPC */
+		}
+}
+
+static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr)
+{
+	if (wc->status != IB_WC_SUCCESS)
+		frwr_mr_recycle(mr);
+	else
+		rpcrdma_mr_put(mr);
+}
+
+/**
+ * frwr_wc_localinv - Invoked by RDMA provider for a LOCAL_INV WC
+ * @cq: completion queue
+ * @wc: WCE for a completed LocalInv WR
+ *
+ */
+static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct rpcrdma_frwr *frwr =
+		container_of(cqe, struct rpcrdma_frwr, fr_cqe);
+	struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
+
+	/* WARNING: Only wr_cqe and status are reliable at this point */
+	trace_xprtrdma_wc_li(wc, frwr);
+	__frwr_release_mr(wc, mr);
+
+	rpcrdma_flush_disconnect(cq->cq_context, wc);
+}
+
+/**
+ * frwr_wc_localinv_wake - Invoked by RDMA provider for a LOCAL_INV WC
+ * @cq: completion queue
+ * @wc: WCE for a completed LocalInv WR
+ *
+ * Awaken anyone waiting for an MR to finish being fenced.
+ */
+static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct rpcrdma_frwr *frwr =
+		container_of(cqe, struct rpcrdma_frwr, fr_cqe);
+	struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
+
+	/* WARNING: Only wr_cqe and status are reliable at this point */
+	trace_xprtrdma_wc_li_wake(wc, frwr);
+	__frwr_release_mr(wc, mr);
+	complete(&frwr->fr_linv_done);
+
+	rpcrdma_flush_disconnect(cq->cq_context, wc);
+}
+
+/**
+ * frwr_unmap_sync - invalidate memory regions that were registered for @req
+ * @r_xprt: controlling transport instance
+ * @req: rpcrdma_req with a non-empty list of MRs to process
+ *
+ * Sleeps until it is safe for the host CPU to access the previously mapped
+ * memory regions. This guarantees that registered MRs are properly fenced
+ * from the server before the RPC consumer accesses the data in them. It
+ * also ensures proper Send flow control: waking the next RPC waits until
+ * this RPC has relinquished all its Send Queue entries.
+ */
+void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+{
+	struct ib_send_wr *first, **prev, *last;
+	const struct ib_send_wr *bad_wr;
+	struct rpcrdma_frwr *frwr;
+	struct rpcrdma_mr *mr;
+	int rc;
+
+	/* ORDER: Invalidate all of the MRs first
+	 *
+	 * Chain the LOCAL_INV Work Requests and post them with
+	 * a single ib_post_send() call.
+	 */
+	frwr = NULL;
+	prev = &first;
+	while ((mr = rpcrdma_mr_pop(&req->rl_registered))) {
+
+		trace_xprtrdma_mr_localinv(mr);
+		r_xprt->rx_stats.local_inv_needed++;
+
+		frwr = &mr->frwr;
+		frwr->fr_cqe.done = frwr_wc_localinv;
+		last = &frwr->fr_invwr;
+		last->next = NULL;
+		last->wr_cqe = &frwr->fr_cqe;
+		last->sg_list = NULL;
+		last->num_sge = 0;
+		last->opcode = IB_WR_LOCAL_INV;
+		last->send_flags = IB_SEND_SIGNALED;
+		last->ex.invalidate_rkey = mr->mr_handle;
+
+		*prev = last;
+		prev = &last->next;
+	}
+
+	/* Strong send queue ordering guarantees that when the
+	 * last WR in the chain completes, all WRs in the chain
+	 * are complete.
+	 */
+	frwr->fr_cqe.done = frwr_wc_localinv_wake;
+	reinit_completion(&frwr->fr_linv_done);
+
+	/* Transport disconnect drains the receive CQ before it
+	 * replaces the QP. The RPC reply handler won't call us
+	 * unless re_id->qp is a valid pointer.
+	 */
+	bad_wr = NULL;
+	rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr);
+
+	/* The final LOCAL_INV WR in the chain is supposed to
+	 * do the wake. If it was never posted, the wake will
+	 * not happen, so don't wait in that case.
+	 */
+	if (bad_wr != first)
+		wait_for_completion(&frwr->fr_linv_done);
+	if (!rc)
+		return;
+
+	/* Recycle MRs in the LOCAL_INV chain that did not get posted.
+	 */
+	trace_xprtrdma_post_linv(req, rc);
+	while (bad_wr) {
+		frwr = container_of(bad_wr, struct rpcrdma_frwr,
+				    fr_invwr);
+		mr = container_of(frwr, struct rpcrdma_mr, frwr);
+		bad_wr = bad_wr->next;
+
+		frwr_mr_recycle(mr);
+	}
+}
+
+/**
+ * frwr_wc_localinv_done - Invoked by RDMA provider for a signaled LOCAL_INV WC
+ * @cq:	completion queue
+ * @wc:	WCE for a completed LocalInv WR
+ *
+ */
+static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct rpcrdma_frwr *frwr =
+		container_of(cqe, struct rpcrdma_frwr, fr_cqe);
+	struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
+	struct rpcrdma_rep *rep = mr->mr_req->rl_reply;
+
+	/* WARNING: Only wr_cqe and status are reliable at this point */
+	trace_xprtrdma_wc_li_done(wc, frwr);
+	__frwr_release_mr(wc, mr);
+
+	/* Ensure @rep is generated before __frwr_release_mr */
+	smp_rmb();
+	rpcrdma_complete_rqst(rep);
+
+	rpcrdma_flush_disconnect(cq->cq_context, wc);
+}
+
+/**
+ * frwr_unmap_async - invalidate memory regions that were registered for @req
+ * @r_xprt: controlling transport instance
+ * @req: rpcrdma_req with a non-empty list of MRs to process
+ *
+ * This guarantees that registered MRs are properly fenced from the
+ * server before the RPC consumer accesses the data in them. It also
+ * ensures proper Send flow control: waking the next RPC waits until
+ * this RPC has relinquished all its Send Queue entries.
+ */
+void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+{
+	struct ib_send_wr *first, *last, **prev;
+	const struct ib_send_wr *bad_wr;
+	struct rpcrdma_frwr *frwr;
+	struct rpcrdma_mr *mr;
+	int rc;
+
+	/* Chain the LOCAL_INV Work Requests and post them with
+	 * a single ib_post_send() call.
+	 */
+	frwr = NULL;
+	prev = &first;
+	while ((mr = rpcrdma_mr_pop(&req->rl_registered))) {
+
+		trace_xprtrdma_mr_localinv(mr);
+		r_xprt->rx_stats.local_inv_needed++;
+
+		frwr = &mr->frwr;
+		frwr->fr_cqe.done = frwr_wc_localinv;
+		last = &frwr->fr_invwr;
+		last->next = NULL;
+		last->wr_cqe = &frwr->fr_cqe;
+		last->sg_list = NULL;
+		last->num_sge = 0;
+		last->opcode = IB_WR_LOCAL_INV;
+		last->send_flags = IB_SEND_SIGNALED;
+		last->ex.invalidate_rkey = mr->mr_handle;
+
+		*prev = last;
+		prev = &last->next;
+	}
+
+	/* Strong send queue ordering guarantees that when the
+	 * last WR in the chain completes, all WRs in the chain
+	 * are complete. The last completion will wake up the
+	 * RPC waiter.
+	 */
+	frwr->fr_cqe.done = frwr_wc_localinv_done;
+
+	/* Transport disconnect drains the receive CQ before it
+	 * replaces the QP. The RPC reply handler won't call us
+	 * unless re_id->qp is a valid pointer.
+	 */
+	bad_wr = NULL;
+	rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr);
+	if (!rc)
+		return;
+
+	/* Recycle MRs in the LOCAL_INV chain that did not get posted.
+	 */
+	trace_xprtrdma_post_linv(req, rc);
+	while (bad_wr) {
+		frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr);
+		mr = container_of(frwr, struct rpcrdma_mr, frwr);
+		bad_wr = bad_wr->next;
+
+		frwr_mr_recycle(mr);
+	}
+
+	/* The final LOCAL_INV WR in the chain is supposed to
+	 * do the wake. If it was never posted, the wake will
+	 * not happen, so wake here in that case.
+	 */
+	rpcrdma_complete_rqst(req->rl_reply);
+}
diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c
new file mode 100644
index 000000000..45c5b41ac
--- /dev/null
+++ b/net/sunrpc/xprtrdma/module.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+/*
+ * Copyright (c) 2015, 2017 Oracle.  All rights reserved.
+ */
+
+/* rpcrdma.ko module initialization
+ */
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#include <asm/swab.h>
+
+#include "xprt_rdma.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/rpcrdma.h>
+
+MODULE_AUTHOR("Open Grid Computing and Network Appliance, Inc.");
+MODULE_DESCRIPTION("RPC/RDMA Transport");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("svcrdma");
+MODULE_ALIAS("xprtrdma");
+MODULE_ALIAS("rpcrdma6");
+
+static void __exit rpc_rdma_cleanup(void)
+{
+	xprt_rdma_cleanup();
+	svc_rdma_cleanup();
+}
+
+static int __init rpc_rdma_init(void)
+{
+	int rc;
+
+	rc = svc_rdma_init();
+	if (rc)
+		goto out;
+
+	rc = xprt_rdma_init();
+	if (rc)
+		svc_rdma_cleanup();
+
+out:
+	return rc;
+}
+
+module_init(rpc_rdma_init);
+module_exit(rpc_rdma_cleanup);
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
new file mode 100644
index 000000000..b8174c77d
--- /dev/null
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -0,0 +1,1513 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+/*
+ * Copyright (c) 2014-2017 Oracle.  All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * rpc_rdma.c
+ *
+ * This file contains the guts of the RPC RDMA protocol, and
+ * does marshaling/unmarshaling, etc. It is also where interfacing
+ * to the Linux RPC framework lives.
+ */
+
+#include <linux/highmem.h>
+
+#include <linux/sunrpc/svc_rdma.h>
+
+#include "xprt_rdma.h"
+#include <trace/events/rpcrdma.h>
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+/* Returns size of largest RPC-over-RDMA header in a Call message
+ *
+ * The largest Call header contains a full-size Read list and a
+ * minimal Reply chunk.
+ */
+static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs)
+{
+	unsigned int size;
+
+	/* Fixed header fields and list discriminators */
+	size = RPCRDMA_HDRLEN_MIN;
+
+	/* Maximum Read list size */
+	size += maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32);
+
+	/* Minimal Read chunk size */
+	size += sizeof(__be32);	/* segment count */
+	size += rpcrdma_segment_maxsz * sizeof(__be32);
+	size += sizeof(__be32);	/* list discriminator */
+
+	return size;
+}
+
+/* Returns size of largest RPC-over-RDMA header in a Reply message
+ *
+ * There is only one Write list or one Reply chunk per Reply
+ * message.  The larger list is the Write list.
+ */
+static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
+{
+	unsigned int size;
+
+	/* Fixed header fields and list discriminators */
+	size = RPCRDMA_HDRLEN_MIN;
+
+	/* Maximum Write list size */
+	size += sizeof(__be32);		/* segment count */
+	size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32);
+	size += sizeof(__be32);	/* list discriminator */
+
+	return size;
+}
+
+/**
+ * rpcrdma_set_max_header_sizes - Initialize inline payload sizes
+ * @ep: endpoint to initialize
+ *
+ * The max_inline fields contain the maximum size of an RPC message
+ * so the marshaling code doesn't have to repeat this calculation
+ * for every RPC.
+ */
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep)
+{
+	unsigned int maxsegs = ep->re_max_rdma_segs;
+
+	ep->re_max_inline_send =
+		ep->re_inline_send - rpcrdma_max_call_header_size(maxsegs);
+	ep->re_max_inline_recv =
+		ep->re_inline_recv - rpcrdma_max_reply_header_size(maxsegs);
+}
+
+/* The client can send a request inline as long as the RPCRDMA header
+ * plus the RPC call fit under the transport's inline limit. If the
+ * combined call message size exceeds that limit, the client must use
+ * a Read chunk for this operation.
+ *
+ * A Read chunk is also required if sending the RPC call inline would
+ * exceed this device's max_sge limit.
+ */
+static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
+				struct rpc_rqst *rqst)
+{
+	struct xdr_buf *xdr = &rqst->rq_snd_buf;
+	struct rpcrdma_ep *ep = r_xprt->rx_ep;
+	unsigned int count, remaining, offset;
+
+	if (xdr->len > ep->re_max_inline_send)
+		return false;
+
+	if (xdr->page_len) {
+		remaining = xdr->page_len;
+		offset = offset_in_page(xdr->page_base);
+		count = RPCRDMA_MIN_SEND_SGES;
+		while (remaining) {
+			remaining -= min_t(unsigned int,
+					   PAGE_SIZE - offset, remaining);
+			offset = 0;
+			if (++count > ep->re_attr.cap.max_send_sge)
+				return false;
+		}
+	}
+
+	return true;
+}
+
+/* The client can't know how large the actual reply will be. Thus it
+ * plans for the largest possible reply for that particular ULP
+ * operation. If the maximum combined reply message size exceeds that
+ * limit, the client must provide a write list or a reply chunk for
+ * this request.
+ */
+static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
+				   struct rpc_rqst *rqst)
+{
+	return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep->re_max_inline_recv;
+}
+
+/* The client is required to provide a Reply chunk if the maximum
+ * size of the non-payload part of the RPC Reply is larger than
+ * the inline threshold.
+ */
+static bool
+rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt,
+			  const struct rpc_rqst *rqst)
+{
+	const struct xdr_buf *buf = &rqst->rq_rcv_buf;
+
+	return (buf->head[0].iov_len + buf->tail[0].iov_len) <
+		r_xprt->rx_ep->re_max_inline_recv;
+}
+
+/* ACL likes to be lazy in allocating pages. For TCP, these
+ * pages can be allocated during receive processing. Not true
+ * for RDMA, which must always provision receive buffers
+ * up front.
+ */
+static noinline int
+rpcrdma_alloc_sparse_pages(struct xdr_buf *buf)
+{
+	struct page **ppages;
+	int len;
+
+	len = buf->page_len;
+	ppages = buf->pages + (buf->page_base >> PAGE_SHIFT);
+	while (len > 0) {
+		if (!*ppages)
+			*ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN);
+		if (!*ppages)
+			return -ENOBUFS;
+		ppages++;
+		len -= PAGE_SIZE;
+	}
+
+	return 0;
+}
+
+/* Split @vec on page boundaries into SGEs. FMR registers pages, not
+ * a byte range. Other modes coalesce these SGEs into a single MR
+ * when they can.
+ *
+ * Returns pointer to next available SGE, and bumps the total number
+ * of SGEs consumed.
+ */
+static struct rpcrdma_mr_seg *
+rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
+		     unsigned int *n)
+{
+	u32 remaining, page_offset;
+	char *base;
+
+	base = vec->iov_base;
+	page_offset = offset_in_page(base);
+	remaining = vec->iov_len;
+	while (remaining) {
+		seg->mr_page = NULL;
+		seg->mr_offset = base;
+		seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining);
+		remaining -= seg->mr_len;
+		base += seg->mr_len;
+		++seg;
+		++(*n);
+		page_offset = 0;
+	}
+	return seg;
+}
+
+/* Convert @xdrbuf into SGEs no larger than a page each. As they
+ * are registered, these SGEs are then coalesced into RDMA segments
+ * when the selected memreg mode supports it.
+ *
+ * Returns positive number of SGEs consumed, or a negative errno.
+ */
+
+static int
+rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
+		     unsigned int pos, enum rpcrdma_chunktype type,
+		     struct rpcrdma_mr_seg *seg)
+{
+	unsigned long page_base;
+	unsigned int len, n;
+	struct page **ppages;
+
+	n = 0;
+	if (pos == 0)
+		seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n);
+
+	len = xdrbuf->page_len;
+	ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
+	page_base = offset_in_page(xdrbuf->page_base);
+	while (len) {
+		seg->mr_page = *ppages;
+		seg->mr_offset = (char *)page_base;
+		seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len);
+		len -= seg->mr_len;
+		++ppages;
+		++seg;
+		++n;
+		page_base = 0;
+	}
+
+	/* When encoding a Read chunk, the tail iovec contains an
+	 * XDR pad and may be omitted.
+	 */
+	if (type == rpcrdma_readch && r_xprt->rx_ep->re_implicit_roundup)
+		goto out;
+
+	/* When encoding a Write chunk, some servers need to see an
+	 * extra segment for non-XDR-aligned Write chunks. The upper
+	 * layer provides space in the tail iovec that may be used
+	 * for this purpose.
+	 */
+	if (type == rpcrdma_writech && r_xprt->rx_ep->re_implicit_roundup)
+		goto out;
+
+	if (xdrbuf->tail[0].iov_len)
+		seg = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n);
+
+out:
+	if (unlikely(n > RPCRDMA_MAX_SEGS))
+		return -EIO;
+	return n;
+}
+
+static int
+encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 4 * sizeof(*p));
+	if (unlikely(!p))
+		return -EMSGSIZE;
+
+	xdr_encode_rdma_segment(p, mr->mr_handle, mr->mr_length, mr->mr_offset);
+	return 0;
+}
+
+static int
+encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr,
+		    u32 position)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 6 * sizeof(*p));
+	if (unlikely(!p))
+		return -EMSGSIZE;
+
+	*p++ = xdr_one;			/* Item present */
+	xdr_encode_read_segment(p, position, mr->mr_handle, mr->mr_length,
+				mr->mr_offset);
+	return 0;
+}
+
+static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
+						 struct rpcrdma_req *req,
+						 struct rpcrdma_mr_seg *seg,
+						 int nsegs, bool writing,
+						 struct rpcrdma_mr **mr)
+{
+	*mr = rpcrdma_mr_pop(&req->rl_free_mrs);
+	if (!*mr) {
+		*mr = rpcrdma_mr_get(r_xprt);
+		if (!*mr)
+			goto out_getmr_err;
+		trace_xprtrdma_mr_get(req);
+		(*mr)->mr_req = req;
+	}
+
+	rpcrdma_mr_push(*mr, &req->rl_registered);
+	return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr);
+
+out_getmr_err:
+	trace_xprtrdma_nomrs(req);
+	xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
+	rpcrdma_mrs_refresh(r_xprt);
+	return ERR_PTR(-EAGAIN);
+}
+
+/* Register and XDR encode the Read list. Supports encoding a list of read
+ * segments that belong to a single read chunk.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
+ *  Read chunklist (a linked list):
+ *   N elements, position P (same P for all chunks of same arg!):
+ *    1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
+ *
+ * Returns zero on success, or a negative errno if a failure occurred.
+ * @xdr is advanced to the next position in the stream.
+ *
+ * Only a single @pos value is currently supported.
+ */
+static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
+				    struct rpcrdma_req *req,
+				    struct rpc_rqst *rqst,
+				    enum rpcrdma_chunktype rtype)
+{
+	struct xdr_stream *xdr = &req->rl_stream;
+	struct rpcrdma_mr_seg *seg;
+	struct rpcrdma_mr *mr;
+	unsigned int pos;
+	int nsegs;
+
+	if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped)
+		goto done;
+
+	pos = rqst->rq_snd_buf.head[0].iov_len;
+	if (rtype == rpcrdma_areadch)
+		pos = 0;
+	seg = req->rl_segments;
+	nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos,
+				     rtype, seg);
+	if (nsegs < 0)
+		return nsegs;
+
+	do {
+		seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, false, &mr);
+		if (IS_ERR(seg))
+			return PTR_ERR(seg);
+
+		if (encode_read_segment(xdr, mr, pos) < 0)
+			return -EMSGSIZE;
+
+		trace_xprtrdma_chunk_read(rqst->rq_task, pos, mr, nsegs);
+		r_xprt->rx_stats.read_chunk_count++;
+		nsegs -= mr->mr_nents;
+	} while (nsegs);
+
+done:
+	if (xdr_stream_encode_item_absent(xdr) < 0)
+		return -EMSGSIZE;
+	return 0;
+}
+
+/* Register and XDR encode the Write list. Supports encoding a list
+ * containing one array of plain segments that belong to a single
+ * write chunk.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
+ *  Write chunklist (a list of (one) counted array):
+ *   N elements:
+ *    1 - N - HLOO - HLOO - ... - HLOO - 0
+ *
+ * Returns zero on success, or a negative errno if a failure occurred.
+ * @xdr is advanced to the next position in the stream.
+ *
+ * Only a single Write chunk is currently supported.
+ */
+static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt,
+				     struct rpcrdma_req *req,
+				     struct rpc_rqst *rqst,
+				     enum rpcrdma_chunktype wtype)
+{
+	struct xdr_stream *xdr = &req->rl_stream;
+	struct rpcrdma_mr_seg *seg;
+	struct rpcrdma_mr *mr;
+	int nsegs, nchunks;
+	__be32 *segcount;
+
+	if (wtype != rpcrdma_writech)
+		goto done;
+
+	seg = req->rl_segments;
+	nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf,
+				     rqst->rq_rcv_buf.head[0].iov_len,
+				     wtype, seg);
+	if (nsegs < 0)
+		return nsegs;
+
+	if (xdr_stream_encode_item_present(xdr) < 0)
+		return -EMSGSIZE;
+	segcount = xdr_reserve_space(xdr, sizeof(*segcount));
+	if (unlikely(!segcount))
+		return -EMSGSIZE;
+	/* Actual value encoded below */
+
+	nchunks = 0;
+	do {
+		seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr);
+		if (IS_ERR(seg))
+			return PTR_ERR(seg);
+
+		if (encode_rdma_segment(xdr, mr) < 0)
+			return -EMSGSIZE;
+
+		trace_xprtrdma_chunk_write(rqst->rq_task, mr, nsegs);
+		r_xprt->rx_stats.write_chunk_count++;
+		r_xprt->rx_stats.total_rdma_request += mr->mr_length;
+		nchunks++;
+		nsegs -= mr->mr_nents;
+	} while (nsegs);
+
+	/* Update count of segments in this Write chunk */
+	*segcount = cpu_to_be32(nchunks);
+
+done:
+	if (xdr_stream_encode_item_absent(xdr) < 0)
+		return -EMSGSIZE;
+	return 0;
+}
+
+/* Register and XDR encode the Reply chunk. Supports encoding an array
+ * of plain segments that belong to a single write (reply) chunk.
+ *
+ * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
+ *
+ *  Reply chunk (a counted array):
+ *   N elements:
+ *    1 - N - HLOO - HLOO - ... - HLOO
+ *
+ * Returns zero on success, or a negative errno if a failure occurred.
+ * @xdr is advanced to the next position in the stream.
+ */
+static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
+				      struct rpcrdma_req *req,
+				      struct rpc_rqst *rqst,
+				      enum rpcrdma_chunktype wtype)
+{
+	struct xdr_stream *xdr = &req->rl_stream;
+	struct rpcrdma_mr_seg *seg;
+	struct rpcrdma_mr *mr;
+	int nsegs, nchunks;
+	__be32 *segcount;
+
+	if (wtype != rpcrdma_replych) {
+		if (xdr_stream_encode_item_absent(xdr) < 0)
+			return -EMSGSIZE;
+		return 0;
+	}
+
+	seg = req->rl_segments;
+	nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg);
+	if (nsegs < 0)
+		return nsegs;
+
+	if (xdr_stream_encode_item_present(xdr) < 0)
+		return -EMSGSIZE;
+	segcount = xdr_reserve_space(xdr, sizeof(*segcount));
+	if (unlikely(!segcount))
+		return -EMSGSIZE;
+	/* Actual value encoded below */
+
+	nchunks = 0;
+	do {
+		seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr);
+		if (IS_ERR(seg))
+			return PTR_ERR(seg);
+
+		if (encode_rdma_segment(xdr, mr) < 0)
+			return -EMSGSIZE;
+
+		trace_xprtrdma_chunk_reply(rqst->rq_task, mr, nsegs);
+		r_xprt->rx_stats.reply_chunk_count++;
+		r_xprt->rx_stats.total_rdma_request += mr->mr_length;
+		nchunks++;
+		nsegs -= mr->mr_nents;
+	} while (nsegs);
+
+	/* Update count of segments in the Reply chunk */
+	*segcount = cpu_to_be32(nchunks);
+
+	return 0;
+}
+
+static void rpcrdma_sendctx_done(struct kref *kref)
+{
+	struct rpcrdma_req *req =
+		container_of(kref, struct rpcrdma_req, rl_kref);
+	struct rpcrdma_rep *rep = req->rl_reply;
+
+	rpcrdma_complete_rqst(rep);
+	rep->rr_rxprt->rx_stats.reply_waits_for_send++;
+}
+
+/**
+ * rpcrdma_sendctx_unmap - DMA-unmap Send buffer
+ * @sc: sendctx containing SGEs to unmap
+ *
+ */
+void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc)
+{
+	struct rpcrdma_regbuf *rb = sc->sc_req->rl_sendbuf;
+	struct ib_sge *sge;
+
+	if (!sc->sc_unmap_count)
+		return;
+
+	/* The first two SGEs contain the transport header and
+	 * the inline buffer. These are always left mapped so
+	 * they can be cheaply re-used.
+	 */
+	for (sge = &sc->sc_sges[2]; sc->sc_unmap_count;
+	     ++sge, --sc->sc_unmap_count)
+		ib_dma_unmap_page(rdmab_device(rb), sge->addr, sge->length,
+				  DMA_TO_DEVICE);
+
+	kref_put(&sc->sc_req->rl_kref, rpcrdma_sendctx_done);
+}
+
+/* Prepare an SGE for the RPC-over-RDMA transport header.
+ */
+static void rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt,
+				    struct rpcrdma_req *req, u32 len)
+{
+	struct rpcrdma_sendctx *sc = req->rl_sendctx;
+	struct rpcrdma_regbuf *rb = req->rl_rdmabuf;
+	struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++];
+
+	sge->addr = rdmab_addr(rb);
+	sge->length = len;
+	sge->lkey = rdmab_lkey(rb);
+
+	ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length,
+				      DMA_TO_DEVICE);
+}
+
+/* The head iovec is straightforward, as it is usually already
+ * DMA-mapped. Sync the content that has changed.
+ */
+static bool rpcrdma_prepare_head_iov(struct rpcrdma_xprt *r_xprt,
+				     struct rpcrdma_req *req, unsigned int len)
+{
+	struct rpcrdma_sendctx *sc = req->rl_sendctx;
+	struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++];
+	struct rpcrdma_regbuf *rb = req->rl_sendbuf;
+
+	if (!rpcrdma_regbuf_dma_map(r_xprt, rb))
+		return false;
+
+	sge->addr = rdmab_addr(rb);
+	sge->length = len;
+	sge->lkey = rdmab_lkey(rb);
+
+	ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length,
+				      DMA_TO_DEVICE);
+	return true;
+}
+
+/* If there is a page list present, DMA map and prepare an
+ * SGE for each page to be sent.
+ */
+static bool rpcrdma_prepare_pagelist(struct rpcrdma_req *req,
+				     struct xdr_buf *xdr)
+{
+	struct rpcrdma_sendctx *sc = req->rl_sendctx;
+	struct rpcrdma_regbuf *rb = req->rl_sendbuf;
+	unsigned int page_base, len, remaining;
+	struct page **ppages;
+	struct ib_sge *sge;
+
+	ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+	page_base = offset_in_page(xdr->page_base);
+	remaining = xdr->page_len;
+	while (remaining) {
+		sge = &sc->sc_sges[req->rl_wr.num_sge++];
+		len = min_t(unsigned int, PAGE_SIZE - page_base, remaining);
+		sge->addr = ib_dma_map_page(rdmab_device(rb), *ppages,
+					    page_base, len, DMA_TO_DEVICE);
+		if (ib_dma_mapping_error(rdmab_device(rb), sge->addr))
+			goto out_mapping_err;
+
+		sge->length = len;
+		sge->lkey = rdmab_lkey(rb);
+
+		sc->sc_unmap_count++;
+		ppages++;
+		remaining -= len;
+		page_base = 0;
+	}
+
+	return true;
+
+out_mapping_err:
+	trace_xprtrdma_dma_maperr(sge->addr);
+	return false;
+}
+
+/* The tail iovec may include an XDR pad for the page list,
+ * as well as additional content, and may not reside in the
+ * same page as the head iovec.
+ */
+static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req,
+				     struct xdr_buf *xdr,
+				     unsigned int page_base, unsigned int len)
+{
+	struct rpcrdma_sendctx *sc = req->rl_sendctx;
+	struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++];
+	struct rpcrdma_regbuf *rb = req->rl_sendbuf;
+	struct page *page = virt_to_page(xdr->tail[0].iov_base);
+
+	sge->addr = ib_dma_map_page(rdmab_device(rb), page, page_base, len,
+				    DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(rdmab_device(rb), sge->addr))
+		goto out_mapping_err;
+
+	sge->length = len;
+	sge->lkey = rdmab_lkey(rb);
+	++sc->sc_unmap_count;
+	return true;
+
+out_mapping_err:
+	trace_xprtrdma_dma_maperr(sge->addr);
+	return false;
+}
+
+/* Copy the tail to the end of the head buffer.
+ */
+static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt,
+				    struct rpcrdma_req *req,
+				    struct xdr_buf *xdr)
+{
+	unsigned char *dst;
+
+	dst = (unsigned char *)xdr->head[0].iov_base;
+	dst += xdr->head[0].iov_len + xdr->page_len;
+	memmove(dst, xdr->tail[0].iov_base, xdr->tail[0].iov_len);
+	r_xprt->rx_stats.pullup_copy_count += xdr->tail[0].iov_len;
+}
+
+/* Copy pagelist content into the head buffer.
+ */
+static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt,
+				    struct rpcrdma_req *req,
+				    struct xdr_buf *xdr)
+{
+	unsigned int len, page_base, remaining;
+	struct page **ppages;
+	unsigned char *src, *dst;
+
+	dst = (unsigned char *)xdr->head[0].iov_base;
+	dst += xdr->head[0].iov_len;
+	ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+	page_base = offset_in_page(xdr->page_base);
+	remaining = xdr->page_len;
+	while (remaining) {
+		src = page_address(*ppages);
+		src += page_base;
+		len = min_t(unsigned int, PAGE_SIZE - page_base, remaining);
+		memcpy(dst, src, len);
+		r_xprt->rx_stats.pullup_copy_count += len;
+
+		ppages++;
+		dst += len;
+		remaining -= len;
+		page_base = 0;
+	}
+}
+
+/* Copy the contents of @xdr into @rl_sendbuf and DMA sync it.
+ * When the head, pagelist, and tail are small, a pull-up copy
+ * is considerably less costly than DMA mapping the components
+ * of @xdr.
+ *
+ * Assumptions:
+ *  - the caller has already verified that the total length
+ *    of the RPC Call body will fit into @rl_sendbuf.
+ */
+static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt,
+					struct rpcrdma_req *req,
+					struct xdr_buf *xdr)
+{
+	if (unlikely(xdr->tail[0].iov_len))
+		rpcrdma_pullup_tail_iov(r_xprt, req, xdr);
+
+	if (unlikely(xdr->page_len))
+		rpcrdma_pullup_pagelist(r_xprt, req, xdr);
+
+	/* The whole RPC message resides in the head iovec now */
+	return rpcrdma_prepare_head_iov(r_xprt, req, xdr->len);
+}
+
+static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt,
+					struct rpcrdma_req *req,
+					struct xdr_buf *xdr)
+{
+	struct kvec *tail = &xdr->tail[0];
+
+	if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len))
+		return false;
+	if (xdr->page_len)
+		if (!rpcrdma_prepare_pagelist(req, xdr))
+			return false;
+	if (tail->iov_len)
+		if (!rpcrdma_prepare_tail_iov(req, xdr,
+					      offset_in_page(tail->iov_base),
+					      tail->iov_len))
+			return false;
+
+	if (req->rl_sendctx->sc_unmap_count)
+		kref_get(&req->rl_kref);
+	return true;
+}
+
+static bool rpcrdma_prepare_readch(struct rpcrdma_xprt *r_xprt,
+				   struct rpcrdma_req *req,
+				   struct xdr_buf *xdr)
+{
+	if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len))
+		return false;
+
+	/* If there is a Read chunk, the page list is being handled
+	 * via explicit RDMA, and thus is skipped here.
+	 */
+
+	/* Do not include the tail if it is only an XDR pad */
+	if (xdr->tail[0].iov_len > 3) {
+		unsigned int page_base, len;
+
+		/* If the content in the page list is an odd length,
+		 * xdr_write_pages() adds a pad at the beginning of
+		 * the tail iovec. Force the tail's non-pad content to
+		 * land at the next XDR position in the Send message.
+		 */
+		page_base = offset_in_page(xdr->tail[0].iov_base);
+		len = xdr->tail[0].iov_len;
+		page_base += len & 3;
+		len -= len & 3;
+		if (!rpcrdma_prepare_tail_iov(req, xdr, page_base, len))
+			return false;
+		kref_get(&req->rl_kref);
+	}
+
+	return true;
+}
+
+/**
+ * rpcrdma_prepare_send_sges - Construct SGEs for a Send WR
+ * @r_xprt: controlling transport
+ * @req: context of RPC Call being marshalled
+ * @hdrlen: size of transport header, in bytes
+ * @xdr: xdr_buf containing RPC Call
+ * @rtype: chunk type being encoded
+ *
+ * Returns 0 on success; otherwise a negative errno is returned.
+ */
+inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
+				     struct rpcrdma_req *req, u32 hdrlen,
+				     struct xdr_buf *xdr,
+				     enum rpcrdma_chunktype rtype)
+{
+	int ret;
+
+	ret = -EAGAIN;
+	req->rl_sendctx = rpcrdma_sendctx_get_locked(r_xprt);
+	if (!req->rl_sendctx)
+		goto out_nosc;
+	req->rl_sendctx->sc_unmap_count = 0;
+	req->rl_sendctx->sc_req = req;
+	kref_init(&req->rl_kref);
+	req->rl_wr.wr_cqe = &req->rl_sendctx->sc_cqe;
+	req->rl_wr.sg_list = req->rl_sendctx->sc_sges;
+	req->rl_wr.num_sge = 0;
+	req->rl_wr.opcode = IB_WR_SEND;
+
+	rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen);
+
+	ret = -EIO;
+	switch (rtype) {
+	case rpcrdma_noch_pullup:
+		if (!rpcrdma_prepare_noch_pullup(r_xprt, req, xdr))
+			goto out_unmap;
+		break;
+	case rpcrdma_noch_mapped:
+		if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr))
+			goto out_unmap;
+		break;
+	case rpcrdma_readch:
+		if (!rpcrdma_prepare_readch(r_xprt, req, xdr))
+			goto out_unmap;
+		break;
+	case rpcrdma_areadch:
+		break;
+	default:
+		goto out_unmap;
+	}
+
+	return 0;
+
+out_unmap:
+	rpcrdma_sendctx_unmap(req->rl_sendctx);
+out_nosc:
+	trace_xprtrdma_prepsend_failed(&req->rl_slot, ret);
+	return ret;
+}
+
+/**
+ * rpcrdma_marshal_req - Marshal and send one RPC request
+ * @r_xprt: controlling transport
+ * @rqst: RPC request to be marshaled
+ *
+ * For the RPC in "rqst", this function:
+ *  - Chooses the transfer mode (eg., RDMA_MSG or RDMA_NOMSG)
+ *  - Registers Read, Write, and Reply chunks
+ *  - Constructs the transport header
+ *  - Posts a Send WR to send the transport header and request
+ *
+ * Returns:
+ *	%0 if the RPC was sent successfully,
+ *	%-ENOTCONN if the connection was lost,
+ *	%-EAGAIN if the caller should call again with the same arguments,
+ *	%-ENOBUFS if the caller should call again after a delay,
+ *	%-EMSGSIZE if the transport header is too small,
+ *	%-EIO if a permanent problem occurred while marshaling.
+ */
+int
+rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
+{
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	struct xdr_stream *xdr = &req->rl_stream;
+	enum rpcrdma_chunktype rtype, wtype;
+	struct xdr_buf *buf = &rqst->rq_snd_buf;
+	bool ddp_allowed;
+	__be32 *p;
+	int ret;
+
+	if (unlikely(rqst->rq_rcv_buf.flags & XDRBUF_SPARSE_PAGES)) {
+		ret = rpcrdma_alloc_sparse_pages(&rqst->rq_rcv_buf);
+		if (ret)
+			return ret;
+	}
+
+	rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0);
+	xdr_init_encode(xdr, &req->rl_hdrbuf, rdmab_data(req->rl_rdmabuf),
+			rqst);
+
+	/* Fixed header fields */
+	ret = -EMSGSIZE;
+	p = xdr_reserve_space(xdr, 4 * sizeof(*p));
+	if (!p)
+		goto out_err;
+	*p++ = rqst->rq_xid;
+	*p++ = rpcrdma_version;
+	*p++ = r_xprt->rx_buf.rb_max_requests;
+
+	/* When the ULP employs a GSS flavor that guarantees integrity
+	 * or privacy, direct data placement of individual data items
+	 * is not allowed.
+	 */
+	ddp_allowed = !test_bit(RPCAUTH_AUTH_DATATOUCH,
+				&rqst->rq_cred->cr_auth->au_flags);
+
+	/*
+	 * Chunks needed for results?
+	 *
+	 * o If the expected result is under the inline threshold, all ops
+	 *   return as inline.
+	 * o Large read ops return data as write chunk(s), header as
+	 *   inline.
+	 * o Large non-read ops return as a single reply chunk.
+	 */
+	if (rpcrdma_results_inline(r_xprt, rqst))
+		wtype = rpcrdma_noch;
+	else if ((ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ) &&
+		 rpcrdma_nonpayload_inline(r_xprt, rqst))
+		wtype = rpcrdma_writech;
+	else
+		wtype = rpcrdma_replych;
+
+	/*
+	 * Chunks needed for arguments?
+	 *
+	 * o If the total request is under the inline threshold, all ops
+	 *   are sent as inline.
+	 * o Large write ops transmit data as read chunk(s), header as
+	 *   inline.
+	 * o Large non-write ops are sent with the entire message as a
+	 *   single read chunk (protocol 0-position special case).
+	 *
+	 * This assumes that the upper layer does not present a request
+	 * that both has a data payload, and whose non-data arguments
+	 * by themselves are larger than the inline threshold.
+	 */
+	if (rpcrdma_args_inline(r_xprt, rqst)) {
+		*p++ = rdma_msg;
+		rtype = buf->len < rdmab_length(req->rl_sendbuf) ?
+			rpcrdma_noch_pullup : rpcrdma_noch_mapped;
+	} else if (ddp_allowed && buf->flags & XDRBUF_WRITE) {
+		*p++ = rdma_msg;
+		rtype = rpcrdma_readch;
+	} else {
+		r_xprt->rx_stats.nomsg_call_count++;
+		*p++ = rdma_nomsg;
+		rtype = rpcrdma_areadch;
+	}
+
+	/* This implementation supports the following combinations
+	 * of chunk lists in one RPC-over-RDMA Call message:
+	 *
+	 *   - Read list
+	 *   - Write list
+	 *   - Reply chunk
+	 *   - Read list + Reply chunk
+	 *
+	 * It might not yet support the following combinations:
+	 *
+	 *   - Read list + Write list
+	 *
+	 * It does not support the following combinations:
+	 *
+	 *   - Write list + Reply chunk
+	 *   - Read list + Write list + Reply chunk
+	 *
+	 * This implementation supports only a single chunk in each
+	 * Read or Write list. Thus for example the client cannot
+	 * send a Call message with a Position Zero Read chunk and a
+	 * regular Read chunk at the same time.
+	 */
+	ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype);
+	if (ret)
+		goto out_err;
+	ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype);
+	if (ret)
+		goto out_err;
+	ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype);
+	if (ret)
+		goto out_err;
+
+	ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len,
+					buf, rtype);
+	if (ret)
+		goto out_err;
+
+	trace_xprtrdma_marshal(req, rtype, wtype);
+	return 0;
+
+out_err:
+	trace_xprtrdma_marshal_failed(rqst, ret);
+	r_xprt->rx_stats.failed_marshal_count++;
+	frwr_reset(req);
+	return ret;
+}
+
+static void __rpcrdma_update_cwnd_locked(struct rpc_xprt *xprt,
+					 struct rpcrdma_buffer *buf,
+					 u32 grant)
+{
+	buf->rb_credits = grant;
+	xprt->cwnd = grant << RPC_CWNDSHIFT;
+}
+
+static void rpcrdma_update_cwnd(struct rpcrdma_xprt *r_xprt, u32 grant)
+{
+	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+
+	spin_lock(&xprt->transport_lock);
+	__rpcrdma_update_cwnd_locked(xprt, &r_xprt->rx_buf, grant);
+	spin_unlock(&xprt->transport_lock);
+}
+
+/**
+ * rpcrdma_reset_cwnd - Reset the xprt's congestion window
+ * @r_xprt: controlling transport instance
+ *
+ * Prepare @r_xprt for the next connection by reinitializing
+ * its credit grant to one (see RFC 8166, Section 3.3.3).
+ */
+void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+
+	spin_lock(&xprt->transport_lock);
+	xprt->cong = 0;
+	__rpcrdma_update_cwnd_locked(xprt, &r_xprt->rx_buf, 1);
+	spin_unlock(&xprt->transport_lock);
+}
+
+/**
+ * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs
+ * @rqst: controlling RPC request
+ * @srcp: points to RPC message payload in receive buffer
+ * @copy_len: remaining length of receive buffer content
+ * @pad: Write chunk pad bytes needed (zero for pure inline)
+ *
+ * The upper layer has set the maximum number of bytes it can
+ * receive in each component of rq_rcv_buf. These values are set in
+ * the head.iov_len, page_len, tail.iov_len, and buflen fields.
+ *
+ * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in
+ * many cases this function simply updates iov_base pointers in
+ * rq_rcv_buf to point directly to the received reply data, to
+ * avoid copying reply data.
+ *
+ * Returns the count of bytes which had to be memcopied.
+ */
+static unsigned long
+rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
+{
+	unsigned long fixup_copy_count;
+	int i, npages, curlen;
+	char *destp;
+	struct page **ppages;
+	int page_base;
+
+	/* The head iovec is redirected to the RPC reply message
+	 * in the receive buffer, to avoid a memcopy.
+	 */
+	rqst->rq_rcv_buf.head[0].iov_base = srcp;
+	rqst->rq_private_buf.head[0].iov_base = srcp;
+
+	/* The contents of the receive buffer that follow
+	 * head.iov_len bytes are copied into the page list.
+	 */
+	curlen = rqst->rq_rcv_buf.head[0].iov_len;
+	if (curlen > copy_len)
+		curlen = copy_len;
+	srcp += curlen;
+	copy_len -= curlen;
+
+	ppages = rqst->rq_rcv_buf.pages +
+		(rqst->rq_rcv_buf.page_base >> PAGE_SHIFT);
+	page_base = offset_in_page(rqst->rq_rcv_buf.page_base);
+	fixup_copy_count = 0;
+	if (copy_len && rqst->rq_rcv_buf.page_len) {
+		int pagelist_len;
+
+		pagelist_len = rqst->rq_rcv_buf.page_len;
+		if (pagelist_len > copy_len)
+			pagelist_len = copy_len;
+		npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT;
+		for (i = 0; i < npages; i++) {
+			curlen = PAGE_SIZE - page_base;
+			if (curlen > pagelist_len)
+				curlen = pagelist_len;
+
+			destp = kmap_atomic(ppages[i]);
+			memcpy(destp + page_base, srcp, curlen);
+			flush_dcache_page(ppages[i]);
+			kunmap_atomic(destp);
+			srcp += curlen;
+			copy_len -= curlen;
+			fixup_copy_count += curlen;
+			pagelist_len -= curlen;
+			if (!pagelist_len)
+				break;
+			page_base = 0;
+		}
+
+		/* Implicit padding for the last segment in a Write
+		 * chunk is inserted inline at the front of the tail
+		 * iovec. The upper layer ignores the content of
+		 * the pad. Simply ensure inline content in the tail
+		 * that follows the Write chunk is properly aligned.
+		 */
+		if (pad)
+			srcp -= pad;
+	}
+
+	/* The tail iovec is redirected to the remaining data
+	 * in the receive buffer, to avoid a memcopy.
+	 */
+	if (copy_len || pad) {
+		rqst->rq_rcv_buf.tail[0].iov_base = srcp;
+		rqst->rq_private_buf.tail[0].iov_base = srcp;
+	}
+
+	if (fixup_copy_count)
+		trace_xprtrdma_fixup(rqst, fixup_copy_count);
+	return fixup_copy_count;
+}
+
+/* By convention, backchannel calls arrive via rdma_msg type
+ * messages, and never populate the chunk lists. This makes
+ * the RPC/RDMA header small and fixed in size, so it is
+ * straightforward to check the RPC header's direction field.
+ */
+static bool
+rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+{
+	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+	struct xdr_stream *xdr = &rep->rr_stream;
+	__be32 *p;
+
+	if (rep->rr_proc != rdma_msg)
+		return false;
+
+	/* Peek at stream contents without advancing. */
+	p = xdr_inline_decode(xdr, 0);
+
+	/* Chunk lists */
+	if (xdr_item_is_present(p++))
+		return false;
+	if (xdr_item_is_present(p++))
+		return false;
+	if (xdr_item_is_present(p++))
+		return false;
+
+	/* RPC header */
+	if (*p++ != rep->rr_xid)
+		return false;
+	if (*p != cpu_to_be32(RPC_CALL))
+		return false;
+
+	/* No bc service. */
+	if (xprt->bc_serv == NULL)
+		return false;
+
+	/* Now that we are sure this is a backchannel call,
+	 * advance to the RPC header.
+	 */
+	p = xdr_inline_decode(xdr, 3 * sizeof(*p));
+	if (unlikely(!p))
+		goto out_short;
+
+	rpcrdma_bc_receive_call(r_xprt, rep);
+	return true;
+
+out_short:
+	pr_warn("RPC/RDMA short backward direction call\n");
+	return true;
+}
+#else	/* CONFIG_SUNRPC_BACKCHANNEL */
+{
+	return false;
+}
+#endif	/* CONFIG_SUNRPC_BACKCHANNEL */
+
+static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length)
+{
+	u32 handle;
+	u64 offset;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, 4 * sizeof(*p));
+	if (unlikely(!p))
+		return -EIO;
+
+	xdr_decode_rdma_segment(p, &handle, length, &offset);
+	trace_xprtrdma_decode_seg(handle, *length, offset);
+	return 0;
+}
+
+static int decode_write_chunk(struct xdr_stream *xdr, u32 *length)
+{
+	u32 segcount, seglength;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, sizeof(*p));
+	if (unlikely(!p))
+		return -EIO;
+
+	*length = 0;
+	segcount = be32_to_cpup(p);
+	while (segcount--) {
+		if (decode_rdma_segment(xdr, &seglength))
+			return -EIO;
+		*length += seglength;
+	}
+
+	return 0;
+}
+
+/* In RPC-over-RDMA Version One replies, a Read list is never
+ * expected. This decoder is a stub that returns an error if
+ * a Read list is present.
+ */
+static int decode_read_list(struct xdr_stream *xdr)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, sizeof(*p));
+	if (unlikely(!p))
+		return -EIO;
+	if (unlikely(xdr_item_is_present(p)))
+		return -EIO;
+	return 0;
+}
+
+/* Supports only one Write chunk in the Write list
+ */
+static int decode_write_list(struct xdr_stream *xdr, u32 *length)
+{
+	u32 chunklen;
+	bool first;
+	__be32 *p;
+
+	*length = 0;
+	first = true;
+	do {
+		p = xdr_inline_decode(xdr, sizeof(*p));
+		if (unlikely(!p))
+			return -EIO;
+		if (xdr_item_is_absent(p))
+			break;
+		if (!first)
+			return -EIO;
+
+		if (decode_write_chunk(xdr, &chunklen))
+			return -EIO;
+		*length += chunklen;
+		first = false;
+	} while (true);
+	return 0;
+}
+
+static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, sizeof(*p));
+	if (unlikely(!p))
+		return -EIO;
+
+	*length = 0;
+	if (xdr_item_is_present(p))
+		if (decode_write_chunk(xdr, length))
+			return -EIO;
+	return 0;
+}
+
+static int
+rpcrdma_decode_msg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
+		   struct rpc_rqst *rqst)
+{
+	struct xdr_stream *xdr = &rep->rr_stream;
+	u32 writelist, replychunk, rpclen;
+	char *base;
+
+	/* Decode the chunk lists */
+	if (decode_read_list(xdr))
+		return -EIO;
+	if (decode_write_list(xdr, &writelist))
+		return -EIO;
+	if (decode_reply_chunk(xdr, &replychunk))
+		return -EIO;
+
+	/* RDMA_MSG sanity checks */
+	if (unlikely(replychunk))
+		return -EIO;
+
+	/* Build the RPC reply's Payload stream in rqst->rq_rcv_buf */
+	base = (char *)xdr_inline_decode(xdr, 0);
+	rpclen = xdr_stream_remaining(xdr);
+	r_xprt->rx_stats.fixup_copy_count +=
+		rpcrdma_inline_fixup(rqst, base, rpclen, writelist & 3);
+
+	r_xprt->rx_stats.total_rdma_reply += writelist;
+	return rpclen + xdr_align_size(writelist);
+}
+
+static noinline int
+rpcrdma_decode_nomsg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
+{
+	struct xdr_stream *xdr = &rep->rr_stream;
+	u32 writelist, replychunk;
+
+	/* Decode the chunk lists */
+	if (decode_read_list(xdr))
+		return -EIO;
+	if (decode_write_list(xdr, &writelist))
+		return -EIO;
+	if (decode_reply_chunk(xdr, &replychunk))
+		return -EIO;
+
+	/* RDMA_NOMSG sanity checks */
+	if (unlikely(writelist))
+		return -EIO;
+	if (unlikely(!replychunk))
+		return -EIO;
+
+	/* Reply chunk buffer already is the reply vector */
+	r_xprt->rx_stats.total_rdma_reply += replychunk;
+	return replychunk;
+}
+
+static noinline int
+rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
+		     struct rpc_rqst *rqst)
+{
+	struct xdr_stream *xdr = &rep->rr_stream;
+	__be32 *p;
+
+	p = xdr_inline_decode(xdr, sizeof(*p));
+	if (unlikely(!p))
+		return -EIO;
+
+	switch (*p) {
+	case err_vers:
+		p = xdr_inline_decode(xdr, 2 * sizeof(*p));
+		if (!p)
+			break;
+		dprintk("RPC:       %s: server reports "
+			"version error (%u-%u), xid %08x\n", __func__,
+			be32_to_cpup(p), be32_to_cpu(*(p + 1)),
+			be32_to_cpu(rep->rr_xid));
+		break;
+	case err_chunk:
+		dprintk("RPC:       %s: server reports "
+			"header decoding error, xid %08x\n", __func__,
+			be32_to_cpu(rep->rr_xid));
+		break;
+	default:
+		dprintk("RPC:       %s: server reports "
+			"unrecognized error %d, xid %08x\n", __func__,
+			be32_to_cpup(p), be32_to_cpu(rep->rr_xid));
+	}
+
+	return -EIO;
+}
+
+/* Perform XID lookup, reconstruction of the RPC reply, and
+ * RPC completion while holding the transport lock to ensure
+ * the rep, rqst, and rq_task pointers remain stable.
+ */
+void rpcrdma_complete_rqst(struct rpcrdma_rep *rep)
+{
+	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
+	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+	struct rpc_rqst *rqst = rep->rr_rqst;
+	int status;
+
+	switch (rep->rr_proc) {
+	case rdma_msg:
+		status = rpcrdma_decode_msg(r_xprt, rep, rqst);
+		break;
+	case rdma_nomsg:
+		status = rpcrdma_decode_nomsg(r_xprt, rep);
+		break;
+	case rdma_error:
+		status = rpcrdma_decode_error(r_xprt, rep, rqst);
+		break;
+	default:
+		status = -EIO;
+	}
+	if (status < 0)
+		goto out_badheader;
+
+out:
+	spin_lock(&xprt->queue_lock);
+	xprt_complete_rqst(rqst->rq_task, status);
+	xprt_unpin_rqst(rqst);
+	spin_unlock(&xprt->queue_lock);
+	return;
+
+out_badheader:
+	trace_xprtrdma_reply_hdr(rep);
+	r_xprt->rx_stats.bad_reply_count++;
+	rqst->rq_task->tk_status = status;
+	status = 0;
+	goto out;
+}
+
+static void rpcrdma_reply_done(struct kref *kref)
+{
+	struct rpcrdma_req *req =
+		container_of(kref, struct rpcrdma_req, rl_kref);
+
+	rpcrdma_complete_rqst(req->rl_reply);
+}
+
+/**
+ * rpcrdma_reply_handler - Process received RPC/RDMA messages
+ * @rep: Incoming rpcrdma_rep object to process
+ *
+ * Errors must result in the RPC task either being awakened, or
+ * allowed to timeout, to discover the errors at that time.
+ */
+void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
+{
+	struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
+	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct rpcrdma_req *req;
+	struct rpc_rqst *rqst;
+	u32 credits;
+	__be32 *p;
+
+	/* Any data means we had a useful conversation, so
+	 * then we don't need to delay the next reconnect.
+	 */
+	if (xprt->reestablish_timeout)
+		xprt->reestablish_timeout = 0;
+
+	/* Fixed transport header fields */
+	xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf,
+			rep->rr_hdrbuf.head[0].iov_base, NULL);
+	p = xdr_inline_decode(&rep->rr_stream, 4 * sizeof(*p));
+	if (unlikely(!p))
+		goto out_shortreply;
+	rep->rr_xid = *p++;
+	rep->rr_vers = *p++;
+	credits = be32_to_cpu(*p++);
+	rep->rr_proc = *p++;
+
+	if (rep->rr_vers != rpcrdma_version)
+		goto out_badversion;
+
+	if (rpcrdma_is_bcall(r_xprt, rep))
+		return;
+
+	/* Match incoming rpcrdma_rep to an rpcrdma_req to
+	 * get context for handling any incoming chunks.
+	 */
+	spin_lock(&xprt->queue_lock);
+	rqst = xprt_lookup_rqst(xprt, rep->rr_xid);
+	if (!rqst)
+		goto out_norqst;
+	xprt_pin_rqst(rqst);
+	spin_unlock(&xprt->queue_lock);
+
+	if (credits == 0)
+		credits = 1;	/* don't deadlock */
+	else if (credits > r_xprt->rx_ep->re_max_requests)
+		credits = r_xprt->rx_ep->re_max_requests;
+	rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1),
+			   false);
+	if (buf->rb_credits != credits)
+		rpcrdma_update_cwnd(r_xprt, credits);
+
+	req = rpcr_to_rdmar(rqst);
+	if (req->rl_reply) {
+		trace_xprtrdma_leaked_rep(rqst, req->rl_reply);
+		rpcrdma_recv_buffer_put(req->rl_reply);
+	}
+	req->rl_reply = rep;
+	rep->rr_rqst = rqst;
+
+	trace_xprtrdma_reply(rqst->rq_task, rep, req, credits);
+
+	if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)
+		frwr_reminv(rep, &req->rl_registered);
+	if (!list_empty(&req->rl_registered))
+		frwr_unmap_async(r_xprt, req);
+		/* LocalInv completion will complete the RPC */
+	else
+		kref_put(&req->rl_kref, rpcrdma_reply_done);
+	return;
+
+out_badversion:
+	trace_xprtrdma_reply_vers(rep);
+	goto out;
+
+out_norqst:
+	spin_unlock(&xprt->queue_lock);
+	trace_xprtrdma_reply_rqst(rep);
+	goto out;
+
+out_shortreply:
+	trace_xprtrdma_reply_short(rep);
+
+out:
+	rpcrdma_recv_buffer_put(rep);
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
new file mode 100644
index 000000000..526da5d47
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -0,0 +1,252 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+/*
+ * Copyright (c) 2015-2018 Oracle.  All rights reserved.
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/workqueue.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/sched.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+/* RPC/RDMA parameters */
+unsigned int svcrdma_ord = 16;	/* historical default */
+static unsigned int min_ord = 1;
+static unsigned int max_ord = 255;
+unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
+unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS;
+static unsigned int min_max_requests = 4;
+static unsigned int max_max_requests = 16384;
+unsigned int svcrdma_max_req_size = RPCRDMA_DEF_INLINE_THRESH;
+static unsigned int min_max_inline = RPCRDMA_DEF_INLINE_THRESH;
+static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH;
+
+atomic_t rdma_stat_recv;
+atomic_t rdma_stat_read;
+atomic_t rdma_stat_write;
+atomic_t rdma_stat_sq_starve;
+atomic_t rdma_stat_rq_starve;
+atomic_t rdma_stat_rq_poll;
+atomic_t rdma_stat_rq_prod;
+atomic_t rdma_stat_sq_poll;
+atomic_t rdma_stat_sq_prod;
+
+/*
+ * This function implements reading and resetting an atomic_t stat
+ * variable through read/write to a proc file. Any write to the file
+ * resets the associated statistic to zero. Any read returns it's
+ * current value.
+ */
+static int read_reset_stat(struct ctl_table *table, int write,
+			   void *buffer, size_t *lenp, loff_t *ppos)
+{
+	atomic_t *stat = (atomic_t *)table->data;
+
+	if (!stat)
+		return -EINVAL;
+
+	if (write)
+		atomic_set(stat, 0);
+	else {
+		char str_buf[32];
+		int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat));
+		if (len >= 32)
+			return -EFAULT;
+		len = strlen(str_buf);
+		if (*ppos > len) {
+			*lenp = 0;
+			return 0;
+		}
+		len -= *ppos;
+		if (len > *lenp)
+			len = *lenp;
+		if (len)
+			memcpy(buffer, str_buf, len);
+		*lenp = len;
+		*ppos += len;
+	}
+	return 0;
+}
+
+static struct ctl_table_header *svcrdma_table_header;
+static struct ctl_table svcrdma_parm_table[] = {
+	{
+		.procname	= "max_requests",
+		.data		= &svcrdma_max_requests,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_max_requests,
+		.extra2		= &max_max_requests
+	},
+	{
+		.procname	= "max_req_size",
+		.data		= &svcrdma_max_req_size,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_max_inline,
+		.extra2		= &max_max_inline
+	},
+	{
+		.procname	= "max_outbound_read_requests",
+		.data		= &svcrdma_ord,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_ord,
+		.extra2		= &max_ord,
+	},
+
+	{
+		.procname	= "rdma_stat_read",
+		.data		= &rdma_stat_read,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_recv",
+		.data		= &rdma_stat_recv,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_write",
+		.data		= &rdma_stat_write,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_sq_starve",
+		.data		= &rdma_stat_sq_starve,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_rq_starve",
+		.data		= &rdma_stat_rq_starve,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_rq_poll",
+		.data		= &rdma_stat_rq_poll,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_rq_prod",
+		.data		= &rdma_stat_rq_prod,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_sq_poll",
+		.data		= &rdma_stat_sq_poll,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{
+		.procname	= "rdma_stat_sq_prod",
+		.data		= &rdma_stat_sq_prod,
+		.maxlen		= sizeof(atomic_t),
+		.mode		= 0644,
+		.proc_handler	= read_reset_stat,
+	},
+	{ },
+};
+
+static struct ctl_table svcrdma_table[] = {
+	{
+		.procname	= "svc_rdma",
+		.mode		= 0555,
+		.child		= svcrdma_parm_table
+	},
+	{ },
+};
+
+static struct ctl_table svcrdma_root_table[] = {
+	{
+		.procname	= "sunrpc",
+		.mode		= 0555,
+		.child		= svcrdma_table
+	},
+	{ },
+};
+
+void svc_rdma_cleanup(void)
+{
+	dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
+	if (svcrdma_table_header) {
+		unregister_sysctl_table(svcrdma_table_header);
+		svcrdma_table_header = NULL;
+	}
+	svc_unreg_xprt_class(&svc_rdma_class);
+}
+
+int svc_rdma_init(void)
+{
+	dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
+	dprintk("\tsvcrdma_ord      : %d\n", svcrdma_ord);
+	dprintk("\tmax_requests     : %u\n", svcrdma_max_requests);
+	dprintk("\tmax_bc_requests  : %u\n", svcrdma_max_bc_requests);
+	dprintk("\tmax_inline       : %d\n", svcrdma_max_req_size);
+
+	if (!svcrdma_table_header)
+		svcrdma_table_header =
+			register_sysctl_table(svcrdma_root_table);
+
+	/* Register RDMA with the SVC transport switch */
+	svc_reg_xprt_class(&svc_rdma_class);
+	return 0;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
new file mode 100644
index 000000000..c5154bc38
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2015-2018 Oracle.  All rights reserved.
+ *
+ * Support for backward direction RPCs on RPC/RDMA (server-side).
+ */
+
+#include <linux/sunrpc/svc_rdma.h>
+
+#include "xprt_rdma.h"
+#include <trace/events/rpcrdma.h>
+
+/**
+ * svc_rdma_handle_bc_reply - Process incoming backchannel Reply
+ * @rqstp: resources for handling the Reply
+ * @rctxt: Received message
+ *
+ */
+void svc_rdma_handle_bc_reply(struct svc_rqst *rqstp,
+			      struct svc_rdma_recv_ctxt *rctxt)
+{
+	struct svc_xprt *sxprt = rqstp->rq_xprt;
+	struct rpc_xprt *xprt = sxprt->xpt_bc_xprt;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	struct xdr_buf *rcvbuf = &rqstp->rq_arg;
+	struct kvec *dst, *src = &rcvbuf->head[0];
+	__be32 *rdma_resp = rctxt->rc_recv_buf;
+	struct rpc_rqst *req;
+	u32 credits;
+
+	spin_lock(&xprt->queue_lock);
+	req = xprt_lookup_rqst(xprt, *rdma_resp);
+	if (!req)
+		goto out_unlock;
+
+	dst = &req->rq_private_buf.head[0];
+	memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf));
+	if (dst->iov_len < src->iov_len)
+		goto out_unlock;
+	memcpy(dst->iov_base, src->iov_base, src->iov_len);
+	xprt_pin_rqst(req);
+	spin_unlock(&xprt->queue_lock);
+
+	credits = be32_to_cpup(rdma_resp + 2);
+	if (credits == 0)
+		credits = 1;	/* don't deadlock */
+	else if (credits > r_xprt->rx_buf.rb_bc_max_requests)
+		credits = r_xprt->rx_buf.rb_bc_max_requests;
+	spin_lock(&xprt->transport_lock);
+	xprt->cwnd = credits << RPC_CWNDSHIFT;
+	spin_unlock(&xprt->transport_lock);
+
+	spin_lock(&xprt->queue_lock);
+	xprt_complete_rqst(req->rq_task, rcvbuf->len);
+	xprt_unpin_rqst(req);
+	rcvbuf->len = 0;
+
+out_unlock:
+	spin_unlock(&xprt->queue_lock);
+}
+
+/* Send a backwards direction RPC call.
+ *
+ * Caller holds the connection's mutex and has already marshaled
+ * the RPC/RDMA request.
+ *
+ * This is similar to svc_rdma_send_reply_msg, but takes a struct
+ * rpc_rqst instead, does not support chunks, and avoids blocking
+ * memory allocation.
+ *
+ * XXX: There is still an opportunity to block in svc_rdma_send()
+ * if there are no SQ entries to post the Send. This may occur if
+ * the adapter has a small maximum SQ depth.
+ */
+static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
+			      struct rpc_rqst *rqst,
+			      struct svc_rdma_send_ctxt *ctxt)
+{
+	int ret;
+
+	ret = svc_rdma_map_reply_msg(rdma, ctxt, NULL, &rqst->rq_snd_buf);
+	if (ret < 0)
+		return -EIO;
+
+	/* Bump page refcnt so Send completion doesn't release
+	 * the rq_buffer before all retransmits are complete.
+	 */
+	get_page(virt_to_page(rqst->rq_buffer));
+	ctxt->sc_send_wr.opcode = IB_WR_SEND;
+	return svc_rdma_send(rdma, ctxt);
+}
+
+/* Server-side transport endpoint wants a whole page for its send
+ * buffer. The client RPC code constructs the RPC header in this
+ * buffer before it invokes ->send_request.
+ */
+static int
+xprt_rdma_bc_allocate(struct rpc_task *task)
+{
+	struct rpc_rqst *rqst = task->tk_rqstp;
+	size_t size = rqst->rq_callsize;
+	struct page *page;
+
+	if (size > PAGE_SIZE) {
+		WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n",
+			  size);
+		return -EINVAL;
+	}
+
+	page = alloc_page(RPCRDMA_DEF_GFP);
+	if (!page)
+		return -ENOMEM;
+	rqst->rq_buffer = page_address(page);
+
+	rqst->rq_rbuffer = kmalloc(rqst->rq_rcvsize, RPCRDMA_DEF_GFP);
+	if (!rqst->rq_rbuffer) {
+		put_page(page);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static void
+xprt_rdma_bc_free(struct rpc_task *task)
+{
+	struct rpc_rqst *rqst = task->tk_rqstp;
+
+	put_page(virt_to_page(rqst->rq_buffer));
+	kfree(rqst->rq_rbuffer);
+}
+
+static int
+rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst)
+{
+	struct rpc_xprt *xprt = rqst->rq_xprt;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	struct svc_rdma_send_ctxt *ctxt;
+	__be32 *p;
+	int rc;
+
+	ctxt = svc_rdma_send_ctxt_get(rdma);
+	if (!ctxt)
+		goto drop_connection;
+
+	p = xdr_reserve_space(&ctxt->sc_stream, RPCRDMA_HDRLEN_MIN);
+	if (!p)
+		goto put_ctxt;
+	*p++ = rqst->rq_xid;
+	*p++ = rpcrdma_version;
+	*p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
+	*p++ = rdma_msg;
+	*p++ = xdr_zero;
+	*p++ = xdr_zero;
+	*p   = xdr_zero;
+
+	rqst->rq_xtime = ktime_get();
+	rc = svc_rdma_bc_sendto(rdma, rqst, ctxt);
+	if (rc)
+		goto put_ctxt;
+	return 0;
+
+put_ctxt:
+	svc_rdma_send_ctxt_put(rdma, ctxt);
+
+drop_connection:
+	return -ENOTCONN;
+}
+
+/**
+ * xprt_rdma_bc_send_request - Send a reverse-direction Call
+ * @rqst: rpc_rqst containing Call message to be sent
+ *
+ * Return values:
+ *   %0 if the message was sent successfully
+ *   %ENOTCONN if the message was not sent
+ */
+static int xprt_rdma_bc_send_request(struct rpc_rqst *rqst)
+{
+	struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
+	struct svcxprt_rdma *rdma =
+		container_of(sxprt, struct svcxprt_rdma, sc_xprt);
+	int ret;
+
+	if (test_bit(XPT_DEAD, &sxprt->xpt_flags))
+		return -ENOTCONN;
+
+	ret = rpcrdma_bc_send_request(rdma, rqst);
+	if (ret == -ENOTCONN)
+		svc_close_xprt(sxprt);
+	return ret;
+}
+
+static void
+xprt_rdma_bc_close(struct rpc_xprt *xprt)
+{
+	xprt_disconnect_done(xprt);
+	xprt->cwnd = RPC_CWNDSHIFT;
+}
+
+static void
+xprt_rdma_bc_put(struct rpc_xprt *xprt)
+{
+	xprt_rdma_free_addresses(xprt);
+	xprt_free(xprt);
+}
+
+static const struct rpc_xprt_ops xprt_rdma_bc_procs = {
+	.reserve_xprt		= xprt_reserve_xprt_cong,
+	.release_xprt		= xprt_release_xprt_cong,
+	.alloc_slot		= xprt_alloc_slot,
+	.free_slot		= xprt_free_slot,
+	.release_request	= xprt_release_rqst_cong,
+	.buf_alloc		= xprt_rdma_bc_allocate,
+	.buf_free		= xprt_rdma_bc_free,
+	.send_request		= xprt_rdma_bc_send_request,
+	.wait_for_reply_request	= xprt_wait_for_reply_request_def,
+	.close			= xprt_rdma_bc_close,
+	.destroy		= xprt_rdma_bc_put,
+	.print_stats		= xprt_rdma_print_stats
+};
+
+static const struct rpc_timeout xprt_rdma_bc_timeout = {
+	.to_initval = 60 * HZ,
+	.to_maxval = 60 * HZ,
+};
+
+/* It shouldn't matter if the number of backchannel session slots
+ * doesn't match the number of RPC/RDMA credits. That just means
+ * one or the other will have extra slots that aren't used.
+ */
+static struct rpc_xprt *
+xprt_setup_rdma_bc(struct xprt_create *args)
+{
+	struct rpc_xprt *xprt;
+	struct rpcrdma_xprt *new_xprt;
+
+	if (args->addrlen > sizeof(xprt->addr))
+		return ERR_PTR(-EBADF);
+
+	xprt = xprt_alloc(args->net, sizeof(*new_xprt),
+			  RPCRDMA_MAX_BC_REQUESTS,
+			  RPCRDMA_MAX_BC_REQUESTS);
+	if (!xprt)
+		return ERR_PTR(-ENOMEM);
+
+	xprt->timeout = &xprt_rdma_bc_timeout;
+	xprt_set_bound(xprt);
+	xprt_set_connected(xprt);
+	xprt->bind_timeout = 0;
+	xprt->reestablish_timeout = 0;
+	xprt->idle_timeout = 0;
+
+	xprt->prot = XPRT_TRANSPORT_BC_RDMA;
+	xprt->ops = &xprt_rdma_bc_procs;
+
+	memcpy(&xprt->addr, args->dstaddr, args->addrlen);
+	xprt->addrlen = args->addrlen;
+	xprt_rdma_format_addresses(xprt, (struct sockaddr *)&xprt->addr);
+	xprt->resvport = 0;
+
+	xprt->max_payload = xprt_rdma_max_inline_read;
+
+	new_xprt = rpcx_to_rdmax(xprt);
+	new_xprt->rx_buf.rb_bc_max_requests = xprt->max_reqs;
+
+	xprt_get(xprt);
+	args->bc_xprt->xpt_bc_xprt = xprt;
+	xprt->bc_xprt = args->bc_xprt;
+
+	/* Final put for backchannel xprt is in __svc_rdma_free */
+	xprt_get(xprt);
+	return xprt;
+}
+
+struct xprt_class xprt_rdma_bc = {
+	.list			= LIST_HEAD_INIT(xprt_rdma_bc.list),
+	.name			= "rdma backchannel",
+	.owner			= THIS_MODULE,
+	.ident			= XPRT_TRANSPORT_BC_RDMA,
+	.setup			= xprt_setup_rdma_bc,
+};
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
new file mode 100644
index 000000000..c6ea2903c
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -0,0 +1,881 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+/*
+ * Copyright (c) 2016-2018 Oracle. All rights reserved.
+ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+/* Operation
+ *
+ * The main entry point is svc_rdma_recvfrom. This is called from
+ * svc_recv when the transport indicates there is incoming data to
+ * be read. "Data Ready" is signaled when an RDMA Receive completes,
+ * or when a set of RDMA Reads complete.
+ *
+ * An svc_rqst is passed in. This structure contains an array of
+ * free pages (rq_pages) that will contain the incoming RPC message.
+ *
+ * Short messages are moved directly into svc_rqst::rq_arg, and
+ * the RPC Call is ready to be processed by the Upper Layer.
+ * svc_rdma_recvfrom returns the length of the RPC Call message,
+ * completing the reception of the RPC Call.
+ *
+ * However, when an incoming message has Read chunks,
+ * svc_rdma_recvfrom must post RDMA Reads to pull the RPC Call's
+ * data payload from the client. svc_rdma_recvfrom sets up the
+ * RDMA Reads using pages in svc_rqst::rq_pages, which are
+ * transferred to an svc_rdma_recv_ctxt for the duration of the
+ * I/O. svc_rdma_recvfrom then returns zero, since the RPC message
+ * is still not yet ready.
+ *
+ * When the Read chunk payloads have become available on the
+ * server, "Data Ready" is raised again, and svc_recv calls
+ * svc_rdma_recvfrom again. This second call may use a different
+ * svc_rqst than the first one, thus any information that needs
+ * to be preserved across these two calls is kept in an
+ * svc_rdma_recv_ctxt.
+ *
+ * The second call to svc_rdma_recvfrom performs final assembly
+ * of the RPC Call message, using the RDMA Read sink pages kept in
+ * the svc_rdma_recv_ctxt. The xdr_buf is copied from the
+ * svc_rdma_recv_ctxt to the second svc_rqst. The second call returns
+ * the length of the completed RPC Call message.
+ *
+ * Page Management
+ *
+ * Pages under I/O must be transferred from the first svc_rqst to an
+ * svc_rdma_recv_ctxt before the first svc_rdma_recvfrom call returns.
+ *
+ * The first svc_rqst supplies pages for RDMA Reads. These are moved
+ * from rqstp::rq_pages into ctxt::pages. The consumed elements of
+ * the rq_pages array are set to NULL and refilled with the first
+ * svc_rdma_recvfrom call returns.
+ *
+ * During the second svc_rdma_recvfrom call, RDMA Read sink pages
+ * are transferred from the svc_rdma_recv_ctxt to the second svc_rqst
+ * (see rdma_read_complete() below).
+ */
+
+#include <linux/spinlock.h>
+#include <asm/unaligned.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#include "xprt_rdma.h"
+#include <trace/events/rpcrdma.h>
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc);
+
+static inline struct svc_rdma_recv_ctxt *
+svc_rdma_next_recv_ctxt(struct list_head *list)
+{
+	return list_first_entry_or_null(list, struct svc_rdma_recv_ctxt,
+					rc_list);
+}
+
+static void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma,
+				   struct rpc_rdma_cid *cid)
+{
+	cid->ci_queue_id = rdma->sc_rq_cq->res.id;
+	cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids);
+}
+
+static struct svc_rdma_recv_ctxt *
+svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_recv_ctxt *ctxt;
+	dma_addr_t addr;
+	void *buffer;
+
+	ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (!ctxt)
+		goto fail0;
+	buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL);
+	if (!buffer)
+		goto fail1;
+	addr = ib_dma_map_single(rdma->sc_pd->device, buffer,
+				 rdma->sc_max_req_size, DMA_FROM_DEVICE);
+	if (ib_dma_mapping_error(rdma->sc_pd->device, addr))
+		goto fail2;
+
+	svc_rdma_recv_cid_init(rdma, &ctxt->rc_cid);
+
+	ctxt->rc_recv_wr.next = NULL;
+	ctxt->rc_recv_wr.wr_cqe = &ctxt->rc_cqe;
+	ctxt->rc_recv_wr.sg_list = &ctxt->rc_recv_sge;
+	ctxt->rc_recv_wr.num_sge = 1;
+	ctxt->rc_cqe.done = svc_rdma_wc_receive;
+	ctxt->rc_recv_sge.addr = addr;
+	ctxt->rc_recv_sge.length = rdma->sc_max_req_size;
+	ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey;
+	ctxt->rc_recv_buf = buffer;
+	ctxt->rc_temp = false;
+	return ctxt;
+
+fail2:
+	kfree(buffer);
+fail1:
+	kfree(ctxt);
+fail0:
+	return NULL;
+}
+
+static void svc_rdma_recv_ctxt_destroy(struct svcxprt_rdma *rdma,
+				       struct svc_rdma_recv_ctxt *ctxt)
+{
+	ib_dma_unmap_single(rdma->sc_pd->device, ctxt->rc_recv_sge.addr,
+			    ctxt->rc_recv_sge.length, DMA_FROM_DEVICE);
+	kfree(ctxt->rc_recv_buf);
+	kfree(ctxt);
+}
+
+/**
+ * svc_rdma_recv_ctxts_destroy - Release all recv_ctxt's for an xprt
+ * @rdma: svcxprt_rdma being torn down
+ *
+ */
+void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_recv_ctxt *ctxt;
+	struct llist_node *node;
+
+	while ((node = llist_del_first(&rdma->sc_recv_ctxts))) {
+		ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
+		svc_rdma_recv_ctxt_destroy(rdma, ctxt);
+	}
+}
+
+static struct svc_rdma_recv_ctxt *
+svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_recv_ctxt *ctxt;
+	struct llist_node *node;
+
+	node = llist_del_first(&rdma->sc_recv_ctxts);
+	if (!node)
+		goto out_empty;
+	ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
+
+out:
+	ctxt->rc_page_count = 0;
+	ctxt->rc_read_payload_length = 0;
+	return ctxt;
+
+out_empty:
+	ctxt = svc_rdma_recv_ctxt_alloc(rdma);
+	if (!ctxt)
+		return NULL;
+	goto out;
+}
+
+/**
+ * svc_rdma_recv_ctxt_put - Return recv_ctxt to free list
+ * @rdma: controlling svcxprt_rdma
+ * @ctxt: object to return to the free list
+ *
+ */
+void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
+			    struct svc_rdma_recv_ctxt *ctxt)
+{
+	unsigned int i;
+
+	for (i = 0; i < ctxt->rc_page_count; i++)
+		put_page(ctxt->rc_pages[i]);
+
+	if (!ctxt->rc_temp)
+		llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts);
+	else
+		svc_rdma_recv_ctxt_destroy(rdma, ctxt);
+}
+
+/**
+ * svc_rdma_release_rqst - Release transport-specific per-rqst resources
+ * @rqstp: svc_rqst being released
+ *
+ * Ensure that the recv_ctxt is released whether or not a Reply
+ * was sent. For example, the client could close the connection,
+ * or svc_process could drop an RPC, before the Reply is sent.
+ */
+void svc_rdma_release_rqst(struct svc_rqst *rqstp)
+{
+	struct svc_rdma_recv_ctxt *ctxt = rqstp->rq_xprt_ctxt;
+	struct svc_xprt *xprt = rqstp->rq_xprt;
+	struct svcxprt_rdma *rdma =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+
+	rqstp->rq_xprt_ctxt = NULL;
+	if (ctxt)
+		svc_rdma_recv_ctxt_put(rdma, ctxt);
+}
+
+static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma,
+				struct svc_rdma_recv_ctxt *ctxt)
+{
+	int ret;
+
+	trace_svcrdma_post_recv(ctxt);
+	ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, NULL);
+	if (ret)
+		goto err_post;
+	return 0;
+
+err_post:
+	trace_svcrdma_rq_post_err(rdma, ret);
+	svc_rdma_recv_ctxt_put(rdma, ctxt);
+	return ret;
+}
+
+static int svc_rdma_post_recv(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_recv_ctxt *ctxt;
+
+	if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags))
+		return 0;
+	ctxt = svc_rdma_recv_ctxt_get(rdma);
+	if (!ctxt)
+		return -ENOMEM;
+	return __svc_rdma_post_recv(rdma, ctxt);
+}
+
+/**
+ * svc_rdma_post_recvs - Post initial set of Recv WRs
+ * @rdma: fresh svcxprt_rdma
+ *
+ * Returns true if successful, otherwise false.
+ */
+bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_recv_ctxt *ctxt;
+	unsigned int i;
+	int ret;
+
+	for (i = 0; i < rdma->sc_max_requests; i++) {
+		ctxt = svc_rdma_recv_ctxt_get(rdma);
+		if (!ctxt)
+			return false;
+		ctxt->rc_temp = true;
+		ret = __svc_rdma_post_recv(rdma, ctxt);
+		if (ret)
+			return false;
+	}
+	return true;
+}
+
+/**
+ * svc_rdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
+ * @cq: Completion Queue context
+ * @wc: Work Completion object
+ *
+ * NB: The svc_xprt/svcxprt_rdma is pinned whenever it's possible that
+ * the Receive completion handler could be running.
+ */
+static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct svcxprt_rdma *rdma = cq->cq_context;
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct svc_rdma_recv_ctxt *ctxt;
+
+	/* WARNING: Only wc->wr_cqe and wc->status are reliable */
+	ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe);
+
+	trace_svcrdma_wc_receive(wc, &ctxt->rc_cid);
+	if (wc->status != IB_WC_SUCCESS)
+		goto flushed;
+
+	if (svc_rdma_post_recv(rdma))
+		goto post_err;
+
+	/* All wc fields are now known to be valid */
+	ctxt->rc_byte_len = wc->byte_len;
+	ib_dma_sync_single_for_cpu(rdma->sc_pd->device,
+				   ctxt->rc_recv_sge.addr,
+				   wc->byte_len, DMA_FROM_DEVICE);
+
+	spin_lock(&rdma->sc_rq_dto_lock);
+	list_add_tail(&ctxt->rc_list, &rdma->sc_rq_dto_q);
+	/* Note the unlock pairs with the smp_rmb in svc_xprt_ready: */
+	set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags);
+	spin_unlock(&rdma->sc_rq_dto_lock);
+	if (!test_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags))
+		svc_xprt_enqueue(&rdma->sc_xprt);
+	return;
+
+flushed:
+post_err:
+	svc_rdma_recv_ctxt_put(rdma, ctxt);
+	set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+	svc_xprt_enqueue(&rdma->sc_xprt);
+}
+
+/**
+ * svc_rdma_flush_recv_queues - Drain pending Receive work
+ * @rdma: svcxprt_rdma being shut down
+ *
+ */
+void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_recv_ctxt *ctxt;
+
+	while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_read_complete_q))) {
+		list_del(&ctxt->rc_list);
+		svc_rdma_recv_ctxt_put(rdma, ctxt);
+	}
+	while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_rq_dto_q))) {
+		list_del(&ctxt->rc_list);
+		svc_rdma_recv_ctxt_put(rdma, ctxt);
+	}
+}
+
+static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp,
+				   struct svc_rdma_recv_ctxt *ctxt)
+{
+	struct xdr_buf *arg = &rqstp->rq_arg;
+
+	arg->head[0].iov_base = ctxt->rc_recv_buf;
+	arg->head[0].iov_len = ctxt->rc_byte_len;
+	arg->tail[0].iov_base = NULL;
+	arg->tail[0].iov_len = 0;
+	arg->page_len = 0;
+	arg->page_base = 0;
+	arg->buflen = ctxt->rc_byte_len;
+	arg->len = ctxt->rc_byte_len;
+}
+
+/* This accommodates the largest possible Write chunk.
+ */
+#define MAX_BYTES_WRITE_CHUNK ((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT))
+
+/* This accommodates the largest possible Position-Zero
+ * Read chunk or Reply chunk.
+ */
+#define MAX_BYTES_SPECIAL_CHUNK ((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT))
+
+/* Sanity check the Read list.
+ *
+ * Implementation limits:
+ * - This implementation supports only one Read chunk.
+ *
+ * Sanity checks:
+ * - Read list does not overflow Receive buffer.
+ * - Segment size limited by largest NFS data payload.
+ *
+ * The segment count is limited to how many segments can
+ * fit in the transport header without overflowing the
+ * buffer. That's about 40 Read segments for a 1KB inline
+ * threshold.
+ *
+ * Return values:
+ *       %true: Read list is valid. @rctxt's xdr_stream is updated
+ *		to point to the first byte past the Read list.
+ *      %false: Read list is corrupt. @rctxt's xdr_stream is left
+ *		in an unknown state.
+ */
+static bool xdr_check_read_list(struct svc_rdma_recv_ctxt *rctxt)
+{
+	u32 position, len;
+	bool first;
+	__be32 *p;
+
+	p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
+	if (!p)
+		return false;
+
+	len = 0;
+	first = true;
+	while (xdr_item_is_present(p)) {
+		p = xdr_inline_decode(&rctxt->rc_stream,
+				      rpcrdma_readseg_maxsz * sizeof(*p));
+		if (!p)
+			return false;
+
+		if (first) {
+			position = be32_to_cpup(p);
+			first = false;
+		} else if (be32_to_cpup(p) != position) {
+			return false;
+		}
+		p += 2;
+		len += be32_to_cpup(p);
+
+		p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
+		if (!p)
+			return false;
+	}
+	return len <= MAX_BYTES_SPECIAL_CHUNK;
+}
+
+/* The segment count is limited to how many segments can
+ * fit in the transport header without overflowing the
+ * buffer. That's about 60 Write segments for a 1KB inline
+ * threshold.
+ */
+static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt, u32 maxlen)
+{
+	u32 i, segcount, total;
+	__be32 *p;
+
+	p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
+	if (!p)
+		return false;
+	segcount = be32_to_cpup(p);
+
+	total = 0;
+	for (i = 0; i < segcount; i++) {
+		u32 handle, length;
+		u64 offset;
+
+		p = xdr_inline_decode(&rctxt->rc_stream,
+				      rpcrdma_segment_maxsz * sizeof(*p));
+		if (!p)
+			return false;
+
+		xdr_decode_rdma_segment(p, &handle, &length, &offset);
+		trace_svcrdma_decode_wseg(handle, length, offset);
+
+		total += length;
+	}
+	return total <= maxlen;
+}
+
+/* Sanity check the Write list.
+ *
+ * Implementation limits:
+ * - This implementation currently supports only one Write chunk.
+ *
+ * Sanity checks:
+ * - Write list does not overflow Receive buffer.
+ * - Chunk size limited by largest NFS data payload.
+ *
+ * Return values:
+ *       %true: Write list is valid. @rctxt's xdr_stream is updated
+ *		to point to the first byte past the Write list.
+ *      %false: Write list is corrupt. @rctxt's xdr_stream is left
+ *		in an unknown state.
+ */
+static bool xdr_check_write_list(struct svc_rdma_recv_ctxt *rctxt)
+{
+	u32 chcount = 0;
+	__be32 *p;
+
+	p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
+	if (!p)
+		return false;
+	rctxt->rc_write_list = p;
+	while (xdr_item_is_present(p)) {
+		if (!xdr_check_write_chunk(rctxt, MAX_BYTES_WRITE_CHUNK))
+			return false;
+		++chcount;
+		p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
+		if (!p)
+			return false;
+	}
+	if (!chcount)
+		rctxt->rc_write_list = NULL;
+	return chcount < 2;
+}
+
+/* Sanity check the Reply chunk.
+ *
+ * Sanity checks:
+ * - Reply chunk does not overflow Receive buffer.
+ * - Chunk size limited by largest NFS data payload.
+ *
+ * Return values:
+ *       %true: Reply chunk is valid. @rctxt's xdr_stream is updated
+ *		to point to the first byte past the Reply chunk.
+ *      %false: Reply chunk is corrupt. @rctxt's xdr_stream is left
+ *		in an unknown state.
+ */
+static bool xdr_check_reply_chunk(struct svc_rdma_recv_ctxt *rctxt)
+{
+	__be32 *p;
+
+	p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
+	if (!p)
+		return false;
+	rctxt->rc_reply_chunk = NULL;
+	if (xdr_item_is_present(p)) {
+		if (!xdr_check_write_chunk(rctxt, MAX_BYTES_SPECIAL_CHUNK))
+			return false;
+		rctxt->rc_reply_chunk = p;
+	}
+	return true;
+}
+
+/* RPC-over-RDMA Version One private extension: Remote Invalidation.
+ * Responder's choice: requester signals it can handle Send With
+ * Invalidate, and responder chooses one R_key to invalidate.
+ *
+ * If there is exactly one distinct R_key in the received transport
+ * header, set rc_inv_rkey to that R_key. Otherwise, set it to zero.
+ *
+ * Perform this operation while the received transport header is
+ * still in the CPU cache.
+ */
+static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma,
+				  struct svc_rdma_recv_ctxt *ctxt)
+{
+	__be32 inv_rkey, *p;
+	u32 i, segcount;
+
+	ctxt->rc_inv_rkey = 0;
+
+	if (!rdma->sc_snd_w_inv)
+		return;
+
+	inv_rkey = xdr_zero;
+	p = ctxt->rc_recv_buf;
+	p += rpcrdma_fixed_maxsz;
+
+	/* Read list */
+	while (xdr_item_is_present(p++)) {
+		p++;	/* position */
+		if (inv_rkey == xdr_zero)
+			inv_rkey = *p;
+		else if (inv_rkey != *p)
+			return;
+		p += 4;
+	}
+
+	/* Write list */
+	while (xdr_item_is_present(p++)) {
+		segcount = be32_to_cpup(p++);
+		for (i = 0; i < segcount; i++) {
+			if (inv_rkey == xdr_zero)
+				inv_rkey = *p;
+			else if (inv_rkey != *p)
+				return;
+			p += 4;
+		}
+	}
+
+	/* Reply chunk */
+	if (xdr_item_is_present(p++)) {
+		segcount = be32_to_cpup(p++);
+		for (i = 0; i < segcount; i++) {
+			if (inv_rkey == xdr_zero)
+				inv_rkey = *p;
+			else if (inv_rkey != *p)
+				return;
+			p += 4;
+		}
+	}
+
+	ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey);
+}
+
+/**
+ * svc_rdma_xdr_decode_req - Decode the transport header
+ * @rq_arg: xdr_buf containing ingress RPC/RDMA message
+ * @rctxt: state of decoding
+ *
+ * On entry, xdr->head[0].iov_base points to first byte of the
+ * RPC-over-RDMA transport header.
+ *
+ * On successful exit, head[0] points to first byte past the
+ * RPC-over-RDMA header. For RDMA_MSG, this is the RPC message.
+ *
+ * The length of the RPC-over-RDMA header is returned.
+ *
+ * Assumptions:
+ * - The transport header is entirely contained in the head iovec.
+ */
+static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg,
+				   struct svc_rdma_recv_ctxt *rctxt)
+{
+	__be32 *p, *rdma_argp;
+	unsigned int hdr_len;
+
+	rdma_argp = rq_arg->head[0].iov_base;
+	xdr_init_decode(&rctxt->rc_stream, rq_arg, rdma_argp, NULL);
+
+	p = xdr_inline_decode(&rctxt->rc_stream,
+			      rpcrdma_fixed_maxsz * sizeof(*p));
+	if (unlikely(!p))
+		goto out_short;
+	p++;
+	if (*p != rpcrdma_version)
+		goto out_version;
+	p += 2;
+	switch (*p) {
+	case rdma_msg:
+		break;
+	case rdma_nomsg:
+		break;
+	case rdma_done:
+		goto out_drop;
+	case rdma_error:
+		goto out_drop;
+	default:
+		goto out_proc;
+	}
+
+	if (!xdr_check_read_list(rctxt))
+		goto out_inval;
+	if (!xdr_check_write_list(rctxt))
+		goto out_inval;
+	if (!xdr_check_reply_chunk(rctxt))
+		goto out_inval;
+
+	rq_arg->head[0].iov_base = rctxt->rc_stream.p;
+	hdr_len = xdr_stream_pos(&rctxt->rc_stream);
+	rq_arg->head[0].iov_len -= hdr_len;
+	rq_arg->len -= hdr_len;
+	trace_svcrdma_decode_rqst(rctxt, rdma_argp, hdr_len);
+	return hdr_len;
+
+out_short:
+	trace_svcrdma_decode_short_err(rctxt, rq_arg->len);
+	return -EINVAL;
+
+out_version:
+	trace_svcrdma_decode_badvers_err(rctxt, rdma_argp);
+	return -EPROTONOSUPPORT;
+
+out_drop:
+	trace_svcrdma_decode_drop_err(rctxt, rdma_argp);
+	return 0;
+
+out_proc:
+	trace_svcrdma_decode_badproc_err(rctxt, rdma_argp);
+	return -EINVAL;
+
+out_inval:
+	trace_svcrdma_decode_parse_err(rctxt, rdma_argp);
+	return -EINVAL;
+}
+
+static void rdma_read_complete(struct svc_rqst *rqstp,
+			       struct svc_rdma_recv_ctxt *head)
+{
+	int page_no;
+
+	/* Move Read chunk pages to rqstp so that they will be released
+	 * when svc_process is done with them.
+	 */
+	for (page_no = 0; page_no < head->rc_page_count; page_no++) {
+		put_page(rqstp->rq_pages[page_no]);
+		rqstp->rq_pages[page_no] = head->rc_pages[page_no];
+	}
+	head->rc_page_count = 0;
+
+	/* Point rq_arg.pages past header */
+	rqstp->rq_arg.pages = &rqstp->rq_pages[head->rc_hdr_count];
+	rqstp->rq_arg.page_len = head->rc_arg.page_len;
+
+	/* rq_respages starts after the last arg page */
+	rqstp->rq_respages = &rqstp->rq_pages[page_no];
+	rqstp->rq_next_page = rqstp->rq_respages + 1;
+
+	/* Rebuild rq_arg head and tail. */
+	rqstp->rq_arg.head[0] = head->rc_arg.head[0];
+	rqstp->rq_arg.tail[0] = head->rc_arg.tail[0];
+	rqstp->rq_arg.len = head->rc_arg.len;
+	rqstp->rq_arg.buflen = head->rc_arg.buflen;
+}
+
+static void svc_rdma_send_error(struct svcxprt_rdma *rdma,
+				struct svc_rdma_recv_ctxt *rctxt,
+				int status)
+{
+	struct svc_rdma_send_ctxt *sctxt;
+
+	sctxt = svc_rdma_send_ctxt_get(rdma);
+	if (!sctxt)
+		return;
+	svc_rdma_send_error_msg(rdma, sctxt, rctxt, status);
+}
+
+/* By convention, backchannel calls arrive via rdma_msg type
+ * messages, and never populate the chunk lists. This makes
+ * the RPC/RDMA header small and fixed in size, so it is
+ * straightforward to check the RPC header's direction field.
+ */
+static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt,
+					  __be32 *rdma_resp)
+{
+	__be32 *p;
+
+	if (!xprt->xpt_bc_xprt)
+		return false;
+
+	p = rdma_resp + 3;
+	if (*p++ != rdma_msg)
+		return false;
+
+	if (*p++ != xdr_zero)
+		return false;
+	if (*p++ != xdr_zero)
+		return false;
+	if (*p++ != xdr_zero)
+		return false;
+
+	/* XID sanity */
+	if (*p++ != *rdma_resp)
+		return false;
+	/* call direction */
+	if (*p == cpu_to_be32(RPC_CALL))
+		return false;
+
+	return true;
+}
+
+/**
+ * svc_rdma_recvfrom - Receive an RPC call
+ * @rqstp: request structure into which to receive an RPC Call
+ *
+ * Returns:
+ *	The positive number of bytes in the RPC Call message,
+ *	%0 if there were no Calls ready to return,
+ *	%-EINVAL if the Read chunk data is too large,
+ *	%-ENOMEM if rdma_rw context pool was exhausted,
+ *	%-ENOTCONN if posting failed (connection is lost),
+ *	%-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ *
+ * Called in a loop when XPT_DATA is set. XPT_DATA is cleared only
+ * when there are no remaining ctxt's to process.
+ *
+ * The next ctxt is removed from the "receive" lists.
+ *
+ * - If the ctxt completes a Read, then finish assembling the Call
+ *   message and return the number of bytes in the message.
+ *
+ * - If the ctxt completes a Receive, then construct the Call
+ *   message from the contents of the Receive buffer.
+ *
+ *   - If there are no Read chunks in this message, then finish
+ *     assembling the Call message and return the number of bytes
+ *     in the message.
+ *
+ *   - If there are Read chunks in this message, post Read WRs to
+ *     pull that payload and return 0.
+ */
+int svc_rdma_recvfrom(struct svc_rqst *rqstp)
+{
+	struct svc_xprt *xprt = rqstp->rq_xprt;
+	struct svcxprt_rdma *rdma_xprt =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+	struct svc_rdma_recv_ctxt *ctxt;
+	__be32 *p;
+	int ret;
+
+	rqstp->rq_xprt_ctxt = NULL;
+
+	spin_lock(&rdma_xprt->sc_rq_dto_lock);
+	ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_read_complete_q);
+	if (ctxt) {
+		list_del(&ctxt->rc_list);
+		spin_unlock(&rdma_xprt->sc_rq_dto_lock);
+		rdma_read_complete(rqstp, ctxt);
+		goto complete;
+	}
+	ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_rq_dto_q);
+	if (!ctxt) {
+		/* No new incoming requests, terminate the loop */
+		clear_bit(XPT_DATA, &xprt->xpt_flags);
+		spin_unlock(&rdma_xprt->sc_rq_dto_lock);
+		return 0;
+	}
+	list_del(&ctxt->rc_list);
+	spin_unlock(&rdma_xprt->sc_rq_dto_lock);
+
+	atomic_inc(&rdma_stat_recv);
+
+	svc_rdma_build_arg_xdr(rqstp, ctxt);
+
+	/* Prevent svc_xprt_release from releasing pages in rq_pages
+	 * if we return 0 or an error.
+	 */
+	rqstp->rq_respages = rqstp->rq_pages;
+	rqstp->rq_next_page = rqstp->rq_respages;
+
+	p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
+	ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg, ctxt);
+	if (ret < 0)
+		goto out_err;
+	if (ret == 0)
+		goto out_drop;
+	rqstp->rq_xprt_hlen = ret;
+
+	if (svc_rdma_is_backchannel_reply(xprt, p))
+		goto out_backchannel;
+
+	svc_rdma_get_inv_rkey(rdma_xprt, ctxt);
+
+	p += rpcrdma_fixed_maxsz;
+	if (*p != xdr_zero)
+		goto out_readchunk;
+
+complete:
+	rqstp->rq_xprt_ctxt = ctxt;
+	rqstp->rq_prot = IPPROTO_MAX;
+	svc_xprt_copy_addrs(rqstp, xprt);
+	return rqstp->rq_arg.len;
+
+out_readchunk:
+	ret = svc_rdma_recv_read_chunk(rdma_xprt, rqstp, ctxt, p);
+	if (ret < 0)
+		goto out_postfail;
+	return 0;
+
+out_err:
+	svc_rdma_send_error(rdma_xprt, ctxt, ret);
+	svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
+	return 0;
+
+out_postfail:
+	if (ret == -EINVAL)
+		svc_rdma_send_error(rdma_xprt, ctxt, ret);
+	svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
+	return ret;
+
+out_backchannel:
+	svc_rdma_handle_bc_reply(rqstp, ctxt);
+out_drop:
+	svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
+	return 0;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
new file mode 100644
index 000000000..80a0c0e87
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -0,0 +1,885 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2016-2018 Oracle.  All rights reserved.
+ *
+ * Use the core R/W API to move RPC-over-RDMA Read and Write chunks.
+ */
+
+#include <rdma/rw.h>
+
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#include "xprt_rdma.h"
+#include <trace/events/rpcrdma.h>
+
+static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc);
+static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc);
+
+/* Each R/W context contains state for one chain of RDMA Read or
+ * Write Work Requests.
+ *
+ * Each WR chain handles a single contiguous server-side buffer,
+ * because scatterlist entries after the first have to start on
+ * page alignment. xdr_buf iovecs cannot guarantee alignment.
+ *
+ * Each WR chain handles only one R_key. Each RPC-over-RDMA segment
+ * from a client may contain a unique R_key, so each WR chain moves
+ * up to one segment at a time.
+ *
+ * The scatterlist makes this data structure over 4KB in size. To
+ * make it less likely to fail, and to handle the allocation for
+ * smaller I/O requests without disabling bottom-halves, these
+ * contexts are created on demand, but cached and reused until the
+ * controlling svcxprt_rdma is destroyed.
+ */
+struct svc_rdma_rw_ctxt {
+	struct list_head	rw_list;
+	struct rdma_rw_ctx	rw_ctx;
+	unsigned int		rw_nents;
+	struct sg_table		rw_sg_table;
+	struct scatterlist	rw_first_sgl[];
+};
+
+static inline struct svc_rdma_rw_ctxt *
+svc_rdma_next_ctxt(struct list_head *list)
+{
+	return list_first_entry_or_null(list, struct svc_rdma_rw_ctxt,
+					rw_list);
+}
+
+static struct svc_rdma_rw_ctxt *
+svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
+{
+	struct svc_rdma_rw_ctxt *ctxt;
+
+	spin_lock(&rdma->sc_rw_ctxt_lock);
+
+	ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts);
+	if (ctxt) {
+		list_del(&ctxt->rw_list);
+		spin_unlock(&rdma->sc_rw_ctxt_lock);
+	} else {
+		spin_unlock(&rdma->sc_rw_ctxt_lock);
+		ctxt = kmalloc(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE),
+			       GFP_KERNEL);
+		if (!ctxt)
+			goto out_noctx;
+		INIT_LIST_HEAD(&ctxt->rw_list);
+	}
+
+	ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl;
+	if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges,
+				   ctxt->rw_sg_table.sgl,
+				   SG_CHUNK_SIZE))
+		goto out_free;
+	return ctxt;
+
+out_free:
+	kfree(ctxt);
+out_noctx:
+	trace_svcrdma_no_rwctx_err(rdma, sges);
+	return NULL;
+}
+
+static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
+				 struct svc_rdma_rw_ctxt *ctxt)
+{
+	sg_free_table_chained(&ctxt->rw_sg_table, SG_CHUNK_SIZE);
+
+	spin_lock(&rdma->sc_rw_ctxt_lock);
+	list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts);
+	spin_unlock(&rdma->sc_rw_ctxt_lock);
+}
+
+/**
+ * svc_rdma_destroy_rw_ctxts - Free accumulated R/W contexts
+ * @rdma: transport about to be destroyed
+ *
+ */
+void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_rw_ctxt *ctxt;
+
+	while ((ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts)) != NULL) {
+		list_del(&ctxt->rw_list);
+		kfree(ctxt);
+	}
+}
+
+/**
+ * svc_rdma_rw_ctx_init - Prepare a R/W context for I/O
+ * @rdma: controlling transport instance
+ * @ctxt: R/W context to prepare
+ * @offset: RDMA offset
+ * @handle: RDMA tag/handle
+ * @direction: I/O direction
+ *
+ * Returns on success, the number of WQEs that will be needed
+ * on the workqueue, or a negative errno.
+ */
+static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma,
+				struct svc_rdma_rw_ctxt *ctxt,
+				u64 offset, u32 handle,
+				enum dma_data_direction direction)
+{
+	int ret;
+
+	ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num,
+			       ctxt->rw_sg_table.sgl, ctxt->rw_nents,
+			       0, offset, handle, direction);
+	if (unlikely(ret < 0)) {
+		svc_rdma_put_rw_ctxt(rdma, ctxt);
+		trace_svcrdma_dma_map_rw_err(rdma, ctxt->rw_nents, ret);
+	}
+	return ret;
+}
+
+/* A chunk context tracks all I/O for moving one Read or Write
+ * chunk. This is a set of rdma_rw's that handle data movement
+ * for all segments of one chunk.
+ *
+ * These are small, acquired with a single allocator call, and
+ * no more than one is needed per chunk. They are allocated on
+ * demand, and not cached.
+ */
+struct svc_rdma_chunk_ctxt {
+	struct rpc_rdma_cid	cc_cid;
+	struct ib_cqe		cc_cqe;
+	struct svcxprt_rdma	*cc_rdma;
+	struct list_head	cc_rwctxts;
+	int			cc_sqecount;
+};
+
+static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma,
+				 struct rpc_rdma_cid *cid)
+{
+	cid->ci_queue_id = rdma->sc_sq_cq->res.id;
+	cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids);
+}
+
+static void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
+			     struct svc_rdma_chunk_ctxt *cc)
+{
+	svc_rdma_cc_cid_init(rdma, &cc->cc_cid);
+	cc->cc_rdma = rdma;
+
+	INIT_LIST_HEAD(&cc->cc_rwctxts);
+	cc->cc_sqecount = 0;
+}
+
+static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc,
+				enum dma_data_direction dir)
+{
+	struct svcxprt_rdma *rdma = cc->cc_rdma;
+	struct svc_rdma_rw_ctxt *ctxt;
+
+	while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) {
+		list_del(&ctxt->rw_list);
+
+		rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp,
+				    rdma->sc_port_num, ctxt->rw_sg_table.sgl,
+				    ctxt->rw_nents, dir);
+		svc_rdma_put_rw_ctxt(rdma, ctxt);
+	}
+}
+
+/* State for sending a Write or Reply chunk.
+ *  - Tracks progress of writing one chunk over all its segments
+ *  - Stores arguments for the SGL constructor functions
+ */
+struct svc_rdma_write_info {
+	/* write state of this chunk */
+	unsigned int		wi_seg_off;
+	unsigned int		wi_seg_no;
+	unsigned int		wi_nsegs;
+	__be32			*wi_segs;
+
+	/* SGL constructor arguments */
+	struct xdr_buf		*wi_xdr;
+	unsigned char		*wi_base;
+	unsigned int		wi_next_off;
+
+	struct svc_rdma_chunk_ctxt	wi_cc;
+};
+
+static struct svc_rdma_write_info *
+svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk)
+{
+	struct svc_rdma_write_info *info;
+
+	info = kmalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return info;
+
+	info->wi_seg_off = 0;
+	info->wi_seg_no = 0;
+	info->wi_nsegs = be32_to_cpup(++chunk);
+	info->wi_segs = ++chunk;
+	svc_rdma_cc_init(rdma, &info->wi_cc);
+	info->wi_cc.cc_cqe.done = svc_rdma_write_done;
+	return info;
+}
+
+static void svc_rdma_write_info_free(struct svc_rdma_write_info *info)
+{
+	svc_rdma_cc_release(&info->wi_cc, DMA_TO_DEVICE);
+	kfree(info);
+}
+
+/**
+ * svc_rdma_write_done - Write chunk completion
+ * @cq: controlling Completion Queue
+ * @wc: Work Completion
+ *
+ * Pages under I/O are freed by a subsequent Send completion.
+ */
+static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct svc_rdma_chunk_ctxt *cc =
+			container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
+	struct svcxprt_rdma *rdma = cc->cc_rdma;
+	struct svc_rdma_write_info *info =
+			container_of(cc, struct svc_rdma_write_info, wi_cc);
+
+	trace_svcrdma_wc_write(wc, &cc->cc_cid);
+
+	atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+	wake_up(&rdma->sc_send_wait);
+
+	if (unlikely(wc->status != IB_WC_SUCCESS))
+		set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+
+	svc_rdma_write_info_free(info);
+}
+
+/* State for pulling a Read chunk.
+ */
+struct svc_rdma_read_info {
+	struct svc_rdma_recv_ctxt	*ri_readctxt;
+	unsigned int			ri_position;
+	unsigned int			ri_pageno;
+	unsigned int			ri_pageoff;
+	unsigned int			ri_chunklen;
+
+	struct svc_rdma_chunk_ctxt	ri_cc;
+};
+
+static struct svc_rdma_read_info *
+svc_rdma_read_info_alloc(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_read_info *info;
+
+	info = kmalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return info;
+
+	svc_rdma_cc_init(rdma, &info->ri_cc);
+	info->ri_cc.cc_cqe.done = svc_rdma_wc_read_done;
+	return info;
+}
+
+static void svc_rdma_read_info_free(struct svc_rdma_read_info *info)
+{
+	svc_rdma_cc_release(&info->ri_cc, DMA_FROM_DEVICE);
+	kfree(info);
+}
+
+/**
+ * svc_rdma_wc_read_done - Handle completion of an RDMA Read ctx
+ * @cq: controlling Completion Queue
+ * @wc: Work Completion
+ *
+ */
+static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct svc_rdma_chunk_ctxt *cc =
+			container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
+	struct svcxprt_rdma *rdma = cc->cc_rdma;
+	struct svc_rdma_read_info *info =
+			container_of(cc, struct svc_rdma_read_info, ri_cc);
+
+	trace_svcrdma_wc_read(wc, &cc->cc_cid);
+
+	atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+	wake_up(&rdma->sc_send_wait);
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+		svc_rdma_recv_ctxt_put(rdma, info->ri_readctxt);
+	} else {
+		spin_lock(&rdma->sc_rq_dto_lock);
+		list_add_tail(&info->ri_readctxt->rc_list,
+			      &rdma->sc_read_complete_q);
+		/* Note the unlock pairs with the smp_rmb in svc_xprt_ready: */
+		set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags);
+		spin_unlock(&rdma->sc_rq_dto_lock);
+
+		svc_xprt_enqueue(&rdma->sc_xprt);
+	}
+
+	svc_rdma_read_info_free(info);
+}
+
+/* This function sleeps when the transport's Send Queue is congested.
+ *
+ * Assumptions:
+ * - If ib_post_send() succeeds, only one completion is expected,
+ *   even if one or more WRs are flushed. This is true when posting
+ *   an rdma_rw_ctx or when posting a single signaled WR.
+ */
+static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
+{
+	struct svcxprt_rdma *rdma = cc->cc_rdma;
+	struct svc_xprt *xprt = &rdma->sc_xprt;
+	struct ib_send_wr *first_wr;
+	const struct ib_send_wr *bad_wr;
+	struct list_head *tmp;
+	struct ib_cqe *cqe;
+	int ret;
+
+	if (cc->cc_sqecount > rdma->sc_sq_depth)
+		return -EINVAL;
+
+	first_wr = NULL;
+	cqe = &cc->cc_cqe;
+	list_for_each(tmp, &cc->cc_rwctxts) {
+		struct svc_rdma_rw_ctxt *ctxt;
+
+		ctxt = list_entry(tmp, struct svc_rdma_rw_ctxt, rw_list);
+		first_wr = rdma_rw_ctx_wrs(&ctxt->rw_ctx, rdma->sc_qp,
+					   rdma->sc_port_num, cqe, first_wr);
+		cqe = NULL;
+	}
+
+	do {
+		if (atomic_sub_return(cc->cc_sqecount,
+				      &rdma->sc_sq_avail) > 0) {
+			trace_svcrdma_post_chunk(&cc->cc_cid, cc->cc_sqecount);
+			ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
+			if (ret)
+				break;
+			return 0;
+		}
+
+		trace_svcrdma_sq_full(rdma);
+		atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+		wait_event(rdma->sc_send_wait,
+			   atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount);
+		trace_svcrdma_sq_retry(rdma);
+	} while (1);
+
+	trace_svcrdma_sq_post_err(rdma, ret);
+	set_bit(XPT_CLOSE, &xprt->xpt_flags);
+
+	/* If even one was posted, there will be a completion. */
+	if (bad_wr != first_wr)
+		return 0;
+
+	atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+	wake_up(&rdma->sc_send_wait);
+	return -ENOTCONN;
+}
+
+/* Build and DMA-map an SGL that covers one kvec in an xdr_buf
+ */
+static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info,
+			       unsigned int len,
+			       struct svc_rdma_rw_ctxt *ctxt)
+{
+	struct scatterlist *sg = ctxt->rw_sg_table.sgl;
+
+	sg_set_buf(&sg[0], info->wi_base, len);
+	info->wi_base += len;
+
+	ctxt->rw_nents = 1;
+}
+
+/* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist.
+ */
+static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info,
+				    unsigned int remaining,
+				    struct svc_rdma_rw_ctxt *ctxt)
+{
+	unsigned int sge_no, sge_bytes, page_off, page_no;
+	struct xdr_buf *xdr = info->wi_xdr;
+	struct scatterlist *sg;
+	struct page **page;
+
+	page_off = info->wi_next_off + xdr->page_base;
+	page_no = page_off >> PAGE_SHIFT;
+	page_off = offset_in_page(page_off);
+	page = xdr->pages + page_no;
+	info->wi_next_off += remaining;
+	sg = ctxt->rw_sg_table.sgl;
+	sge_no = 0;
+	do {
+		sge_bytes = min_t(unsigned int, remaining,
+				  PAGE_SIZE - page_off);
+		sg_set_page(sg, *page, sge_bytes, page_off);
+
+		remaining -= sge_bytes;
+		sg = sg_next(sg);
+		page_off = 0;
+		sge_no++;
+		page++;
+	} while (remaining);
+
+	ctxt->rw_nents = sge_no;
+}
+
+/* Construct RDMA Write WRs to send a portion of an xdr_buf containing
+ * an RPC Reply.
+ */
+static int
+svc_rdma_build_writes(struct svc_rdma_write_info *info,
+		      void (*constructor)(struct svc_rdma_write_info *info,
+					  unsigned int len,
+					  struct svc_rdma_rw_ctxt *ctxt),
+		      unsigned int remaining)
+{
+	struct svc_rdma_chunk_ctxt *cc = &info->wi_cc;
+	struct svcxprt_rdma *rdma = cc->cc_rdma;
+	struct svc_rdma_rw_ctxt *ctxt;
+	__be32 *seg;
+	int ret;
+
+	seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz;
+	do {
+		unsigned int write_len;
+		u32 handle, length;
+		u64 offset;
+
+		if (info->wi_seg_no >= info->wi_nsegs)
+			goto out_overflow;
+
+		xdr_decode_rdma_segment(seg, &handle, &length, &offset);
+		offset += info->wi_seg_off;
+
+		write_len = min(remaining, length - info->wi_seg_off);
+		ctxt = svc_rdma_get_rw_ctxt(rdma,
+					    (write_len >> PAGE_SHIFT) + 2);
+		if (!ctxt)
+			return -ENOMEM;
+
+		constructor(info, write_len, ctxt);
+		ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, handle,
+					   DMA_TO_DEVICE);
+		if (ret < 0)
+			return -EIO;
+
+		trace_svcrdma_send_wseg(handle, write_len, offset);
+
+		list_add(&ctxt->rw_list, &cc->cc_rwctxts);
+		cc->cc_sqecount += ret;
+		if (write_len == length - info->wi_seg_off) {
+			seg += 4;
+			info->wi_seg_no++;
+			info->wi_seg_off = 0;
+		} else {
+			info->wi_seg_off += write_len;
+		}
+		remaining -= write_len;
+	} while (remaining);
+
+	return 0;
+
+out_overflow:
+	trace_svcrdma_small_wrch_err(rdma, remaining, info->wi_seg_no,
+				     info->wi_nsegs);
+	return -E2BIG;
+}
+
+/* Send one of an xdr_buf's kvecs by itself. To send a Reply
+ * chunk, the whole RPC Reply is written back to the client.
+ * This function writes either the head or tail of the xdr_buf
+ * containing the Reply.
+ */
+static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info,
+				  struct kvec *vec)
+{
+	info->wi_base = vec->iov_base;
+	return svc_rdma_build_writes(info, svc_rdma_vec_to_sg,
+				     vec->iov_len);
+}
+
+/* Send an xdr_buf's page list by itself. A Write chunk is just
+ * the page list. A Reply chunk is @xdr's head, page list, and
+ * tail. This function is shared between the two types of chunk.
+ */
+static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
+				      struct xdr_buf *xdr,
+				      unsigned int offset,
+				      unsigned long length)
+{
+	info->wi_xdr = xdr;
+	info->wi_next_off = offset - xdr->head[0].iov_len;
+	return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg,
+				     length);
+}
+
+/**
+ * svc_rdma_send_write_chunk - Write all segments in a Write chunk
+ * @rdma: controlling RDMA transport
+ * @wr_ch: Write chunk provided by client
+ * @xdr: xdr_buf containing the data payload
+ * @offset: payload's byte offset in @xdr
+ * @length: size of payload, in bytes
+ *
+ * Returns a non-negative number of bytes the chunk consumed, or
+ *	%-E2BIG if the payload was larger than the Write chunk,
+ *	%-EINVAL if client provided too many segments,
+ *	%-ENOMEM if rdma_rw context pool was exhausted,
+ *	%-ENOTCONN if posting failed (connection is lost),
+ *	%-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ */
+int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch,
+			      struct xdr_buf *xdr,
+			      unsigned int offset, unsigned long length)
+{
+	struct svc_rdma_write_info *info;
+	int ret;
+
+	if (!length)
+		return 0;
+
+	info = svc_rdma_write_info_alloc(rdma, wr_ch);
+	if (!info)
+		return -ENOMEM;
+
+	ret = svc_rdma_send_xdr_pagelist(info, xdr, offset, length);
+	if (ret < 0)
+		goto out_err;
+
+	ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
+	if (ret < 0)
+		goto out_err;
+
+	trace_svcrdma_send_write_chunk(xdr->page_len);
+	return length;
+
+out_err:
+	svc_rdma_write_info_free(info);
+	return ret;
+}
+
+/**
+ * svc_rdma_send_reply_chunk - Write all segments in the Reply chunk
+ * @rdma: controlling RDMA transport
+ * @rctxt: Write and Reply chunks from client
+ * @xdr: xdr_buf containing an RPC Reply
+ *
+ * Returns a non-negative number of bytes the chunk consumed, or
+ *	%-E2BIG if the payload was larger than the Reply chunk,
+ *	%-EINVAL if client provided too many segments,
+ *	%-ENOMEM if rdma_rw context pool was exhausted,
+ *	%-ENOTCONN if posting failed (connection is lost),
+ *	%-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ */
+int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
+			      const struct svc_rdma_recv_ctxt *rctxt,
+			      struct xdr_buf *xdr)
+{
+	struct svc_rdma_write_info *info;
+	int consumed, ret;
+
+	info = svc_rdma_write_info_alloc(rdma, rctxt->rc_reply_chunk);
+	if (!info)
+		return -ENOMEM;
+
+	ret = svc_rdma_send_xdr_kvec(info, &xdr->head[0]);
+	if (ret < 0)
+		goto out_err;
+	consumed = xdr->head[0].iov_len;
+
+	/* Send the page list in the Reply chunk only if the
+	 * client did not provide Write chunks.
+	 */
+	if (!rctxt->rc_write_list && xdr->page_len) {
+		ret = svc_rdma_send_xdr_pagelist(info, xdr,
+						 xdr->head[0].iov_len,
+						 xdr->page_len);
+		if (ret < 0)
+			goto out_err;
+		consumed += xdr->page_len;
+	}
+
+	if (xdr->tail[0].iov_len) {
+		ret = svc_rdma_send_xdr_kvec(info, &xdr->tail[0]);
+		if (ret < 0)
+			goto out_err;
+		consumed += xdr->tail[0].iov_len;
+	}
+
+	ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
+	if (ret < 0)
+		goto out_err;
+
+	trace_svcrdma_send_reply_chunk(consumed);
+	return consumed;
+
+out_err:
+	svc_rdma_write_info_free(info);
+	return ret;
+}
+
+static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
+				       struct svc_rqst *rqstp,
+				       u32 rkey, u32 len, u64 offset)
+{
+	struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
+	struct svc_rdma_chunk_ctxt *cc = &info->ri_cc;
+	struct svc_rdma_rw_ctxt *ctxt;
+	unsigned int sge_no, seg_len;
+	struct scatterlist *sg;
+	int ret;
+
+	sge_no = PAGE_ALIGN(info->ri_pageoff + len) >> PAGE_SHIFT;
+	ctxt = svc_rdma_get_rw_ctxt(cc->cc_rdma, sge_no);
+	if (!ctxt)
+		return -ENOMEM;
+	ctxt->rw_nents = sge_no;
+
+	sg = ctxt->rw_sg_table.sgl;
+	for (sge_no = 0; sge_no < ctxt->rw_nents; sge_no++) {
+		seg_len = min_t(unsigned int, len,
+				PAGE_SIZE - info->ri_pageoff);
+
+		head->rc_arg.pages[info->ri_pageno] =
+			rqstp->rq_pages[info->ri_pageno];
+		if (!info->ri_pageoff)
+			head->rc_page_count++;
+
+		sg_set_page(sg, rqstp->rq_pages[info->ri_pageno],
+			    seg_len, info->ri_pageoff);
+		sg = sg_next(sg);
+
+		info->ri_pageoff += seg_len;
+		if (info->ri_pageoff == PAGE_SIZE) {
+			info->ri_pageno++;
+			info->ri_pageoff = 0;
+		}
+		len -= seg_len;
+
+		/* Safety check */
+		if (len &&
+		    &rqstp->rq_pages[info->ri_pageno + 1] > rqstp->rq_page_end)
+			goto out_overrun;
+	}
+
+	ret = svc_rdma_rw_ctx_init(cc->cc_rdma, ctxt, offset, rkey,
+				   DMA_FROM_DEVICE);
+	if (ret < 0)
+		return -EIO;
+
+	list_add(&ctxt->rw_list, &cc->cc_rwctxts);
+	cc->cc_sqecount += ret;
+	return 0;
+
+out_overrun:
+	trace_svcrdma_page_overrun_err(cc->cc_rdma, rqstp, info->ri_pageno);
+	return -EINVAL;
+}
+
+/* Walk the segments in the Read chunk starting at @p and construct
+ * RDMA Read operations to pull the chunk to the server.
+ */
+static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp,
+				     struct svc_rdma_read_info *info,
+				     __be32 *p)
+{
+	int ret;
+
+	ret = -EINVAL;
+	info->ri_chunklen = 0;
+	while (*p++ != xdr_zero && be32_to_cpup(p++) == info->ri_position) {
+		u32 handle, length;
+		u64 offset;
+
+		p = xdr_decode_rdma_segment(p, &handle, &length, &offset);
+		ret = svc_rdma_build_read_segment(info, rqstp, handle, length,
+						  offset);
+		if (ret < 0)
+			break;
+
+		trace_svcrdma_send_rseg(handle, length, offset);
+		info->ri_chunklen += length;
+	}
+
+	return ret;
+}
+
+/* Construct RDMA Reads to pull over a normal Read chunk. The chunk
+ * data lands in the page list of head->rc_arg.pages.
+ *
+ * Currently NFSD does not look at the head->rc_arg.tail[0] iovec.
+ * Therefore, XDR round-up of the Read chunk and trailing
+ * inline content must both be added at the end of the pagelist.
+ */
+static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp,
+					    struct svc_rdma_read_info *info,
+					    __be32 *p)
+{
+	struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
+	int ret;
+
+	ret = svc_rdma_build_read_chunk(rqstp, info, p);
+	if (ret < 0)
+		goto out;
+
+	trace_svcrdma_send_read_chunk(info->ri_chunklen, info->ri_position);
+
+	head->rc_hdr_count = 0;
+
+	/* Split the Receive buffer between the head and tail
+	 * buffers at Read chunk's position. XDR roundup of the
+	 * chunk is not included in either the pagelist or in
+	 * the tail.
+	 */
+	head->rc_arg.tail[0].iov_base =
+		head->rc_arg.head[0].iov_base + info->ri_position;
+	head->rc_arg.tail[0].iov_len =
+		head->rc_arg.head[0].iov_len - info->ri_position;
+	head->rc_arg.head[0].iov_len = info->ri_position;
+
+	/* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2).
+	 *
+	 * If the client already rounded up the chunk length, the
+	 * length does not change. Otherwise, the length of the page
+	 * list is increased to include XDR round-up.
+	 *
+	 * Currently these chunks always start at page offset 0,
+	 * thus the rounded-up length never crosses a page boundary.
+	 */
+	info->ri_chunklen = XDR_QUADLEN(info->ri_chunklen) << 2;
+
+	head->rc_arg.page_len = info->ri_chunklen;
+	head->rc_arg.len += info->ri_chunklen;
+	head->rc_arg.buflen += info->ri_chunklen;
+
+out:
+	return ret;
+}
+
+/* Construct RDMA Reads to pull over a Position Zero Read chunk.
+ * The start of the data lands in the first page just after
+ * the Transport header, and the rest lands in the page list of
+ * head->rc_arg.pages.
+ *
+ * Assumptions:
+ *	- A PZRC has an XDR-aligned length (no implicit round-up).
+ *	- There can be no trailing inline content (IOW, we assume
+ *	  a PZRC is never sent in an RDMA_MSG message, though it's
+ *	  allowed by spec).
+ */
+static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp,
+					struct svc_rdma_read_info *info,
+					__be32 *p)
+{
+	struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
+	int ret;
+
+	ret = svc_rdma_build_read_chunk(rqstp, info, p);
+	if (ret < 0)
+		goto out;
+
+	trace_svcrdma_send_pzr(info->ri_chunklen);
+
+	head->rc_arg.len += info->ri_chunklen;
+	head->rc_arg.buflen += info->ri_chunklen;
+
+	head->rc_hdr_count = 1;
+	head->rc_arg.head[0].iov_base = page_address(head->rc_pages[0]);
+	head->rc_arg.head[0].iov_len = min_t(size_t, PAGE_SIZE,
+					     info->ri_chunklen);
+
+	head->rc_arg.page_len = info->ri_chunklen -
+				head->rc_arg.head[0].iov_len;
+
+out:
+	return ret;
+}
+
+/* Pages under I/O have been copied to head->rc_pages. Ensure they
+ * are not released by svc_xprt_release() until the I/O is complete.
+ *
+ * This has to be done after all Read WRs are constructed to properly
+ * handle a page that is part of I/O on behalf of two different RDMA
+ * segments.
+ *
+ * Do this only if I/O has been posted. Otherwise, we do indeed want
+ * svc_xprt_release() to clean things up properly.
+ */
+static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
+				   const unsigned int start,
+				   const unsigned int num_pages)
+{
+	unsigned int i;
+
+	for (i = start; i < num_pages + start; i++)
+		rqstp->rq_pages[i] = NULL;
+}
+
+/**
+ * svc_rdma_recv_read_chunk - Pull a Read chunk from the client
+ * @rdma: controlling RDMA transport
+ * @rqstp: set of pages to use as Read sink buffers
+ * @head: pages under I/O collect here
+ * @p: pointer to start of Read chunk
+ *
+ * Returns:
+ *	%0 if all needed RDMA Reads were posted successfully,
+ *	%-EINVAL if client provided too many segments,
+ *	%-ENOMEM if rdma_rw context pool was exhausted,
+ *	%-ENOTCONN if posting failed (connection is lost),
+ *	%-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ *
+ * Assumptions:
+ * - All Read segments in @p have the same Position value.
+ */
+int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp,
+			     struct svc_rdma_recv_ctxt *head, __be32 *p)
+{
+	struct svc_rdma_read_info *info;
+	int ret;
+
+	/* The request (with page list) is constructed in
+	 * head->rc_arg. Pages involved with RDMA Read I/O are
+	 * transferred there.
+	 */
+	head->rc_arg.head[0] = rqstp->rq_arg.head[0];
+	head->rc_arg.tail[0] = rqstp->rq_arg.tail[0];
+	head->rc_arg.pages = head->rc_pages;
+	head->rc_arg.page_base = 0;
+	head->rc_arg.page_len = 0;
+	head->rc_arg.len = rqstp->rq_arg.len;
+	head->rc_arg.buflen = rqstp->rq_arg.buflen;
+
+	info = svc_rdma_read_info_alloc(rdma);
+	if (!info)
+		return -ENOMEM;
+	info->ri_readctxt = head;
+	info->ri_pageno = 0;
+	info->ri_pageoff = 0;
+
+	info->ri_position = be32_to_cpup(p + 1);
+	if (info->ri_position)
+		ret = svc_rdma_build_normal_read_chunk(rqstp, info, p);
+	else
+		ret = svc_rdma_build_pz_read_chunk(rqstp, info, p);
+	if (ret < 0)
+		goto out_err;
+
+	ret = svc_rdma_post_chunk_ctxt(&info->ri_cc);
+	if (ret < 0)
+		goto out_err;
+	svc_rdma_save_io_pages(rqstp, 0, head->rc_page_count);
+	return 0;
+
+out_err:
+	svc_rdma_read_info_free(info);
+	return ret;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
new file mode 100644
index 000000000..c3d588b14
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -0,0 +1,1006 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+/*
+ * Copyright (c) 2016-2018 Oracle. All rights reserved.
+ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+/* Operation
+ *
+ * The main entry point is svc_rdma_sendto. This is called by the
+ * RPC server when an RPC Reply is ready to be transmitted to a client.
+ *
+ * The passed-in svc_rqst contains a struct xdr_buf which holds an
+ * XDR-encoded RPC Reply message. sendto must construct the RPC-over-RDMA
+ * transport header, post all Write WRs needed for this Reply, then post
+ * a Send WR conveying the transport header and the RPC message itself to
+ * the client.
+ *
+ * svc_rdma_sendto must fully transmit the Reply before returning, as
+ * the svc_rqst will be recycled as soon as sendto returns. Remaining
+ * resources referred to by the svc_rqst are also recycled at that time.
+ * Therefore any resources that must remain longer must be detached
+ * from the svc_rqst and released later.
+ *
+ * Page Management
+ *
+ * The I/O that performs Reply transmission is asynchronous, and may
+ * complete well after sendto returns. Thus pages under I/O must be
+ * removed from the svc_rqst before sendto returns.
+ *
+ * The logic here depends on Send Queue and completion ordering. Since
+ * the Send WR is always posted last, it will always complete last. Thus
+ * when it completes, it is guaranteed that all previous Write WRs have
+ * also completed.
+ *
+ * Write WRs are constructed and posted. Each Write segment gets its own
+ * svc_rdma_rw_ctxt, allowing the Write completion handler to find and
+ * DMA-unmap the pages under I/O for that Write segment. The Write
+ * completion handler does not release any pages.
+ *
+ * When the Send WR is constructed, it also gets its own svc_rdma_send_ctxt.
+ * The ownership of all of the Reply's pages are transferred into that
+ * ctxt, the Send WR is posted, and sendto returns.
+ *
+ * The svc_rdma_send_ctxt is presented when the Send WR completes. The
+ * Send completion handler finally releases the Reply's pages.
+ *
+ * This mechanism also assumes that completions on the transport's Send
+ * Completion Queue do not run in parallel. Otherwise a Write completion
+ * and Send completion running at the same time could release pages that
+ * are still DMA-mapped.
+ *
+ * Error Handling
+ *
+ * - If the Send WR is posted successfully, it will either complete
+ *   successfully, or get flushed. Either way, the Send completion
+ *   handler releases the Reply's pages.
+ * - If the Send WR cannot be not posted, the forward path releases
+ *   the Reply's pages.
+ *
+ * This handles the case, without the use of page reference counting,
+ * where two different Write segments send portions of the same page.
+ */
+
+#include <linux/spinlock.h>
+#include <asm/unaligned.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#include "xprt_rdma.h"
+#include <trace/events/rpcrdma.h>
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc);
+
+static inline struct svc_rdma_send_ctxt *
+svc_rdma_next_send_ctxt(struct list_head *list)
+{
+	return list_first_entry_or_null(list, struct svc_rdma_send_ctxt,
+					sc_list);
+}
+
+static void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma,
+				   struct rpc_rdma_cid *cid)
+{
+	cid->ci_queue_id = rdma->sc_sq_cq->res.id;
+	cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids);
+}
+
+static struct svc_rdma_send_ctxt *
+svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_send_ctxt *ctxt;
+	dma_addr_t addr;
+	void *buffer;
+	size_t size;
+	int i;
+
+	size = sizeof(*ctxt);
+	size += rdma->sc_max_send_sges * sizeof(struct ib_sge);
+	ctxt = kmalloc(size, GFP_KERNEL);
+	if (!ctxt)
+		goto fail0;
+	buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL);
+	if (!buffer)
+		goto fail1;
+	addr = ib_dma_map_single(rdma->sc_pd->device, buffer,
+				 rdma->sc_max_req_size, DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(rdma->sc_pd->device, addr))
+		goto fail2;
+
+	svc_rdma_send_cid_init(rdma, &ctxt->sc_cid);
+
+	ctxt->sc_send_wr.next = NULL;
+	ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe;
+	ctxt->sc_send_wr.sg_list = ctxt->sc_sges;
+	ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED;
+	ctxt->sc_cqe.done = svc_rdma_wc_send;
+	ctxt->sc_xprt_buf = buffer;
+	xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf,
+		     rdma->sc_max_req_size);
+	ctxt->sc_sges[0].addr = addr;
+
+	for (i = 0; i < rdma->sc_max_send_sges; i++)
+		ctxt->sc_sges[i].lkey = rdma->sc_pd->local_dma_lkey;
+	return ctxt;
+
+fail2:
+	kfree(buffer);
+fail1:
+	kfree(ctxt);
+fail0:
+	return NULL;
+}
+
+/**
+ * svc_rdma_send_ctxts_destroy - Release all send_ctxt's for an xprt
+ * @rdma: svcxprt_rdma being torn down
+ *
+ */
+void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_send_ctxt *ctxt;
+
+	while ((ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts))) {
+		list_del(&ctxt->sc_list);
+		ib_dma_unmap_single(rdma->sc_pd->device,
+				    ctxt->sc_sges[0].addr,
+				    rdma->sc_max_req_size,
+				    DMA_TO_DEVICE);
+		kfree(ctxt->sc_xprt_buf);
+		kfree(ctxt);
+	}
+}
+
+/**
+ * svc_rdma_send_ctxt_get - Get a free send_ctxt
+ * @rdma: controlling svcxprt_rdma
+ *
+ * Returns a ready-to-use send_ctxt, or NULL if none are
+ * available and a fresh one cannot be allocated.
+ */
+struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_send_ctxt *ctxt;
+
+	spin_lock(&rdma->sc_send_lock);
+	ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts);
+	if (!ctxt)
+		goto out_empty;
+	list_del(&ctxt->sc_list);
+	spin_unlock(&rdma->sc_send_lock);
+
+out:
+	rpcrdma_set_xdrlen(&ctxt->sc_hdrbuf, 0);
+	xdr_init_encode(&ctxt->sc_stream, &ctxt->sc_hdrbuf,
+			ctxt->sc_xprt_buf, NULL);
+
+	ctxt->sc_send_wr.num_sge = 0;
+	ctxt->sc_cur_sge_no = 0;
+	ctxt->sc_page_count = 0;
+	return ctxt;
+
+out_empty:
+	spin_unlock(&rdma->sc_send_lock);
+	ctxt = svc_rdma_send_ctxt_alloc(rdma);
+	if (!ctxt)
+		return NULL;
+	goto out;
+}
+
+/**
+ * svc_rdma_send_ctxt_put - Return send_ctxt to free list
+ * @rdma: controlling svcxprt_rdma
+ * @ctxt: object to return to the free list
+ *
+ * Pages left in sc_pages are DMA unmapped and released.
+ */
+void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma,
+			    struct svc_rdma_send_ctxt *ctxt)
+{
+	struct ib_device *device = rdma->sc_cm_id->device;
+	unsigned int i;
+
+	/* The first SGE contains the transport header, which
+	 * remains mapped until @ctxt is destroyed.
+	 */
+	for (i = 1; i < ctxt->sc_send_wr.num_sge; i++) {
+		ib_dma_unmap_page(device,
+				  ctxt->sc_sges[i].addr,
+				  ctxt->sc_sges[i].length,
+				  DMA_TO_DEVICE);
+		trace_svcrdma_dma_unmap_page(rdma,
+					     ctxt->sc_sges[i].addr,
+					     ctxt->sc_sges[i].length);
+	}
+
+	for (i = 0; i < ctxt->sc_page_count; ++i)
+		put_page(ctxt->sc_pages[i]);
+
+	spin_lock(&rdma->sc_send_lock);
+	list_add(&ctxt->sc_list, &rdma->sc_send_ctxts);
+	spin_unlock(&rdma->sc_send_lock);
+}
+
+/**
+ * svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC
+ * @cq: Completion Queue context
+ * @wc: Work Completion object
+ *
+ * NB: The svc_xprt/svcxprt_rdma is pinned whenever it's possible that
+ * the Send completion handler could be running.
+ */
+static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct svcxprt_rdma *rdma = cq->cq_context;
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct svc_rdma_send_ctxt *ctxt =
+		container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe);
+
+	trace_svcrdma_wc_send(wc, &ctxt->sc_cid);
+
+	atomic_inc(&rdma->sc_sq_avail);
+	wake_up(&rdma->sc_send_wait);
+
+	svc_rdma_send_ctxt_put(rdma, ctxt);
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+		svc_xprt_enqueue(&rdma->sc_xprt);
+	}
+}
+
+/**
+ * svc_rdma_send - Post a single Send WR
+ * @rdma: transport on which to post the WR
+ * @ctxt: send ctxt with a Send WR ready to post
+ *
+ * Returns zero the Send WR was posted successfully. Otherwise, a
+ * negative errno is returned.
+ */
+int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt)
+{
+	struct ib_send_wr *wr = &ctxt->sc_send_wr;
+	int ret;
+
+	might_sleep();
+
+	/* Sync the transport header buffer */
+	ib_dma_sync_single_for_device(rdma->sc_pd->device,
+				      wr->sg_list[0].addr,
+				      wr->sg_list[0].length,
+				      DMA_TO_DEVICE);
+
+	/* If the SQ is full, wait until an SQ entry is available */
+	while (1) {
+		if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) {
+			atomic_inc(&rdma_stat_sq_starve);
+			trace_svcrdma_sq_full(rdma);
+			atomic_inc(&rdma->sc_sq_avail);
+			wait_event(rdma->sc_send_wait,
+				   atomic_read(&rdma->sc_sq_avail) > 1);
+			if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags))
+				return -ENOTCONN;
+			trace_svcrdma_sq_retry(rdma);
+			continue;
+		}
+
+		trace_svcrdma_post_send(ctxt);
+		ret = ib_post_send(rdma->sc_qp, wr, NULL);
+		if (ret)
+			break;
+		return 0;
+	}
+
+	trace_svcrdma_sq_post_err(rdma, ret);
+	set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+	wake_up(&rdma->sc_send_wait);
+	return ret;
+}
+
+/**
+ * svc_rdma_encode_read_list - Encode RPC Reply's Read chunk list
+ * @sctxt: Send context for the RPC Reply
+ *
+ * Return values:
+ *   On success, returns length in bytes of the Reply XDR buffer
+ *   that was consumed by the Reply Read list
+ *   %-EMSGSIZE on XDR buffer overflow
+ */
+static ssize_t svc_rdma_encode_read_list(struct svc_rdma_send_ctxt *sctxt)
+{
+	/* RPC-over-RDMA version 1 replies never have a Read list. */
+	return xdr_stream_encode_item_absent(&sctxt->sc_stream);
+}
+
+/**
+ * svc_rdma_encode_write_segment - Encode one Write segment
+ * @src: matching Write chunk in the RPC Call header
+ * @sctxt: Send context for the RPC Reply
+ * @remaining: remaining bytes of the payload left in the Write chunk
+ *
+ * Return values:
+ *   On success, returns length in bytes of the Reply XDR buffer
+ *   that was consumed by the Write segment
+ *   %-EMSGSIZE on XDR buffer overflow
+ */
+static ssize_t svc_rdma_encode_write_segment(__be32 *src,
+					     struct svc_rdma_send_ctxt *sctxt,
+					     unsigned int *remaining)
+{
+	__be32 *p;
+	const size_t len = rpcrdma_segment_maxsz * sizeof(*p);
+	u32 handle, length;
+	u64 offset;
+
+	p = xdr_reserve_space(&sctxt->sc_stream, len);
+	if (!p)
+		return -EMSGSIZE;
+
+	xdr_decode_rdma_segment(src, &handle, &length, &offset);
+
+	if (*remaining < length) {
+		/* segment only partly filled */
+		length = *remaining;
+		*remaining = 0;
+	} else {
+		/* entire segment was consumed */
+		*remaining -= length;
+	}
+	xdr_encode_rdma_segment(p, handle, length, offset);
+
+	trace_svcrdma_encode_wseg(handle, length, offset);
+	return len;
+}
+
+/**
+ * svc_rdma_encode_write_chunk - Encode one Write chunk
+ * @src: matching Write chunk in the RPC Call header
+ * @sctxt: Send context for the RPC Reply
+ * @remaining: size in bytes of the payload in the Write chunk
+ *
+ * Copy a Write chunk from the Call transport header to the
+ * Reply transport header. Update each segment's length field
+ * to reflect the number of bytes written in that segment.
+ *
+ * Return values:
+ *   On success, returns length in bytes of the Reply XDR buffer
+ *   that was consumed by the Write chunk
+ *   %-EMSGSIZE on XDR buffer overflow
+ */
+static ssize_t svc_rdma_encode_write_chunk(__be32 *src,
+					   struct svc_rdma_send_ctxt *sctxt,
+					   unsigned int remaining)
+{
+	unsigned int i, nsegs;
+	ssize_t len, ret;
+
+	len = 0;
+	trace_svcrdma_encode_write_chunk(remaining);
+
+	src++;
+	ret = xdr_stream_encode_item_present(&sctxt->sc_stream);
+	if (ret < 0)
+		return -EMSGSIZE;
+	len += ret;
+
+	nsegs = be32_to_cpup(src++);
+	ret = xdr_stream_encode_u32(&sctxt->sc_stream, nsegs);
+	if (ret < 0)
+		return -EMSGSIZE;
+	len += ret;
+
+	for (i = nsegs; i; i--) {
+		ret = svc_rdma_encode_write_segment(src, sctxt, &remaining);
+		if (ret < 0)
+			return -EMSGSIZE;
+		src += rpcrdma_segment_maxsz;
+		len += ret;
+	}
+
+	return len;
+}
+
+/**
+ * svc_rdma_encode_write_list - Encode RPC Reply's Write chunk list
+ * @rctxt: Reply context with information about the RPC Call
+ * @sctxt: Send context for the RPC Reply
+ * @length: size in bytes of the payload in the first Write chunk
+ *
+ * The client provides a Write chunk list in the Call message. Fill
+ * in the segments in the first Write chunk in the Reply's transport
+ * header with the number of bytes consumed in each segment.
+ * Remaining chunks are returned unused.
+ *
+ * Assumptions:
+ *  - Client has provided only one Write chunk
+ *
+ * Return values:
+ *   On success, returns length in bytes of the Reply XDR buffer
+ *   that was consumed by the Reply's Write list
+ *   %-EMSGSIZE on XDR buffer overflow
+ */
+static ssize_t
+svc_rdma_encode_write_list(const struct svc_rdma_recv_ctxt *rctxt,
+			   struct svc_rdma_send_ctxt *sctxt,
+			   unsigned int length)
+{
+	ssize_t len, ret;
+
+	ret = svc_rdma_encode_write_chunk(rctxt->rc_write_list, sctxt, length);
+	if (ret < 0)
+		return ret;
+	len = ret;
+
+	/* Terminate the Write list */
+	ret = xdr_stream_encode_item_absent(&sctxt->sc_stream);
+	if (ret < 0)
+		return ret;
+
+	return len + ret;
+}
+
+/**
+ * svc_rdma_encode_reply_chunk - Encode RPC Reply's Reply chunk
+ * @rctxt: Reply context with information about the RPC Call
+ * @sctxt: Send context for the RPC Reply
+ * @length: size in bytes of the payload in the Reply chunk
+ *
+ * Assumptions:
+ * - Reply can always fit in the client-provided Reply chunk
+ *
+ * Return values:
+ *   On success, returns length in bytes of the Reply XDR buffer
+ *   that was consumed by the Reply's Reply chunk
+ *   %-EMSGSIZE on XDR buffer overflow
+ */
+static ssize_t
+svc_rdma_encode_reply_chunk(const struct svc_rdma_recv_ctxt *rctxt,
+			    struct svc_rdma_send_ctxt *sctxt,
+			    unsigned int length)
+{
+	return svc_rdma_encode_write_chunk(rctxt->rc_reply_chunk, sctxt,
+					   length);
+}
+
+static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
+				 struct svc_rdma_send_ctxt *ctxt,
+				 struct page *page,
+				 unsigned long offset,
+				 unsigned int len)
+{
+	struct ib_device *dev = rdma->sc_cm_id->device;
+	dma_addr_t dma_addr;
+
+	dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE);
+	trace_svcrdma_dma_map_page(rdma, dma_addr, len);
+	if (ib_dma_mapping_error(dev, dma_addr))
+		goto out_maperr;
+
+	ctxt->sc_sges[ctxt->sc_cur_sge_no].addr = dma_addr;
+	ctxt->sc_sges[ctxt->sc_cur_sge_no].length = len;
+	ctxt->sc_send_wr.num_sge++;
+	return 0;
+
+out_maperr:
+	return -EIO;
+}
+
+/* ib_dma_map_page() is used here because svc_rdma_dma_unmap()
+ * handles DMA-unmap and it uses ib_dma_unmap_page() exclusively.
+ */
+static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
+				struct svc_rdma_send_ctxt *ctxt,
+				unsigned char *base,
+				unsigned int len)
+{
+	return svc_rdma_dma_map_page(rdma, ctxt, virt_to_page(base),
+				     offset_in_page(base), len);
+}
+
+/**
+ * svc_rdma_pull_up_needed - Determine whether to use pull-up
+ * @rdma: controlling transport
+ * @sctxt: send_ctxt for the Send WR
+ * @rctxt: Write and Reply chunks provided by client
+ * @xdr: xdr_buf containing RPC message to transmit
+ *
+ * Returns:
+ *	%true if pull-up must be used
+ *	%false otherwise
+ */
+static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma,
+				    struct svc_rdma_send_ctxt *sctxt,
+				    const struct svc_rdma_recv_ctxt *rctxt,
+				    struct xdr_buf *xdr)
+{
+	int elements;
+
+	/* For small messages, copying bytes is cheaper than DMA mapping.
+	 */
+	if (sctxt->sc_hdrbuf.len + xdr->len < RPCRDMA_PULLUP_THRESH)
+		return true;
+
+	/* Check whether the xdr_buf has more elements than can
+	 * fit in a single RDMA Send.
+	 */
+	/* xdr->head */
+	elements = 1;
+
+	/* xdr->pages */
+	if (!rctxt || !rctxt->rc_write_list) {
+		unsigned int remaining;
+		unsigned long pageoff;
+
+		pageoff = xdr->page_base & ~PAGE_MASK;
+		remaining = xdr->page_len;
+		while (remaining) {
+			++elements;
+			remaining -= min_t(u32, PAGE_SIZE - pageoff,
+					   remaining);
+			pageoff = 0;
+		}
+	}
+
+	/* xdr->tail */
+	if (xdr->tail[0].iov_len)
+		++elements;
+
+	/* assume 1 SGE is needed for the transport header */
+	return elements >= rdma->sc_max_send_sges;
+}
+
+/**
+ * svc_rdma_pull_up_reply_msg - Copy Reply into a single buffer
+ * @rdma: controlling transport
+ * @sctxt: send_ctxt for the Send WR; xprt hdr is already prepared
+ * @rctxt: Write and Reply chunks provided by client
+ * @xdr: prepared xdr_buf containing RPC message
+ *
+ * The device is not capable of sending the reply directly.
+ * Assemble the elements of @xdr into the transport header buffer.
+ *
+ * Returns zero on success, or a negative errno on failure.
+ */
+static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma,
+				      struct svc_rdma_send_ctxt *sctxt,
+				      const struct svc_rdma_recv_ctxt *rctxt,
+				      const struct xdr_buf *xdr)
+{
+	unsigned char *dst, *tailbase;
+	unsigned int taillen;
+
+	dst = sctxt->sc_xprt_buf + sctxt->sc_hdrbuf.len;
+	memcpy(dst, xdr->head[0].iov_base, xdr->head[0].iov_len);
+	dst += xdr->head[0].iov_len;
+
+	tailbase = xdr->tail[0].iov_base;
+	taillen = xdr->tail[0].iov_len;
+	if (rctxt && rctxt->rc_write_list) {
+		u32 xdrpad;
+
+		xdrpad = xdr_pad_size(xdr->page_len);
+		if (taillen && xdrpad) {
+			tailbase += xdrpad;
+			taillen -= xdrpad;
+		}
+	} else {
+		unsigned int len, remaining;
+		unsigned long pageoff;
+		struct page **ppages;
+
+		ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+		pageoff = xdr->page_base & ~PAGE_MASK;
+		remaining = xdr->page_len;
+		while (remaining) {
+			len = min_t(u32, PAGE_SIZE - pageoff, remaining);
+
+			memcpy(dst, page_address(*ppages) + pageoff, len);
+			remaining -= len;
+			dst += len;
+			pageoff = 0;
+			ppages++;
+		}
+	}
+
+	if (taillen)
+		memcpy(dst, tailbase, taillen);
+
+	sctxt->sc_sges[0].length += xdr->len;
+	trace_svcrdma_send_pullup(sctxt->sc_sges[0].length);
+	return 0;
+}
+
+/* svc_rdma_map_reply_msg - DMA map the buffer holding RPC message
+ * @rdma: controlling transport
+ * @sctxt: send_ctxt for the Send WR
+ * @rctxt: Write and Reply chunks provided by client
+ * @xdr: prepared xdr_buf containing RPC message
+ *
+ * Load the xdr_buf into the ctxt's sge array, and DMA map each
+ * element as it is added. The Send WR's num_sge field is set.
+ *
+ * Returns zero on success, or a negative errno on failure.
+ */
+int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
+			   struct svc_rdma_send_ctxt *sctxt,
+			   const struct svc_rdma_recv_ctxt *rctxt,
+			   struct xdr_buf *xdr)
+{
+	unsigned int len, remaining;
+	unsigned long page_off;
+	struct page **ppages;
+	unsigned char *base;
+	u32 xdr_pad;
+	int ret;
+
+	/* Set up the (persistently-mapped) transport header SGE. */
+	sctxt->sc_send_wr.num_sge = 1;
+	sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len;
+
+	/* If there is a Reply chunk, nothing follows the transport
+	 * header, and we're done here.
+	 */
+	if (rctxt && rctxt->rc_reply_chunk)
+		return 0;
+
+	/* For pull-up, svc_rdma_send() will sync the transport header.
+	 * No additional DMA mapping is necessary.
+	 */
+	if (svc_rdma_pull_up_needed(rdma, sctxt, rctxt, xdr))
+		return svc_rdma_pull_up_reply_msg(rdma, sctxt, rctxt, xdr);
+
+	++sctxt->sc_cur_sge_no;
+	ret = svc_rdma_dma_map_buf(rdma, sctxt,
+				   xdr->head[0].iov_base,
+				   xdr->head[0].iov_len);
+	if (ret < 0)
+		return ret;
+
+	/* If a Write chunk is present, the xdr_buf's page list
+	 * is not included inline. However the Upper Layer may
+	 * have added XDR padding in the tail buffer, and that
+	 * should not be included inline.
+	 */
+	if (rctxt && rctxt->rc_write_list) {
+		base = xdr->tail[0].iov_base;
+		len = xdr->tail[0].iov_len;
+		xdr_pad = xdr_pad_size(xdr->page_len);
+
+		if (len && xdr_pad) {
+			base += xdr_pad;
+			len -= xdr_pad;
+		}
+
+		goto tail;
+	}
+
+	ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+	page_off = xdr->page_base & ~PAGE_MASK;
+	remaining = xdr->page_len;
+	while (remaining) {
+		len = min_t(u32, PAGE_SIZE - page_off, remaining);
+
+		++sctxt->sc_cur_sge_no;
+		ret = svc_rdma_dma_map_page(rdma, sctxt, *ppages++,
+					    page_off, len);
+		if (ret < 0)
+			return ret;
+
+		remaining -= len;
+		page_off = 0;
+	}
+
+	base = xdr->tail[0].iov_base;
+	len = xdr->tail[0].iov_len;
+tail:
+	if (len) {
+		++sctxt->sc_cur_sge_no;
+		ret = svc_rdma_dma_map_buf(rdma, sctxt, base, len);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+/* The svc_rqst and all resources it owns are released as soon as
+ * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt
+ * so they are released by the Send completion handler.
+ */
+static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
+				   struct svc_rdma_send_ctxt *ctxt)
+{
+	int i, pages = rqstp->rq_next_page - rqstp->rq_respages;
+
+	ctxt->sc_page_count += pages;
+	for (i = 0; i < pages; i++) {
+		ctxt->sc_pages[i] = rqstp->rq_respages[i];
+		rqstp->rq_respages[i] = NULL;
+	}
+
+	/* Prevent svc_xprt_release from releasing pages in rq_pages */
+	rqstp->rq_next_page = rqstp->rq_respages;
+}
+
+/* Prepare the portion of the RPC Reply that will be transmitted
+ * via RDMA Send. The RPC-over-RDMA transport header is prepared
+ * in sc_sges[0], and the RPC xdr_buf is prepared in following sges.
+ *
+ * Depending on whether a Write list or Reply chunk is present,
+ * the server may send all, a portion of, or none of the xdr_buf.
+ * In the latter case, only the transport header (sc_sges[0]) is
+ * transmitted.
+ *
+ * RDMA Send is the last step of transmitting an RPC reply. Pages
+ * involved in the earlier RDMA Writes are here transferred out
+ * of the rqstp and into the sctxt's page array. These pages are
+ * DMA unmapped by each Write completion, but the subsequent Send
+ * completion finally releases these pages.
+ *
+ * Assumptions:
+ * - The Reply's transport header will never be larger than a page.
+ */
+static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
+				   struct svc_rdma_send_ctxt *sctxt,
+				   const struct svc_rdma_recv_ctxt *rctxt,
+				   struct svc_rqst *rqstp)
+{
+	int ret;
+
+	ret = svc_rdma_map_reply_msg(rdma, sctxt, rctxt, &rqstp->rq_res);
+	if (ret < 0)
+		return ret;
+
+	svc_rdma_save_io_pages(rqstp, sctxt);
+
+	if (rctxt->rc_inv_rkey) {
+		sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV;
+		sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey;
+	} else {
+		sctxt->sc_send_wr.opcode = IB_WR_SEND;
+	}
+	return svc_rdma_send(rdma, sctxt);
+}
+
+/**
+ * svc_rdma_send_error_msg - Send an RPC/RDMA v1 error response
+ * @rdma: controlling transport context
+ * @sctxt: Send context for the response
+ * @rctxt: Receive context for incoming bad message
+ * @status: negative errno indicating error that occurred
+ *
+ * Given the client-provided Read, Write, and Reply chunks, the
+ * server was not able to parse the Call or form a complete Reply.
+ * Return an RDMA_ERROR message so the client can retire the RPC
+ * transaction.
+ *
+ * The caller does not have to release @sctxt. It is released by
+ * Send completion, or by this function on error.
+ */
+void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
+			     struct svc_rdma_send_ctxt *sctxt,
+			     struct svc_rdma_recv_ctxt *rctxt,
+			     int status)
+{
+	__be32 *rdma_argp = rctxt->rc_recv_buf;
+	__be32 *p;
+
+	rpcrdma_set_xdrlen(&sctxt->sc_hdrbuf, 0);
+	xdr_init_encode(&sctxt->sc_stream, &sctxt->sc_hdrbuf,
+			sctxt->sc_xprt_buf, NULL);
+
+	p = xdr_reserve_space(&sctxt->sc_stream,
+			      rpcrdma_fixed_maxsz * sizeof(*p));
+	if (!p)
+		goto put_ctxt;
+
+	*p++ = *rdma_argp;
+	*p++ = *(rdma_argp + 1);
+	*p++ = rdma->sc_fc_credits;
+	*p = rdma_error;
+
+	switch (status) {
+	case -EPROTONOSUPPORT:
+		p = xdr_reserve_space(&sctxt->sc_stream, 3 * sizeof(*p));
+		if (!p)
+			goto put_ctxt;
+
+		*p++ = err_vers;
+		*p++ = rpcrdma_version;
+		*p = rpcrdma_version;
+		trace_svcrdma_err_vers(*rdma_argp);
+		break;
+	default:
+		p = xdr_reserve_space(&sctxt->sc_stream, sizeof(*p));
+		if (!p)
+			goto put_ctxt;
+
+		*p = err_chunk;
+		trace_svcrdma_err_chunk(*rdma_argp);
+	}
+
+	/* Remote Invalidation is skipped for simplicity. */
+	sctxt->sc_send_wr.num_sge = 1;
+	sctxt->sc_send_wr.opcode = IB_WR_SEND;
+	sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len;
+	if (svc_rdma_send(rdma, sctxt))
+		goto put_ctxt;
+	return;
+
+put_ctxt:
+	svc_rdma_send_ctxt_put(rdma, sctxt);
+}
+
+/**
+ * svc_rdma_sendto - Transmit an RPC reply
+ * @rqstp: processed RPC request, reply XDR already in ::rq_res
+ *
+ * Any resources still associated with @rqstp are released upon return.
+ * If no reply message was possible, the connection is closed.
+ *
+ * Returns:
+ *	%0 if an RPC reply has been successfully posted,
+ *	%-ENOMEM if a resource shortage occurred (connection is lost),
+ *	%-ENOTCONN if posting failed (connection is lost).
+ */
+int svc_rdma_sendto(struct svc_rqst *rqstp)
+{
+	struct svc_xprt *xprt = rqstp->rq_xprt;
+	struct svcxprt_rdma *rdma =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+	struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt;
+	__be32 *rdma_argp = rctxt->rc_recv_buf;
+	__be32 *wr_lst = rctxt->rc_write_list;
+	__be32 *rp_ch = rctxt->rc_reply_chunk;
+	struct xdr_buf *xdr = &rqstp->rq_res;
+	struct svc_rdma_send_ctxt *sctxt;
+	__be32 *p;
+	int ret;
+
+	ret = -ENOTCONN;
+	if (svc_xprt_is_dead(xprt))
+		goto err0;
+
+	ret = -ENOMEM;
+	sctxt = svc_rdma_send_ctxt_get(rdma);
+	if (!sctxt)
+		goto err0;
+
+	p = xdr_reserve_space(&sctxt->sc_stream,
+			      rpcrdma_fixed_maxsz * sizeof(*p));
+	if (!p)
+		goto err0;
+	*p++ = *rdma_argp;
+	*p++ = *(rdma_argp + 1);
+	*p++ = rdma->sc_fc_credits;
+	*p   = rp_ch ? rdma_nomsg : rdma_msg;
+
+	if (svc_rdma_encode_read_list(sctxt) < 0)
+		goto err0;
+	if (wr_lst) {
+		/* XXX: Presume the client sent only one Write chunk */
+		unsigned long offset;
+		unsigned int length;
+
+		if (rctxt->rc_read_payload_length) {
+			offset = rctxt->rc_read_payload_offset;
+			length = rctxt->rc_read_payload_length;
+		} else {
+			offset = xdr->head[0].iov_len;
+			length = xdr->page_len;
+		}
+		ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr, offset,
+						length);
+		if (ret < 0)
+			goto err2;
+		if (svc_rdma_encode_write_list(rctxt, sctxt, length) < 0)
+			goto err0;
+	} else {
+		if (xdr_stream_encode_item_absent(&sctxt->sc_stream) < 0)
+			goto err0;
+	}
+	if (rp_ch) {
+		ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res);
+		if (ret < 0)
+			goto err2;
+		if (svc_rdma_encode_reply_chunk(rctxt, sctxt, ret) < 0)
+			goto err0;
+	} else {
+		if (xdr_stream_encode_item_absent(&sctxt->sc_stream) < 0)
+			goto err0;
+	}
+
+	ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp);
+	if (ret < 0)
+		goto err1;
+	return 0;
+
+ err2:
+	if (ret != -E2BIG && ret != -EINVAL)
+		goto err1;
+
+	/* Send completion releases payload pages that were part
+	 * of previously posted RDMA Writes.
+	 */
+	svc_rdma_save_io_pages(rqstp, sctxt);
+	svc_rdma_send_error_msg(rdma, sctxt, rctxt, ret);
+	return 0;
+
+ err1:
+	svc_rdma_send_ctxt_put(rdma, sctxt);
+ err0:
+	trace_svcrdma_send_err(rqstp, ret);
+	set_bit(XPT_CLOSE, &xprt->xpt_flags);
+	return -ENOTCONN;
+}
+
+/**
+ * svc_rdma_read_payload - special processing for a READ payload
+ * @rqstp: svc_rqst to operate on
+ * @offset: payload's byte offset in @xdr
+ * @length: size of payload, in bytes
+ *
+ * Returns zero on success.
+ *
+ * For the moment, just record the xdr_buf location of the READ
+ * payload. svc_rdma_sendto will use that location later when
+ * we actually send the payload.
+ */
+int svc_rdma_read_payload(struct svc_rqst *rqstp, unsigned int offset,
+			  unsigned int length)
+{
+	struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt;
+
+	/* XXX: Just one READ payload slot for now, since our
+	 * transport implementation currently supports only one
+	 * Write chunk.
+	 */
+	rctxt->rc_read_payload_offset = offset;
+	rctxt->rc_read_payload_length = length;
+
+	return 0;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
new file mode 100644
index 000000000..5f7e3d125
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -0,0 +1,614 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+/*
+ * Copyright (c) 2015-2018 Oracle. All rights reserved.
+ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Author: Tom Tucker <tom@opengridcomputing.com>
+ */
+
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/export.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/rw.h>
+
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/debug.h>
+#include <linux/sunrpc/svc_xprt.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#include "xprt_rdma.h"
+#include <trace/events/rpcrdma.h>
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
+						 struct net *net);
+static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
+					struct net *net,
+					struct sockaddr *sa, int salen,
+					int flags);
+static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt);
+static void svc_rdma_detach(struct svc_xprt *xprt);
+static void svc_rdma_free(struct svc_xprt *xprt);
+static int svc_rdma_has_wspace(struct svc_xprt *xprt);
+static void svc_rdma_secure_port(struct svc_rqst *);
+static void svc_rdma_kill_temp_xprt(struct svc_xprt *);
+
+static const struct svc_xprt_ops svc_rdma_ops = {
+	.xpo_create = svc_rdma_create,
+	.xpo_recvfrom = svc_rdma_recvfrom,
+	.xpo_sendto = svc_rdma_sendto,
+	.xpo_read_payload = svc_rdma_read_payload,
+	.xpo_release_rqst = svc_rdma_release_rqst,
+	.xpo_detach = svc_rdma_detach,
+	.xpo_free = svc_rdma_free,
+	.xpo_has_wspace = svc_rdma_has_wspace,
+	.xpo_accept = svc_rdma_accept,
+	.xpo_secure_port = svc_rdma_secure_port,
+	.xpo_kill_temp_xprt = svc_rdma_kill_temp_xprt,
+};
+
+struct svc_xprt_class svc_rdma_class = {
+	.xcl_name = "rdma",
+	.xcl_owner = THIS_MODULE,
+	.xcl_ops = &svc_rdma_ops,
+	.xcl_max_payload = RPCSVC_MAXPAYLOAD_RDMA,
+	.xcl_ident = XPRT_TRANSPORT_RDMA,
+};
+
+/* QP event handler */
+static void qp_event_handler(struct ib_event *event, void *context)
+{
+	struct svc_xprt *xprt = context;
+
+	trace_svcrdma_qp_error(event, (struct sockaddr *)&xprt->xpt_remote);
+	switch (event->event) {
+	/* These are considered benign events */
+	case IB_EVENT_PATH_MIG:
+	case IB_EVENT_COMM_EST:
+	case IB_EVENT_SQ_DRAINED:
+	case IB_EVENT_QP_LAST_WQE_REACHED:
+		break;
+
+	/* These are considered fatal events */
+	case IB_EVENT_PATH_MIG_ERR:
+	case IB_EVENT_QP_FATAL:
+	case IB_EVENT_QP_REQ_ERR:
+	case IB_EVENT_QP_ACCESS_ERR:
+	case IB_EVENT_DEVICE_FATAL:
+	default:
+		set_bit(XPT_CLOSE, &xprt->xpt_flags);
+		svc_xprt_enqueue(xprt);
+		break;
+	}
+}
+
+static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
+						 struct net *net)
+{
+	struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL);
+
+	if (!cma_xprt) {
+		dprintk("svcrdma: failed to create new transport\n");
+		return NULL;
+	}
+	svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
+	INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
+	INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
+	INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
+	INIT_LIST_HEAD(&cma_xprt->sc_send_ctxts);
+	init_llist_head(&cma_xprt->sc_recv_ctxts);
+	INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts);
+	init_waitqueue_head(&cma_xprt->sc_send_wait);
+
+	spin_lock_init(&cma_xprt->sc_lock);
+	spin_lock_init(&cma_xprt->sc_rq_dto_lock);
+	spin_lock_init(&cma_xprt->sc_send_lock);
+	spin_lock_init(&cma_xprt->sc_rw_ctxt_lock);
+
+	/*
+	 * Note that this implies that the underlying transport support
+	 * has some form of congestion control (see RFC 7530 section 3.1
+	 * paragraph 2). For now, we assume that all supported RDMA
+	 * transports are suitable here.
+	 */
+	set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags);
+
+	return cma_xprt;
+}
+
+static void
+svc_rdma_parse_connect_private(struct svcxprt_rdma *newxprt,
+			       struct rdma_conn_param *param)
+{
+	const struct rpcrdma_connect_private *pmsg = param->private_data;
+
+	if (pmsg &&
+	    pmsg->cp_magic == rpcrdma_cmp_magic &&
+	    pmsg->cp_version == RPCRDMA_CMP_VERSION) {
+		newxprt->sc_snd_w_inv = pmsg->cp_flags &
+					RPCRDMA_CMP_F_SND_W_INV_OK;
+
+		dprintk("svcrdma: client send_size %u, recv_size %u "
+			"remote inv %ssupported\n",
+			rpcrdma_decode_buffer_size(pmsg->cp_send_size),
+			rpcrdma_decode_buffer_size(pmsg->cp_recv_size),
+			newxprt->sc_snd_w_inv ? "" : "un");
+	}
+}
+
+/*
+ * This function handles the CONNECT_REQUEST event on a listening
+ * endpoint. It is passed the cma_id for the _new_ connection. The context in
+ * this cma_id is inherited from the listening cma_id and is the svc_xprt
+ * structure for the listening endpoint.
+ *
+ * This function creates a new xprt for the new connection and enqueues it on
+ * the accept queue for the listent xprt. When the listen thread is kicked, it
+ * will call the recvfrom method on the listen xprt which will accept the new
+ * connection.
+ */
+static void handle_connect_req(struct rdma_cm_id *new_cma_id,
+			       struct rdma_conn_param *param)
+{
+	struct svcxprt_rdma *listen_xprt = new_cma_id->context;
+	struct svcxprt_rdma *newxprt;
+	struct sockaddr *sa;
+
+	/* Create a new transport */
+	newxprt = svc_rdma_create_xprt(listen_xprt->sc_xprt.xpt_server,
+				       listen_xprt->sc_xprt.xpt_net);
+	if (!newxprt)
+		return;
+	newxprt->sc_cm_id = new_cma_id;
+	new_cma_id->context = newxprt;
+	svc_rdma_parse_connect_private(newxprt, param);
+
+	/* Save client advertised inbound read limit for use later in accept. */
+	newxprt->sc_ord = param->initiator_depth;
+
+	sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
+	newxprt->sc_xprt.xpt_remotelen = svc_addr_len(sa);
+	memcpy(&newxprt->sc_xprt.xpt_remote, sa,
+	       newxprt->sc_xprt.xpt_remotelen);
+	snprintf(newxprt->sc_xprt.xpt_remotebuf,
+		 sizeof(newxprt->sc_xprt.xpt_remotebuf) - 1, "%pISc", sa);
+
+	/* The remote port is arbitrary and not under the control of the
+	 * client ULP. Set it to a fixed value so that the DRC continues
+	 * to be effective after a reconnect.
+	 */
+	rpc_set_port((struct sockaddr *)&newxprt->sc_xprt.xpt_remote, 0);
+
+	sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
+	svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa));
+
+	/*
+	 * Enqueue the new transport on the accept queue of the listening
+	 * transport
+	 */
+	spin_lock(&listen_xprt->sc_lock);
+	list_add_tail(&newxprt->sc_accept_q, &listen_xprt->sc_accept_q);
+	spin_unlock(&listen_xprt->sc_lock);
+
+	set_bit(XPT_CONN, &listen_xprt->sc_xprt.xpt_flags);
+	svc_xprt_enqueue(&listen_xprt->sc_xprt);
+}
+
+/**
+ * svc_rdma_listen_handler - Handle CM events generated on a listening endpoint
+ * @cma_id: the server's listener rdma_cm_id
+ * @event: details of the event
+ *
+ * Return values:
+ *     %0: Do not destroy @cma_id
+ *     %1: Destroy @cma_id (never returned here)
+ *
+ * NB: There is never a DEVICE_REMOVAL event for INADDR_ANY listeners.
+ */
+static int svc_rdma_listen_handler(struct rdma_cm_id *cma_id,
+				   struct rdma_cm_event *event)
+{
+	switch (event->event) {
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+		handle_connect_req(cma_id, &event->param.conn);
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
+/**
+ * svc_rdma_cma_handler - Handle CM events on client connections
+ * @cma_id: the server's listener rdma_cm_id
+ * @event: details of the event
+ *
+ * Return values:
+ *     %0: Do not destroy @cma_id
+ *     %1: Destroy @cma_id (never returned here)
+ */
+static int svc_rdma_cma_handler(struct rdma_cm_id *cma_id,
+				struct rdma_cm_event *event)
+{
+	struct svcxprt_rdma *rdma = cma_id->context;
+	struct svc_xprt *xprt = &rdma->sc_xprt;
+
+	switch (event->event) {
+	case RDMA_CM_EVENT_ESTABLISHED:
+		clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags);
+		svc_xprt_enqueue(xprt);
+		break;
+	case RDMA_CM_EVENT_DISCONNECTED:
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		set_bit(XPT_CLOSE, &xprt->xpt_flags);
+		svc_xprt_enqueue(xprt);
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
+/*
+ * Create a listening RDMA service endpoint.
+ */
+static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
+					struct net *net,
+					struct sockaddr *sa, int salen,
+					int flags)
+{
+	struct rdma_cm_id *listen_id;
+	struct svcxprt_rdma *cma_xprt;
+	int ret;
+
+	if (sa->sa_family != AF_INET && sa->sa_family != AF_INET6)
+		return ERR_PTR(-EAFNOSUPPORT);
+	cma_xprt = svc_rdma_create_xprt(serv, net);
+	if (!cma_xprt)
+		return ERR_PTR(-ENOMEM);
+	set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
+	strcpy(cma_xprt->sc_xprt.xpt_remotebuf, "listener");
+
+	listen_id = rdma_create_id(net, svc_rdma_listen_handler, cma_xprt,
+				   RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(listen_id)) {
+		ret = PTR_ERR(listen_id);
+		goto err0;
+	}
+
+	/* Allow both IPv4 and IPv6 sockets to bind a single port
+	 * at the same time.
+	 */
+#if IS_ENABLED(CONFIG_IPV6)
+	ret = rdma_set_afonly(listen_id, 1);
+	if (ret)
+		goto err1;
+#endif
+	ret = rdma_bind_addr(listen_id, sa);
+	if (ret)
+		goto err1;
+	cma_xprt->sc_cm_id = listen_id;
+
+	ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG);
+	if (ret)
+		goto err1;
+
+	/*
+	 * We need to use the address from the cm_id in case the
+	 * caller specified 0 for the port number.
+	 */
+	sa = (struct sockaddr *)&cma_xprt->sc_cm_id->route.addr.src_addr;
+	svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen);
+
+	return &cma_xprt->sc_xprt;
+
+ err1:
+	rdma_destroy_id(listen_id);
+ err0:
+	kfree(cma_xprt);
+	return ERR_PTR(ret);
+}
+
+/*
+ * This is the xpo_recvfrom function for listening endpoints. Its
+ * purpose is to accept incoming connections. The CMA callback handler
+ * has already created a new transport and attached it to the new CMA
+ * ID.
+ *
+ * There is a queue of pending connections hung on the listening
+ * transport. This queue contains the new svc_xprt structure. This
+ * function takes svc_xprt structures off the accept_q and completes
+ * the connection.
+ */
+static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
+{
+	struct svcxprt_rdma *listen_rdma;
+	struct svcxprt_rdma *newxprt = NULL;
+	struct rdma_conn_param conn_param;
+	struct rpcrdma_connect_private pmsg;
+	struct ib_qp_init_attr qp_attr;
+	unsigned int ctxts, rq_depth;
+	struct ib_device *dev;
+	int ret = 0;
+	RPC_IFDEBUG(struct sockaddr *sap);
+
+	listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt);
+	clear_bit(XPT_CONN, &xprt->xpt_flags);
+	/* Get the next entry off the accept list */
+	spin_lock(&listen_rdma->sc_lock);
+	if (!list_empty(&listen_rdma->sc_accept_q)) {
+		newxprt = list_entry(listen_rdma->sc_accept_q.next,
+				     struct svcxprt_rdma, sc_accept_q);
+		list_del_init(&newxprt->sc_accept_q);
+	}
+	if (!list_empty(&listen_rdma->sc_accept_q))
+		set_bit(XPT_CONN, &listen_rdma->sc_xprt.xpt_flags);
+	spin_unlock(&listen_rdma->sc_lock);
+	if (!newxprt)
+		return NULL;
+
+	dev = newxprt->sc_cm_id->device;
+	newxprt->sc_port_num = newxprt->sc_cm_id->port_num;
+
+	/* Qualify the transport resource defaults with the
+	 * capabilities of this particular device */
+	/* Transport header, head iovec, tail iovec */
+	newxprt->sc_max_send_sges = 3;
+	/* Add one SGE per page list entry */
+	newxprt->sc_max_send_sges += (svcrdma_max_req_size / PAGE_SIZE) + 1;
+	if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge)
+		newxprt->sc_max_send_sges = dev->attrs.max_send_sge;
+	newxprt->sc_max_req_size = svcrdma_max_req_size;
+	newxprt->sc_max_requests = svcrdma_max_requests;
+	newxprt->sc_max_bc_requests = svcrdma_max_bc_requests;
+	rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests;
+	if (rq_depth > dev->attrs.max_qp_wr) {
+		pr_warn("svcrdma: reducing receive depth to %d\n",
+			dev->attrs.max_qp_wr);
+		rq_depth = dev->attrs.max_qp_wr;
+		newxprt->sc_max_requests = rq_depth - 2;
+		newxprt->sc_max_bc_requests = 2;
+	}
+	newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests);
+	ctxts = rdma_rw_mr_factor(dev, newxprt->sc_port_num, RPCSVC_MAXPAGES);
+	ctxts *= newxprt->sc_max_requests;
+	newxprt->sc_sq_depth = rq_depth + ctxts;
+	if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) {
+		pr_warn("svcrdma: reducing send depth to %d\n",
+			dev->attrs.max_qp_wr);
+		newxprt->sc_sq_depth = dev->attrs.max_qp_wr;
+	}
+	atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
+
+	newxprt->sc_pd = ib_alloc_pd(dev, 0);
+	if (IS_ERR(newxprt->sc_pd)) {
+		trace_svcrdma_pd_err(newxprt, PTR_ERR(newxprt->sc_pd));
+		goto errout;
+	}
+	newxprt->sc_sq_cq = ib_alloc_cq_any(dev, newxprt, newxprt->sc_sq_depth,
+					    IB_POLL_WORKQUEUE);
+	if (IS_ERR(newxprt->sc_sq_cq))
+		goto errout;
+	newxprt->sc_rq_cq =
+		ib_alloc_cq_any(dev, newxprt, rq_depth, IB_POLL_WORKQUEUE);
+	if (IS_ERR(newxprt->sc_rq_cq))
+		goto errout;
+
+	memset(&qp_attr, 0, sizeof qp_attr);
+	qp_attr.event_handler = qp_event_handler;
+	qp_attr.qp_context = &newxprt->sc_xprt;
+	qp_attr.port_num = newxprt->sc_port_num;
+	qp_attr.cap.max_rdma_ctxs = ctxts;
+	qp_attr.cap.max_send_wr = newxprt->sc_sq_depth - ctxts;
+	qp_attr.cap.max_recv_wr = rq_depth;
+	qp_attr.cap.max_send_sge = newxprt->sc_max_send_sges;
+	qp_attr.cap.max_recv_sge = 1;
+	qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+	qp_attr.qp_type = IB_QPT_RC;
+	qp_attr.send_cq = newxprt->sc_sq_cq;
+	qp_attr.recv_cq = newxprt->sc_rq_cq;
+	dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n",
+		newxprt->sc_cm_id, newxprt->sc_pd);
+	dprintk("    cap.max_send_wr = %d, cap.max_recv_wr = %d\n",
+		qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr);
+	dprintk("    cap.max_send_sge = %d, cap.max_recv_sge = %d\n",
+		qp_attr.cap.max_send_sge, qp_attr.cap.max_recv_sge);
+
+	ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr);
+	if (ret) {
+		trace_svcrdma_qp_err(newxprt, ret);
+		goto errout;
+	}
+	newxprt->sc_qp = newxprt->sc_cm_id->qp;
+
+	if (!(dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
+		newxprt->sc_snd_w_inv = false;
+	if (!rdma_protocol_iwarp(dev, newxprt->sc_port_num) &&
+	    !rdma_ib_or_roce(dev, newxprt->sc_port_num)) {
+		trace_svcrdma_fabric_err(newxprt, -EINVAL);
+		goto errout;
+	}
+
+	if (!svc_rdma_post_recvs(newxprt))
+		goto errout;
+
+	/* Construct RDMA-CM private message */
+	pmsg.cp_magic = rpcrdma_cmp_magic;
+	pmsg.cp_version = RPCRDMA_CMP_VERSION;
+	pmsg.cp_flags = 0;
+	pmsg.cp_send_size = pmsg.cp_recv_size =
+		rpcrdma_encode_buffer_size(newxprt->sc_max_req_size);
+
+	/* Accept Connection */
+	set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags);
+	memset(&conn_param, 0, sizeof conn_param);
+	conn_param.responder_resources = 0;
+	conn_param.initiator_depth = min_t(int, newxprt->sc_ord,
+					   dev->attrs.max_qp_init_rd_atom);
+	if (!conn_param.initiator_depth) {
+		ret = -EINVAL;
+		trace_svcrdma_initdepth_err(newxprt, ret);
+		goto errout;
+	}
+	conn_param.private_data = &pmsg;
+	conn_param.private_data_len = sizeof(pmsg);
+	rdma_lock_handler(newxprt->sc_cm_id);
+	newxprt->sc_cm_id->event_handler = svc_rdma_cma_handler;
+	ret = rdma_accept(newxprt->sc_cm_id, &conn_param);
+	rdma_unlock_handler(newxprt->sc_cm_id);
+	if (ret) {
+		trace_svcrdma_accept_err(newxprt, ret);
+		goto errout;
+	}
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+	dprintk("svcrdma: new connection %p accepted:\n", newxprt);
+	sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
+	dprintk("    local address   : %pIS:%u\n", sap, rpc_get_port(sap));
+	sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
+	dprintk("    remote address  : %pIS:%u\n", sap, rpc_get_port(sap));
+	dprintk("    max_sge         : %d\n", newxprt->sc_max_send_sges);
+	dprintk("    sq_depth        : %d\n", newxprt->sc_sq_depth);
+	dprintk("    rdma_rw_ctxs    : %d\n", ctxts);
+	dprintk("    max_requests    : %d\n", newxprt->sc_max_requests);
+	dprintk("    ord             : %d\n", conn_param.initiator_depth);
+#endif
+
+	return &newxprt->sc_xprt;
+
+ errout:
+	/* Take a reference in case the DTO handler runs */
+	svc_xprt_get(&newxprt->sc_xprt);
+	if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp))
+		ib_destroy_qp(newxprt->sc_qp);
+	rdma_destroy_id(newxprt->sc_cm_id);
+	/* This call to put will destroy the transport */
+	svc_xprt_put(&newxprt->sc_xprt);
+	return NULL;
+}
+
+static void svc_rdma_detach(struct svc_xprt *xprt)
+{
+	struct svcxprt_rdma *rdma =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+
+	rdma_disconnect(rdma->sc_cm_id);
+}
+
+static void __svc_rdma_free(struct work_struct *work)
+{
+	struct svcxprt_rdma *rdma =
+		container_of(work, struct svcxprt_rdma, sc_work);
+	struct svc_xprt *xprt = &rdma->sc_xprt;
+
+	/* This blocks until the Completion Queues are empty */
+	if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
+		ib_drain_qp(rdma->sc_qp);
+
+	svc_rdma_flush_recv_queues(rdma);
+
+	/* Final put of backchannel client transport */
+	if (xprt->xpt_bc_xprt) {
+		xprt_put(xprt->xpt_bc_xprt);
+		xprt->xpt_bc_xprt = NULL;
+	}
+
+	svc_rdma_destroy_rw_ctxts(rdma);
+	svc_rdma_send_ctxts_destroy(rdma);
+	svc_rdma_recv_ctxts_destroy(rdma);
+
+	/* Destroy the QP if present (not a listener) */
+	if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
+		ib_destroy_qp(rdma->sc_qp);
+
+	if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq))
+		ib_free_cq(rdma->sc_sq_cq);
+
+	if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq))
+		ib_free_cq(rdma->sc_rq_cq);
+
+	if (rdma->sc_pd && !IS_ERR(rdma->sc_pd))
+		ib_dealloc_pd(rdma->sc_pd);
+
+	/* Destroy the CM ID */
+	rdma_destroy_id(rdma->sc_cm_id);
+
+	kfree(rdma);
+}
+
+static void svc_rdma_free(struct svc_xprt *xprt)
+{
+	struct svcxprt_rdma *rdma =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+
+	INIT_WORK(&rdma->sc_work, __svc_rdma_free);
+	schedule_work(&rdma->sc_work);
+}
+
+static int svc_rdma_has_wspace(struct svc_xprt *xprt)
+{
+	struct svcxprt_rdma *rdma =
+		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+
+	/*
+	 * If there are already waiters on the SQ,
+	 * return false.
+	 */
+	if (waitqueue_active(&rdma->sc_send_wait))
+		return 0;
+
+	/* Otherwise return true. */
+	return 1;
+}
+
+static void svc_rdma_secure_port(struct svc_rqst *rqstp)
+{
+	set_bit(RQ_SECURE, &rqstp->rq_flags);
+}
+
+static void svc_rdma_kill_temp_xprt(struct svc_xprt *xprt)
+{
+}
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
new file mode 100644
index 000000000..9cf10cfb8
--- /dev/null
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -0,0 +1,809 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+/*
+ * Copyright (c) 2014-2017 Oracle.  All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * transport.c
+ *
+ * This file contains the top-level implementation of an RPC RDMA
+ * transport.
+ *
+ * Naming convention: functions beginning with xprt_ are part of the
+ * transport switch. All others are RPC RDMA internal.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/smp.h>
+
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/svc_rdma.h>
+
+#include "xprt_rdma.h"
+#include <trace/events/rpcrdma.h>
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+/*
+ * tunables
+ */
+
+static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
+unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
+unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
+unsigned int xprt_rdma_memreg_strategy		= RPCRDMA_FRWR;
+int xprt_rdma_pad_optimize;
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+
+static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
+static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
+static unsigned int min_inline_size = RPCRDMA_MIN_INLINE;
+static unsigned int max_inline_size = RPCRDMA_MAX_INLINE;
+static unsigned int max_padding = PAGE_SIZE;
+static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
+static unsigned int max_memreg = RPCRDMA_LAST - 1;
+static unsigned int dummy;
+
+static struct ctl_table_header *sunrpc_table_header;
+
+static struct ctl_table xr_tunables_table[] = {
+	{
+		.procname	= "rdma_slot_table_entries",
+		.data		= &xprt_rdma_slot_table_entries,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_slot_table_size,
+		.extra2		= &max_slot_table_size
+	},
+	{
+		.procname	= "rdma_max_inline_read",
+		.data		= &xprt_rdma_max_inline_read,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_inline_size,
+		.extra2		= &max_inline_size,
+	},
+	{
+		.procname	= "rdma_max_inline_write",
+		.data		= &xprt_rdma_max_inline_write,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_inline_size,
+		.extra2		= &max_inline_size,
+	},
+	{
+		.procname	= "rdma_inline_write_padding",
+		.data		= &dummy,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &max_padding,
+	},
+	{
+		.procname	= "rdma_memreg_strategy",
+		.data		= &xprt_rdma_memreg_strategy,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &min_memreg,
+		.extra2		= &max_memreg,
+	},
+	{
+		.procname	= "rdma_pad_optimize",
+		.data		= &xprt_rdma_pad_optimize,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ },
+};
+
+static struct ctl_table sunrpc_table[] = {
+	{
+		.procname	= "sunrpc",
+		.mode		= 0555,
+		.child		= xr_tunables_table
+	},
+	{ },
+};
+
+#endif
+
+static const struct rpc_xprt_ops xprt_rdma_procs;
+
+static void
+xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap)
+{
+	struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+	char buf[20];
+
+	snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr));
+	xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
+
+	xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA;
+}
+
+static void
+xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap)
+{
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+	char buf[40];
+
+	snprintf(buf, sizeof(buf), "%pi6", &sin6->sin6_addr);
+	xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
+
+	xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6;
+}
+
+void
+xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap)
+{
+	char buf[128];
+
+	switch (sap->sa_family) {
+	case AF_INET:
+		xprt_rdma_format_addresses4(xprt, sap);
+		break;
+	case AF_INET6:
+		xprt_rdma_format_addresses6(xprt, sap);
+		break;
+	default:
+		pr_err("rpcrdma: Unrecognized address family\n");
+		return;
+	}
+
+	(void)rpc_ntop(sap, buf, sizeof(buf));
+	xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL);
+
+	snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap));
+	xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL);
+
+	snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap));
+	xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
+
+	xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
+}
+
+void
+xprt_rdma_free_addresses(struct rpc_xprt *xprt)
+{
+	unsigned int i;
+
+	for (i = 0; i < RPC_DISPLAY_MAX; i++)
+		switch (i) {
+		case RPC_DISPLAY_PROTO:
+		case RPC_DISPLAY_NETID:
+			continue;
+		default:
+			kfree(xprt->address_strings[i]);
+		}
+}
+
+/**
+ * xprt_rdma_connect_worker - establish connection in the background
+ * @work: worker thread context
+ *
+ * Requester holds the xprt's send lock to prevent activity on this
+ * transport while a fresh connection is being established. RPC tasks
+ * sleep on the xprt's pending queue waiting for connect to complete.
+ */
+static void
+xprt_rdma_connect_worker(struct work_struct *work)
+{
+	struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt,
+						   rx_connect_worker.work);
+	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+	int rc;
+
+	rc = rpcrdma_xprt_connect(r_xprt);
+	xprt_clear_connecting(xprt);
+	if (!rc) {
+		xprt->connect_cookie++;
+		xprt->stat.connect_count++;
+		xprt->stat.connect_time += (long)jiffies -
+					   xprt->stat.connect_start;
+		xprt_set_connected(xprt);
+		rc = -EAGAIN;
+	} else
+		rpcrdma_xprt_disconnect(r_xprt);
+	xprt_unlock_connect(xprt, r_xprt);
+	xprt_wake_pending_tasks(xprt, rc);
+}
+
+/**
+ * xprt_rdma_inject_disconnect - inject a connection fault
+ * @xprt: transport context
+ *
+ * If @xprt is connected, disconnect it to simulate spurious
+ * connection loss. Caller must hold @xprt's send lock to
+ * ensure that data structures and hardware resources are
+ * stable during the rdma_disconnect() call.
+ */
+static void
+xprt_rdma_inject_disconnect(struct rpc_xprt *xprt)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+	trace_xprtrdma_op_inject_dsc(r_xprt);
+	rdma_disconnect(r_xprt->rx_ep->re_id);
+}
+
+/**
+ * xprt_rdma_destroy - Full tear down of transport
+ * @xprt: doomed transport context
+ *
+ * Caller guarantees there will be no more calls to us with
+ * this @xprt.
+ */
+static void
+xprt_rdma_destroy(struct rpc_xprt *xprt)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+	cancel_delayed_work_sync(&r_xprt->rx_connect_worker);
+
+	rpcrdma_xprt_disconnect(r_xprt);
+	rpcrdma_buffer_destroy(&r_xprt->rx_buf);
+
+	xprt_rdma_free_addresses(xprt);
+	xprt_free(xprt);
+
+	module_put(THIS_MODULE);
+}
+
+/* 60 second timeout, no retries */
+static const struct rpc_timeout xprt_rdma_default_timeout = {
+	.to_initval = 60 * HZ,
+	.to_maxval = 60 * HZ,
+};
+
+/**
+ * xprt_setup_rdma - Set up transport to use RDMA
+ *
+ * @args: rpc transport arguments
+ */
+static struct rpc_xprt *
+xprt_setup_rdma(struct xprt_create *args)
+{
+	struct rpc_xprt *xprt;
+	struct rpcrdma_xprt *new_xprt;
+	struct sockaddr *sap;
+	int rc;
+
+	if (args->addrlen > sizeof(xprt->addr))
+		return ERR_PTR(-EBADF);
+
+	if (!try_module_get(THIS_MODULE))
+		return ERR_PTR(-EIO);
+
+	xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0,
+			  xprt_rdma_slot_table_entries);
+	if (!xprt) {
+		module_put(THIS_MODULE);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	xprt->timeout = &xprt_rdma_default_timeout;
+	xprt->connect_timeout = xprt->timeout->to_initval;
+	xprt->max_reconnect_timeout = xprt->timeout->to_maxval;
+	xprt->bind_timeout = RPCRDMA_BIND_TO;
+	xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
+	xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
+
+	xprt->resvport = 0;		/* privileged port not needed */
+	xprt->ops = &xprt_rdma_procs;
+
+	/*
+	 * Set up RDMA-specific connect data.
+	 */
+	sap = args->dstaddr;
+
+	/* Ensure xprt->addr holds valid server TCP (not RDMA)
+	 * address, for any side protocols which peek at it */
+	xprt->prot = IPPROTO_TCP;
+	xprt->addrlen = args->addrlen;
+	memcpy(&xprt->addr, sap, xprt->addrlen);
+
+	if (rpc_get_port(sap))
+		xprt_set_bound(xprt);
+	xprt_rdma_format_addresses(xprt, sap);
+
+	new_xprt = rpcx_to_rdmax(xprt);
+	rc = rpcrdma_buffer_create(new_xprt);
+	if (rc) {
+		xprt_rdma_free_addresses(xprt);
+		xprt_free(xprt);
+		module_put(THIS_MODULE);
+		return ERR_PTR(rc);
+	}
+
+	INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
+			  xprt_rdma_connect_worker);
+
+	xprt->max_payload = RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
+
+	return xprt;
+}
+
+/**
+ * xprt_rdma_close - close a transport connection
+ * @xprt: transport context
+ *
+ * Called during autoclose or device removal.
+ *
+ * Caller holds @xprt's send lock to prevent activity on this
+ * transport while the connection is torn down.
+ */
+void xprt_rdma_close(struct rpc_xprt *xprt)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+	rpcrdma_xprt_disconnect(r_xprt);
+
+	xprt->reestablish_timeout = 0;
+	++xprt->connect_cookie;
+	xprt_disconnect_done(xprt);
+}
+
+/**
+ * xprt_rdma_set_port - update server port with rpcbind result
+ * @xprt: controlling RPC transport
+ * @port: new port value
+ *
+ * Transport connect status is unchanged.
+ */
+static void
+xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
+{
+	struct sockaddr *sap = (struct sockaddr *)&xprt->addr;
+	char buf[8];
+
+	rpc_set_port(sap, port);
+
+	kfree(xprt->address_strings[RPC_DISPLAY_PORT]);
+	snprintf(buf, sizeof(buf), "%u", port);
+	xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL);
+
+	kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]);
+	snprintf(buf, sizeof(buf), "%4hx", port);
+	xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
+}
+
+/**
+ * xprt_rdma_timer - invoked when an RPC times out
+ * @xprt: controlling RPC transport
+ * @task: RPC task that timed out
+ *
+ * Invoked when the transport is still connected, but an RPC
+ * retransmit timeout occurs.
+ *
+ * Since RDMA connections don't have a keep-alive, forcibly
+ * disconnect and retry to connect. This drives full
+ * detection of the network path, and retransmissions of
+ * all pending RPCs.
+ */
+static void
+xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+	xprt_force_disconnect(xprt);
+}
+
+/**
+ * xprt_rdma_set_connect_timeout - set timeouts for establishing a connection
+ * @xprt: controlling transport instance
+ * @connect_timeout: reconnect timeout after client disconnects
+ * @reconnect_timeout: reconnect timeout after server disconnects
+ *
+ */
+static void xprt_rdma_set_connect_timeout(struct rpc_xprt *xprt,
+					  unsigned long connect_timeout,
+					  unsigned long reconnect_timeout)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+	trace_xprtrdma_op_set_cto(r_xprt, connect_timeout, reconnect_timeout);
+
+	spin_lock(&xprt->transport_lock);
+
+	if (connect_timeout < xprt->connect_timeout) {
+		struct rpc_timeout to;
+		unsigned long initval;
+
+		to = *xprt->timeout;
+		initval = connect_timeout;
+		if (initval < RPCRDMA_INIT_REEST_TO << 1)
+			initval = RPCRDMA_INIT_REEST_TO << 1;
+		to.to_initval = initval;
+		to.to_maxval = initval;
+		r_xprt->rx_timeout = to;
+		xprt->timeout = &r_xprt->rx_timeout;
+		xprt->connect_timeout = connect_timeout;
+	}
+
+	if (reconnect_timeout < xprt->max_reconnect_timeout)
+		xprt->max_reconnect_timeout = reconnect_timeout;
+
+	spin_unlock(&xprt->transport_lock);
+}
+
+/**
+ * xprt_rdma_connect - schedule an attempt to reconnect
+ * @xprt: transport state
+ * @task: RPC scheduler context (unused)
+ *
+ */
+static void
+xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	struct rpcrdma_ep *ep = r_xprt->rx_ep;
+	unsigned long delay;
+
+	WARN_ON_ONCE(!xprt_lock_connect(xprt, task, r_xprt));
+
+	delay = 0;
+	if (ep && ep->re_connect_status != 0) {
+		delay = xprt_reconnect_delay(xprt);
+		xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO);
+	}
+	trace_xprtrdma_op_connect(r_xprt, delay);
+	queue_delayed_work(xprtiod_workqueue, &r_xprt->rx_connect_worker,
+			   delay);
+}
+
+/**
+ * xprt_rdma_alloc_slot - allocate an rpc_rqst
+ * @xprt: controlling RPC transport
+ * @task: RPC task requesting a fresh rpc_rqst
+ *
+ * tk_status values:
+ *	%0 if task->tk_rqstp points to a fresh rpc_rqst
+ *	%-EAGAIN if no rpc_rqst is available; queued on backlog
+ */
+static void
+xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	struct rpcrdma_req *req;
+
+	req = rpcrdma_buffer_get(&r_xprt->rx_buf);
+	if (!req)
+		goto out_sleep;
+	task->tk_rqstp = &req->rl_slot;
+	task->tk_status = 0;
+	return;
+
+out_sleep:
+	task->tk_status = -ENOMEM;
+	xprt_add_backlog(xprt, task);
+}
+
+/**
+ * xprt_rdma_free_slot - release an rpc_rqst
+ * @xprt: controlling RPC transport
+ * @rqst: rpc_rqst to release
+ *
+ */
+static void
+xprt_rdma_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *rqst)
+{
+	struct rpcrdma_xprt *r_xprt =
+		container_of(xprt, struct rpcrdma_xprt, rx_xprt);
+
+	rpcrdma_reply_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst));
+	if (!xprt_wake_up_backlog(xprt, rqst)) {
+		memset(rqst, 0, sizeof(*rqst));
+		rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst));
+	}
+}
+
+static bool rpcrdma_check_regbuf(struct rpcrdma_xprt *r_xprt,
+				 struct rpcrdma_regbuf *rb, size_t size,
+				 gfp_t flags)
+{
+	if (unlikely(rdmab_length(rb) < size)) {
+		if (!rpcrdma_regbuf_realloc(rb, size, flags))
+			return false;
+		r_xprt->rx_stats.hardway_register_count += size;
+	}
+	return true;
+}
+
+/**
+ * xprt_rdma_allocate - allocate transport resources for an RPC
+ * @task: RPC task
+ *
+ * Return values:
+ *        0:	Success; rq_buffer points to RPC buffer to use
+ *   ENOMEM:	Out of memory, call again later
+ *      EIO:	A permanent error occurred, do not retry
+ */
+static int
+xprt_rdma_allocate(struct rpc_task *task)
+{
+	struct rpc_rqst *rqst = task->tk_rqstp;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	gfp_t flags;
+
+	flags = RPCRDMA_DEF_GFP;
+	if (RPC_IS_ASYNC(task))
+		flags = GFP_NOWAIT | __GFP_NOWARN;
+	if (RPC_IS_SWAPPER(task))
+		flags |= __GFP_MEMALLOC;
+
+	if (!rpcrdma_check_regbuf(r_xprt, req->rl_sendbuf, rqst->rq_callsize,
+				  flags))
+		goto out_fail;
+	if (!rpcrdma_check_regbuf(r_xprt, req->rl_recvbuf, rqst->rq_rcvsize,
+				  flags))
+		goto out_fail;
+
+	rqst->rq_buffer = rdmab_data(req->rl_sendbuf);
+	rqst->rq_rbuffer = rdmab_data(req->rl_recvbuf);
+	return 0;
+
+out_fail:
+	return -ENOMEM;
+}
+
+/**
+ * xprt_rdma_free - release resources allocated by xprt_rdma_allocate
+ * @task: RPC task
+ *
+ * Caller guarantees rqst->rq_buffer is non-NULL.
+ */
+static void
+xprt_rdma_free(struct rpc_task *task)
+{
+	struct rpc_rqst *rqst = task->tk_rqstp;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+
+	if (!list_empty(&req->rl_registered))
+		frwr_unmap_sync(r_xprt, req);
+
+	/* XXX: If the RPC is completing because of a signal and
+	 * not because a reply was received, we ought to ensure
+	 * that the Send completion has fired, so that memory
+	 * involved with the Send is not still visible to the NIC.
+	 */
+}
+
+/**
+ * xprt_rdma_send_request - marshal and send an RPC request
+ * @rqst: RPC message in rq_snd_buf
+ *
+ * Caller holds the transport's write lock.
+ *
+ * Returns:
+ *	%0 if the RPC message has been sent
+ *	%-ENOTCONN if the caller should reconnect and call again
+ *	%-EAGAIN if the caller should call again
+ *	%-ENOBUFS if the caller should call again after a delay
+ *	%-EMSGSIZE if encoding ran out of buffer space. The request
+ *		was not sent. Do not try to send this message again.
+ *	%-EIO if an I/O error occurred. The request was not sent.
+ *		Do not try to send this message again.
+ */
+static int
+xprt_rdma_send_request(struct rpc_rqst *rqst)
+{
+	struct rpc_xprt *xprt = rqst->rq_xprt;
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	int rc = 0;
+
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+	if (unlikely(!rqst->rq_buffer))
+		return xprt_rdma_bc_send_reply(rqst);
+#endif	/* CONFIG_SUNRPC_BACKCHANNEL */
+
+	if (!xprt_connected(xprt))
+		return -ENOTCONN;
+
+	if (!xprt_request_get_cong(xprt, rqst))
+		return -EBADSLT;
+
+	rc = rpcrdma_marshal_req(r_xprt, rqst);
+	if (rc < 0)
+		goto failed_marshal;
+
+	/* Must suppress retransmit to maintain credits */
+	if (rqst->rq_connect_cookie == xprt->connect_cookie)
+		goto drop_connection;
+	rqst->rq_xtime = ktime_get();
+
+	if (rpcrdma_post_sends(r_xprt, req))
+		goto drop_connection;
+
+	rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len;
+
+	/* An RPC with no reply will throw off credit accounting,
+	 * so drop the connection to reset the credit grant.
+	 */
+	if (!rpc_reply_expected(rqst->rq_task))
+		goto drop_connection;
+	return 0;
+
+failed_marshal:
+	if (rc != -ENOTCONN)
+		return rc;
+drop_connection:
+	xprt_rdma_close(xprt);
+	return -ENOTCONN;
+}
+
+void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	long idle_time = 0;
+
+	if (xprt_connected(xprt))
+		idle_time = (long)(jiffies - xprt->last_used) / HZ;
+
+	seq_puts(seq, "\txprt:\trdma ");
+	seq_printf(seq, "%u %lu %lu %lu %ld %lu %lu %lu %llu %llu ",
+		   0,	/* need a local port? */
+		   xprt->stat.bind_count,
+		   xprt->stat.connect_count,
+		   xprt->stat.connect_time / HZ,
+		   idle_time,
+		   xprt->stat.sends,
+		   xprt->stat.recvs,
+		   xprt->stat.bad_xids,
+		   xprt->stat.req_u,
+		   xprt->stat.bklog_u);
+	seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu ",
+		   r_xprt->rx_stats.read_chunk_count,
+		   r_xprt->rx_stats.write_chunk_count,
+		   r_xprt->rx_stats.reply_chunk_count,
+		   r_xprt->rx_stats.total_rdma_request,
+		   r_xprt->rx_stats.total_rdma_reply,
+		   r_xprt->rx_stats.pullup_copy_count,
+		   r_xprt->rx_stats.fixup_copy_count,
+		   r_xprt->rx_stats.hardway_register_count,
+		   r_xprt->rx_stats.failed_marshal_count,
+		   r_xprt->rx_stats.bad_reply_count,
+		   r_xprt->rx_stats.nomsg_call_count);
+	seq_printf(seq, "%lu %lu %lu %lu %lu %lu\n",
+		   r_xprt->rx_stats.mrs_recycled,
+		   r_xprt->rx_stats.mrs_orphaned,
+		   r_xprt->rx_stats.mrs_allocated,
+		   r_xprt->rx_stats.local_inv_needed,
+		   r_xprt->rx_stats.empty_sendctx_q,
+		   r_xprt->rx_stats.reply_waits_for_send);
+}
+
+static int
+xprt_rdma_enable_swap(struct rpc_xprt *xprt)
+{
+	return 0;
+}
+
+static void
+xprt_rdma_disable_swap(struct rpc_xprt *xprt)
+{
+}
+
+/*
+ * Plumbing for rpc transport switch and kernel module
+ */
+
+static const struct rpc_xprt_ops xprt_rdma_procs = {
+	.reserve_xprt		= xprt_reserve_xprt_cong,
+	.release_xprt		= xprt_release_xprt_cong, /* sunrpc/xprt.c */
+	.alloc_slot		= xprt_rdma_alloc_slot,
+	.free_slot		= xprt_rdma_free_slot,
+	.release_request	= xprt_release_rqst_cong,       /* ditto */
+	.wait_for_reply_request	= xprt_wait_for_reply_request_def, /* ditto */
+	.timer			= xprt_rdma_timer,
+	.rpcbind		= rpcb_getport_async,	/* sunrpc/rpcb_clnt.c */
+	.set_port		= xprt_rdma_set_port,
+	.connect		= xprt_rdma_connect,
+	.buf_alloc		= xprt_rdma_allocate,
+	.buf_free		= xprt_rdma_free,
+	.send_request		= xprt_rdma_send_request,
+	.close			= xprt_rdma_close,
+	.destroy		= xprt_rdma_destroy,
+	.set_connect_timeout	= xprt_rdma_set_connect_timeout,
+	.print_stats		= xprt_rdma_print_stats,
+	.enable_swap		= xprt_rdma_enable_swap,
+	.disable_swap		= xprt_rdma_disable_swap,
+	.inject_disconnect	= xprt_rdma_inject_disconnect,
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+	.bc_setup		= xprt_rdma_bc_setup,
+	.bc_maxpayload		= xprt_rdma_bc_maxpayload,
+	.bc_num_slots		= xprt_rdma_bc_max_slots,
+	.bc_free_rqst		= xprt_rdma_bc_free_rqst,
+	.bc_destroy		= xprt_rdma_bc_destroy,
+#endif
+};
+
+static struct xprt_class xprt_rdma = {
+	.list			= LIST_HEAD_INIT(xprt_rdma.list),
+	.name			= "rdma",
+	.owner			= THIS_MODULE,
+	.ident			= XPRT_TRANSPORT_RDMA,
+	.setup			= xprt_setup_rdma,
+	.netid			= { "rdma", "rdma6", "" },
+};
+
+void xprt_rdma_cleanup(void)
+{
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+	if (sunrpc_table_header) {
+		unregister_sysctl_table(sunrpc_table_header);
+		sunrpc_table_header = NULL;
+	}
+#endif
+
+	xprt_unregister_transport(&xprt_rdma);
+	xprt_unregister_transport(&xprt_rdma_bc);
+}
+
+int xprt_rdma_init(void)
+{
+	int rc;
+
+	rc = xprt_register_transport(&xprt_rdma);
+	if (rc)
+		return rc;
+
+	rc = xprt_register_transport(&xprt_rdma_bc);
+	if (rc) {
+		xprt_unregister_transport(&xprt_rdma);
+		return rc;
+	}
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+	if (!sunrpc_table_header)
+		sunrpc_table_header = register_sysctl_table(sunrpc_table);
+#endif
+	return 0;
+}
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
new file mode 100644
index 000000000..d015576f3
--- /dev/null
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -0,0 +1,1453 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+/*
+ * Copyright (c) 2014-2017 Oracle.  All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * verbs.c
+ *
+ * Encapsulates the major functions managing:
+ *  o adapters
+ *  o endpoints
+ *  o connections
+ *  o buffer memory
+ */
+
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/sunrpc/svc_rdma.h>
+#include <linux/log2.h>
+
+#include <asm-generic/barrier.h>
+#include <asm/bitops.h>
+
+#include <rdma/ib_cm.h>
+
+#include "xprt_rdma.h"
+#include <trace/events/rpcrdma.h>
+
+/*
+ * Globals/Macros
+ */
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY	RPCDBG_TRANS
+#endif
+
+/*
+ * internal functions
+ */
+static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt);
+static void rpcrdma_sendctxs_destroy(struct rpcrdma_xprt *r_xprt);
+static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt,
+				       struct rpcrdma_sendctx *sc);
+static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt);
+static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt);
+static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep);
+static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt);
+static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
+static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt);
+static void rpcrdma_ep_get(struct rpcrdma_ep *ep);
+static int rpcrdma_ep_put(struct rpcrdma_ep *ep);
+static struct rpcrdma_regbuf *
+rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction,
+		     gfp_t flags);
+static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb);
+static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb);
+
+/* Wait for outstanding transport work to finish. ib_drain_qp
+ * handles the drains in the wrong order for us, so open code
+ * them here.
+ */
+static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_ep *ep = r_xprt->rx_ep;
+	struct rdma_cm_id *id = ep->re_id;
+
+	/* Flush Receives, then wait for deferred Reply work
+	 * to complete.
+	 */
+	ib_drain_rq(id->qp);
+
+	/* Deferred Reply processing might have scheduled
+	 * local invalidations.
+	 */
+	ib_drain_sq(id->qp);
+
+	rpcrdma_ep_put(ep);
+}
+
+/**
+ * rpcrdma_qp_event_handler - Handle one QP event (error notification)
+ * @event: details of the event
+ * @context: ep that owns QP where event occurred
+ *
+ * Called from the RDMA provider (device driver) possibly in an interrupt
+ * context. The QP is always destroyed before the ID, so the ID will be
+ * reliably available when this handler is invoked.
+ */
+static void rpcrdma_qp_event_handler(struct ib_event *event, void *context)
+{
+	struct rpcrdma_ep *ep = context;
+
+	trace_xprtrdma_qp_event(ep, event);
+}
+
+/* Ensure xprt_force_disconnect() is invoked exactly once when a
+ * connection is closed or lost. (The important thing is it needs
+ * to be invoked "at least" once).
+ */
+static void rpcrdma_force_disconnect(struct rpcrdma_ep *ep)
+{
+	if (atomic_add_unless(&ep->re_force_disconnect, 1, 1))
+		xprt_force_disconnect(ep->re_xprt);
+}
+
+/**
+ * rpcrdma_flush_disconnect - Disconnect on flushed completion
+ * @r_xprt: transport to disconnect
+ * @wc: work completion entry
+ *
+ * Must be called in process context.
+ */
+void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc)
+{
+	if (wc->status != IB_WC_SUCCESS)
+		rpcrdma_force_disconnect(r_xprt->rx_ep);
+}
+
+/**
+ * rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC
+ * @cq:	completion queue
+ * @wc:	WCE for a completed Send WR
+ *
+ */
+static void rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct rpcrdma_sendctx *sc =
+		container_of(cqe, struct rpcrdma_sendctx, sc_cqe);
+	struct rpcrdma_xprt *r_xprt = cq->cq_context;
+
+	/* WARNING: Only wr_cqe and status are reliable at this point */
+	trace_xprtrdma_wc_send(sc, wc);
+	rpcrdma_sendctx_put_locked(r_xprt, sc);
+	rpcrdma_flush_disconnect(r_xprt, wc);
+}
+
+/**
+ * rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
+ * @cq:	completion queue
+ * @wc:	WCE for a completed Receive WR
+ *
+ */
+static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep,
+					       rr_cqe);
+	struct rpcrdma_xprt *r_xprt = cq->cq_context;
+
+	/* WARNING: Only wr_cqe and status are reliable at this point */
+	trace_xprtrdma_wc_receive(wc);
+	--r_xprt->rx_ep->re_receive_count;
+	if (wc->status != IB_WC_SUCCESS)
+		goto out_flushed;
+
+	/* status == SUCCESS means all fields in wc are trustworthy */
+	rpcrdma_set_xdrlen(&rep->rr_hdrbuf, wc->byte_len);
+	rep->rr_wc_flags = wc->wc_flags;
+	rep->rr_inv_rkey = wc->ex.invalidate_rkey;
+
+	ib_dma_sync_single_for_cpu(rdmab_device(rep->rr_rdmabuf),
+				   rdmab_addr(rep->rr_rdmabuf),
+				   wc->byte_len, DMA_FROM_DEVICE);
+
+	rpcrdma_reply_handler(rep);
+	return;
+
+out_flushed:
+	rpcrdma_flush_disconnect(r_xprt, wc);
+	rpcrdma_rep_destroy(rep);
+}
+
+static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep,
+				      struct rdma_conn_param *param)
+{
+	const struct rpcrdma_connect_private *pmsg = param->private_data;
+	unsigned int rsize, wsize;
+
+	/* Default settings for RPC-over-RDMA Version One */
+	ep->re_implicit_roundup = xprt_rdma_pad_optimize;
+	rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
+	wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
+
+	if (pmsg &&
+	    pmsg->cp_magic == rpcrdma_cmp_magic &&
+	    pmsg->cp_version == RPCRDMA_CMP_VERSION) {
+		ep->re_implicit_roundup = true;
+		rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
+		wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
+	}
+
+	if (rsize < ep->re_inline_recv)
+		ep->re_inline_recv = rsize;
+	if (wsize < ep->re_inline_send)
+		ep->re_inline_send = wsize;
+
+	rpcrdma_set_max_header_sizes(ep);
+}
+
+/**
+ * rpcrdma_cm_event_handler - Handle RDMA CM events
+ * @id: rdma_cm_id on which an event has occurred
+ * @event: details of the event
+ *
+ * Called with @id's mutex held. Returns 1 if caller should
+ * destroy @id, otherwise 0.
+ */
+static int
+rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
+{
+	struct sockaddr *sap = (struct sockaddr *)&id->route.addr.dst_addr;
+	struct rpcrdma_ep *ep = id->context;
+
+	might_sleep();
+
+	switch (event->event) {
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		ep->re_async_rc = 0;
+		complete(&ep->re_done);
+		return 0;
+	case RDMA_CM_EVENT_ADDR_ERROR:
+		ep->re_async_rc = -EPROTO;
+		complete(&ep->re_done);
+		return 0;
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+		ep->re_async_rc = -ENETUNREACH;
+		complete(&ep->re_done);
+		return 0;
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		pr_info("rpcrdma: removing device %s for %pISpc\n",
+			ep->re_id->device->name, sap);
+		fallthrough;
+	case RDMA_CM_EVENT_ADDR_CHANGE:
+		ep->re_connect_status = -ENODEV;
+		goto disconnected;
+	case RDMA_CM_EVENT_ESTABLISHED:
+		rpcrdma_ep_get(ep);
+		ep->re_connect_status = 1;
+		rpcrdma_update_cm_private(ep, &event->param.conn);
+		trace_xprtrdma_inline_thresh(ep);
+		wake_up_all(&ep->re_connect_wait);
+		break;
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+		ep->re_connect_status = -ENOTCONN;
+		goto wake_connect_worker;
+	case RDMA_CM_EVENT_UNREACHABLE:
+		ep->re_connect_status = -ENETUNREACH;
+		goto wake_connect_worker;
+	case RDMA_CM_EVENT_REJECTED:
+		dprintk("rpcrdma: connection to %pISpc rejected: %s\n",
+			sap, rdma_reject_msg(id, event->status));
+		ep->re_connect_status = -ECONNREFUSED;
+		if (event->status == IB_CM_REJ_STALE_CONN)
+			ep->re_connect_status = -ENOTCONN;
+wake_connect_worker:
+		wake_up_all(&ep->re_connect_wait);
+		return 0;
+	case RDMA_CM_EVENT_DISCONNECTED:
+		ep->re_connect_status = -ECONNABORTED;
+disconnected:
+		rpcrdma_force_disconnect(ep);
+		return rpcrdma_ep_put(ep);
+	default:
+		break;
+	}
+
+	dprintk("RPC:       %s: %pISpc on %s/frwr: %s\n", __func__, sap,
+		ep->re_id->device->name, rdma_event_msg(event->event));
+	return 0;
+}
+
+static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
+					    struct rpcrdma_ep *ep)
+{
+	unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1;
+	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+	struct rdma_cm_id *id;
+	int rc;
+
+	init_completion(&ep->re_done);
+
+	id = rdma_create_id(xprt->xprt_net, rpcrdma_cm_event_handler, ep,
+			    RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(id))
+		return id;
+
+	ep->re_async_rc = -ETIMEDOUT;
+	rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)&xprt->addr,
+			       RDMA_RESOLVE_TIMEOUT);
+	if (rc)
+		goto out;
+	rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout);
+	if (rc < 0)
+		goto out;
+
+	rc = ep->re_async_rc;
+	if (rc)
+		goto out;
+
+	ep->re_async_rc = -ETIMEDOUT;
+	rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
+	if (rc)
+		goto out;
+	rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout);
+	if (rc < 0)
+		goto out;
+	rc = ep->re_async_rc;
+	if (rc)
+		goto out;
+
+	return id;
+
+out:
+	rdma_destroy_id(id);
+	return ERR_PTR(rc);
+}
+
+static void rpcrdma_ep_destroy(struct kref *kref)
+{
+	struct rpcrdma_ep *ep = container_of(kref, struct rpcrdma_ep, re_kref);
+
+	if (ep->re_id->qp) {
+		rdma_destroy_qp(ep->re_id);
+		ep->re_id->qp = NULL;
+	}
+
+	if (ep->re_attr.recv_cq)
+		ib_free_cq(ep->re_attr.recv_cq);
+	ep->re_attr.recv_cq = NULL;
+	if (ep->re_attr.send_cq)
+		ib_free_cq(ep->re_attr.send_cq);
+	ep->re_attr.send_cq = NULL;
+
+	if (ep->re_pd)
+		ib_dealloc_pd(ep->re_pd);
+	ep->re_pd = NULL;
+
+	kfree(ep);
+	module_put(THIS_MODULE);
+}
+
+static noinline void rpcrdma_ep_get(struct rpcrdma_ep *ep)
+{
+	kref_get(&ep->re_kref);
+}
+
+/* Returns:
+ *     %0 if @ep still has a positive kref count, or
+ *     %1 if @ep was destroyed successfully.
+ */
+static noinline int rpcrdma_ep_put(struct rpcrdma_ep *ep)
+{
+	return kref_put(&ep->re_kref, rpcrdma_ep_destroy);
+}
+
+static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_connect_private *pmsg;
+	struct ib_device *device;
+	struct rdma_cm_id *id;
+	struct rpcrdma_ep *ep;
+	int rc;
+
+	ep = kzalloc(sizeof(*ep), GFP_NOFS);
+	if (!ep)
+		return -ENOTCONN;
+	ep->re_xprt = &r_xprt->rx_xprt;
+	kref_init(&ep->re_kref);
+
+	id = rpcrdma_create_id(r_xprt, ep);
+	if (IS_ERR(id)) {
+		kfree(ep);
+		return PTR_ERR(id);
+	}
+	__module_get(THIS_MODULE);
+	device = id->device;
+	ep->re_id = id;
+
+	ep->re_max_requests = r_xprt->rx_xprt.max_reqs;
+	ep->re_inline_send = xprt_rdma_max_inline_write;
+	ep->re_inline_recv = xprt_rdma_max_inline_read;
+	rc = frwr_query_device(ep, device);
+	if (rc)
+		goto out_destroy;
+
+	r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->re_max_requests);
+
+	ep->re_attr.event_handler = rpcrdma_qp_event_handler;
+	ep->re_attr.qp_context = ep;
+	ep->re_attr.srq = NULL;
+	ep->re_attr.cap.max_inline_data = 0;
+	ep->re_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+	ep->re_attr.qp_type = IB_QPT_RC;
+	ep->re_attr.port_num = ~0;
+
+	dprintk("RPC:       %s: requested max: dtos: send %d recv %d; "
+		"iovs: send %d recv %d\n",
+		__func__,
+		ep->re_attr.cap.max_send_wr,
+		ep->re_attr.cap.max_recv_wr,
+		ep->re_attr.cap.max_send_sge,
+		ep->re_attr.cap.max_recv_sge);
+
+	ep->re_send_batch = ep->re_max_requests >> 3;
+	ep->re_send_count = ep->re_send_batch;
+	init_waitqueue_head(&ep->re_connect_wait);
+
+	ep->re_attr.send_cq = ib_alloc_cq_any(device, r_xprt,
+					      ep->re_attr.cap.max_send_wr,
+					      IB_POLL_WORKQUEUE);
+	if (IS_ERR(ep->re_attr.send_cq)) {
+		rc = PTR_ERR(ep->re_attr.send_cq);
+		ep->re_attr.send_cq = NULL;
+		goto out_destroy;
+	}
+
+	ep->re_attr.recv_cq = ib_alloc_cq_any(device, r_xprt,
+					      ep->re_attr.cap.max_recv_wr,
+					      IB_POLL_WORKQUEUE);
+	if (IS_ERR(ep->re_attr.recv_cq)) {
+		rc = PTR_ERR(ep->re_attr.recv_cq);
+		ep->re_attr.recv_cq = NULL;
+		goto out_destroy;
+	}
+	ep->re_receive_count = 0;
+
+	/* Initialize cma parameters */
+	memset(&ep->re_remote_cma, 0, sizeof(ep->re_remote_cma));
+
+	/* Prepare RDMA-CM private message */
+	pmsg = &ep->re_cm_private;
+	pmsg->cp_magic = rpcrdma_cmp_magic;
+	pmsg->cp_version = RPCRDMA_CMP_VERSION;
+	pmsg->cp_flags |= RPCRDMA_CMP_F_SND_W_INV_OK;
+	pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->re_inline_send);
+	pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->re_inline_recv);
+	ep->re_remote_cma.private_data = pmsg;
+	ep->re_remote_cma.private_data_len = sizeof(*pmsg);
+
+	/* Client offers RDMA Read but does not initiate */
+	ep->re_remote_cma.initiator_depth = 0;
+	ep->re_remote_cma.responder_resources =
+		min_t(int, U8_MAX, device->attrs.max_qp_rd_atom);
+
+	/* Limit transport retries so client can detect server
+	 * GID changes quickly. RPC layer handles re-establishing
+	 * transport connection and retransmission.
+	 */
+	ep->re_remote_cma.retry_count = 6;
+
+	/* RPC-over-RDMA handles its own flow control. In addition,
+	 * make all RNR NAKs visible so we know that RPC-over-RDMA
+	 * flow control is working correctly (no NAKs should be seen).
+	 */
+	ep->re_remote_cma.flow_control = 0;
+	ep->re_remote_cma.rnr_retry_count = 0;
+
+	ep->re_pd = ib_alloc_pd(device, 0);
+	if (IS_ERR(ep->re_pd)) {
+		rc = PTR_ERR(ep->re_pd);
+		ep->re_pd = NULL;
+		goto out_destroy;
+	}
+
+	rc = rdma_create_qp(id, ep->re_pd, &ep->re_attr);
+	if (rc)
+		goto out_destroy;
+
+	r_xprt->rx_ep = ep;
+	return 0;
+
+out_destroy:
+	rpcrdma_ep_put(ep);
+	rdma_destroy_id(id);
+	return rc;
+}
+
+/**
+ * rpcrdma_xprt_connect - Connect an unconnected transport
+ * @r_xprt: controlling transport instance
+ *
+ * Returns 0 on success or a negative errno.
+ */
+int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+	struct rpcrdma_ep *ep;
+	int rc;
+
+	rc = rpcrdma_ep_create(r_xprt);
+	if (rc)
+		return rc;
+	ep = r_xprt->rx_ep;
+
+	xprt_clear_connected(xprt);
+	rpcrdma_reset_cwnd(r_xprt);
+
+	/* Bump the ep's reference count while there are
+	 * outstanding Receives.
+	 */
+	rpcrdma_ep_get(ep);
+	rpcrdma_post_recvs(r_xprt, 1, true);
+
+	rc = rdma_connect(ep->re_id, &ep->re_remote_cma);
+	if (rc)
+		goto out;
+
+	if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
+		xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
+	wait_event_interruptible(ep->re_connect_wait,
+				 ep->re_connect_status != 0);
+	if (ep->re_connect_status <= 0) {
+		rc = ep->re_connect_status;
+		goto out;
+	}
+
+	rc = rpcrdma_sendctxs_create(r_xprt);
+	if (rc) {
+		rc = -ENOTCONN;
+		goto out;
+	}
+
+	rc = rpcrdma_reqs_setup(r_xprt);
+	if (rc) {
+		rc = -ENOTCONN;
+		goto out;
+	}
+	rpcrdma_mrs_create(r_xprt);
+
+out:
+	trace_xprtrdma_connect(r_xprt, rc);
+	return rc;
+}
+
+/**
+ * rpcrdma_xprt_disconnect - Disconnect underlying transport
+ * @r_xprt: controlling transport instance
+ *
+ * Caller serializes. Either the transport send lock is held,
+ * or we're being called to destroy the transport.
+ *
+ * On return, @r_xprt is completely divested of all hardware
+ * resources and prepared for the next ->connect operation.
+ */
+void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_ep *ep = r_xprt->rx_ep;
+	struct rdma_cm_id *id;
+	int rc;
+
+	if (!ep)
+		return;
+
+	id = ep->re_id;
+	rc = rdma_disconnect(id);
+	trace_xprtrdma_disconnect(r_xprt, rc);
+
+	rpcrdma_xprt_drain(r_xprt);
+	rpcrdma_reps_unmap(r_xprt);
+	rpcrdma_reqs_reset(r_xprt);
+	rpcrdma_mrs_destroy(r_xprt);
+	rpcrdma_sendctxs_destroy(r_xprt);
+
+	if (rpcrdma_ep_put(ep))
+		rdma_destroy_id(id);
+
+	r_xprt->rx_ep = NULL;
+}
+
+/* Fixed-size circular FIFO queue. This implementation is wait-free and
+ * lock-free.
+ *
+ * Consumer is the code path that posts Sends. This path dequeues a
+ * sendctx for use by a Send operation. Multiple consumer threads
+ * are serialized by the RPC transport lock, which allows only one
+ * ->send_request call at a time.
+ *
+ * Producer is the code path that handles Send completions. This path
+ * enqueues a sendctx that has been completed. Multiple producer
+ * threads are serialized by the ib_poll_cq() function.
+ */
+
+/* rpcrdma_sendctxs_destroy() assumes caller has already quiesced
+ * queue activity, and rpcrdma_xprt_drain has flushed all remaining
+ * Send requests.
+ */
+static void rpcrdma_sendctxs_destroy(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	unsigned long i;
+
+	if (!buf->rb_sc_ctxs)
+		return;
+	for (i = 0; i <= buf->rb_sc_last; i++)
+		kfree(buf->rb_sc_ctxs[i]);
+	kfree(buf->rb_sc_ctxs);
+	buf->rb_sc_ctxs = NULL;
+}
+
+static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep)
+{
+	struct rpcrdma_sendctx *sc;
+
+	sc = kzalloc(struct_size(sc, sc_sges, ep->re_attr.cap.max_send_sge),
+		     GFP_KERNEL);
+	if (!sc)
+		return NULL;
+
+	sc->sc_cqe.done = rpcrdma_wc_send;
+	return sc;
+}
+
+static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct rpcrdma_sendctx *sc;
+	unsigned long i;
+
+	/* Maximum number of concurrent outstanding Send WRs. Capping
+	 * the circular queue size stops Send Queue overflow by causing
+	 * the ->send_request call to fail temporarily before too many
+	 * Sends are posted.
+	 */
+	i = r_xprt->rx_ep->re_max_requests + RPCRDMA_MAX_BC_REQUESTS;
+	buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL);
+	if (!buf->rb_sc_ctxs)
+		return -ENOMEM;
+
+	buf->rb_sc_last = i - 1;
+	for (i = 0; i <= buf->rb_sc_last; i++) {
+		sc = rpcrdma_sendctx_create(r_xprt->rx_ep);
+		if (!sc)
+			return -ENOMEM;
+
+		buf->rb_sc_ctxs[i] = sc;
+	}
+
+	buf->rb_sc_head = 0;
+	buf->rb_sc_tail = 0;
+	return 0;
+}
+
+/* The sendctx queue is not guaranteed to have a size that is a
+ * power of two, thus the helpers in circ_buf.h cannot be used.
+ * The other option is to use modulus (%), which can be expensive.
+ */
+static unsigned long rpcrdma_sendctx_next(struct rpcrdma_buffer *buf,
+					  unsigned long item)
+{
+	return likely(item < buf->rb_sc_last) ? item + 1 : 0;
+}
+
+/**
+ * rpcrdma_sendctx_get_locked - Acquire a send context
+ * @r_xprt: controlling transport instance
+ *
+ * Returns pointer to a free send completion context; or NULL if
+ * the queue is empty.
+ *
+ * Usage: Called to acquire an SGE array before preparing a Send WR.
+ *
+ * The caller serializes calls to this function (per transport), and
+ * provides an effective memory barrier that flushes the new value
+ * of rb_sc_head.
+ */
+struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct rpcrdma_sendctx *sc;
+	unsigned long next_head;
+
+	next_head = rpcrdma_sendctx_next(buf, buf->rb_sc_head);
+
+	if (next_head == READ_ONCE(buf->rb_sc_tail))
+		goto out_emptyq;
+
+	/* ORDER: item must be accessed _before_ head is updated */
+	sc = buf->rb_sc_ctxs[next_head];
+
+	/* Releasing the lock in the caller acts as a memory
+	 * barrier that flushes rb_sc_head.
+	 */
+	buf->rb_sc_head = next_head;
+
+	return sc;
+
+out_emptyq:
+	/* The queue is "empty" if there have not been enough Send
+	 * completions recently. This is a sign the Send Queue is
+	 * backing up. Cause the caller to pause and try again.
+	 */
+	xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
+	r_xprt->rx_stats.empty_sendctx_q++;
+	return NULL;
+}
+
+/**
+ * rpcrdma_sendctx_put_locked - Release a send context
+ * @r_xprt: controlling transport instance
+ * @sc: send context to release
+ *
+ * Usage: Called from Send completion to return a sendctxt
+ * to the queue.
+ *
+ * The caller serializes calls to this function (per transport).
+ */
+static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt,
+				       struct rpcrdma_sendctx *sc)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	unsigned long next_tail;
+
+	/* Unmap SGEs of previously completed but unsignaled
+	 * Sends by walking up the queue until @sc is found.
+	 */
+	next_tail = buf->rb_sc_tail;
+	do {
+		next_tail = rpcrdma_sendctx_next(buf, next_tail);
+
+		/* ORDER: item must be accessed _before_ tail is updated */
+		rpcrdma_sendctx_unmap(buf->rb_sc_ctxs[next_tail]);
+
+	} while (buf->rb_sc_ctxs[next_tail] != sc);
+
+	/* Paired with READ_ONCE */
+	smp_store_release(&buf->rb_sc_tail, next_tail);
+
+	xprt_write_space(&r_xprt->rx_xprt);
+}
+
+static void
+rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct rpcrdma_ep *ep = r_xprt->rx_ep;
+	unsigned int count;
+
+	for (count = 0; count < ep->re_max_rdma_segs; count++) {
+		struct rpcrdma_mr *mr;
+		int rc;
+
+		mr = kzalloc(sizeof(*mr), GFP_NOFS);
+		if (!mr)
+			break;
+
+		rc = frwr_mr_init(r_xprt, mr);
+		if (rc) {
+			kfree(mr);
+			break;
+		}
+
+		spin_lock(&buf->rb_lock);
+		rpcrdma_mr_push(mr, &buf->rb_mrs);
+		list_add(&mr->mr_all, &buf->rb_all_mrs);
+		spin_unlock(&buf->rb_lock);
+	}
+
+	r_xprt->rx_stats.mrs_allocated += count;
+	trace_xprtrdma_createmrs(r_xprt, count);
+}
+
+static void
+rpcrdma_mr_refresh_worker(struct work_struct *work)
+{
+	struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer,
+						  rb_refresh_worker);
+	struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt,
+						   rx_buf);
+
+	rpcrdma_mrs_create(r_xprt);
+	xprt_write_space(&r_xprt->rx_xprt);
+}
+
+/**
+ * rpcrdma_mrs_refresh - Wake the MR refresh worker
+ * @r_xprt: controlling transport instance
+ *
+ */
+void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct rpcrdma_ep *ep = r_xprt->rx_ep;
+
+	/* If there is no underlying connection, it's no use
+	 * to wake the refresh worker.
+	 */
+	if (ep->re_connect_status == 1) {
+		/* The work is scheduled on a WQ_MEM_RECLAIM
+		 * workqueue in order to prevent MR allocation
+		 * from recursing into NFS during direct reclaim.
+		 */
+		queue_work(xprtiod_workqueue, &buf->rb_refresh_worker);
+	}
+}
+
+/**
+ * rpcrdma_req_create - Allocate an rpcrdma_req object
+ * @r_xprt: controlling r_xprt
+ * @size: initial size, in bytes, of send and receive buffers
+ * @flags: GFP flags passed to memory allocators
+ *
+ * Returns an allocated and fully initialized rpcrdma_req or NULL.
+ */
+struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size,
+				       gfp_t flags)
+{
+	struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
+	struct rpcrdma_req *req;
+
+	req = kzalloc(sizeof(*req), flags);
+	if (req == NULL)
+		goto out1;
+
+	req->rl_sendbuf = rpcrdma_regbuf_alloc(size, DMA_TO_DEVICE, flags);
+	if (!req->rl_sendbuf)
+		goto out2;
+
+	req->rl_recvbuf = rpcrdma_regbuf_alloc(size, DMA_NONE, flags);
+	if (!req->rl_recvbuf)
+		goto out3;
+
+	INIT_LIST_HEAD(&req->rl_free_mrs);
+	INIT_LIST_HEAD(&req->rl_registered);
+	spin_lock(&buffer->rb_lock);
+	list_add(&req->rl_all, &buffer->rb_allreqs);
+	spin_unlock(&buffer->rb_lock);
+	return req;
+
+out3:
+	rpcrdma_regbuf_free(req->rl_sendbuf);
+out2:
+	kfree(req);
+out1:
+	return NULL;
+}
+
+/**
+ * rpcrdma_req_setup - Per-connection instance setup of an rpcrdma_req object
+ * @r_xprt: controlling transport instance
+ * @req: rpcrdma_req object to set up
+ *
+ * Returns zero on success, and a negative errno on failure.
+ */
+int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+{
+	struct rpcrdma_regbuf *rb;
+	size_t maxhdrsize;
+
+	/* Compute maximum header buffer size in bytes */
+	maxhdrsize = rpcrdma_fixed_maxsz + 3 +
+		     r_xprt->rx_ep->re_max_rdma_segs * rpcrdma_readchunk_maxsz;
+	maxhdrsize *= sizeof(__be32);
+	rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize),
+				  DMA_TO_DEVICE, GFP_KERNEL);
+	if (!rb)
+		goto out;
+
+	if (!__rpcrdma_regbuf_dma_map(r_xprt, rb))
+		goto out_free;
+
+	req->rl_rdmabuf = rb;
+	xdr_buf_init(&req->rl_hdrbuf, rdmab_data(rb), rdmab_length(rb));
+	return 0;
+
+out_free:
+	rpcrdma_regbuf_free(rb);
+out:
+	return -ENOMEM;
+}
+
+/* ASSUMPTION: the rb_allreqs list is stable for the duration,
+ * and thus can be walked without holding rb_lock. Eg. the
+ * caller is holding the transport send lock to exclude
+ * device removal or disconnection.
+ */
+static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct rpcrdma_req *req;
+	int rc;
+
+	list_for_each_entry(req, &buf->rb_allreqs, rl_all) {
+		rc = rpcrdma_req_setup(r_xprt, req);
+		if (rc)
+			return rc;
+	}
+	return 0;
+}
+
+static void rpcrdma_req_reset(struct rpcrdma_req *req)
+{
+	/* Credits are valid for only one connection */
+	req->rl_slot.rq_cong = 0;
+
+	rpcrdma_regbuf_free(req->rl_rdmabuf);
+	req->rl_rdmabuf = NULL;
+
+	rpcrdma_regbuf_dma_unmap(req->rl_sendbuf);
+	rpcrdma_regbuf_dma_unmap(req->rl_recvbuf);
+
+	frwr_reset(req);
+}
+
+/* ASSUMPTION: the rb_allreqs list is stable for the duration,
+ * and thus can be walked without holding rb_lock. Eg. the
+ * caller is holding the transport send lock to exclude
+ * device removal or disconnection.
+ */
+static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct rpcrdma_req *req;
+
+	list_for_each_entry(req, &buf->rb_allreqs, rl_all)
+		rpcrdma_req_reset(req);
+}
+
+/* No locking needed here. This function is called only by the
+ * Receive completion handler.
+ */
+static noinline
+struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
+				       bool temp)
+{
+	struct rpcrdma_rep *rep;
+
+	rep = kzalloc(sizeof(*rep), GFP_KERNEL);
+	if (rep == NULL)
+		goto out;
+
+	rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep->re_inline_recv,
+					       DMA_FROM_DEVICE, GFP_KERNEL);
+	if (!rep->rr_rdmabuf)
+		goto out_free;
+
+	xdr_buf_init(&rep->rr_hdrbuf, rdmab_data(rep->rr_rdmabuf),
+		     rdmab_length(rep->rr_rdmabuf));
+	rep->rr_cqe.done = rpcrdma_wc_receive;
+	rep->rr_rxprt = r_xprt;
+	rep->rr_recv_wr.next = NULL;
+	rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
+	rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
+	rep->rr_recv_wr.num_sge = 1;
+	rep->rr_temp = temp;
+	list_add(&rep->rr_all, &r_xprt->rx_buf.rb_all_reps);
+	return rep;
+
+out_free:
+	kfree(rep);
+out:
+	return NULL;
+}
+
+/* No locking needed here. This function is invoked only by the
+ * Receive completion handler, or during transport shutdown.
+ */
+static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep)
+{
+	list_del(&rep->rr_all);
+	rpcrdma_regbuf_free(rep->rr_rdmabuf);
+	kfree(rep);
+}
+
+static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf)
+{
+	struct llist_node *node;
+
+	/* Calls to llist_del_first are required to be serialized */
+	node = llist_del_first(&buf->rb_free_reps);
+	if (!node)
+		return NULL;
+	return llist_entry(node, struct rpcrdma_rep, rr_node);
+}
+
+static void rpcrdma_rep_put(struct rpcrdma_buffer *buf,
+			    struct rpcrdma_rep *rep)
+{
+	llist_add(&rep->rr_node, &buf->rb_free_reps);
+}
+
+static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct rpcrdma_rep *rep;
+
+	list_for_each_entry(rep, &buf->rb_all_reps, rr_all) {
+		rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf);
+		rep->rr_temp = true;
+	}
+}
+
+static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf)
+{
+	struct rpcrdma_rep *rep;
+
+	while ((rep = rpcrdma_rep_get_locked(buf)) != NULL)
+		rpcrdma_rep_destroy(rep);
+}
+
+/**
+ * rpcrdma_buffer_create - Create initial set of req/rep objects
+ * @r_xprt: transport instance to (re)initialize
+ *
+ * Returns zero on success, otherwise a negative errno.
+ */
+int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	int i, rc;
+
+	buf->rb_bc_srv_max_requests = 0;
+	spin_lock_init(&buf->rb_lock);
+	INIT_LIST_HEAD(&buf->rb_mrs);
+	INIT_LIST_HEAD(&buf->rb_all_mrs);
+	INIT_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker);
+
+	INIT_LIST_HEAD(&buf->rb_send_bufs);
+	INIT_LIST_HEAD(&buf->rb_allreqs);
+	INIT_LIST_HEAD(&buf->rb_all_reps);
+
+	rc = -ENOMEM;
+	for (i = 0; i < r_xprt->rx_xprt.max_reqs; i++) {
+		struct rpcrdma_req *req;
+
+		req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE * 2,
+					 GFP_KERNEL);
+		if (!req)
+			goto out;
+		list_add(&req->rl_list, &buf->rb_send_bufs);
+	}
+
+	init_llist_head(&buf->rb_free_reps);
+
+	return 0;
+out:
+	rpcrdma_buffer_destroy(buf);
+	return rc;
+}
+
+/**
+ * rpcrdma_req_destroy - Destroy an rpcrdma_req object
+ * @req: unused object to be destroyed
+ *
+ * Relies on caller holding the transport send lock to protect
+ * removing req->rl_all from buf->rb_all_reqs safely.
+ */
+void rpcrdma_req_destroy(struct rpcrdma_req *req)
+{
+	struct rpcrdma_mr *mr;
+
+	list_del(&req->rl_all);
+
+	while ((mr = rpcrdma_mr_pop(&req->rl_free_mrs))) {
+		struct rpcrdma_buffer *buf = &mr->mr_xprt->rx_buf;
+
+		spin_lock(&buf->rb_lock);
+		list_del(&mr->mr_all);
+		spin_unlock(&buf->rb_lock);
+
+		frwr_release_mr(mr);
+	}
+
+	rpcrdma_regbuf_free(req->rl_recvbuf);
+	rpcrdma_regbuf_free(req->rl_sendbuf);
+	rpcrdma_regbuf_free(req->rl_rdmabuf);
+	kfree(req);
+}
+
+/**
+ * rpcrdma_mrs_destroy - Release all of a transport's MRs
+ * @r_xprt: controlling transport instance
+ *
+ * Relies on caller holding the transport send lock to protect
+ * removing mr->mr_list from req->rl_free_mrs safely.
+ */
+static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct rpcrdma_mr *mr;
+
+	cancel_work_sync(&buf->rb_refresh_worker);
+
+	spin_lock(&buf->rb_lock);
+	while ((mr = list_first_entry_or_null(&buf->rb_all_mrs,
+					      struct rpcrdma_mr,
+					      mr_all)) != NULL) {
+		list_del(&mr->mr_list);
+		list_del(&mr->mr_all);
+		spin_unlock(&buf->rb_lock);
+
+		frwr_release_mr(mr);
+
+		spin_lock(&buf->rb_lock);
+	}
+	spin_unlock(&buf->rb_lock);
+}
+
+/**
+ * rpcrdma_buffer_destroy - Release all hw resources
+ * @buf: root control block for resources
+ *
+ * ORDERING: relies on a prior rpcrdma_xprt_drain :
+ * - No more Send or Receive completions can occur
+ * - All MRs, reps, and reqs are returned to their free lists
+ */
+void
+rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
+{
+	rpcrdma_reps_destroy(buf);
+
+	while (!list_empty(&buf->rb_send_bufs)) {
+		struct rpcrdma_req *req;
+
+		req = list_first_entry(&buf->rb_send_bufs,
+				       struct rpcrdma_req, rl_list);
+		list_del(&req->rl_list);
+		rpcrdma_req_destroy(req);
+	}
+}
+
+/**
+ * rpcrdma_mr_get - Allocate an rpcrdma_mr object
+ * @r_xprt: controlling transport
+ *
+ * Returns an initialized rpcrdma_mr or NULL if no free
+ * rpcrdma_mr objects are available.
+ */
+struct rpcrdma_mr *
+rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct rpcrdma_mr *mr;
+
+	spin_lock(&buf->rb_lock);
+	mr = rpcrdma_mr_pop(&buf->rb_mrs);
+	spin_unlock(&buf->rb_lock);
+	return mr;
+}
+
+/**
+ * rpcrdma_mr_put - DMA unmap an MR and release it
+ * @mr: MR to release
+ *
+ */
+void rpcrdma_mr_put(struct rpcrdma_mr *mr)
+{
+	struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
+
+	if (mr->mr_dir != DMA_NONE) {
+		trace_xprtrdma_mr_unmap(mr);
+		ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device,
+				mr->mr_sg, mr->mr_nents, mr->mr_dir);
+		mr->mr_dir = DMA_NONE;
+	}
+
+	rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs);
+}
+
+/**
+ * rpcrdma_reply_put - Put reply buffers back into pool
+ * @buffers: buffer pool
+ * @req: object to return
+ *
+ */
+void rpcrdma_reply_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req)
+{
+	if (req->rl_reply) {
+		rpcrdma_rep_put(buffers, req->rl_reply);
+		req->rl_reply = NULL;
+	}
+}
+
+/**
+ * rpcrdma_buffer_get - Get a request buffer
+ * @buffers: Buffer pool from which to obtain a buffer
+ *
+ * Returns a fresh rpcrdma_req, or NULL if none are available.
+ */
+struct rpcrdma_req *
+rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
+{
+	struct rpcrdma_req *req;
+
+	spin_lock(&buffers->rb_lock);
+	req = list_first_entry_or_null(&buffers->rb_send_bufs,
+				       struct rpcrdma_req, rl_list);
+	if (req)
+		list_del_init(&req->rl_list);
+	spin_unlock(&buffers->rb_lock);
+	return req;
+}
+
+/**
+ * rpcrdma_buffer_put - Put request/reply buffers back into pool
+ * @buffers: buffer pool
+ * @req: object to return
+ *
+ */
+void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req)
+{
+	rpcrdma_reply_put(buffers, req);
+
+	spin_lock(&buffers->rb_lock);
+	list_add(&req->rl_list, &buffers->rb_send_bufs);
+	spin_unlock(&buffers->rb_lock);
+}
+
+/**
+ * rpcrdma_recv_buffer_put - Release rpcrdma_rep back to free list
+ * @rep: rep to release
+ *
+ * Used after error conditions.
+ */
+void rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
+{
+	rpcrdma_rep_put(&rep->rr_rxprt->rx_buf, rep);
+}
+
+/* Returns a pointer to a rpcrdma_regbuf object, or NULL.
+ *
+ * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
+ * receiving the payload of RDMA RECV operations. During Long Calls
+ * or Replies they may be registered externally via frwr_map.
+ */
+static struct rpcrdma_regbuf *
+rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction,
+		     gfp_t flags)
+{
+	struct rpcrdma_regbuf *rb;
+
+	rb = kmalloc(sizeof(*rb), flags);
+	if (!rb)
+		return NULL;
+	rb->rg_data = kmalloc(size, flags);
+	if (!rb->rg_data) {
+		kfree(rb);
+		return NULL;
+	}
+
+	rb->rg_device = NULL;
+	rb->rg_direction = direction;
+	rb->rg_iov.length = size;
+	return rb;
+}
+
+/**
+ * rpcrdma_regbuf_realloc - re-allocate a SEND/RECV buffer
+ * @rb: regbuf to reallocate
+ * @size: size of buffer to be allocated, in bytes
+ * @flags: GFP flags
+ *
+ * Returns true if reallocation was successful. If false is
+ * returned, @rb is left untouched.
+ */
+bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags)
+{
+	void *buf;
+
+	buf = kmalloc(size, flags);
+	if (!buf)
+		return false;
+
+	rpcrdma_regbuf_dma_unmap(rb);
+	kfree(rb->rg_data);
+
+	rb->rg_data = buf;
+	rb->rg_iov.length = size;
+	return true;
+}
+
+/**
+ * __rpcrdma_regbuf_dma_map - DMA-map a regbuf
+ * @r_xprt: controlling transport instance
+ * @rb: regbuf to be mapped
+ *
+ * Returns true if the buffer is now DMA mapped to @r_xprt's device
+ */
+bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt,
+			      struct rpcrdma_regbuf *rb)
+{
+	struct ib_device *device = r_xprt->rx_ep->re_id->device;
+
+	if (rb->rg_direction == DMA_NONE)
+		return false;
+
+	rb->rg_iov.addr = ib_dma_map_single(device, rdmab_data(rb),
+					    rdmab_length(rb), rb->rg_direction);
+	if (ib_dma_mapping_error(device, rdmab_addr(rb))) {
+		trace_xprtrdma_dma_maperr(rdmab_addr(rb));
+		return false;
+	}
+
+	rb->rg_device = device;
+	rb->rg_iov.lkey = r_xprt->rx_ep->re_pd->local_dma_lkey;
+	return true;
+}
+
+static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb)
+{
+	if (!rb)
+		return;
+
+	if (!rpcrdma_regbuf_is_mapped(rb))
+		return;
+
+	ib_dma_unmap_single(rb->rg_device, rdmab_addr(rb), rdmab_length(rb),
+			    rb->rg_direction);
+	rb->rg_device = NULL;
+}
+
+static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb)
+{
+	rpcrdma_regbuf_dma_unmap(rb);
+	if (rb)
+		kfree(rb->rg_data);
+	kfree(rb);
+}
+
+/**
+ * rpcrdma_post_sends - Post WRs to a transport's Send Queue
+ * @r_xprt: controlling transport instance
+ * @req: rpcrdma_req containing the Send WR to post
+ *
+ * Returns 0 if the post was successful, otherwise -ENOTCONN
+ * is returned.
+ */
+int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+{
+	struct ib_send_wr *send_wr = &req->rl_wr;
+	struct rpcrdma_ep *ep = r_xprt->rx_ep;
+	int rc;
+
+	if (!ep->re_send_count || kref_read(&req->rl_kref) > 1) {
+		send_wr->send_flags |= IB_SEND_SIGNALED;
+		ep->re_send_count = ep->re_send_batch;
+	} else {
+		send_wr->send_flags &= ~IB_SEND_SIGNALED;
+		--ep->re_send_count;
+	}
+
+	trace_xprtrdma_post_send(req);
+	rc = frwr_send(r_xprt, req);
+	if (rc)
+		return -ENOTCONN;
+	return 0;
+}
+
+/**
+ * rpcrdma_post_recvs - Refill the Receive Queue
+ * @r_xprt: controlling transport instance
+ * @needed: current credit grant
+ * @temp: mark Receive buffers to be deleted after one use
+ *
+ */
+void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp)
+{
+	struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+	struct rpcrdma_ep *ep = r_xprt->rx_ep;
+	struct ib_recv_wr *wr, *bad_wr;
+	struct rpcrdma_rep *rep;
+	int count, rc;
+
+	rc = 0;
+	count = 0;
+
+	if (likely(ep->re_receive_count > needed))
+		goto out;
+	needed -= ep->re_receive_count;
+	if (!temp)
+		needed += RPCRDMA_MAX_RECV_BATCH;
+
+	/* fast path: all needed reps can be found on the free list */
+	wr = NULL;
+	while (needed) {
+		rep = rpcrdma_rep_get_locked(buf);
+		if (rep && rep->rr_temp) {
+			rpcrdma_rep_destroy(rep);
+			continue;
+		}
+		if (!rep)
+			rep = rpcrdma_rep_create(r_xprt, temp);
+		if (!rep)
+			break;
+		if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) {
+			rpcrdma_rep_put(buf, rep);
+			break;
+		}
+
+		trace_xprtrdma_post_recv(rep);
+		rep->rr_recv_wr.next = wr;
+		wr = &rep->rr_recv_wr;
+		--needed;
+		++count;
+	}
+	if (!wr)
+		goto out;
+
+	rc = ib_post_recv(ep->re_id->qp, wr,
+			  (const struct ib_recv_wr **)&bad_wr);
+out:
+	trace_xprtrdma_post_recvs(r_xprt, count, rc);
+	if (rc) {
+		for (wr = bad_wr; wr;) {
+			struct rpcrdma_rep *rep;
+
+			rep = container_of(wr, struct rpcrdma_rep, rr_recv_wr);
+			wr = wr->next;
+			rpcrdma_recv_buffer_put(rep);
+			--count;
+		}
+	}
+	ep->re_receive_count += count;
+	return;
+}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
new file mode 100644
index 000000000..702f03445
--- /dev/null
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -0,0 +1,590 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/*
+ * Copyright (c) 2014-2017 Oracle.  All rights reserved.
+ * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the BSD-type
+ * license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *
+ *      Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following
+ *      disclaimer in the documentation and/or other materials provided
+ *      with the distribution.
+ *
+ *      Neither the name of the Network Appliance, Inc. nor the names of
+ *      its contributors may be used to endorse or promote products
+ *      derived from this software without specific prior written
+ *      permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LINUX_SUNRPC_XPRT_RDMA_H
+#define _LINUX_SUNRPC_XPRT_RDMA_H
+
+#include <linux/wait.h> 		/* wait_queue_head_t, etc */
+#include <linux/spinlock.h> 		/* spinlock_t, etc */
+#include <linux/atomic.h>		/* atomic_t, etc */
+#include <linux/kref.h>			/* struct kref */
+#include <linux/workqueue.h>		/* struct work_struct */
+#include <linux/llist.h>
+
+#include <rdma/rdma_cm.h>		/* RDMA connection api */
+#include <rdma/ib_verbs.h>		/* RDMA verbs api */
+
+#include <linux/sunrpc/clnt.h> 		/* rpc_xprt */
+#include <linux/sunrpc/rpc_rdma.h> 	/* RPC/RDMA protocol */
+#include <linux/sunrpc/xprtrdma.h> 	/* xprt parameters */
+
+#define RDMA_RESOLVE_TIMEOUT	(5000)	/* 5 seconds */
+#define RDMA_CONNECT_RETRY_MAX	(2)	/* retries if no listener backlog */
+
+#define RPCRDMA_BIND_TO		(60U * HZ)
+#define RPCRDMA_INIT_REEST_TO	(5U * HZ)
+#define RPCRDMA_MAX_REEST_TO	(30U * HZ)
+#define RPCRDMA_IDLE_DISC_TO	(5U * 60 * HZ)
+
+/*
+ * RDMA Endpoint -- connection endpoint details
+ */
+struct rpcrdma_ep {
+	struct kref		re_kref;
+	struct rdma_cm_id 	*re_id;
+	struct ib_pd		*re_pd;
+	unsigned int		re_max_rdma_segs;
+	unsigned int		re_max_fr_depth;
+	bool			re_implicit_roundup;
+	enum ib_mr_type		re_mrtype;
+	struct completion	re_done;
+	unsigned int		re_send_count;
+	unsigned int		re_send_batch;
+	unsigned int		re_max_inline_send;
+	unsigned int		re_max_inline_recv;
+	int			re_async_rc;
+	int			re_connect_status;
+	atomic_t		re_force_disconnect;
+	struct ib_qp_init_attr	re_attr;
+	wait_queue_head_t       re_connect_wait;
+	struct rpc_xprt		*re_xprt;
+	struct rpcrdma_connect_private
+				re_cm_private;
+	struct rdma_conn_param	re_remote_cma;
+	int			re_receive_count;
+	unsigned int		re_max_requests; /* depends on device */
+	unsigned int		re_inline_send;	/* negotiated */
+	unsigned int		re_inline_recv;	/* negotiated */
+};
+
+/* Pre-allocate extra Work Requests for handling backward receives
+ * and sends. This is a fixed value because the Work Queues are
+ * allocated when the forward channel is set up, long before the
+ * backchannel is provisioned. This value is two times
+ * NFS4_DEF_CB_SLOT_TABLE_SIZE.
+ */
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+#define RPCRDMA_BACKWARD_WRS (32)
+#else
+#define RPCRDMA_BACKWARD_WRS (0)
+#endif
+
+/* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV
+ */
+
+struct rpcrdma_regbuf {
+	struct ib_sge		rg_iov;
+	struct ib_device	*rg_device;
+	enum dma_data_direction	rg_direction;
+	void			*rg_data;
+};
+
+static inline u64 rdmab_addr(struct rpcrdma_regbuf *rb)
+{
+	return rb->rg_iov.addr;
+}
+
+static inline u32 rdmab_length(struct rpcrdma_regbuf *rb)
+{
+	return rb->rg_iov.length;
+}
+
+static inline u32 rdmab_lkey(struct rpcrdma_regbuf *rb)
+{
+	return rb->rg_iov.lkey;
+}
+
+static inline struct ib_device *rdmab_device(struct rpcrdma_regbuf *rb)
+{
+	return rb->rg_device;
+}
+
+static inline void *rdmab_data(const struct rpcrdma_regbuf *rb)
+{
+	return rb->rg_data;
+}
+
+#define RPCRDMA_DEF_GFP		(GFP_NOIO | __GFP_NOWARN)
+
+/* To ensure a transport can always make forward progress,
+ * the number of RDMA segments allowed in header chunk lists
+ * is capped at 16. This prevents less-capable devices from
+ * overrunning the Send buffer while building chunk lists.
+ *
+ * Elements of the Read list take up more room than the
+ * Write list or Reply chunk. 16 read segments means the
+ * chunk lists cannot consume more than
+ *
+ * ((16 + 2) * read segment size) + 1 XDR words,
+ *
+ * or about 400 bytes. The fixed part of the header is
+ * another 24 bytes. Thus when the inline threshold is
+ * 1024 bytes, at least 600 bytes are available for RPC
+ * message bodies.
+ */
+enum {
+	RPCRDMA_MAX_HDR_SEGS = 16,
+};
+
+/*
+ * struct rpcrdma_rep -- this structure encapsulates state required
+ * to receive and complete an RPC Reply, asychronously. It needs
+ * several pieces of state:
+ *
+ *   o receive buffer and ib_sge (donated to provider)
+ *   o status of receive (success or not, length, inv rkey)
+ *   o bookkeeping state to get run by reply handler (XDR stream)
+ *
+ * These structures are allocated during transport initialization.
+ * N of these are associated with a transport instance, managed by
+ * struct rpcrdma_buffer. N is the max number of outstanding RPCs.
+ */
+
+struct rpcrdma_rep {
+	struct ib_cqe		rr_cqe;
+	__be32			rr_xid;
+	__be32			rr_vers;
+	__be32			rr_proc;
+	int			rr_wc_flags;
+	u32			rr_inv_rkey;
+	bool			rr_temp;
+	struct rpcrdma_regbuf	*rr_rdmabuf;
+	struct rpcrdma_xprt	*rr_rxprt;
+	struct rpc_rqst		*rr_rqst;
+	struct xdr_buf		rr_hdrbuf;
+	struct xdr_stream	rr_stream;
+	struct llist_node	rr_node;
+	struct ib_recv_wr	rr_recv_wr;
+	struct list_head	rr_all;
+};
+
+/* To reduce the rate at which a transport invokes ib_post_recv
+ * (and thus the hardware doorbell rate), xprtrdma posts Receive
+ * WRs in batches.
+ *
+ * Setting this to zero disables Receive post batching.
+ */
+enum {
+	RPCRDMA_MAX_RECV_BATCH = 7,
+};
+
+/* struct rpcrdma_sendctx - DMA mapped SGEs to unmap after Send completes
+ */
+struct rpcrdma_req;
+struct rpcrdma_sendctx {
+	struct ib_cqe		sc_cqe;
+	struct rpcrdma_req	*sc_req;
+	unsigned int		sc_unmap_count;
+	struct ib_sge		sc_sges[];
+};
+
+/*
+ * struct rpcrdma_mr - external memory region metadata
+ *
+ * An external memory region is any buffer or page that is registered
+ * on the fly (ie, not pre-registered).
+ */
+struct rpcrdma_frwr {
+	struct ib_mr			*fr_mr;
+	struct ib_cqe			fr_cqe;
+	struct completion		fr_linv_done;
+	union {
+		struct ib_reg_wr	fr_regwr;
+		struct ib_send_wr	fr_invwr;
+	};
+};
+
+struct rpcrdma_req;
+struct rpcrdma_mr {
+	struct list_head	mr_list;
+	struct rpcrdma_req	*mr_req;
+	struct scatterlist	*mr_sg;
+	int			mr_nents;
+	enum dma_data_direction	mr_dir;
+	struct rpcrdma_frwr	frwr;
+	struct rpcrdma_xprt	*mr_xprt;
+	u32			mr_handle;
+	u32			mr_length;
+	u64			mr_offset;
+	struct list_head	mr_all;
+};
+
+/*
+ * struct rpcrdma_req -- structure central to the request/reply sequence.
+ *
+ * N of these are associated with a transport instance, and stored in
+ * struct rpcrdma_buffer. N is the max number of outstanding requests.
+ *
+ * It includes pre-registered buffer memory for send AND recv.
+ * The recv buffer, however, is not owned by this structure, and
+ * is "donated" to the hardware when a recv is posted. When a
+ * reply is handled, the recv buffer used is given back to the
+ * struct rpcrdma_req associated with the request.
+ *
+ * In addition to the basic memory, this structure includes an array
+ * of iovs for send operations. The reason is that the iovs passed to
+ * ib_post_{send,recv} must not be modified until the work request
+ * completes.
+ */
+
+/* Maximum number of page-sized "segments" per chunk list to be
+ * registered or invalidated. Must handle a Reply chunk:
+ */
+enum {
+	RPCRDMA_MAX_IOV_SEGS	= 3,
+	RPCRDMA_MAX_DATA_SEGS	= ((1 * 1024 * 1024) / PAGE_SIZE) + 1,
+	RPCRDMA_MAX_SEGS	= RPCRDMA_MAX_DATA_SEGS +
+				  RPCRDMA_MAX_IOV_SEGS,
+};
+
+struct rpcrdma_mr_seg {		/* chunk descriptors */
+	u32		mr_len;		/* length of chunk or segment */
+	struct page	*mr_page;	/* owning page, if any */
+	char		*mr_offset;	/* kva if no page, else offset */
+};
+
+/* The Send SGE array is provisioned to send a maximum size
+ * inline request:
+ * - RPC-over-RDMA header
+ * - xdr_buf head iovec
+ * - RPCRDMA_MAX_INLINE bytes, in pages
+ * - xdr_buf tail iovec
+ *
+ * The actual number of array elements consumed by each RPC
+ * depends on the device's max_sge limit.
+ */
+enum {
+	RPCRDMA_MIN_SEND_SGES = 3,
+	RPCRDMA_MAX_PAGE_SGES = RPCRDMA_MAX_INLINE >> PAGE_SHIFT,
+	RPCRDMA_MAX_SEND_SGES = 1 + 1 + RPCRDMA_MAX_PAGE_SGES + 1,
+};
+
+struct rpcrdma_buffer;
+struct rpcrdma_req {
+	struct list_head	rl_list;
+	struct rpc_rqst		rl_slot;
+	struct rpcrdma_rep	*rl_reply;
+	struct xdr_stream	rl_stream;
+	struct xdr_buf		rl_hdrbuf;
+	struct ib_send_wr	rl_wr;
+	struct rpcrdma_sendctx	*rl_sendctx;
+	struct rpcrdma_regbuf	*rl_rdmabuf;	/* xprt header */
+	struct rpcrdma_regbuf	*rl_sendbuf;	/* rq_snd_buf */
+	struct rpcrdma_regbuf	*rl_recvbuf;	/* rq_rcv_buf */
+
+	struct list_head	rl_all;
+	struct kref		rl_kref;
+
+	struct list_head	rl_free_mrs;
+	struct list_head	rl_registered;
+	struct rpcrdma_mr_seg	rl_segments[RPCRDMA_MAX_SEGS];
+};
+
+static inline struct rpcrdma_req *
+rpcr_to_rdmar(const struct rpc_rqst *rqst)
+{
+	return container_of(rqst, struct rpcrdma_req, rl_slot);
+}
+
+static inline void
+rpcrdma_mr_push(struct rpcrdma_mr *mr, struct list_head *list)
+{
+	list_add(&mr->mr_list, list);
+}
+
+static inline struct rpcrdma_mr *
+rpcrdma_mr_pop(struct list_head *list)
+{
+	struct rpcrdma_mr *mr;
+
+	mr = list_first_entry_or_null(list, struct rpcrdma_mr, mr_list);
+	if (mr)
+		list_del_init(&mr->mr_list);
+	return mr;
+}
+
+/*
+ * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for
+ * inline requests/replies, and client/server credits.
+ *
+ * One of these is associated with a transport instance
+ */
+struct rpcrdma_buffer {
+	spinlock_t		rb_lock;
+	struct list_head	rb_send_bufs;
+	struct list_head	rb_mrs;
+
+	unsigned long		rb_sc_head;
+	unsigned long		rb_sc_tail;
+	unsigned long		rb_sc_last;
+	struct rpcrdma_sendctx	**rb_sc_ctxs;
+
+	struct list_head	rb_allreqs;
+	struct list_head	rb_all_mrs;
+	struct list_head	rb_all_reps;
+
+	struct llist_head	rb_free_reps;
+
+	__be32			rb_max_requests;
+	u32			rb_credits;	/* most recent credit grant */
+
+	u32			rb_bc_srv_max_requests;
+	u32			rb_bc_max_requests;
+
+	struct work_struct	rb_refresh_worker;
+};
+
+/*
+ * Statistics for RPCRDMA
+ */
+struct rpcrdma_stats {
+	/* accessed when sending a call */
+	unsigned long		read_chunk_count;
+	unsigned long		write_chunk_count;
+	unsigned long		reply_chunk_count;
+	unsigned long long	total_rdma_request;
+
+	/* rarely accessed error counters */
+	unsigned long long	pullup_copy_count;
+	unsigned long		hardway_register_count;
+	unsigned long		failed_marshal_count;
+	unsigned long		bad_reply_count;
+	unsigned long		mrs_recycled;
+	unsigned long		mrs_orphaned;
+	unsigned long		mrs_allocated;
+	unsigned long		empty_sendctx_q;
+
+	/* accessed when receiving a reply */
+	unsigned long long	total_rdma_reply;
+	unsigned long long	fixup_copy_count;
+	unsigned long		reply_waits_for_send;
+	unsigned long		local_inv_needed;
+	unsigned long		nomsg_call_count;
+	unsigned long		bcall_count;
+};
+
+/*
+ * RPCRDMA transport -- encapsulates the structures above for
+ * integration with RPC.
+ *
+ * The contained structures are embedded, not pointers,
+ * for convenience. This structure need not be visible externally.
+ *
+ * It is allocated and initialized during mount, and released
+ * during unmount.
+ */
+struct rpcrdma_xprt {
+	struct rpc_xprt		rx_xprt;
+	struct rpcrdma_ep	*rx_ep;
+	struct rpcrdma_buffer	rx_buf;
+	struct delayed_work	rx_connect_worker;
+	struct rpc_timeout	rx_timeout;
+	struct rpcrdma_stats	rx_stats;
+};
+
+#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, rx_xprt)
+
+static inline const char *
+rpcrdma_addrstr(const struct rpcrdma_xprt *r_xprt)
+{
+	return r_xprt->rx_xprt.address_strings[RPC_DISPLAY_ADDR];
+}
+
+static inline const char *
+rpcrdma_portstr(const struct rpcrdma_xprt *r_xprt)
+{
+	return r_xprt->rx_xprt.address_strings[RPC_DISPLAY_PORT];
+}
+
+/* Setting this to 0 ensures interoperability with early servers.
+ * Setting this to 1 enhances certain unaligned read/write performance.
+ * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */
+extern int xprt_rdma_pad_optimize;
+
+/* This setting controls the hunt for a supported memory
+ * registration strategy.
+ */
+extern unsigned int xprt_rdma_memreg_strategy;
+
+/*
+ * Endpoint calls - xprtrdma/verbs.c
+ */
+void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc);
+int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt);
+void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt);
+
+int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
+void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp);
+
+/*
+ * Buffer calls - xprtrdma/verbs.c
+ */
+struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size,
+				       gfp_t flags);
+int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
+void rpcrdma_req_destroy(struct rpcrdma_req *req);
+int rpcrdma_buffer_create(struct rpcrdma_xprt *);
+void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
+struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt);
+
+struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt);
+void rpcrdma_mr_put(struct rpcrdma_mr *mr);
+void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt);
+
+struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
+void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers,
+			struct rpcrdma_req *req);
+void rpcrdma_reply_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req);
+void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
+
+bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size,
+			    gfp_t flags);
+bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt,
+			      struct rpcrdma_regbuf *rb);
+
+/**
+ * rpcrdma_regbuf_is_mapped - check if buffer is DMA mapped
+ *
+ * Returns true if the buffer is now mapped to rb->rg_device.
+ */
+static inline bool rpcrdma_regbuf_is_mapped(struct rpcrdma_regbuf *rb)
+{
+	return rb->rg_device != NULL;
+}
+
+/**
+ * rpcrdma_regbuf_dma_map - DMA-map a regbuf
+ * @r_xprt: controlling transport instance
+ * @rb: regbuf to be mapped
+ *
+ * Returns true if the buffer is currently DMA mapped.
+ */
+static inline bool rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt,
+					  struct rpcrdma_regbuf *rb)
+{
+	if (likely(rpcrdma_regbuf_is_mapped(rb)))
+		return true;
+	return __rpcrdma_regbuf_dma_map(r_xprt, rb);
+}
+
+/*
+ * Wrappers for chunk registration, shared by read/write chunk code.
+ */
+
+static inline enum dma_data_direction
+rpcrdma_data_dir(bool writing)
+{
+	return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+}
+
+/* Memory registration calls xprtrdma/frwr_ops.c
+ */
+void frwr_reset(struct rpcrdma_req *req);
+int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device);
+int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr);
+void frwr_release_mr(struct rpcrdma_mr *mr);
+struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
+				struct rpcrdma_mr_seg *seg,
+				int nsegs, bool writing, __be32 xid,
+				struct rpcrdma_mr *mr);
+int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
+void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs);
+void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
+void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
+
+/*
+ * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
+ */
+
+enum rpcrdma_chunktype {
+	rpcrdma_noch = 0,
+	rpcrdma_noch_pullup,
+	rpcrdma_noch_mapped,
+	rpcrdma_readch,
+	rpcrdma_areadch,
+	rpcrdma_writech,
+	rpcrdma_replych
+};
+
+int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
+			      struct rpcrdma_req *req, u32 hdrlen,
+			      struct xdr_buf *xdr,
+			      enum rpcrdma_chunktype rtype);
+void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc);
+int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst);
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep);
+void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt);
+void rpcrdma_complete_rqst(struct rpcrdma_rep *rep);
+void rpcrdma_reply_handler(struct rpcrdma_rep *rep);
+
+static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len)
+{
+	xdr->head[0].iov_len = len;
+	xdr->len = len;
+}
+
+/* RPC/RDMA module init - xprtrdma/transport.c
+ */
+extern unsigned int xprt_rdma_max_inline_read;
+extern unsigned int xprt_rdma_max_inline_write;
+void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
+void xprt_rdma_free_addresses(struct rpc_xprt *xprt);
+void xprt_rdma_close(struct rpc_xprt *xprt);
+void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq);
+int xprt_rdma_init(void);
+void xprt_rdma_cleanup(void);
+
+/* Backchannel calls - xprtrdma/backchannel.c
+ */
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int);
+size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *);
+unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *);
+int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
+void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
+int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst);
+void xprt_rdma_bc_free_rqst(struct rpc_rqst *);
+void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int);
+#endif	/* CONFIG_SUNRPC_BACKCHANNEL */
+
+extern struct xprt_class xprt_rdma_bc;
+
+#endif				/* _LINUX_SUNRPC_XPRT_RDMA_H */
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-27 10:05:51 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-27 10:05:51 +0000
commit	5d1646d90e1f2cceb9f0828f4b28318cd0ec7744 (patch)
tree	a94efe259b9009378be6d90eb30d2b019d95c194 /net/sunrpc/xprtrdma
parent	Initial commit. (diff)
download	linux-upstream.tar.xz linux-upstream.zip