summaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/sw
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-11 08:27:49 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-11 08:27:49 +0000
commitace9429bb58fd418f0c81d4c2835699bddf6bde6 (patch)
treeb2d64bc10158fdd5497876388cd68142ca374ed3 /drivers/infiniband/sw
parentInitial commit. (diff)
downloadlinux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.tar.xz
linux-ace9429bb58fd418f0c81d4c2835699bddf6bde6.zip
Adding upstream version 6.6.15.upstream/6.6.15
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'drivers/infiniband/sw')
-rw-r--r--drivers/infiniband/sw/Makefile4
-rw-r--r--drivers/infiniband/sw/rdmavt/Kconfig8
-rw-r--r--drivers/infiniband/sw/rdmavt/Makefile14
-rw-r--r--drivers/infiniband/sw/rdmavt/ah.c139
-rw-r--r--drivers/infiniband/sw/rdmavt/ah.h17
-rw-r--r--drivers/infiniband/sw/rdmavt/cq.c534
-rw-r--r--drivers/infiniband/sw/rdmavt/cq.h20
-rw-r--r--drivers/infiniband/sw/rdmavt/mad.c130
-rw-r--r--drivers/infiniband/sw/rdmavt/mad.h18
-rw-r--r--drivers/infiniband/sw/rdmavt/mcast.c401
-rw-r--r--drivers/infiniband/sw/rdmavt/mcast.h16
-rw-r--r--drivers/infiniband/sw/rdmavt/mmap.c169
-rw-r--r--drivers/infiniband/sw/rdmavt/mmap.h19
-rw-r--r--drivers/infiniband/sw/rdmavt/mr.c917
-rw-r--r--drivers/infiniband/sw/rdmavt/mr.h36
-rw-r--r--drivers/infiniband/sw/rdmavt/pd.c64
-rw-r--r--drivers/infiniband/sw/rdmavt/pd.h14
-rw-r--r--drivers/infiniband/sw/rdmavt/qp.c3220
-rw-r--r--drivers/infiniband/sw/rdmavt/qp.h30
-rw-r--r--drivers/infiniband/sw/rdmavt/rc.c172
-rw-r--r--drivers/infiniband/sw/rdmavt/srq.c306
-rw-r--r--drivers/infiniband/sw/rdmavt/srq.h19
-rw-r--r--drivers/infiniband/sw/rdmavt/trace.c7
-rw-r--r--drivers/infiniband/sw/rdmavt/trace.h14
-rw-r--r--drivers/infiniband/sw/rdmavt/trace_cq.h124
-rw-r--r--drivers/infiniband/sw/rdmavt/trace_mr.h182
-rw-r--r--drivers/infiniband/sw/rdmavt/trace_qp.h96
-rw-r--r--drivers/infiniband/sw/rdmavt/trace_rc.h67
-rw-r--r--drivers/infiniband/sw/rdmavt/trace_rvt.h39
-rw-r--r--drivers/infiniband/sw/rdmavt/trace_tx.h163
-rw-r--r--drivers/infiniband/sw/rdmavt/vt.c619
-rw-r--r--drivers/infiniband/sw/rdmavt/vt.h62
-rw-r--r--drivers/infiniband/sw/rxe/Kconfig28
-rw-r--r--drivers/infiniband/sw/rxe/Makefile25
-rw-r--r--drivers/infiniband/sw/rxe/rxe.c243
-rw-r--r--drivers/infiniband/sw/rxe/rxe.h161
-rw-r--r--drivers/infiniband/sw/rxe/rxe_av.c173
-rw-r--r--drivers/infiniband/sw/rxe/rxe_comp.c851
-rw-r--r--drivers/infiniband/sw/rxe/rxe_cq.c133
-rw-r--r--drivers/infiniband/sw/rxe/rxe_hdr.h977
-rw-r--r--drivers/infiniband/sw/rxe/rxe_hw_counters.c51
-rw-r--r--drivers/infiniband/sw/rxe/rxe_hw_counters.h37
-rw-r--r--drivers/infiniband/sw/rxe/rxe_icrc.c175
-rw-r--r--drivers/infiniband/sw/rxe/rxe_loc.h183
-rw-r--r--drivers/infiniband/sw/rxe/rxe_mcast.c479
-rw-r--r--drivers/infiniband/sw/rxe/rxe_mmap.c148
-rw-r--r--drivers/infiniband/sw/rxe/rxe_mr.c731
-rw-r--r--drivers/infiniband/sw/rxe/rxe_mw.c338
-rw-r--r--drivers/infiniband/sw/rxe/rxe_net.c696
-rw-r--r--drivers/infiniband/sw/rxe/rxe_net.h24
-rw-r--r--drivers/infiniband/sw/rxe/rxe_opcode.c975
-rw-r--r--drivers/infiniband/sw/rxe/rxe_opcode.h111
-rw-r--r--drivers/infiniband/sw/rxe/rxe_param.h156
-rw-r--r--drivers/infiniband/sw/rxe/rxe_pool.c256
-rw-r--r--drivers/infiniband/sw/rxe/rxe_pool.h82
-rw-r--r--drivers/infiniband/sw/rxe/rxe_qp.c880
-rw-r--r--drivers/infiniband/sw/rxe/rxe_queue.c201
-rw-r--r--drivers/infiniband/sw/rxe/rxe_queue.h284
-rw-r--r--drivers/infiniband/sw/rxe/rxe_recv.c359
-rw-r--r--drivers/infiniband/sw/rxe/rxe_req.c880
-rw-r--r--drivers/infiniband/sw/rxe/rxe_resp.c1691
-rw-r--r--drivers/infiniband/sw/rxe/rxe_srq.c199
-rw-r--r--drivers/infiniband/sw/rxe/rxe_task.c313
-rw-r--r--drivers/infiniband/sw/rxe/rxe_task.h60
-rw-r--r--drivers/infiniband/sw/rxe/rxe_verbs.c1533
-rw-r--r--drivers/infiniband/sw/rxe/rxe_verbs.h475
-rw-r--r--drivers/infiniband/sw/siw/Kconfig21
-rw-r--r--drivers/infiniband/sw/siw/Makefile11
-rw-r--r--drivers/infiniband/sw/siw/iwarp.h367
-rw-r--r--drivers/infiniband/sw/siw/siw.h729
-rw-r--r--drivers/infiniband/sw/siw/siw_cm.c1970
-rw-r--r--drivers/infiniband/sw/siw/siw_cm.h133
-rw-r--r--drivers/infiniband/sw/siw/siw_cq.c122
-rw-r--r--drivers/infiniband/sw/siw/siw_main.c587
-rw-r--r--drivers/infiniband/sw/siw/siw_mem.c449
-rw-r--r--drivers/infiniband/sw/siw/siw_mem.h69
-rw-r--r--drivers/infiniband/sw/siw/siw_qp.c1350
-rw-r--r--drivers/infiniband/sw/siw/siw_qp_rx.c1476
-rw-r--r--drivers/infiniband/sw/siw/siw_qp_tx.c1314
-rw-r--r--drivers/infiniband/sw/siw/siw_verbs.c1889
-rw-r--r--drivers/infiniband/sw/siw/siw_verbs.h90
81 files changed, 32125 insertions, 0 deletions
diff --git a/drivers/infiniband/sw/Makefile b/drivers/infiniband/sw/Makefile
new file mode 100644
index 0000000000..68e0230f8f
--- /dev/null
+++ b/drivers/infiniband/sw/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_INFINIBAND_RDMAVT) += rdmavt/
+obj-$(CONFIG_RDMA_RXE) += rxe/
+obj-$(CONFIG_RDMA_SIW) += siw/
diff --git a/drivers/infiniband/sw/rdmavt/Kconfig b/drivers/infiniband/sw/rdmavt/Kconfig
new file mode 100644
index 0000000000..0df48b3a6b
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/Kconfig
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config INFINIBAND_RDMAVT
+ tristate "RDMA verbs transport library"
+ depends on INFINIBAND_VIRT_DMA
+ depends on X86_64
+ depends on PCI
+ help
+ This is a common software verbs provider for RDMA networks.
diff --git a/drivers/infiniband/sw/rdmavt/Makefile b/drivers/infiniband/sw/rdmavt/Makefile
new file mode 100644
index 0000000000..b21962dafc
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# rdmavt driver
+#
+#
+#
+# Called from the kernel module build system.
+#
+obj-$(CONFIG_INFINIBAND_RDMAVT) += rdmavt.o
+
+rdmavt-y := vt.o ah.o cq.o mad.o mcast.o mmap.o mr.o pd.o qp.o \
+ rc.o srq.o trace.o
+
+CFLAGS_trace.o = -I$(src)
diff --git a/drivers/infiniband/sw/rdmavt/ah.c b/drivers/infiniband/sw/rdmavt/ah.c
new file mode 100644
index 0000000000..63999239ed
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/ah.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+/*
+ * Copyright(c) 2016 - 2019 Intel Corporation.
+ */
+
+#include <linux/slab.h>
+#include "ah.h"
+#include "vt.h" /* for prints */
+
+/**
+ * rvt_check_ah - validate the attributes of AH
+ * @ibdev: the ib device
+ * @ah_attr: the attributes of the AH
+ *
+ * If driver supports a more detailed check_ah function call back to it
+ * otherwise just check the basics.
+ *
+ * Return: 0 on success
+ */
+int rvt_check_ah(struct ib_device *ibdev,
+ struct rdma_ah_attr *ah_attr)
+{
+ int err;
+ int port_num = rdma_ah_get_port_num(ah_attr);
+ struct ib_port_attr port_attr;
+ struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
+ u8 ah_flags = rdma_ah_get_ah_flags(ah_attr);
+ u8 static_rate = rdma_ah_get_static_rate(ah_attr);
+
+ err = ib_query_port(ibdev, port_num, &port_attr);
+ if (err)
+ return -EINVAL;
+ if (port_num < 1 ||
+ port_num > ibdev->phys_port_cnt)
+ return -EINVAL;
+ if (static_rate != IB_RATE_PORT_CURRENT &&
+ ib_rate_to_mbps(static_rate) < 0)
+ return -EINVAL;
+ if ((ah_flags & IB_AH_GRH) &&
+ rdma_ah_read_grh(ah_attr)->sgid_index >= port_attr.gid_tbl_len)
+ return -EINVAL;
+ if (rdi->driver_f.check_ah)
+ return rdi->driver_f.check_ah(ibdev, ah_attr);
+ return 0;
+}
+EXPORT_SYMBOL(rvt_check_ah);
+
+/**
+ * rvt_create_ah - create an address handle
+ * @ibah: the IB address handle
+ * @init_attr: the attributes of the AH
+ * @udata: pointer to user's input output buffer information.
+ *
+ * This may be called from interrupt context.
+ *
+ * Return: 0 on success
+ */
+int rvt_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ struct rvt_ah *ah = ibah_to_rvtah(ibah);
+ struct rvt_dev_info *dev = ib_to_rvt(ibah->device);
+ unsigned long flags;
+
+ if (rvt_check_ah(ibah->device, init_attr->ah_attr))
+ return -EINVAL;
+
+ spin_lock_irqsave(&dev->n_ahs_lock, flags);
+ if (dev->n_ahs_allocated == dev->dparms.props.max_ah) {
+ spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+ return -ENOMEM;
+ }
+
+ dev->n_ahs_allocated++;
+ spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+
+ rdma_copy_ah_attr(&ah->attr, init_attr->ah_attr);
+
+ if (dev->driver_f.notify_new_ah)
+ dev->driver_f.notify_new_ah(ibah->device,
+ init_attr->ah_attr, ah);
+
+ return 0;
+}
+
+/**
+ * rvt_destroy_ah - Destroy an address handle
+ * @ibah: address handle
+ * @destroy_flags: destroy address handle flags (see enum rdma_destroy_ah_flags)
+ * Return: 0 on success
+ */
+int rvt_destroy_ah(struct ib_ah *ibah, u32 destroy_flags)
+{
+ struct rvt_dev_info *dev = ib_to_rvt(ibah->device);
+ struct rvt_ah *ah = ibah_to_rvtah(ibah);
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev->n_ahs_lock, flags);
+ dev->n_ahs_allocated--;
+ spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+
+ rdma_destroy_ah_attr(&ah->attr);
+ return 0;
+}
+
+/**
+ * rvt_modify_ah - modify an ah with given attrs
+ * @ibah: address handle to modify
+ * @ah_attr: attrs to apply
+ *
+ * Return: 0 on success
+ */
+int rvt_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr)
+{
+ struct rvt_ah *ah = ibah_to_rvtah(ibah);
+
+ if (rvt_check_ah(ibah->device, ah_attr))
+ return -EINVAL;
+
+ ah->attr = *ah_attr;
+
+ return 0;
+}
+
+/**
+ * rvt_query_ah - return attrs for ah
+ * @ibah: address handle to query
+ * @ah_attr: return info in this
+ *
+ * Return: always 0
+ */
+int rvt_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr)
+{
+ struct rvt_ah *ah = ibah_to_rvtah(ibah);
+
+ *ah_attr = ah->attr;
+
+ return 0;
+}
diff --git a/drivers/infiniband/sw/rdmavt/ah.h b/drivers/infiniband/sw/rdmavt/ah.h
new file mode 100644
index 0000000000..c11fdf637d
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/ah.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ */
+
+#ifndef DEF_RVTAH_H
+#define DEF_RVTAH_H
+
+#include <rdma/rdma_vt.h>
+
+int rvt_create_ah(struct ib_ah *ah, struct rdma_ah_init_attr *init_attr,
+ struct ib_udata *udata);
+int rvt_destroy_ah(struct ib_ah *ibah, u32 destroy_flags);
+int rvt_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
+int rvt_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
+
+#endif /* DEF_RVTAH_H */
diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c
new file mode 100644
index 0000000000..9fe4dcaa04
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/cq.c
@@ -0,0 +1,534 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+/*
+ * Copyright(c) 2016 - 2018 Intel Corporation.
+ */
+
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include "cq.h"
+#include "vt.h"
+#include "trace.h"
+
+static struct workqueue_struct *comp_vector_wq;
+
+/**
+ * rvt_cq_enter - add a new entry to the completion queue
+ * @cq: completion queue
+ * @entry: work completion entry to add
+ * @solicited: true if @entry is solicited
+ *
+ * This may be called with qp->s_lock held.
+ *
+ * Return: return true on success, else return
+ * false if cq is full.
+ */
+bool rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
+{
+ struct ib_uverbs_wc *uqueue = NULL;
+ struct ib_wc *kqueue = NULL;
+ struct rvt_cq_wc *u_wc = NULL;
+ struct rvt_k_cq_wc *k_wc = NULL;
+ unsigned long flags;
+ u32 head;
+ u32 next;
+ u32 tail;
+
+ spin_lock_irqsave(&cq->lock, flags);
+
+ if (cq->ip) {
+ u_wc = cq->queue;
+ uqueue = &u_wc->uqueue[0];
+ head = RDMA_READ_UAPI_ATOMIC(u_wc->head);
+ tail = RDMA_READ_UAPI_ATOMIC(u_wc->tail);
+ } else {
+ k_wc = cq->kqueue;
+ kqueue = &k_wc->kqueue[0];
+ head = k_wc->head;
+ tail = k_wc->tail;
+ }
+
+ /*
+ * Note that the head pointer might be writable by
+ * user processes.Take care to verify it is a sane value.
+ */
+ if (head >= (unsigned)cq->ibcq.cqe) {
+ head = cq->ibcq.cqe;
+ next = 0;
+ } else {
+ next = head + 1;
+ }
+
+ if (unlikely(next == tail || cq->cq_full)) {
+ struct rvt_dev_info *rdi = cq->rdi;
+
+ if (!cq->cq_full)
+ rvt_pr_err_ratelimited(rdi, "CQ is full!\n");
+ cq->cq_full = true;
+ spin_unlock_irqrestore(&cq->lock, flags);
+ if (cq->ibcq.event_handler) {
+ struct ib_event ev;
+
+ ev.device = cq->ibcq.device;
+ ev.element.cq = &cq->ibcq;
+ ev.event = IB_EVENT_CQ_ERR;
+ cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
+ }
+ return false;
+ }
+ trace_rvt_cq_enter(cq, entry, head);
+ if (uqueue) {
+ uqueue[head].wr_id = entry->wr_id;
+ uqueue[head].status = entry->status;
+ uqueue[head].opcode = entry->opcode;
+ uqueue[head].vendor_err = entry->vendor_err;
+ uqueue[head].byte_len = entry->byte_len;
+ uqueue[head].ex.imm_data = entry->ex.imm_data;
+ uqueue[head].qp_num = entry->qp->qp_num;
+ uqueue[head].src_qp = entry->src_qp;
+ uqueue[head].wc_flags = entry->wc_flags;
+ uqueue[head].pkey_index = entry->pkey_index;
+ uqueue[head].slid = ib_lid_cpu16(entry->slid);
+ uqueue[head].sl = entry->sl;
+ uqueue[head].dlid_path_bits = entry->dlid_path_bits;
+ uqueue[head].port_num = entry->port_num;
+ /* Make sure entry is written before the head index. */
+ RDMA_WRITE_UAPI_ATOMIC(u_wc->head, next);
+ } else {
+ kqueue[head] = *entry;
+ k_wc->head = next;
+ }
+
+ if (cq->notify == IB_CQ_NEXT_COMP ||
+ (cq->notify == IB_CQ_SOLICITED &&
+ (solicited || entry->status != IB_WC_SUCCESS))) {
+ /*
+ * This will cause send_complete() to be called in
+ * another thread.
+ */
+ cq->notify = RVT_CQ_NONE;
+ cq->triggered++;
+ queue_work_on(cq->comp_vector_cpu, comp_vector_wq,
+ &cq->comptask);
+ }
+
+ spin_unlock_irqrestore(&cq->lock, flags);
+ return true;
+}
+EXPORT_SYMBOL(rvt_cq_enter);
+
+static void send_complete(struct work_struct *work)
+{
+ struct rvt_cq *cq = container_of(work, struct rvt_cq, comptask);
+
+ /*
+ * The completion handler will most likely rearm the notification
+ * and poll for all pending entries. If a new completion entry
+ * is added while we are in this routine, queue_work()
+ * won't call us again until we return so we check triggered to
+ * see if we need to call the handler again.
+ */
+ for (;;) {
+ u8 triggered = cq->triggered;
+
+ /*
+ * IPoIB connected mode assumes the callback is from a
+ * soft IRQ. We simulate this by blocking "bottom halves".
+ * See the implementation for ipoib_cm_handle_tx_wc(),
+ * netif_tx_lock_bh() and netif_tx_lock().
+ */
+ local_bh_disable();
+ cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+ local_bh_enable();
+
+ if (cq->triggered == triggered)
+ return;
+ }
+}
+
+/**
+ * rvt_create_cq - create a completion queue
+ * @ibcq: Allocated CQ
+ * @attr: creation attributes
+ * @udata: user data for libibverbs.so
+ *
+ * Called by ib_create_cq() in the generic verbs code.
+ *
+ * Return: 0 on success
+ */
+int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+ struct ib_udata *udata)
+{
+ struct ib_device *ibdev = ibcq->device;
+ struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
+ struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
+ struct rvt_cq_wc *u_wc = NULL;
+ struct rvt_k_cq_wc *k_wc = NULL;
+ u32 sz;
+ unsigned int entries = attr->cqe;
+ int comp_vector = attr->comp_vector;
+ int err;
+
+ if (attr->flags)
+ return -EOPNOTSUPP;
+
+ if (entries < 1 || entries > rdi->dparms.props.max_cqe)
+ return -EINVAL;
+
+ if (comp_vector < 0)
+ comp_vector = 0;
+
+ comp_vector = comp_vector % rdi->ibdev.num_comp_vectors;
+
+ /*
+ * Allocate the completion queue entries and head/tail pointers.
+ * This is allocated separately so that it can be resized and
+ * also mapped into user space.
+ * We need to use vmalloc() in order to support mmap and large
+ * numbers of entries.
+ */
+ if (udata && udata->outlen >= sizeof(__u64)) {
+ sz = sizeof(struct ib_uverbs_wc) * (entries + 1);
+ sz += sizeof(*u_wc);
+ u_wc = vmalloc_user(sz);
+ if (!u_wc)
+ return -ENOMEM;
+ } else {
+ sz = sizeof(struct ib_wc) * (entries + 1);
+ sz += sizeof(*k_wc);
+ k_wc = vzalloc_node(sz, rdi->dparms.node);
+ if (!k_wc)
+ return -ENOMEM;
+ }
+
+ /*
+ * Return the address of the WC as the offset to mmap.
+ * See rvt_mmap() for details.
+ */
+ if (udata && udata->outlen >= sizeof(__u64)) {
+ cq->ip = rvt_create_mmap_info(rdi, sz, udata, u_wc);
+ if (IS_ERR(cq->ip)) {
+ err = PTR_ERR(cq->ip);
+ goto bail_wc;
+ }
+
+ err = ib_copy_to_udata(udata, &cq->ip->offset,
+ sizeof(cq->ip->offset));
+ if (err)
+ goto bail_ip;
+ }
+
+ spin_lock_irq(&rdi->n_cqs_lock);
+ if (rdi->n_cqs_allocated == rdi->dparms.props.max_cq) {
+ spin_unlock_irq(&rdi->n_cqs_lock);
+ err = -ENOMEM;
+ goto bail_ip;
+ }
+
+ rdi->n_cqs_allocated++;
+ spin_unlock_irq(&rdi->n_cqs_lock);
+
+ if (cq->ip) {
+ spin_lock_irq(&rdi->pending_lock);
+ list_add(&cq->ip->pending_mmaps, &rdi->pending_mmaps);
+ spin_unlock_irq(&rdi->pending_lock);
+ }
+
+ /*
+ * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe.
+ * The number of entries should be >= the number requested or return
+ * an error.
+ */
+ cq->rdi = rdi;
+ if (rdi->driver_f.comp_vect_cpu_lookup)
+ cq->comp_vector_cpu =
+ rdi->driver_f.comp_vect_cpu_lookup(rdi, comp_vector);
+ else
+ cq->comp_vector_cpu =
+ cpumask_first(cpumask_of_node(rdi->dparms.node));
+
+ cq->ibcq.cqe = entries;
+ cq->notify = RVT_CQ_NONE;
+ spin_lock_init(&cq->lock);
+ INIT_WORK(&cq->comptask, send_complete);
+ if (u_wc)
+ cq->queue = u_wc;
+ else
+ cq->kqueue = k_wc;
+
+ trace_rvt_create_cq(cq, attr);
+ return 0;
+
+bail_ip:
+ kfree(cq->ip);
+bail_wc:
+ vfree(u_wc);
+ vfree(k_wc);
+ return err;
+}
+
+/**
+ * rvt_destroy_cq - destroy a completion queue
+ * @ibcq: the completion queue to destroy.
+ * @udata: user data or NULL for kernel object
+ *
+ * Called by ib_destroy_cq() in the generic verbs code.
+ */
+int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+{
+ struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
+ struct rvt_dev_info *rdi = cq->rdi;
+
+ flush_work(&cq->comptask);
+ spin_lock_irq(&rdi->n_cqs_lock);
+ rdi->n_cqs_allocated--;
+ spin_unlock_irq(&rdi->n_cqs_lock);
+ if (cq->ip)
+ kref_put(&cq->ip->ref, rvt_release_mmap_info);
+ else
+ vfree(cq->kqueue);
+ return 0;
+}
+
+/**
+ * rvt_req_notify_cq - change the notification type for a completion queue
+ * @ibcq: the completion queue
+ * @notify_flags: the type of notification to request
+ *
+ * This may be called from interrupt context. Also called by
+ * ib_req_notify_cq() in the generic verbs code.
+ *
+ * Return: 0 for success.
+ */
+int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
+{
+ struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
+ unsigned long flags;
+ int ret = 0;
+
+ spin_lock_irqsave(&cq->lock, flags);
+ /*
+ * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow
+ * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2).
+ */
+ if (cq->notify != IB_CQ_NEXT_COMP)
+ cq->notify = notify_flags & IB_CQ_SOLICITED_MASK;
+
+ if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) {
+ if (cq->queue) {
+ if (RDMA_READ_UAPI_ATOMIC(cq->queue->head) !=
+ RDMA_READ_UAPI_ATOMIC(cq->queue->tail))
+ ret = 1;
+ } else {
+ if (cq->kqueue->head != cq->kqueue->tail)
+ ret = 1;
+ }
+ }
+
+ spin_unlock_irqrestore(&cq->lock, flags);
+
+ return ret;
+}
+
+/*
+ * rvt_resize_cq - change the size of the CQ
+ * @ibcq: the completion queue
+ *
+ * Return: 0 for success.
+ */
+int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
+{
+ struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
+ u32 head, tail, n;
+ int ret;
+ u32 sz;
+ struct rvt_dev_info *rdi = cq->rdi;
+ struct rvt_cq_wc *u_wc = NULL;
+ struct rvt_cq_wc *old_u_wc = NULL;
+ struct rvt_k_cq_wc *k_wc = NULL;
+ struct rvt_k_cq_wc *old_k_wc = NULL;
+
+ if (cqe < 1 || cqe > rdi->dparms.props.max_cqe)
+ return -EINVAL;
+
+ /*
+ * Need to use vmalloc() if we want to support large #s of entries.
+ */
+ if (udata && udata->outlen >= sizeof(__u64)) {
+ sz = sizeof(struct ib_uverbs_wc) * (cqe + 1);
+ sz += sizeof(*u_wc);
+ u_wc = vmalloc_user(sz);
+ if (!u_wc)
+ return -ENOMEM;
+ } else {
+ sz = sizeof(struct ib_wc) * (cqe + 1);
+ sz += sizeof(*k_wc);
+ k_wc = vzalloc_node(sz, rdi->dparms.node);
+ if (!k_wc)
+ return -ENOMEM;
+ }
+ /* Check that we can write the offset to mmap. */
+ if (udata && udata->outlen >= sizeof(__u64)) {
+ __u64 offset = 0;
+
+ ret = ib_copy_to_udata(udata, &offset, sizeof(offset));
+ if (ret)
+ goto bail_free;
+ }
+
+ spin_lock_irq(&cq->lock);
+ /*
+ * Make sure head and tail are sane since they
+ * might be user writable.
+ */
+ if (u_wc) {
+ old_u_wc = cq->queue;
+ head = RDMA_READ_UAPI_ATOMIC(old_u_wc->head);
+ tail = RDMA_READ_UAPI_ATOMIC(old_u_wc->tail);
+ } else {
+ old_k_wc = cq->kqueue;
+ head = old_k_wc->head;
+ tail = old_k_wc->tail;
+ }
+
+ if (head > (u32)cq->ibcq.cqe)
+ head = (u32)cq->ibcq.cqe;
+ if (tail > (u32)cq->ibcq.cqe)
+ tail = (u32)cq->ibcq.cqe;
+ if (head < tail)
+ n = cq->ibcq.cqe + 1 + head - tail;
+ else
+ n = head - tail;
+ if (unlikely((u32)cqe < n)) {
+ ret = -EINVAL;
+ goto bail_unlock;
+ }
+ for (n = 0; tail != head; n++) {
+ if (u_wc)
+ u_wc->uqueue[n] = old_u_wc->uqueue[tail];
+ else
+ k_wc->kqueue[n] = old_k_wc->kqueue[tail];
+ if (tail == (u32)cq->ibcq.cqe)
+ tail = 0;
+ else
+ tail++;
+ }
+ cq->ibcq.cqe = cqe;
+ if (u_wc) {
+ RDMA_WRITE_UAPI_ATOMIC(u_wc->head, n);
+ RDMA_WRITE_UAPI_ATOMIC(u_wc->tail, 0);
+ cq->queue = u_wc;
+ } else {
+ k_wc->head = n;
+ k_wc->tail = 0;
+ cq->kqueue = k_wc;
+ }
+ spin_unlock_irq(&cq->lock);
+
+ if (u_wc)
+ vfree(old_u_wc);
+ else
+ vfree(old_k_wc);
+
+ if (cq->ip) {
+ struct rvt_mmap_info *ip = cq->ip;
+
+ rvt_update_mmap_info(rdi, ip, sz, u_wc);
+
+ /*
+ * Return the offset to mmap.
+ * See rvt_mmap() for details.
+ */
+ if (udata && udata->outlen >= sizeof(__u64)) {
+ ret = ib_copy_to_udata(udata, &ip->offset,
+ sizeof(ip->offset));
+ if (ret)
+ return ret;
+ }
+
+ spin_lock_irq(&rdi->pending_lock);
+ if (list_empty(&ip->pending_mmaps))
+ list_add(&ip->pending_mmaps, &rdi->pending_mmaps);
+ spin_unlock_irq(&rdi->pending_lock);
+ }
+
+ return 0;
+
+bail_unlock:
+ spin_unlock_irq(&cq->lock);
+bail_free:
+ vfree(u_wc);
+ vfree(k_wc);
+
+ return ret;
+}
+
+/**
+ * rvt_poll_cq - poll for work completion entries
+ * @ibcq: the completion queue to poll
+ * @num_entries: the maximum number of entries to return
+ * @entry: pointer to array where work completions are placed
+ *
+ * This may be called from interrupt context. Also called by ib_poll_cq()
+ * in the generic verbs code.
+ *
+ * Return: the number of completion entries polled.
+ */
+int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
+{
+ struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
+ struct rvt_k_cq_wc *wc;
+ unsigned long flags;
+ int npolled;
+ u32 tail;
+
+ /* The kernel can only poll a kernel completion queue */
+ if (cq->ip)
+ return -EINVAL;
+
+ spin_lock_irqsave(&cq->lock, flags);
+
+ wc = cq->kqueue;
+ tail = wc->tail;
+ if (tail > (u32)cq->ibcq.cqe)
+ tail = (u32)cq->ibcq.cqe;
+ for (npolled = 0; npolled < num_entries; ++npolled, ++entry) {
+ if (tail == wc->head)
+ break;
+ /* The kernel doesn't need a RMB since it has the lock. */
+ trace_rvt_cq_poll(cq, &wc->kqueue[tail], npolled);
+ *entry = wc->kqueue[tail];
+ if (tail >= cq->ibcq.cqe)
+ tail = 0;
+ else
+ tail++;
+ }
+ wc->tail = tail;
+
+ spin_unlock_irqrestore(&cq->lock, flags);
+
+ return npolled;
+}
+
+/**
+ * rvt_driver_cq_init - Init cq resources on behalf of driver
+ *
+ * Return: 0 on success
+ */
+int rvt_driver_cq_init(void)
+{
+ comp_vector_wq = alloc_workqueue("%s", WQ_HIGHPRI | WQ_CPU_INTENSIVE,
+ 0, "rdmavt_cq");
+ if (!comp_vector_wq)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/**
+ * rvt_cq_exit - tear down cq reources
+ */
+void rvt_cq_exit(void)
+{
+ destroy_workqueue(comp_vector_wq);
+ comp_vector_wq = NULL;
+}
diff --git a/drivers/infiniband/sw/rdmavt/cq.h b/drivers/infiniband/sw/rdmavt/cq.h
new file mode 100644
index 0000000000..b0a948ec76
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/cq.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+/*
+ * Copyright(c) 2016 - 2018 Intel Corporation.
+ */
+
+#ifndef DEF_RVTCQ_H
+#define DEF_RVTCQ_H
+
+#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_cq.h>
+
+int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+ struct ib_udata *udata);
+int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
+int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags);
+int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
+int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
+int rvt_driver_cq_init(void);
+void rvt_cq_exit(void);
+#endif /* DEF_RVTCQ_H */
diff --git a/drivers/infiniband/sw/rdmavt/mad.c b/drivers/infiniband/sw/rdmavt/mad.c
new file mode 100644
index 0000000000..98a8fe3b04
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/mad.c
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ */
+
+#include <rdma/ib_mad.h>
+#include "mad.h"
+#include "vt.h"
+
+/**
+ * rvt_process_mad - process an incoming MAD packet
+ * @ibdev: the infiniband device this packet came in on
+ * @mad_flags: MAD flags
+ * @port_num: the port number this packet came in on, 1 based from ib core
+ * @in_wc: the work completion entry for this packet
+ * @in_grh: the global route header for this packet
+ * @in: the incoming MAD
+ * @in_mad_size: size of the incoming MAD reply
+ * @out: any outgoing MAD reply
+ * @out_mad_size: size of the outgoing MAD reply
+ * @out_mad_pkey_index: unused
+ *
+ * Note that the verbs framework has already done the MAD sanity checks,
+ * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+ * MADs.
+ *
+ * This is called by the ib_mad module.
+ *
+ * Return: IB_MAD_RESULT_SUCCESS or error
+ */
+int rvt_process_mad(struct ib_device *ibdev, int mad_flags, u32 port_num,
+ const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+ const struct ib_mad_hdr *in, size_t in_mad_size,
+ struct ib_mad_hdr *out, size_t *out_mad_size,
+ u16 *out_mad_pkey_index)
+{
+ /*
+ * MAD processing is quite different between hfi1 and qib. Therefore
+ * this is expected to be provided by the driver. Other drivers in the
+ * future may choose to implement this but it should not be made into a
+ * requirement.
+ */
+ return IB_MAD_RESULT_FAILURE;
+}
+
+static void rvt_send_mad_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+/**
+ * rvt_create_mad_agents - create mad agents
+ * @rdi: rvt dev struct
+ *
+ * If driver needs to be notified of mad agent creation then call back
+ *
+ * Return 0 on success
+ */
+int rvt_create_mad_agents(struct rvt_dev_info *rdi)
+{
+ struct ib_mad_agent *agent;
+ struct rvt_ibport *rvp;
+ int p;
+ int ret;
+
+ for (p = 0; p < rdi->dparms.nports; p++) {
+ rvp = rdi->ports[p];
+ agent = ib_register_mad_agent(&rdi->ibdev, p + 1,
+ IB_QPT_SMI,
+ NULL, 0, rvt_send_mad_handler,
+ NULL, NULL, 0);
+ if (IS_ERR(agent)) {
+ ret = PTR_ERR(agent);
+ goto err;
+ }
+
+ rvp->send_agent = agent;
+
+ if (rdi->driver_f.notify_create_mad_agent)
+ rdi->driver_f.notify_create_mad_agent(rdi, p);
+ }
+
+ return 0;
+
+err:
+ for (p = 0; p < rdi->dparms.nports; p++) {
+ rvp = rdi->ports[p];
+ if (rvp->send_agent) {
+ agent = rvp->send_agent;
+ rvp->send_agent = NULL;
+ ib_unregister_mad_agent(agent);
+ if (rdi->driver_f.notify_free_mad_agent)
+ rdi->driver_f.notify_free_mad_agent(rdi, p);
+ }
+ }
+
+ return ret;
+}
+
+/**
+ * rvt_free_mad_agents - free up mad agents
+ * @rdi: rvt dev struct
+ *
+ * If driver needs notification of mad agent removal make the call back
+ */
+void rvt_free_mad_agents(struct rvt_dev_info *rdi)
+{
+ struct ib_mad_agent *agent;
+ struct rvt_ibport *rvp;
+ int p;
+
+ for (p = 0; p < rdi->dparms.nports; p++) {
+ rvp = rdi->ports[p];
+ if (rvp->send_agent) {
+ agent = rvp->send_agent;
+ rvp->send_agent = NULL;
+ ib_unregister_mad_agent(agent);
+ }
+ if (rvp->sm_ah) {
+ rdma_destroy_ah(&rvp->sm_ah->ibah,
+ RDMA_DESTROY_AH_SLEEPABLE);
+ rvp->sm_ah = NULL;
+ }
+
+ if (rdi->driver_f.notify_free_mad_agent)
+ rdi->driver_f.notify_free_mad_agent(rdi, p);
+ }
+}
+
diff --git a/drivers/infiniband/sw/rdmavt/mad.h b/drivers/infiniband/sw/rdmavt/mad.h
new file mode 100644
index 0000000000..368be29eab
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/mad.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ */
+
+#ifndef DEF_RVTMAD_H
+#define DEF_RVTMAD_H
+
+#include <rdma/rdma_vt.h>
+
+int rvt_process_mad(struct ib_device *ibdev, int mad_flags, u32 port_num,
+ const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+ const struct ib_mad_hdr *in, size_t in_mad_size,
+ struct ib_mad_hdr *out, size_t *out_mad_size,
+ u16 *out_mad_pkey_index);
+int rvt_create_mad_agents(struct rvt_dev_info *rdi);
+void rvt_free_mad_agents(struct rvt_dev_info *rdi);
+#endif /* DEF_RVTMAD_H */
diff --git a/drivers/infiniband/sw/rdmavt/mcast.c b/drivers/infiniband/sw/rdmavt/mcast.c
new file mode 100644
index 0000000000..a123874e1c
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/mcast.c
@@ -0,0 +1,401 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ */
+
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/rculist.h>
+#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
+
+#include "mcast.h"
+
+/**
+ * rvt_driver_mcast_init - init resources for multicast
+ * @rdi: rvt dev struct
+ *
+ * This is per device that registers with rdmavt
+ */
+void rvt_driver_mcast_init(struct rvt_dev_info *rdi)
+{
+ /*
+ * Anything that needs setup for multicast on a per driver or per rdi
+ * basis should be done in here.
+ */
+ spin_lock_init(&rdi->n_mcast_grps_lock);
+}
+
+/**
+ * rvt_mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct
+ * @qp: the QP to link
+ */
+static struct rvt_mcast_qp *rvt_mcast_qp_alloc(struct rvt_qp *qp)
+{
+ struct rvt_mcast_qp *mqp;
+
+ mqp = kmalloc(sizeof(*mqp), GFP_KERNEL);
+ if (!mqp)
+ goto bail;
+
+ mqp->qp = qp;
+ rvt_get_qp(qp);
+
+bail:
+ return mqp;
+}
+
+static void rvt_mcast_qp_free(struct rvt_mcast_qp *mqp)
+{
+ struct rvt_qp *qp = mqp->qp;
+
+ /* Notify hfi1_destroy_qp() if it is waiting. */
+ rvt_put_qp(qp);
+
+ kfree(mqp);
+}
+
+/**
+ * rvt_mcast_alloc - allocate the multicast GID structure
+ * @mgid: the multicast GID
+ * @lid: the muilticast LID (host order)
+ *
+ * A list of QPs will be attached to this structure.
+ */
+static struct rvt_mcast *rvt_mcast_alloc(union ib_gid *mgid, u16 lid)
+{
+ struct rvt_mcast *mcast;
+
+ mcast = kzalloc(sizeof(*mcast), GFP_KERNEL);
+ if (!mcast)
+ goto bail;
+
+ mcast->mcast_addr.mgid = *mgid;
+ mcast->mcast_addr.lid = lid;
+
+ INIT_LIST_HEAD(&mcast->qp_list);
+ init_waitqueue_head(&mcast->wait);
+ atomic_set(&mcast->refcount, 0);
+
+bail:
+ return mcast;
+}
+
+static void rvt_mcast_free(struct rvt_mcast *mcast)
+{
+ struct rvt_mcast_qp *p, *tmp;
+
+ list_for_each_entry_safe(p, tmp, &mcast->qp_list, list)
+ rvt_mcast_qp_free(p);
+
+ kfree(mcast);
+}
+
+/**
+ * rvt_mcast_find - search the global table for the given multicast GID/LID
+ * NOTE: It is valid to have 1 MLID with multiple MGIDs. It is not valid
+ * to have 1 MGID with multiple MLIDs.
+ * @ibp: the IB port structure
+ * @mgid: the multicast GID to search for
+ * @lid: the multicast LID portion of the multicast address (host order)
+ *
+ * The caller is responsible for decrementing the reference count if found.
+ *
+ * Return: NULL if not found.
+ */
+struct rvt_mcast *rvt_mcast_find(struct rvt_ibport *ibp, union ib_gid *mgid,
+ u16 lid)
+{
+ struct rb_node *n;
+ unsigned long flags;
+ struct rvt_mcast *found = NULL;
+
+ spin_lock_irqsave(&ibp->lock, flags);
+ n = ibp->mcast_tree.rb_node;
+ while (n) {
+ int ret;
+ struct rvt_mcast *mcast;
+
+ mcast = rb_entry(n, struct rvt_mcast, rb_node);
+
+ ret = memcmp(mgid->raw, mcast->mcast_addr.mgid.raw,
+ sizeof(*mgid));
+ if (ret < 0) {
+ n = n->rb_left;
+ } else if (ret > 0) {
+ n = n->rb_right;
+ } else {
+ /* MGID/MLID must match */
+ if (mcast->mcast_addr.lid == lid) {
+ atomic_inc(&mcast->refcount);
+ found = mcast;
+ }
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&ibp->lock, flags);
+ return found;
+}
+EXPORT_SYMBOL(rvt_mcast_find);
+
+/*
+ * rvt_mcast_add - insert mcast GID into table and attach QP struct
+ * @mcast: the mcast GID table
+ * @mqp: the QP to attach
+ *
+ * Return: zero if both were added. Return EEXIST if the GID was already in
+ * the table but the QP was added. Return ESRCH if the QP was already
+ * attached and neither structure was added. Return EINVAL if the MGID was
+ * found, but the MLID did NOT match.
+ */
+static int rvt_mcast_add(struct rvt_dev_info *rdi, struct rvt_ibport *ibp,
+ struct rvt_mcast *mcast, struct rvt_mcast_qp *mqp)
+{
+ struct rb_node **n = &ibp->mcast_tree.rb_node;
+ struct rb_node *pn = NULL;
+ int ret;
+
+ spin_lock_irq(&ibp->lock);
+
+ while (*n) {
+ struct rvt_mcast *tmcast;
+ struct rvt_mcast_qp *p;
+
+ pn = *n;
+ tmcast = rb_entry(pn, struct rvt_mcast, rb_node);
+
+ ret = memcmp(mcast->mcast_addr.mgid.raw,
+ tmcast->mcast_addr.mgid.raw,
+ sizeof(mcast->mcast_addr.mgid));
+ if (ret < 0) {
+ n = &pn->rb_left;
+ continue;
+ }
+ if (ret > 0) {
+ n = &pn->rb_right;
+ continue;
+ }
+
+ if (tmcast->mcast_addr.lid != mcast->mcast_addr.lid) {
+ ret = EINVAL;
+ goto bail;
+ }
+
+ /* Search the QP list to see if this is already there. */
+ list_for_each_entry_rcu(p, &tmcast->qp_list, list) {
+ if (p->qp == mqp->qp) {
+ ret = ESRCH;
+ goto bail;
+ }
+ }
+ if (tmcast->n_attached ==
+ rdi->dparms.props.max_mcast_qp_attach) {
+ ret = ENOMEM;
+ goto bail;
+ }
+
+ tmcast->n_attached++;
+
+ list_add_tail_rcu(&mqp->list, &tmcast->qp_list);
+ ret = EEXIST;
+ goto bail;
+ }
+
+ spin_lock(&rdi->n_mcast_grps_lock);
+ if (rdi->n_mcast_grps_allocated == rdi->dparms.props.max_mcast_grp) {
+ spin_unlock(&rdi->n_mcast_grps_lock);
+ ret = ENOMEM;
+ goto bail;
+ }
+
+ rdi->n_mcast_grps_allocated++;
+ spin_unlock(&rdi->n_mcast_grps_lock);
+
+ mcast->n_attached++;
+
+ list_add_tail_rcu(&mqp->list, &mcast->qp_list);
+
+ atomic_inc(&mcast->refcount);
+ rb_link_node(&mcast->rb_node, pn, n);
+ rb_insert_color(&mcast->rb_node, &ibp->mcast_tree);
+
+ ret = 0;
+
+bail:
+ spin_unlock_irq(&ibp->lock);
+
+ return ret;
+}
+
+/**
+ * rvt_attach_mcast - attach a qp to a multicast group
+ * @ibqp: Infiniband qp
+ * @gid: multicast guid
+ * @lid: multicast lid
+ *
+ * Return: 0 on success
+ */
+int rvt_attach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
+ struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+ struct rvt_ibport *ibp = rdi->ports[qp->port_num - 1];
+ struct rvt_mcast *mcast;
+ struct rvt_mcast_qp *mqp;
+ int ret = -ENOMEM;
+
+ if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET)
+ return -EINVAL;
+
+ /*
+ * Allocate data structures since its better to do this outside of
+ * spin locks and it will most likely be needed.
+ */
+ mcast = rvt_mcast_alloc(gid, lid);
+ if (!mcast)
+ return -ENOMEM;
+
+ mqp = rvt_mcast_qp_alloc(qp);
+ if (!mqp)
+ goto bail_mcast;
+
+ switch (rvt_mcast_add(rdi, ibp, mcast, mqp)) {
+ case ESRCH:
+ /* Neither was used: OK to attach the same QP twice. */
+ ret = 0;
+ goto bail_mqp;
+ case EEXIST: /* The mcast wasn't used */
+ ret = 0;
+ goto bail_mcast;
+ case ENOMEM:
+ /* Exceeded the maximum number of mcast groups. */
+ ret = -ENOMEM;
+ goto bail_mqp;
+ case EINVAL:
+ /* Invalid MGID/MLID pair */
+ ret = -EINVAL;
+ goto bail_mqp;
+ default:
+ break;
+ }
+
+ return 0;
+
+bail_mqp:
+ rvt_mcast_qp_free(mqp);
+
+bail_mcast:
+ rvt_mcast_free(mcast);
+
+ return ret;
+}
+
+/**
+ * rvt_detach_mcast - remove a qp from a multicast group
+ * @ibqp: Infiniband qp
+ * @gid: multicast guid
+ * @lid: multicast lid
+ *
+ * Return: 0 on success
+ */
+int rvt_detach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+ struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
+ struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+ struct rvt_ibport *ibp = rdi->ports[qp->port_num - 1];
+ struct rvt_mcast *mcast = NULL;
+ struct rvt_mcast_qp *p, *tmp, *delp = NULL;
+ struct rb_node *n;
+ int last = 0;
+ int ret = 0;
+
+ if (ibqp->qp_num <= 1)
+ return -EINVAL;
+
+ spin_lock_irq(&ibp->lock);
+
+ /* Find the GID in the mcast table. */
+ n = ibp->mcast_tree.rb_node;
+ while (1) {
+ if (!n) {
+ spin_unlock_irq(&ibp->lock);
+ return -EINVAL;
+ }
+
+ mcast = rb_entry(n, struct rvt_mcast, rb_node);
+ ret = memcmp(gid->raw, mcast->mcast_addr.mgid.raw,
+ sizeof(*gid));
+ if (ret < 0) {
+ n = n->rb_left;
+ } else if (ret > 0) {
+ n = n->rb_right;
+ } else {
+ /* MGID/MLID must match */
+ if (mcast->mcast_addr.lid != lid) {
+ spin_unlock_irq(&ibp->lock);
+ return -EINVAL;
+ }
+ break;
+ }
+ }
+
+ /* Search the QP list. */
+ list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) {
+ if (p->qp != qp)
+ continue;
+ /*
+ * We found it, so remove it, but don't poison the forward
+ * link until we are sure there are no list walkers.
+ */
+ list_del_rcu(&p->list);
+ mcast->n_attached--;
+ delp = p;
+
+ /* If this was the last attached QP, remove the GID too. */
+ if (list_empty(&mcast->qp_list)) {
+ rb_erase(&mcast->rb_node, &ibp->mcast_tree);
+ last = 1;
+ }
+ break;
+ }
+
+ spin_unlock_irq(&ibp->lock);
+ /* QP not attached */
+ if (!delp)
+ return -EINVAL;
+
+ /*
+ * Wait for any list walkers to finish before freeing the
+ * list element.
+ */
+ wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
+ rvt_mcast_qp_free(delp);
+
+ if (last) {
+ atomic_dec(&mcast->refcount);
+ wait_event(mcast->wait, !atomic_read(&mcast->refcount));
+ rvt_mcast_free(mcast);
+ spin_lock_irq(&rdi->n_mcast_grps_lock);
+ rdi->n_mcast_grps_allocated--;
+ spin_unlock_irq(&rdi->n_mcast_grps_lock);
+ }
+
+ return 0;
+}
+
+/**
+ * rvt_mcast_tree_empty - determine if any qps are attached to any mcast group
+ * @rdi: rvt dev struct
+ *
+ * Return: in use count
+ */
+int rvt_mcast_tree_empty(struct rvt_dev_info *rdi)
+{
+ int i;
+ int in_use = 0;
+
+ for (i = 0; i < rdi->dparms.nports; i++)
+ if (rdi->ports[i]->mcast_tree.rb_node)
+ in_use++;
+ return in_use;
+}
diff --git a/drivers/infiniband/sw/rdmavt/mcast.h b/drivers/infiniband/sw/rdmavt/mcast.h
new file mode 100644
index 0000000000..b96d86f962
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/mcast.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ */
+
+#ifndef DEF_RVTMCAST_H
+#define DEF_RVTMCAST_H
+
+#include <rdma/rdma_vt.h>
+
+void rvt_driver_mcast_init(struct rvt_dev_info *rdi);
+int rvt_attach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+int rvt_detach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+int rvt_mcast_tree_empty(struct rvt_dev_info *rdi);
+
+#endif /* DEF_RVTMCAST_H */
diff --git a/drivers/infiniband/sw/rdmavt/mmap.c b/drivers/infiniband/sw/rdmavt/mmap.c
new file mode 100644
index 0000000000..4d2238f3f3
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/mmap.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ */
+
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <rdma/uverbs_ioctl.h>
+#include "mmap.h"
+
+/**
+ * rvt_mmap_init - init link list and lock for mem map
+ * @rdi: rvt dev struct
+ */
+void rvt_mmap_init(struct rvt_dev_info *rdi)
+{
+ INIT_LIST_HEAD(&rdi->pending_mmaps);
+ spin_lock_init(&rdi->pending_lock);
+ rdi->mmap_offset = PAGE_SIZE;
+ spin_lock_init(&rdi->mmap_offset_lock);
+}
+
+/**
+ * rvt_release_mmap_info - free mmap info structure
+ * @ref: a pointer to the kref within struct rvt_mmap_info
+ */
+void rvt_release_mmap_info(struct kref *ref)
+{
+ struct rvt_mmap_info *ip =
+ container_of(ref, struct rvt_mmap_info, ref);
+ struct rvt_dev_info *rdi = ib_to_rvt(ip->context->device);
+
+ spin_lock_irq(&rdi->pending_lock);
+ list_del(&ip->pending_mmaps);
+ spin_unlock_irq(&rdi->pending_lock);
+
+ vfree(ip->obj);
+ kfree(ip);
+}
+
+static void rvt_vma_open(struct vm_area_struct *vma)
+{
+ struct rvt_mmap_info *ip = vma->vm_private_data;
+
+ kref_get(&ip->ref);
+}
+
+static void rvt_vma_close(struct vm_area_struct *vma)
+{
+ struct rvt_mmap_info *ip = vma->vm_private_data;
+
+ kref_put(&ip->ref, rvt_release_mmap_info);
+}
+
+static const struct vm_operations_struct rvt_vm_ops = {
+ .open = rvt_vma_open,
+ .close = rvt_vma_close,
+};
+
+/**
+ * rvt_mmap - create a new mmap region
+ * @context: the IB user context of the process making the mmap() call
+ * @vma: the VMA to be initialized
+ *
+ * Return: zero if the mmap is OK. Otherwise, return an errno.
+ */
+int rvt_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+ struct rvt_dev_info *rdi = ib_to_rvt(context->device);
+ unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+ unsigned long size = vma->vm_end - vma->vm_start;
+ struct rvt_mmap_info *ip, *pp;
+ int ret = -EINVAL;
+
+ /*
+ * Search the device's list of objects waiting for a mmap call.
+ * Normally, this list is very short since a call to create a
+ * CQ, QP, or SRQ is soon followed by a call to mmap().
+ */
+ spin_lock_irq(&rdi->pending_lock);
+ list_for_each_entry_safe(ip, pp, &rdi->pending_mmaps,
+ pending_mmaps) {
+ /* Only the creator is allowed to mmap the object */
+ if (context != ip->context || (__u64)offset != ip->offset)
+ continue;
+ /* Don't allow a mmap larger than the object. */
+ if (size > ip->size)
+ break;
+
+ list_del_init(&ip->pending_mmaps);
+ spin_unlock_irq(&rdi->pending_lock);
+
+ ret = remap_vmalloc_range(vma, ip->obj, 0);
+ if (ret)
+ goto done;
+ vma->vm_ops = &rvt_vm_ops;
+ vma->vm_private_data = ip;
+ rvt_vma_open(vma);
+ goto done;
+ }
+ spin_unlock_irq(&rdi->pending_lock);
+done:
+ return ret;
+}
+
+/**
+ * rvt_create_mmap_info - allocate information for hfi1_mmap
+ * @rdi: rvt dev struct
+ * @size: size in bytes to map
+ * @udata: user data (must be valid!)
+ * @obj: opaque pointer to a cq, wq etc
+ *
+ * Return: rvt_mmap struct on success, ERR_PTR on failure
+ */
+struct rvt_mmap_info *rvt_create_mmap_info(struct rvt_dev_info *rdi, u32 size,
+ struct ib_udata *udata, void *obj)
+{
+ struct rvt_mmap_info *ip;
+
+ if (!udata)
+ return ERR_PTR(-EINVAL);
+
+ ip = kmalloc_node(sizeof(*ip), GFP_KERNEL, rdi->dparms.node);
+ if (!ip)
+ return ERR_PTR(-ENOMEM);
+
+ size = PAGE_ALIGN(size);
+
+ spin_lock_irq(&rdi->mmap_offset_lock);
+ if (rdi->mmap_offset == 0)
+ rdi->mmap_offset = ALIGN(PAGE_SIZE, SHMLBA);
+ ip->offset = rdi->mmap_offset;
+ rdi->mmap_offset += ALIGN(size, SHMLBA);
+ spin_unlock_irq(&rdi->mmap_offset_lock);
+
+ INIT_LIST_HEAD(&ip->pending_mmaps);
+ ip->size = size;
+ ip->context =
+ container_of(udata, struct uverbs_attr_bundle, driver_udata)
+ ->context;
+ ip->obj = obj;
+ kref_init(&ip->ref);
+
+ return ip;
+}
+
+/**
+ * rvt_update_mmap_info - update a mem map
+ * @rdi: rvt dev struct
+ * @ip: mmap info pointer
+ * @size: size to grow by
+ * @obj: opaque pointer to cq, wq, etc.
+ */
+void rvt_update_mmap_info(struct rvt_dev_info *rdi, struct rvt_mmap_info *ip,
+ u32 size, void *obj)
+{
+ size = PAGE_ALIGN(size);
+
+ spin_lock_irq(&rdi->mmap_offset_lock);
+ if (rdi->mmap_offset == 0)
+ rdi->mmap_offset = PAGE_SIZE;
+ ip->offset = rdi->mmap_offset;
+ rdi->mmap_offset += size;
+ spin_unlock_irq(&rdi->mmap_offset_lock);
+
+ ip->size = size;
+ ip->obj = obj;
+}
diff --git a/drivers/infiniband/sw/rdmavt/mmap.h b/drivers/infiniband/sw/rdmavt/mmap.h
new file mode 100644
index 0000000000..7e92cf28e0
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/mmap.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ */
+
+#ifndef DEF_RDMAVTMMAP_H
+#define DEF_RDMAVTMMAP_H
+
+#include <rdma/rdma_vt.h>
+
+void rvt_mmap_init(struct rvt_dev_info *rdi);
+void rvt_release_mmap_info(struct kref *ref);
+int rvt_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
+struct rvt_mmap_info *rvt_create_mmap_info(struct rvt_dev_info *rdi, u32 size,
+ struct ib_udata *udata, void *obj);
+void rvt_update_mmap_info(struct rvt_dev_info *rdi, struct rvt_mmap_info *ip,
+ u32 size, void *obj);
+
+#endif /* DEF_RDMAVTMMAP_H */
diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c
new file mode 100644
index 0000000000..8a1f2e2851
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/mr.c
@@ -0,0 +1,917 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ */
+
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <rdma/ib_umem.h>
+#include <rdma/rdma_vt.h>
+#include "vt.h"
+#include "mr.h"
+#include "trace.h"
+
+/**
+ * rvt_driver_mr_init - Init MR resources per driver
+ * @rdi: rvt dev struct
+ *
+ * Do any intilization needed when a driver registers with rdmavt.
+ *
+ * Return: 0 on success or errno on failure
+ */
+int rvt_driver_mr_init(struct rvt_dev_info *rdi)
+{
+ unsigned int lkey_table_size = rdi->dparms.lkey_table_size;
+ unsigned lk_tab_size;
+ int i;
+
+ /*
+ * The top hfi1_lkey_table_size bits are used to index the
+ * table. The lower 8 bits can be owned by the user (copied from
+ * the LKEY). The remaining bits act as a generation number or tag.
+ */
+ if (!lkey_table_size)
+ return -EINVAL;
+
+ spin_lock_init(&rdi->lkey_table.lock);
+
+ /* ensure generation is at least 4 bits */
+ if (lkey_table_size > RVT_MAX_LKEY_TABLE_BITS) {
+ rvt_pr_warn(rdi, "lkey bits %u too large, reduced to %u\n",
+ lkey_table_size, RVT_MAX_LKEY_TABLE_BITS);
+ rdi->dparms.lkey_table_size = RVT_MAX_LKEY_TABLE_BITS;
+ lkey_table_size = rdi->dparms.lkey_table_size;
+ }
+ rdi->lkey_table.max = 1 << lkey_table_size;
+ rdi->lkey_table.shift = 32 - lkey_table_size;
+ lk_tab_size = rdi->lkey_table.max * sizeof(*rdi->lkey_table.table);
+ rdi->lkey_table.table = (struct rvt_mregion __rcu **)
+ vmalloc_node(lk_tab_size, rdi->dparms.node);
+ if (!rdi->lkey_table.table)
+ return -ENOMEM;
+
+ RCU_INIT_POINTER(rdi->dma_mr, NULL);
+ for (i = 0; i < rdi->lkey_table.max; i++)
+ RCU_INIT_POINTER(rdi->lkey_table.table[i], NULL);
+
+ rdi->dparms.props.max_mr = rdi->lkey_table.max;
+ return 0;
+}
+
+/**
+ * rvt_mr_exit - clean up MR
+ * @rdi: rvt dev structure
+ *
+ * called when drivers have unregistered or perhaps failed to register with us
+ */
+void rvt_mr_exit(struct rvt_dev_info *rdi)
+{
+ if (rdi->dma_mr)
+ rvt_pr_err(rdi, "DMA MR not null!\n");
+
+ vfree(rdi->lkey_table.table);
+}
+
+static void rvt_deinit_mregion(struct rvt_mregion *mr)
+{
+ int i = mr->mapsz;
+
+ mr->mapsz = 0;
+ while (i)
+ kfree(mr->map[--i]);
+ percpu_ref_exit(&mr->refcount);
+}
+
+static void __rvt_mregion_complete(struct percpu_ref *ref)
+{
+ struct rvt_mregion *mr = container_of(ref, struct rvt_mregion,
+ refcount);
+
+ complete(&mr->comp);
+}
+
+static int rvt_init_mregion(struct rvt_mregion *mr, struct ib_pd *pd,
+ int count, unsigned int percpu_flags)
+{
+ int m, i = 0;
+ struct rvt_dev_info *dev = ib_to_rvt(pd->device);
+
+ mr->mapsz = 0;
+ m = (count + RVT_SEGSZ - 1) / RVT_SEGSZ;
+ for (; i < m; i++) {
+ mr->map[i] = kzalloc_node(sizeof(*mr->map[0]), GFP_KERNEL,
+ dev->dparms.node);
+ if (!mr->map[i])
+ goto bail;
+ mr->mapsz++;
+ }
+ init_completion(&mr->comp);
+ /* count returning the ptr to user */
+ if (percpu_ref_init(&mr->refcount, &__rvt_mregion_complete,
+ percpu_flags, GFP_KERNEL))
+ goto bail;
+
+ atomic_set(&mr->lkey_invalid, 0);
+ mr->pd = pd;
+ mr->max_segs = count;
+ return 0;
+bail:
+ rvt_deinit_mregion(mr);
+ return -ENOMEM;
+}
+
+/**
+ * rvt_alloc_lkey - allocate an lkey
+ * @mr: memory region that this lkey protects
+ * @dma_region: 0->normal key, 1->restricted DMA key
+ *
+ * Returns 0 if successful, otherwise returns -errno.
+ *
+ * Increments mr reference count as required.
+ *
+ * Sets the lkey field mr for non-dma regions.
+ *
+ */
+static int rvt_alloc_lkey(struct rvt_mregion *mr, int dma_region)
+{
+ unsigned long flags;
+ u32 r;
+ u32 n;
+ int ret = 0;
+ struct rvt_dev_info *dev = ib_to_rvt(mr->pd->device);
+ struct rvt_lkey_table *rkt = &dev->lkey_table;
+
+ rvt_get_mr(mr);
+ spin_lock_irqsave(&rkt->lock, flags);
+
+ /* special case for dma_mr lkey == 0 */
+ if (dma_region) {
+ struct rvt_mregion *tmr;
+
+ tmr = rcu_access_pointer(dev->dma_mr);
+ if (!tmr) {
+ mr->lkey_published = 1;
+ /* Insure published written first */
+ rcu_assign_pointer(dev->dma_mr, mr);
+ rvt_get_mr(mr);
+ }
+ goto success;
+ }
+
+ /* Find the next available LKEY */
+ r = rkt->next;
+ n = r;
+ for (;;) {
+ if (!rcu_access_pointer(rkt->table[r]))
+ break;
+ r = (r + 1) & (rkt->max - 1);
+ if (r == n)
+ goto bail;
+ }
+ rkt->next = (r + 1) & (rkt->max - 1);
+ /*
+ * Make sure lkey is never zero which is reserved to indicate an
+ * unrestricted LKEY.
+ */
+ rkt->gen++;
+ /*
+ * bits are capped to ensure enough bits for generation number
+ */
+ mr->lkey = (r << (32 - dev->dparms.lkey_table_size)) |
+ ((((1 << (24 - dev->dparms.lkey_table_size)) - 1) & rkt->gen)
+ << 8);
+ if (mr->lkey == 0) {
+ mr->lkey |= 1 << 8;
+ rkt->gen++;
+ }
+ mr->lkey_published = 1;
+ /* Insure published written first */
+ rcu_assign_pointer(rkt->table[r], mr);
+success:
+ spin_unlock_irqrestore(&rkt->lock, flags);
+out:
+ return ret;
+bail:
+ rvt_put_mr(mr);
+ spin_unlock_irqrestore(&rkt->lock, flags);
+ ret = -ENOMEM;
+ goto out;
+}
+
+/**
+ * rvt_free_lkey - free an lkey
+ * @mr: mr to free from tables
+ */
+static void rvt_free_lkey(struct rvt_mregion *mr)
+{
+ unsigned long flags;
+ u32 lkey = mr->lkey;
+ u32 r;
+ struct rvt_dev_info *dev = ib_to_rvt(mr->pd->device);
+ struct rvt_lkey_table *rkt = &dev->lkey_table;
+ int freed = 0;
+
+ spin_lock_irqsave(&rkt->lock, flags);
+ if (!lkey) {
+ if (mr->lkey_published) {
+ mr->lkey_published = 0;
+ /* insure published is written before pointer */
+ rcu_assign_pointer(dev->dma_mr, NULL);
+ rvt_put_mr(mr);
+ }
+ } else {
+ if (!mr->lkey_published)
+ goto out;
+ r = lkey >> (32 - dev->dparms.lkey_table_size);
+ mr->lkey_published = 0;
+ /* insure published is written before pointer */
+ rcu_assign_pointer(rkt->table[r], NULL);
+ }
+ freed++;
+out:
+ spin_unlock_irqrestore(&rkt->lock, flags);
+ if (freed)
+ percpu_ref_kill(&mr->refcount);
+}
+
+static struct rvt_mr *__rvt_alloc_mr(int count, struct ib_pd *pd)
+{
+ struct rvt_mr *mr;
+ int rval = -ENOMEM;
+ int m;
+
+ /* Allocate struct plus pointers to first level page tables. */
+ m = (count + RVT_SEGSZ - 1) / RVT_SEGSZ;
+ mr = kzalloc(struct_size(mr, mr.map, m), GFP_KERNEL);
+ if (!mr)
+ goto bail;
+
+ rval = rvt_init_mregion(&mr->mr, pd, count, 0);
+ if (rval)
+ goto bail;
+ /*
+ * ib_reg_phys_mr() will initialize mr->ibmr except for
+ * lkey and rkey.
+ */
+ rval = rvt_alloc_lkey(&mr->mr, 0);
+ if (rval)
+ goto bail_mregion;
+ mr->ibmr.lkey = mr->mr.lkey;
+ mr->ibmr.rkey = mr->mr.lkey;
+done:
+ return mr;
+
+bail_mregion:
+ rvt_deinit_mregion(&mr->mr);
+bail:
+ kfree(mr);
+ mr = ERR_PTR(rval);
+ goto done;
+}
+
+static void __rvt_free_mr(struct rvt_mr *mr)
+{
+ rvt_free_lkey(&mr->mr);
+ rvt_deinit_mregion(&mr->mr);
+ kfree(mr);
+}
+
+/**
+ * rvt_get_dma_mr - get a DMA memory region
+ * @pd: protection domain for this memory region
+ * @acc: access flags
+ *
+ * Return: the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *rvt_get_dma_mr(struct ib_pd *pd, int acc)
+{
+ struct rvt_mr *mr;
+ struct ib_mr *ret;
+ int rval;
+
+ if (ibpd_to_rvtpd(pd)->user)
+ return ERR_PTR(-EPERM);
+
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr) {
+ ret = ERR_PTR(-ENOMEM);
+ goto bail;
+ }
+
+ rval = rvt_init_mregion(&mr->mr, pd, 0, 0);
+ if (rval) {
+ ret = ERR_PTR(rval);
+ goto bail;
+ }
+
+ rval = rvt_alloc_lkey(&mr->mr, 1);
+ if (rval) {
+ ret = ERR_PTR(rval);
+ goto bail_mregion;
+ }
+
+ mr->mr.access_flags = acc;
+ ret = &mr->ibmr;
+done:
+ return ret;
+
+bail_mregion:
+ rvt_deinit_mregion(&mr->mr);
+bail:
+ kfree(mr);
+ goto done;
+}
+
+/**
+ * rvt_reg_user_mr - register a userspace memory region
+ * @pd: protection domain for this memory region
+ * @start: starting userspace address
+ * @length: length of region to register
+ * @virt_addr: associated virtual address
+ * @mr_access_flags: access flags for this memory region
+ * @udata: unused by the driver
+ *
+ * Return: the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+ u64 virt_addr, int mr_access_flags,
+ struct ib_udata *udata)
+{
+ struct rvt_mr *mr;
+ struct ib_umem *umem;
+ struct sg_page_iter sg_iter;
+ int n, m;
+ struct ib_mr *ret;
+
+ if (length == 0)
+ return ERR_PTR(-EINVAL);
+
+ umem = ib_umem_get(pd->device, start, length, mr_access_flags);
+ if (IS_ERR(umem))
+ return (void *)umem;
+
+ n = ib_umem_num_pages(umem);
+
+ mr = __rvt_alloc_mr(n, pd);
+ if (IS_ERR(mr)) {
+ ret = (struct ib_mr *)mr;
+ goto bail_umem;
+ }
+
+ mr->mr.user_base = start;
+ mr->mr.iova = virt_addr;
+ mr->mr.length = length;
+ mr->mr.offset = ib_umem_offset(umem);
+ mr->mr.access_flags = mr_access_flags;
+ mr->umem = umem;
+
+ mr->mr.page_shift = PAGE_SHIFT;
+ m = 0;
+ n = 0;
+ for_each_sgtable_page (&umem->sgt_append.sgt, &sg_iter, 0) {
+ void *vaddr;
+
+ vaddr = page_address(sg_page_iter_page(&sg_iter));
+ if (!vaddr) {
+ ret = ERR_PTR(-EINVAL);
+ goto bail_inval;
+ }
+ mr->mr.map[m]->segs[n].vaddr = vaddr;
+ mr->mr.map[m]->segs[n].length = PAGE_SIZE;
+ trace_rvt_mr_user_seg(&mr->mr, m, n, vaddr, PAGE_SIZE);
+ if (++n == RVT_SEGSZ) {
+ m++;
+ n = 0;
+ }
+ }
+ return &mr->ibmr;
+
+bail_inval:
+ __rvt_free_mr(mr);
+
+bail_umem:
+ ib_umem_release(umem);
+
+ return ret;
+}
+
+/**
+ * rvt_dereg_clean_qp_cb - callback from iterator
+ * @qp: the qp
+ * @v: the mregion (as u64)
+ *
+ * This routine fields the callback for all QPs and
+ * for QPs in the same PD as the MR will call the
+ * rvt_qp_mr_clean() to potentially cleanup references.
+ */
+static void rvt_dereg_clean_qp_cb(struct rvt_qp *qp, u64 v)
+{
+ struct rvt_mregion *mr = (struct rvt_mregion *)v;
+
+ /* skip PDs that are not ours */
+ if (mr->pd != qp->ibqp.pd)
+ return;
+ rvt_qp_mr_clean(qp, mr->lkey);
+}
+
+/**
+ * rvt_dereg_clean_qps - find QPs for reference cleanup
+ * @mr: the MR that is being deregistered
+ *
+ * This routine iterates RC QPs looking for references
+ * to the lkey noted in mr.
+ */
+static void rvt_dereg_clean_qps(struct rvt_mregion *mr)
+{
+ struct rvt_dev_info *rdi = ib_to_rvt(mr->pd->device);
+
+ rvt_qp_iter(rdi, (u64)mr, rvt_dereg_clean_qp_cb);
+}
+
+/**
+ * rvt_check_refs - check references
+ * @mr: the megion
+ * @t: the caller identification
+ *
+ * This routine checks MRs holding a reference during
+ * when being de-registered.
+ *
+ * If the count is non-zero, the code calls a clean routine then
+ * waits for the timeout for the count to zero.
+ */
+static int rvt_check_refs(struct rvt_mregion *mr, const char *t)
+{
+ unsigned long timeout;
+ struct rvt_dev_info *rdi = ib_to_rvt(mr->pd->device);
+
+ if (mr->lkey) {
+ /* avoid dma mr */
+ rvt_dereg_clean_qps(mr);
+ /* @mr was indexed on rcu protected @lkey_table */
+ synchronize_rcu();
+ }
+
+ timeout = wait_for_completion_timeout(&mr->comp, 5 * HZ);
+ if (!timeout) {
+ rvt_pr_err(rdi,
+ "%s timeout mr %p pd %p lkey %x refcount %ld\n",
+ t, mr, mr->pd, mr->lkey,
+ atomic_long_read(&mr->refcount.data->count));
+ rvt_get_mr(mr);
+ return -EBUSY;
+ }
+ return 0;
+}
+
+/**
+ * rvt_mr_has_lkey - is MR
+ * @mr: the mregion
+ * @lkey: the lkey
+ */
+bool rvt_mr_has_lkey(struct rvt_mregion *mr, u32 lkey)
+{
+ return mr && lkey == mr->lkey;
+}
+
+/**
+ * rvt_ss_has_lkey - is mr in sge tests
+ * @ss: the sge state
+ * @lkey: the lkey
+ *
+ * This code tests for an MR in the indicated
+ * sge state.
+ */
+bool rvt_ss_has_lkey(struct rvt_sge_state *ss, u32 lkey)
+{
+ int i;
+ bool rval = false;
+
+ if (!ss->num_sge)
+ return rval;
+ /* first one */
+ rval = rvt_mr_has_lkey(ss->sge.mr, lkey);
+ /* any others */
+ for (i = 0; !rval && i < ss->num_sge - 1; i++)
+ rval = rvt_mr_has_lkey(ss->sg_list[i].mr, lkey);
+ return rval;
+}
+
+/**
+ * rvt_dereg_mr - unregister and free a memory region
+ * @ibmr: the memory region to free
+ * @udata: unused by the driver
+ *
+ * Note that this is called to free MRs created by rvt_get_dma_mr()
+ * or rvt_reg_user_mr().
+ *
+ * Returns 0 on success.
+ */
+int rvt_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+{
+ struct rvt_mr *mr = to_imr(ibmr);
+ int ret;
+
+ rvt_free_lkey(&mr->mr);
+
+ rvt_put_mr(&mr->mr); /* will set completion if last */
+ ret = rvt_check_refs(&mr->mr, __func__);
+ if (ret)
+ goto out;
+ rvt_deinit_mregion(&mr->mr);
+ ib_umem_release(mr->umem);
+ kfree(mr);
+out:
+ return ret;
+}
+
+/**
+ * rvt_alloc_mr - Allocate a memory region usable with the
+ * @pd: protection domain for this memory region
+ * @mr_type: mem region type
+ * @max_num_sg: Max number of segments allowed
+ *
+ * Return: the memory region on success, otherwise return an errno.
+ */
+struct ib_mr *rvt_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+ u32 max_num_sg)
+{
+ struct rvt_mr *mr;
+
+ if (mr_type != IB_MR_TYPE_MEM_REG)
+ return ERR_PTR(-EINVAL);
+
+ mr = __rvt_alloc_mr(max_num_sg, pd);
+ if (IS_ERR(mr))
+ return (struct ib_mr *)mr;
+
+ return &mr->ibmr;
+}
+
+/**
+ * rvt_set_page - page assignment function called by ib_sg_to_pages
+ * @ibmr: memory region
+ * @addr: dma address of mapped page
+ *
+ * Return: 0 on success
+ */
+static int rvt_set_page(struct ib_mr *ibmr, u64 addr)
+{
+ struct rvt_mr *mr = to_imr(ibmr);
+ u32 ps = 1 << mr->mr.page_shift;
+ u32 mapped_segs = mr->mr.length >> mr->mr.page_shift;
+ int m, n;
+
+ if (unlikely(mapped_segs == mr->mr.max_segs))
+ return -ENOMEM;
+
+ m = mapped_segs / RVT_SEGSZ;
+ n = mapped_segs % RVT_SEGSZ;
+ mr->mr.map[m]->segs[n].vaddr = (void *)addr;
+ mr->mr.map[m]->segs[n].length = ps;
+ mr->mr.length += ps;
+ trace_rvt_mr_page_seg(&mr->mr, m, n, (void *)addr, ps);
+
+ return 0;
+}
+
+/**
+ * rvt_map_mr_sg - map sg list and set it the memory region
+ * @ibmr: memory region
+ * @sg: dma mapped scatterlist
+ * @sg_nents: number of entries in sg
+ * @sg_offset: offset in bytes into sg
+ *
+ * Overwrite rvt_mr length with mr length calculated by ib_sg_to_pages.
+ *
+ * Return: number of sg elements mapped to the memory region
+ */
+int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+ int sg_nents, unsigned int *sg_offset)
+{
+ struct rvt_mr *mr = to_imr(ibmr);
+ int ret;
+
+ mr->mr.length = 0;
+ mr->mr.page_shift = PAGE_SHIFT;
+ ret = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, rvt_set_page);
+ mr->mr.user_base = ibmr->iova;
+ mr->mr.iova = ibmr->iova;
+ mr->mr.offset = ibmr->iova - (u64)mr->mr.map[0]->segs[0].vaddr;
+ mr->mr.length = (size_t)ibmr->length;
+ trace_rvt_map_mr_sg(ibmr, sg_nents, sg_offset);
+ return ret;
+}
+
+/**
+ * rvt_fast_reg_mr - fast register physical MR
+ * @qp: the queue pair where the work request comes from
+ * @ibmr: the memory region to be registered
+ * @key: updated key for this memory region
+ * @access: access flags for this memory region
+ *
+ * Returns 0 on success.
+ */
+int rvt_fast_reg_mr(struct rvt_qp *qp, struct ib_mr *ibmr, u32 key,
+ int access)
+{
+ struct rvt_mr *mr = to_imr(ibmr);
+
+ if (qp->ibqp.pd != mr->mr.pd)
+ return -EACCES;
+
+ /* not applicable to dma MR or user MR */
+ if (!mr->mr.lkey || mr->umem)
+ return -EINVAL;
+
+ if ((key & 0xFFFFFF00) != (mr->mr.lkey & 0xFFFFFF00))
+ return -EINVAL;
+
+ ibmr->lkey = key;
+ ibmr->rkey = key;
+ mr->mr.lkey = key;
+ mr->mr.access_flags = access;
+ mr->mr.iova = ibmr->iova;
+ atomic_set(&mr->mr.lkey_invalid, 0);
+
+ return 0;
+}
+EXPORT_SYMBOL(rvt_fast_reg_mr);
+
+/**
+ * rvt_invalidate_rkey - invalidate an MR rkey
+ * @qp: queue pair associated with the invalidate op
+ * @rkey: rkey to invalidate
+ *
+ * Returns 0 on success.
+ */
+int rvt_invalidate_rkey(struct rvt_qp *qp, u32 rkey)
+{
+ struct rvt_dev_info *dev = ib_to_rvt(qp->ibqp.device);
+ struct rvt_lkey_table *rkt = &dev->lkey_table;
+ struct rvt_mregion *mr;
+
+ if (rkey == 0)
+ return -EINVAL;
+
+ rcu_read_lock();
+ mr = rcu_dereference(
+ rkt->table[(rkey >> (32 - dev->dparms.lkey_table_size))]);
+ if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd))
+ goto bail;
+
+ atomic_set(&mr->lkey_invalid, 1);
+ rcu_read_unlock();
+ return 0;
+
+bail:
+ rcu_read_unlock();
+ return -EINVAL;
+}
+EXPORT_SYMBOL(rvt_invalidate_rkey);
+
+/**
+ * rvt_sge_adjacent - is isge compressible
+ * @last_sge: last outgoing SGE written
+ * @sge: SGE to check
+ *
+ * If adjacent will update last_sge to add length.
+ *
+ * Return: true if isge is adjacent to last sge
+ */
+static inline bool rvt_sge_adjacent(struct rvt_sge *last_sge,
+ struct ib_sge *sge)
+{
+ if (last_sge && sge->lkey == last_sge->mr->lkey &&
+ ((uint64_t)(last_sge->vaddr + last_sge->length) == sge->addr)) {
+ if (sge->lkey) {
+ if (unlikely((sge->addr - last_sge->mr->user_base +
+ sge->length > last_sge->mr->length)))
+ return false; /* overrun, caller will catch */
+ } else {
+ last_sge->length += sge->length;
+ }
+ last_sge->sge_length += sge->length;
+ trace_rvt_sge_adjacent(last_sge, sge);
+ return true;
+ }
+ return false;
+}
+
+/**
+ * rvt_lkey_ok - check IB SGE for validity and initialize
+ * @rkt: table containing lkey to check SGE against
+ * @pd: protection domain
+ * @isge: outgoing internal SGE
+ * @last_sge: last outgoing SGE written
+ * @sge: SGE to check
+ * @acc: access flags
+ *
+ * Check the IB SGE for validity and initialize our internal version
+ * of it.
+ *
+ * Increments the reference count when a new sge is stored.
+ *
+ * Return: 0 if compressed, 1 if added , otherwise returns -errno.
+ */
+int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd,
+ struct rvt_sge *isge, struct rvt_sge *last_sge,
+ struct ib_sge *sge, int acc)
+{
+ struct rvt_mregion *mr;
+ unsigned n, m;
+ size_t off;
+
+ /*
+ * We use LKEY == zero for kernel virtual addresses
+ * (see rvt_get_dma_mr()).
+ */
+ if (sge->lkey == 0) {
+ struct rvt_dev_info *dev = ib_to_rvt(pd->ibpd.device);
+
+ if (pd->user)
+ return -EINVAL;
+ if (rvt_sge_adjacent(last_sge, sge))
+ return 0;
+ rcu_read_lock();
+ mr = rcu_dereference(dev->dma_mr);
+ if (!mr)
+ goto bail;
+ rvt_get_mr(mr);
+ rcu_read_unlock();
+
+ isge->mr = mr;
+ isge->vaddr = (void *)sge->addr;
+ isge->length = sge->length;
+ isge->sge_length = sge->length;
+ isge->m = 0;
+ isge->n = 0;
+ goto ok;
+ }
+ if (rvt_sge_adjacent(last_sge, sge))
+ return 0;
+ rcu_read_lock();
+ mr = rcu_dereference(rkt->table[sge->lkey >> rkt->shift]);
+ if (!mr)
+ goto bail;
+ rvt_get_mr(mr);
+ if (!READ_ONCE(mr->lkey_published))
+ goto bail_unref;
+
+ if (unlikely(atomic_read(&mr->lkey_invalid) ||
+ mr->lkey != sge->lkey || mr->pd != &pd->ibpd))
+ goto bail_unref;
+
+ off = sge->addr - mr->user_base;
+ if (unlikely(sge->addr < mr->user_base ||
+ off + sge->length > mr->length ||
+ (mr->access_flags & acc) != acc))
+ goto bail_unref;
+ rcu_read_unlock();
+
+ off += mr->offset;
+ if (mr->page_shift) {
+ /*
+ * page sizes are uniform power of 2 so no loop is necessary
+ * entries_spanned_by_off is the number of times the loop below
+ * would have executed.
+ */
+ size_t entries_spanned_by_off;
+
+ entries_spanned_by_off = off >> mr->page_shift;
+ off -= (entries_spanned_by_off << mr->page_shift);
+ m = entries_spanned_by_off / RVT_SEGSZ;
+ n = entries_spanned_by_off % RVT_SEGSZ;
+ } else {
+ m = 0;
+ n = 0;
+ while (off >= mr->map[m]->segs[n].length) {
+ off -= mr->map[m]->segs[n].length;
+ n++;
+ if (n >= RVT_SEGSZ) {
+ m++;
+ n = 0;
+ }
+ }
+ }
+ isge->mr = mr;
+ isge->vaddr = mr->map[m]->segs[n].vaddr + off;
+ isge->length = mr->map[m]->segs[n].length - off;
+ isge->sge_length = sge->length;
+ isge->m = m;
+ isge->n = n;
+ok:
+ trace_rvt_sge_new(isge, sge);
+ return 1;
+bail_unref:
+ rvt_put_mr(mr);
+bail:
+ rcu_read_unlock();
+ return -EINVAL;
+}
+EXPORT_SYMBOL(rvt_lkey_ok);
+
+/**
+ * rvt_rkey_ok - check the IB virtual address, length, and RKEY
+ * @qp: qp for validation
+ * @sge: SGE state
+ * @len: length of data
+ * @vaddr: virtual address to place data
+ * @rkey: rkey to check
+ * @acc: access flags
+ *
+ * Return: 1 if successful, otherwise 0.
+ *
+ * increments the reference count upon success
+ */
+int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge,
+ u32 len, u64 vaddr, u32 rkey, int acc)
+{
+ struct rvt_dev_info *dev = ib_to_rvt(qp->ibqp.device);
+ struct rvt_lkey_table *rkt = &dev->lkey_table;
+ struct rvt_mregion *mr;
+ unsigned n, m;
+ size_t off;
+
+ /*
+ * We use RKEY == zero for kernel virtual addresses
+ * (see rvt_get_dma_mr()).
+ */
+ rcu_read_lock();
+ if (rkey == 0) {
+ struct rvt_pd *pd = ibpd_to_rvtpd(qp->ibqp.pd);
+ struct rvt_dev_info *rdi = ib_to_rvt(pd->ibpd.device);
+
+ if (pd->user)
+ goto bail;
+ mr = rcu_dereference(rdi->dma_mr);
+ if (!mr)
+ goto bail;
+ rvt_get_mr(mr);
+ rcu_read_unlock();
+
+ sge->mr = mr;
+ sge->vaddr = (void *)vaddr;
+ sge->length = len;
+ sge->sge_length = len;
+ sge->m = 0;
+ sge->n = 0;
+ goto ok;
+ }
+
+ mr = rcu_dereference(rkt->table[rkey >> rkt->shift]);
+ if (!mr)
+ goto bail;
+ rvt_get_mr(mr);
+ /* insure mr read is before test */
+ if (!READ_ONCE(mr->lkey_published))
+ goto bail_unref;
+ if (unlikely(atomic_read(&mr->lkey_invalid) ||
+ mr->lkey != rkey || qp->ibqp.pd != mr->pd))
+ goto bail_unref;
+
+ off = vaddr - mr->iova;
+ if (unlikely(vaddr < mr->iova || off + len > mr->length ||
+ (mr->access_flags & acc) == 0))
+ goto bail_unref;
+ rcu_read_unlock();
+
+ off += mr->offset;
+ if (mr->page_shift) {
+ /*
+ * page sizes are uniform power of 2 so no loop is necessary
+ * entries_spanned_by_off is the number of times the loop below
+ * would have executed.
+ */
+ size_t entries_spanned_by_off;
+
+ entries_spanned_by_off = off >> mr->page_shift;
+ off -= (entries_spanned_by_off << mr->page_shift);
+ m = entries_spanned_by_off / RVT_SEGSZ;
+ n = entries_spanned_by_off % RVT_SEGSZ;
+ } else {
+ m = 0;
+ n = 0;
+ while (off >= mr->map[m]->segs[n].length) {
+ off -= mr->map[m]->segs[n].length;
+ n++;
+ if (n >= RVT_SEGSZ) {
+ m++;
+ n = 0;
+ }
+ }
+ }
+ sge->mr = mr;
+ sge->vaddr = mr->map[m]->segs[n].vaddr + off;
+ sge->length = mr->map[m]->segs[n].length - off;
+ sge->sge_length = len;
+ sge->m = m;
+ sge->n = n;
+ok:
+ return 1;
+bail_unref:
+ rvt_put_mr(mr);
+bail:
+ rcu_read_unlock();
+ return 0;
+}
+EXPORT_SYMBOL(rvt_rkey_ok);
diff --git a/drivers/infiniband/sw/rdmavt/mr.h b/drivers/infiniband/sw/rdmavt/mr.h
new file mode 100644
index 0000000000..d17f1400b5
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/mr.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ */
+
+#ifndef DEF_RVTMR_H
+#define DEF_RVTMR_H
+
+#include <rdma/rdma_vt.h>
+
+struct rvt_mr {
+ struct ib_mr ibmr;
+ struct ib_umem *umem;
+ struct rvt_mregion mr; /* must be last */
+};
+
+static inline struct rvt_mr *to_imr(struct ib_mr *ibmr)
+{
+ return container_of(ibmr, struct rvt_mr, ibmr);
+}
+
+int rvt_driver_mr_init(struct rvt_dev_info *rdi);
+void rvt_mr_exit(struct rvt_dev_info *rdi);
+
+/* Mem Regions */
+struct ib_mr *rvt_get_dma_mr(struct ib_pd *pd, int acc);
+struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+ u64 virt_addr, int mr_access_flags,
+ struct ib_udata *udata);
+int rvt_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
+struct ib_mr *rvt_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+ u32 max_num_sg);
+int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+ int sg_nents, unsigned int *sg_offset);
+
+#endif /* DEF_RVTMR_H */
diff --git a/drivers/infiniband/sw/rdmavt/pd.c b/drivers/infiniband/sw/rdmavt/pd.c
new file mode 100644
index 0000000000..ae62071969
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/pd.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ */
+
+#include <linux/slab.h>
+#include "pd.h"
+
+/**
+ * rvt_alloc_pd - allocate a protection domain
+ * @ibpd: PD
+ * @udata: optional user data
+ *
+ * Allocate and keep track of a PD.
+ *
+ * Return: 0 on success
+ */
+int rvt_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+{
+ struct ib_device *ibdev = ibpd->device;
+ struct rvt_dev_info *dev = ib_to_rvt(ibdev);
+ struct rvt_pd *pd = ibpd_to_rvtpd(ibpd);
+ int ret = 0;
+
+ /*
+ * While we could continue allocating protecetion domains, being
+ * constrained only by system resources. The IBTA spec defines that
+ * there is a max_pd limit that can be set and we need to check for
+ * that.
+ */
+
+ spin_lock(&dev->n_pds_lock);
+ if (dev->n_pds_allocated == dev->dparms.props.max_pd) {
+ spin_unlock(&dev->n_pds_lock);
+ ret = -ENOMEM;
+ goto bail;
+ }
+
+ dev->n_pds_allocated++;
+ spin_unlock(&dev->n_pds_lock);
+
+ /* ib_alloc_pd() will initialize pd->ibpd. */
+ pd->user = !!udata;
+
+bail:
+ return ret;
+}
+
+/**
+ * rvt_dealloc_pd - Free PD
+ * @ibpd: Free up PD
+ * @udata: Valid user data or NULL for kernel object
+ *
+ * Return: always 0
+ */
+int rvt_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+{
+ struct rvt_dev_info *dev = ib_to_rvt(ibpd->device);
+
+ spin_lock(&dev->n_pds_lock);
+ dev->n_pds_allocated--;
+ spin_unlock(&dev->n_pds_lock);
+ return 0;
+}
diff --git a/drivers/infiniband/sw/rdmavt/pd.h b/drivers/infiniband/sw/rdmavt/pd.h
new file mode 100644
index 0000000000..42a0ef3b7d
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/pd.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ */
+
+#ifndef DEF_RDMAVTPD_H
+#define DEF_RDMAVTPD_H
+
+#include <rdma/rdma_vt.h>
+
+int rvt_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
+int rvt_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
+
+#endif /* DEF_RDMAVTPD_H */
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
new file mode 100644
index 0000000000..dc83d0ac6a
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -0,0 +1,3220 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+/*
+ * Copyright(c) 2016 - 2020 Intel Corporation.
+ */
+
+#include <linux/hash.h>
+#include <linux/bitops.h>
+#include <linux/lockdep.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_hdrs.h>
+#include <rdma/opa_addr.h>
+#include <rdma/uverbs_ioctl.h>
+#include "qp.h"
+#include "vt.h"
+#include "trace.h"
+
+#define RVT_RWQ_COUNT_THRESHOLD 16
+
+static void rvt_rc_timeout(struct timer_list *t);
+static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+ enum ib_qp_type type);
+
+/*
+ * Convert the AETH RNR timeout code into the number of microseconds.
+ */
+static const u32 ib_rvt_rnr_table[32] = {
+ 655360, /* 00: 655.36 */
+ 10, /* 01: .01 */
+ 20, /* 02 .02 */
+ 30, /* 03: .03 */
+ 40, /* 04: .04 */
+ 60, /* 05: .06 */
+ 80, /* 06: .08 */
+ 120, /* 07: .12 */
+ 160, /* 08: .16 */
+ 240, /* 09: .24 */
+ 320, /* 0A: .32 */
+ 480, /* 0B: .48 */
+ 640, /* 0C: .64 */
+ 960, /* 0D: .96 */
+ 1280, /* 0E: 1.28 */
+ 1920, /* 0F: 1.92 */
+ 2560, /* 10: 2.56 */
+ 3840, /* 11: 3.84 */
+ 5120, /* 12: 5.12 */
+ 7680, /* 13: 7.68 */
+ 10240, /* 14: 10.24 */
+ 15360, /* 15: 15.36 */
+ 20480, /* 16: 20.48 */
+ 30720, /* 17: 30.72 */
+ 40960, /* 18: 40.96 */
+ 61440, /* 19: 61.44 */
+ 81920, /* 1A: 81.92 */
+ 122880, /* 1B: 122.88 */
+ 163840, /* 1C: 163.84 */
+ 245760, /* 1D: 245.76 */
+ 327680, /* 1E: 327.68 */
+ 491520 /* 1F: 491.52 */
+};
+
+/*
+ * Note that it is OK to post send work requests in the SQE and ERR
+ * states; rvt_do_send() will process them and generate error
+ * completions as per IB 1.2 C10-96.
+ */
+const int ib_rvt_state_ops[IB_QPS_ERR + 1] = {
+ [IB_QPS_RESET] = 0,
+ [IB_QPS_INIT] = RVT_POST_RECV_OK,
+ [IB_QPS_RTR] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK,
+ [IB_QPS_RTS] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK |
+ RVT_POST_SEND_OK | RVT_PROCESS_SEND_OK |
+ RVT_PROCESS_NEXT_SEND_OK,
+ [IB_QPS_SQD] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK |
+ RVT_POST_SEND_OK | RVT_PROCESS_SEND_OK,
+ [IB_QPS_SQE] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK |
+ RVT_POST_SEND_OK | RVT_FLUSH_SEND,
+ [IB_QPS_ERR] = RVT_POST_RECV_OK | RVT_FLUSH_RECV |
+ RVT_POST_SEND_OK | RVT_FLUSH_SEND,
+};
+EXPORT_SYMBOL(ib_rvt_state_ops);
+
+/* platform specific: return the last level cache (llc) size, in KiB */
+static int rvt_wss_llc_size(void)
+{
+ /* assume that the boot CPU value is universal for all CPUs */
+ return boot_cpu_data.x86_cache_size;
+}
+
+/* platform specific: cacheless copy */
+static void cacheless_memcpy(void *dst, void *src, size_t n)
+{
+ /*
+ * Use the only available X64 cacheless copy. Add a __user cast
+ * to quiet sparse. The src agument is already in the kernel so
+ * there are no security issues. The extra fault recovery machinery
+ * is not invoked.
+ */
+ __copy_user_nocache(dst, (void __user *)src, n);
+}
+
+void rvt_wss_exit(struct rvt_dev_info *rdi)
+{
+ struct rvt_wss *wss = rdi->wss;
+
+ if (!wss)
+ return;
+
+ /* coded to handle partially initialized and repeat callers */
+ kfree(wss->entries);
+ wss->entries = NULL;
+ kfree(rdi->wss);
+ rdi->wss = NULL;
+}
+
+/*
+ * rvt_wss_init - Init wss data structures
+ *
+ * Return: 0 on success
+ */
+int rvt_wss_init(struct rvt_dev_info *rdi)
+{
+ unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode;
+ unsigned int wss_threshold = rdi->dparms.wss_threshold;
+ unsigned int wss_clean_period = rdi->dparms.wss_clean_period;
+ long llc_size;
+ long llc_bits;
+ long table_size;
+ long table_bits;
+ struct rvt_wss *wss;
+ int node = rdi->dparms.node;
+
+ if (sge_copy_mode != RVT_SGE_COPY_ADAPTIVE) {
+ rdi->wss = NULL;
+ return 0;
+ }
+
+ rdi->wss = kzalloc_node(sizeof(*rdi->wss), GFP_KERNEL, node);
+ if (!rdi->wss)
+ return -ENOMEM;
+ wss = rdi->wss;
+
+ /* check for a valid percent range - default to 80 if none or invalid */
+ if (wss_threshold < 1 || wss_threshold > 100)
+ wss_threshold = 80;
+
+ /* reject a wildly large period */
+ if (wss_clean_period > 1000000)
+ wss_clean_period = 256;
+
+ /* reject a zero period */
+ if (wss_clean_period == 0)
+ wss_clean_period = 1;
+
+ /*
+ * Calculate the table size - the next power of 2 larger than the
+ * LLC size. LLC size is in KiB.
+ */
+ llc_size = rvt_wss_llc_size() * 1024;
+ table_size = roundup_pow_of_two(llc_size);
+
+ /* one bit per page in rounded up table */
+ llc_bits = llc_size / PAGE_SIZE;
+ table_bits = table_size / PAGE_SIZE;
+ wss->pages_mask = table_bits - 1;
+ wss->num_entries = table_bits / BITS_PER_LONG;
+
+ wss->threshold = (llc_bits * wss_threshold) / 100;
+ if (wss->threshold == 0)
+ wss->threshold = 1;
+
+ wss->clean_period = wss_clean_period;
+ atomic_set(&wss->clean_counter, wss_clean_period);
+
+ wss->entries = kcalloc_node(wss->num_entries, sizeof(*wss->entries),
+ GFP_KERNEL, node);
+ if (!wss->entries) {
+ rvt_wss_exit(rdi);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/*
+ * Advance the clean counter. When the clean period has expired,
+ * clean an entry.
+ *
+ * This is implemented in atomics to avoid locking. Because multiple
+ * variables are involved, it can be racy which can lead to slightly
+ * inaccurate information. Since this is only a heuristic, this is
+ * OK. Any innaccuracies will clean themselves out as the counter
+ * advances. That said, it is unlikely the entry clean operation will
+ * race - the next possible racer will not start until the next clean
+ * period.
+ *
+ * The clean counter is implemented as a decrement to zero. When zero
+ * is reached an entry is cleaned.
+ */
+static void wss_advance_clean_counter(struct rvt_wss *wss)
+{
+ int entry;
+ int weight;
+ unsigned long bits;
+
+ /* become the cleaner if we decrement the counter to zero */
+ if (atomic_dec_and_test(&wss->clean_counter)) {
+ /*
+ * Set, not add, the clean period. This avoids an issue
+ * where the counter could decrement below the clean period.
+ * Doing a set can result in lost decrements, slowing the
+ * clean advance. Since this a heuristic, this possible
+ * slowdown is OK.
+ *
+ * An alternative is to loop, advancing the counter by a
+ * clean period until the result is > 0. However, this could
+ * lead to several threads keeping another in the clean loop.
+ * This could be mitigated by limiting the number of times
+ * we stay in the loop.
+ */
+ atomic_set(&wss->clean_counter, wss->clean_period);
+
+ /*
+ * Uniquely grab the entry to clean and move to next.
+ * The current entry is always the lower bits of
+ * wss.clean_entry. The table size, wss.num_entries,
+ * is always a power-of-2.
+ */
+ entry = (atomic_inc_return(&wss->clean_entry) - 1)
+ & (wss->num_entries - 1);
+
+ /* clear the entry and count the bits */
+ bits = xchg(&wss->entries[entry], 0);
+ weight = hweight64((u64)bits);
+ /* only adjust the contended total count if needed */
+ if (weight)
+ atomic_sub(weight, &wss->total_count);
+ }
+}
+
+/*
+ * Insert the given address into the working set array.
+ */
+static void wss_insert(struct rvt_wss *wss, void *address)
+{
+ u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss->pages_mask;
+ u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */
+ u32 nr = page & (BITS_PER_LONG - 1);
+
+ if (!test_and_set_bit(nr, &wss->entries[entry]))
+ atomic_inc(&wss->total_count);
+
+ wss_advance_clean_counter(wss);
+}
+
+/*
+ * Is the working set larger than the threshold?
+ */
+static inline bool wss_exceeds_threshold(struct rvt_wss *wss)
+{
+ return atomic_read(&wss->total_count) >= wss->threshold;
+}
+
+static void get_map_page(struct rvt_qpn_table *qpt,
+ struct rvt_qpn_map *map)
+{
+ unsigned long page = get_zeroed_page(GFP_KERNEL);
+
+ /*
+ * Free the page if someone raced with us installing it.
+ */
+
+ spin_lock(&qpt->lock);
+ if (map->page)
+ free_page(page);
+ else
+ map->page = (void *)page;
+ spin_unlock(&qpt->lock);
+}
+
+/**
+ * init_qpn_table - initialize the QP number table for a device
+ * @rdi: rvt dev struct
+ * @qpt: the QPN table
+ */
+static int init_qpn_table(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt)
+{
+ u32 offset, i;
+ struct rvt_qpn_map *map;
+ int ret = 0;
+
+ if (!(rdi->dparms.qpn_res_end >= rdi->dparms.qpn_res_start))
+ return -EINVAL;
+
+ spin_lock_init(&qpt->lock);
+
+ qpt->last = rdi->dparms.qpn_start;
+ qpt->incr = rdi->dparms.qpn_inc << rdi->dparms.qos_shift;
+
+ /*
+ * Drivers may want some QPs beyond what we need for verbs let them use
+ * our qpn table. No need for two. Lets go ahead and mark the bitmaps
+ * for those. The reserved range must be *after* the range which verbs
+ * will pick from.
+ */
+
+ /* Figure out number of bit maps needed before reserved range */
+ qpt->nmaps = rdi->dparms.qpn_res_start / RVT_BITS_PER_PAGE;
+
+ /* This should always be zero */
+ offset = rdi->dparms.qpn_res_start & RVT_BITS_PER_PAGE_MASK;
+
+ /* Starting with the first reserved bit map */
+ map = &qpt->map[qpt->nmaps];
+
+ rvt_pr_info(rdi, "Reserving QPNs from 0x%x to 0x%x for non-verbs use\n",
+ rdi->dparms.qpn_res_start, rdi->dparms.qpn_res_end);
+ for (i = rdi->dparms.qpn_res_start; i <= rdi->dparms.qpn_res_end; i++) {
+ if (!map->page) {
+ get_map_page(qpt, map);
+ if (!map->page) {
+ ret = -ENOMEM;
+ break;
+ }
+ }
+ set_bit(offset, map->page);
+ offset++;
+ if (offset == RVT_BITS_PER_PAGE) {
+ /* next page */
+ qpt->nmaps++;
+ map++;
+ offset = 0;
+ }
+ }
+ return ret;
+}
+
+/**
+ * free_qpn_table - free the QP number table for a device
+ * @qpt: the QPN table
+ */
+static void free_qpn_table(struct rvt_qpn_table *qpt)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(qpt->map); i++)
+ free_page((unsigned long)qpt->map[i].page);
+}
+
+/**
+ * rvt_driver_qp_init - Init driver qp resources
+ * @rdi: rvt dev strucutre
+ *
+ * Return: 0 on success
+ */
+int rvt_driver_qp_init(struct rvt_dev_info *rdi)
+{
+ int i;
+ int ret = -ENOMEM;
+
+ if (!rdi->dparms.qp_table_size)
+ return -EINVAL;
+
+ /*
+ * If driver is not doing any QP allocation then make sure it is
+ * providing the necessary QP functions.
+ */
+ if (!rdi->driver_f.free_all_qps ||
+ !rdi->driver_f.qp_priv_alloc ||
+ !rdi->driver_f.qp_priv_free ||
+ !rdi->driver_f.notify_qp_reset ||
+ !rdi->driver_f.notify_restart_rc)
+ return -EINVAL;
+
+ /* allocate parent object */
+ rdi->qp_dev = kzalloc_node(sizeof(*rdi->qp_dev), GFP_KERNEL,
+ rdi->dparms.node);
+ if (!rdi->qp_dev)
+ return -ENOMEM;
+
+ /* allocate hash table */
+ rdi->qp_dev->qp_table_size = rdi->dparms.qp_table_size;
+ rdi->qp_dev->qp_table_bits = ilog2(rdi->dparms.qp_table_size);
+ rdi->qp_dev->qp_table =
+ kmalloc_array_node(rdi->qp_dev->qp_table_size,
+ sizeof(*rdi->qp_dev->qp_table),
+ GFP_KERNEL, rdi->dparms.node);
+ if (!rdi->qp_dev->qp_table)
+ goto no_qp_table;
+
+ for (i = 0; i < rdi->qp_dev->qp_table_size; i++)
+ RCU_INIT_POINTER(rdi->qp_dev->qp_table[i], NULL);
+
+ spin_lock_init(&rdi->qp_dev->qpt_lock);
+
+ /* initialize qpn map */
+ if (init_qpn_table(rdi, &rdi->qp_dev->qpn_table))
+ goto fail_table;
+
+ spin_lock_init(&rdi->n_qps_lock);
+
+ return 0;
+
+fail_table:
+ kfree(rdi->qp_dev->qp_table);
+ free_qpn_table(&rdi->qp_dev->qpn_table);
+
+no_qp_table:
+ kfree(rdi->qp_dev);
+
+ return ret;
+}
+
+/**
+ * rvt_free_qp_cb - callback function to reset a qp
+ * @qp: the qp to reset
+ * @v: a 64-bit value
+ *
+ * This function resets the qp and removes it from the
+ * qp hash table.
+ */
+static void rvt_free_qp_cb(struct rvt_qp *qp, u64 v)
+{
+ unsigned int *qp_inuse = (unsigned int *)v;
+ struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+
+ /* Reset the qp and remove it from the qp hash list */
+ rvt_reset_qp(rdi, qp, qp->ibqp.qp_type);
+
+ /* Increment the qp_inuse count */
+ (*qp_inuse)++;
+}
+
+/**
+ * rvt_free_all_qps - check for QPs still in use
+ * @rdi: rvt device info structure
+ *
+ * There should not be any QPs still in use.
+ * Free memory for table.
+ * Return the number of QPs still in use.
+ */
+static unsigned rvt_free_all_qps(struct rvt_dev_info *rdi)
+{
+ unsigned int qp_inuse = 0;
+
+ qp_inuse += rvt_mcast_tree_empty(rdi);
+
+ rvt_qp_iter(rdi, (u64)&qp_inuse, rvt_free_qp_cb);
+
+ return qp_inuse;
+}
+
+/**
+ * rvt_qp_exit - clean up qps on device exit
+ * @rdi: rvt dev structure
+ *
+ * Check for qp leaks and free resources.
+ */
+void rvt_qp_exit(struct rvt_dev_info *rdi)
+{
+ u32 qps_inuse = rvt_free_all_qps(rdi);
+
+ if (qps_inuse)
+ rvt_pr_err(rdi, "QP memory leak! %u still in use\n",
+ qps_inuse);
+
+ kfree(rdi->qp_dev->qp_table);
+ free_qpn_table(&rdi->qp_dev->qpn_table);
+ kfree(rdi->qp_dev);
+}
+
+static inline unsigned mk_qpn(struct rvt_qpn_table *qpt,
+ struct rvt_qpn_map *map, unsigned off)
+{
+ return (map - qpt->map) * RVT_BITS_PER_PAGE + off;
+}
+
+/**
+ * alloc_qpn - Allocate the next available qpn or zero/one for QP type
+ * IB_QPT_SMI/IB_QPT_GSI
+ * @rdi: rvt device info structure
+ * @qpt: queue pair number table pointer
+ * @type: the QP type
+ * @port_num: IB port number, 1 based, comes from core
+ * @exclude_prefix: prefix of special queue pair number being allocated
+ *
+ * Return: The queue pair number
+ */
+static int alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt,
+ enum ib_qp_type type, u8 port_num, u8 exclude_prefix)
+{
+ u32 i, offset, max_scan, qpn;
+ struct rvt_qpn_map *map;
+ u32 ret;
+ u32 max_qpn = exclude_prefix == RVT_AIP_QP_PREFIX ?
+ RVT_AIP_QPN_MAX : RVT_QPN_MAX;
+
+ if (rdi->driver_f.alloc_qpn)
+ return rdi->driver_f.alloc_qpn(rdi, qpt, type, port_num);
+
+ if (type == IB_QPT_SMI || type == IB_QPT_GSI) {
+ unsigned n;
+
+ ret = type == IB_QPT_GSI;
+ n = 1 << (ret + 2 * (port_num - 1));
+ spin_lock(&qpt->lock);
+ if (qpt->flags & n)
+ ret = -EINVAL;
+ else
+ qpt->flags |= n;
+ spin_unlock(&qpt->lock);
+ goto bail;
+ }
+
+ qpn = qpt->last + qpt->incr;
+ if (qpn >= max_qpn)
+ qpn = qpt->incr | ((qpt->last & 1) ^ 1);
+ /* offset carries bit 0 */
+ offset = qpn & RVT_BITS_PER_PAGE_MASK;
+ map = &qpt->map[qpn / RVT_BITS_PER_PAGE];
+ max_scan = qpt->nmaps - !offset;
+ for (i = 0;;) {
+ if (unlikely(!map->page)) {
+ get_map_page(qpt, map);
+ if (unlikely(!map->page))
+ break;
+ }
+ do {
+ if (!test_and_set_bit(offset, map->page)) {
+ qpt->last = qpn;
+ ret = qpn;
+ goto bail;
+ }
+ offset += qpt->incr;
+ /*
+ * This qpn might be bogus if offset >= BITS_PER_PAGE.
+ * That is OK. It gets re-assigned below
+ */
+ qpn = mk_qpn(qpt, map, offset);
+ } while (offset < RVT_BITS_PER_PAGE && qpn < RVT_QPN_MAX);
+ /*
+ * In order to keep the number of pages allocated to a
+ * minimum, we scan the all existing pages before increasing
+ * the size of the bitmap table.
+ */
+ if (++i > max_scan) {
+ if (qpt->nmaps == RVT_QPNMAP_ENTRIES)
+ break;
+ map = &qpt->map[qpt->nmaps++];
+ /* start at incr with current bit 0 */
+ offset = qpt->incr | (offset & 1);
+ } else if (map < &qpt->map[qpt->nmaps]) {
+ ++map;
+ /* start at incr with current bit 0 */
+ offset = qpt->incr | (offset & 1);
+ } else {
+ map = &qpt->map[0];
+ /* wrap to first map page, invert bit 0 */
+ offset = qpt->incr | ((offset & 1) ^ 1);
+ }
+ /* there can be no set bits in low-order QoS bits */
+ WARN_ON(rdi->dparms.qos_shift > 1 &&
+ offset & ((BIT(rdi->dparms.qos_shift - 1) - 1) << 1));
+ qpn = mk_qpn(qpt, map, offset);
+ }
+
+ ret = -ENOMEM;
+
+bail:
+ return ret;
+}
+
+/**
+ * rvt_clear_mr_refs - Drop help mr refs
+ * @qp: rvt qp data structure
+ * @clr_sends: If shoudl clear send side or not
+ */
+static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends)
+{
+ unsigned n;
+ struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+
+ if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags))
+ rvt_put_ss(&qp->s_rdma_read_sge);
+
+ rvt_put_ss(&qp->r_sge);
+
+ if (clr_sends) {
+ while (qp->s_last != qp->s_head) {
+ struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+
+ rvt_put_qp_swqe(qp, wqe);
+ if (++qp->s_last >= qp->s_size)
+ qp->s_last = 0;
+ smp_wmb(); /* see qp_set_savail */
+ }
+ if (qp->s_rdma_mr) {
+ rvt_put_mr(qp->s_rdma_mr);
+ qp->s_rdma_mr = NULL;
+ }
+ }
+
+ for (n = 0; qp->s_ack_queue && n < rvt_max_atomic(rdi); n++) {
+ struct rvt_ack_entry *e = &qp->s_ack_queue[n];
+
+ if (e->rdma_sge.mr) {
+ rvt_put_mr(e->rdma_sge.mr);
+ e->rdma_sge.mr = NULL;
+ }
+ }
+}
+
+/**
+ * rvt_swqe_has_lkey - return true if lkey is used by swqe
+ * @wqe: the send wqe
+ * @lkey: the lkey
+ *
+ * Test the swqe for using lkey
+ */
+static bool rvt_swqe_has_lkey(struct rvt_swqe *wqe, u32 lkey)
+{
+ int i;
+
+ for (i = 0; i < wqe->wr.num_sge; i++) {
+ struct rvt_sge *sge = &wqe->sg_list[i];
+
+ if (rvt_mr_has_lkey(sge->mr, lkey))
+ return true;
+ }
+ return false;
+}
+
+/**
+ * rvt_qp_sends_has_lkey - return true is qp sends use lkey
+ * @qp: the rvt_qp
+ * @lkey: the lkey
+ */
+static bool rvt_qp_sends_has_lkey(struct rvt_qp *qp, u32 lkey)
+{
+ u32 s_last = qp->s_last;
+
+ while (s_last != qp->s_head) {
+ struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, s_last);
+
+ if (rvt_swqe_has_lkey(wqe, lkey))
+ return true;
+
+ if (++s_last >= qp->s_size)
+ s_last = 0;
+ }
+ if (qp->s_rdma_mr)
+ if (rvt_mr_has_lkey(qp->s_rdma_mr, lkey))
+ return true;
+ return false;
+}
+
+/**
+ * rvt_qp_acks_has_lkey - return true if acks have lkey
+ * @qp: the qp
+ * @lkey: the lkey
+ */
+static bool rvt_qp_acks_has_lkey(struct rvt_qp *qp, u32 lkey)
+{
+ int i;
+ struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+
+ for (i = 0; qp->s_ack_queue && i < rvt_max_atomic(rdi); i++) {
+ struct rvt_ack_entry *e = &qp->s_ack_queue[i];
+
+ if (rvt_mr_has_lkey(e->rdma_sge.mr, lkey))
+ return true;
+ }
+ return false;
+}
+
+/**
+ * rvt_qp_mr_clean - clean up remote ops for lkey
+ * @qp: the qp
+ * @lkey: the lkey that is being de-registered
+ *
+ * This routine checks if the lkey is being used by
+ * the qp.
+ *
+ * If so, the qp is put into an error state to elminate
+ * any references from the qp.
+ */
+void rvt_qp_mr_clean(struct rvt_qp *qp, u32 lkey)
+{
+ bool lastwqe = false;
+
+ if (qp->ibqp.qp_type == IB_QPT_SMI ||
+ qp->ibqp.qp_type == IB_QPT_GSI)
+ /* avoid special QPs */
+ return;
+ spin_lock_irq(&qp->r_lock);
+ spin_lock(&qp->s_hlock);
+ spin_lock(&qp->s_lock);
+
+ if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET)
+ goto check_lwqe;
+
+ if (rvt_ss_has_lkey(&qp->r_sge, lkey) ||
+ rvt_qp_sends_has_lkey(qp, lkey) ||
+ rvt_qp_acks_has_lkey(qp, lkey))
+ lastwqe = rvt_error_qp(qp, IB_WC_LOC_PROT_ERR);
+check_lwqe:
+ spin_unlock(&qp->s_lock);
+ spin_unlock(&qp->s_hlock);
+ spin_unlock_irq(&qp->r_lock);
+ if (lastwqe) {
+ struct ib_event ev;
+
+ ev.device = qp->ibqp.device;
+ ev.element.qp = &qp->ibqp;
+ ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+ qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+ }
+}
+
+/**
+ * rvt_remove_qp - remove qp form table
+ * @rdi: rvt dev struct
+ * @qp: qp to remove
+ *
+ * Remove the QP from the table so it can't be found asynchronously by
+ * the receive routine.
+ */
+static void rvt_remove_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp)
+{
+ struct rvt_ibport *rvp = rdi->ports[qp->port_num - 1];
+ u32 n = hash_32(qp->ibqp.qp_num, rdi->qp_dev->qp_table_bits);
+ unsigned long flags;
+ int removed = 1;
+
+ spin_lock_irqsave(&rdi->qp_dev->qpt_lock, flags);
+
+ if (rcu_dereference_protected(rvp->qp[0],
+ lockdep_is_held(&rdi->qp_dev->qpt_lock)) == qp) {
+ RCU_INIT_POINTER(rvp->qp[0], NULL);
+ } else if (rcu_dereference_protected(rvp->qp[1],
+ lockdep_is_held(&rdi->qp_dev->qpt_lock)) == qp) {
+ RCU_INIT_POINTER(rvp->qp[1], NULL);
+ } else {
+ struct rvt_qp *q;
+ struct rvt_qp __rcu **qpp;
+
+ removed = 0;
+ qpp = &rdi->qp_dev->qp_table[n];
+ for (; (q = rcu_dereference_protected(*qpp,
+ lockdep_is_held(&rdi->qp_dev->qpt_lock))) != NULL;
+ qpp = &q->next) {
+ if (q == qp) {
+ RCU_INIT_POINTER(*qpp,
+ rcu_dereference_protected(qp->next,
+ lockdep_is_held(&rdi->qp_dev->qpt_lock)));
+ removed = 1;
+ trace_rvt_qpremove(qp, n);
+ break;
+ }
+ }
+ }
+
+ spin_unlock_irqrestore(&rdi->qp_dev->qpt_lock, flags);
+ if (removed) {
+ synchronize_rcu();
+ rvt_put_qp(qp);
+ }
+}
+
+/**
+ * rvt_alloc_rq - allocate memory for user or kernel buffer
+ * @rq: receive queue data structure
+ * @size: number of request queue entries
+ * @node: The NUMA node
+ * @udata: True if user data is available or not false
+ *
+ * Return: If memory allocation failed, return -ENONEM
+ * This function is used by both shared receive
+ * queues and non-shared receive queues to allocate
+ * memory.
+ */
+int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node,
+ struct ib_udata *udata)
+{
+ if (udata) {
+ rq->wq = vmalloc_user(sizeof(struct rvt_rwq) + size);
+ if (!rq->wq)
+ goto bail;
+ /* need kwq with no buffers */
+ rq->kwq = kzalloc_node(sizeof(*rq->kwq), GFP_KERNEL, node);
+ if (!rq->kwq)
+ goto bail;
+ rq->kwq->curr_wq = rq->wq->wq;
+ } else {
+ /* need kwq with buffers */
+ rq->kwq =
+ vzalloc_node(sizeof(struct rvt_krwq) + size, node);
+ if (!rq->kwq)
+ goto bail;
+ rq->kwq->curr_wq = rq->kwq->wq;
+ }
+
+ spin_lock_init(&rq->kwq->p_lock);
+ spin_lock_init(&rq->kwq->c_lock);
+ return 0;
+bail:
+ rvt_free_rq(rq);
+ return -ENOMEM;
+}
+
+/**
+ * rvt_init_qp - initialize the QP state to the reset state
+ * @rdi: rvt dev struct
+ * @qp: the QP to init or reinit
+ * @type: the QP type
+ *
+ * This function is called from both rvt_create_qp() and
+ * rvt_reset_qp(). The difference is that the reset
+ * patch the necessary locks to protect against concurent
+ * access.
+ */
+static void rvt_init_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+ enum ib_qp_type type)
+{
+ qp->remote_qpn = 0;
+ qp->qkey = 0;
+ qp->qp_access_flags = 0;
+ qp->s_flags &= RVT_S_SIGNAL_REQ_WR;
+ qp->s_hdrwords = 0;
+ qp->s_wqe = NULL;
+ qp->s_draining = 0;
+ qp->s_next_psn = 0;
+ qp->s_last_psn = 0;
+ qp->s_sending_psn = 0;
+ qp->s_sending_hpsn = 0;
+ qp->s_psn = 0;
+ qp->r_psn = 0;
+ qp->r_msn = 0;
+ if (type == IB_QPT_RC) {
+ qp->s_state = IB_OPCODE_RC_SEND_LAST;
+ qp->r_state = IB_OPCODE_RC_SEND_LAST;
+ } else {
+ qp->s_state = IB_OPCODE_UC_SEND_LAST;
+ qp->r_state = IB_OPCODE_UC_SEND_LAST;
+ }
+ qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
+ qp->r_nak_state = 0;
+ qp->r_aflags = 0;
+ qp->r_flags = 0;
+ qp->s_head = 0;
+ qp->s_tail = 0;
+ qp->s_cur = 0;
+ qp->s_acked = 0;
+ qp->s_last = 0;
+ qp->s_ssn = 1;
+ qp->s_lsn = 0;
+ qp->s_mig_state = IB_MIG_MIGRATED;
+ qp->r_head_ack_queue = 0;
+ qp->s_tail_ack_queue = 0;
+ qp->s_acked_ack_queue = 0;
+ qp->s_num_rd_atomic = 0;
+ qp->r_sge.num_sge = 0;
+ atomic_set(&qp->s_reserved_used, 0);
+}
+
+/**
+ * _rvt_reset_qp - initialize the QP state to the reset state
+ * @rdi: rvt dev struct
+ * @qp: the QP to reset
+ * @type: the QP type
+ *
+ * r_lock, s_hlock, and s_lock are required to be held by the caller
+ */
+static void _rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+ enum ib_qp_type type)
+ __must_hold(&qp->s_lock)
+ __must_hold(&qp->s_hlock)
+ __must_hold(&qp->r_lock)
+{
+ lockdep_assert_held(&qp->r_lock);
+ lockdep_assert_held(&qp->s_hlock);
+ lockdep_assert_held(&qp->s_lock);
+ if (qp->state != IB_QPS_RESET) {
+ qp->state = IB_QPS_RESET;
+
+ /* Let drivers flush their waitlist */
+ rdi->driver_f.flush_qp_waiters(qp);
+ rvt_stop_rc_timers(qp);
+ qp->s_flags &= ~(RVT_S_TIMER | RVT_S_ANY_WAIT);
+ spin_unlock(&qp->s_lock);
+ spin_unlock(&qp->s_hlock);
+ spin_unlock_irq(&qp->r_lock);
+
+ /* Stop the send queue and the retry timer */
+ rdi->driver_f.stop_send_queue(qp);
+ rvt_del_timers_sync(qp);
+ /* Wait for things to stop */
+ rdi->driver_f.quiesce_qp(qp);
+
+ /* take qp out the hash and wait for it to be unused */
+ rvt_remove_qp(rdi, qp);
+
+ /* grab the lock b/c it was locked at call time */
+ spin_lock_irq(&qp->r_lock);
+ spin_lock(&qp->s_hlock);
+ spin_lock(&qp->s_lock);
+
+ rvt_clear_mr_refs(qp, 1);
+ /*
+ * Let the driver do any tear down or re-init it needs to for
+ * a qp that has been reset
+ */
+ rdi->driver_f.notify_qp_reset(qp);
+ }
+ rvt_init_qp(rdi, qp, type);
+ lockdep_assert_held(&qp->r_lock);
+ lockdep_assert_held(&qp->s_hlock);
+ lockdep_assert_held(&qp->s_lock);
+}
+
+/**
+ * rvt_reset_qp - initialize the QP state to the reset state
+ * @rdi: the device info
+ * @qp: the QP to reset
+ * @type: the QP type
+ *
+ * This is the wrapper function to acquire the r_lock, s_hlock, and s_lock
+ * before calling _rvt_reset_qp().
+ */
+static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+ enum ib_qp_type type)
+{
+ spin_lock_irq(&qp->r_lock);
+ spin_lock(&qp->s_hlock);
+ spin_lock(&qp->s_lock);
+ _rvt_reset_qp(rdi, qp, type);
+ spin_unlock(&qp->s_lock);
+ spin_unlock(&qp->s_hlock);
+ spin_unlock_irq(&qp->r_lock);
+}
+
+/**
+ * rvt_free_qpn - Free a qpn from the bit map
+ * @qpt: QP table
+ * @qpn: queue pair number to free
+ */
+static void rvt_free_qpn(struct rvt_qpn_table *qpt, u32 qpn)
+{
+ struct rvt_qpn_map *map;
+
+ if ((qpn & RVT_AIP_QP_PREFIX_MASK) == RVT_AIP_QP_BASE)
+ qpn &= RVT_AIP_QP_SUFFIX;
+
+ map = qpt->map + (qpn & RVT_QPN_MASK) / RVT_BITS_PER_PAGE;
+ if (map->page)
+ clear_bit(qpn & RVT_BITS_PER_PAGE_MASK, map->page);
+}
+
+/**
+ * get_allowed_ops - Given a QP type return the appropriate allowed OP
+ * @type: valid, supported, QP type
+ */
+static u8 get_allowed_ops(enum ib_qp_type type)
+{
+ return type == IB_QPT_RC ? IB_OPCODE_RC : type == IB_QPT_UC ?
+ IB_OPCODE_UC : IB_OPCODE_UD;
+}
+
+/**
+ * free_ud_wq_attr - Clean up AH attribute cache for UD QPs
+ * @qp: Valid QP with allowed_ops set
+ *
+ * The rvt_swqe data structure being used is a union, so this is
+ * only valid for UD QPs.
+ */
+static void free_ud_wq_attr(struct rvt_qp *qp)
+{
+ struct rvt_swqe *wqe;
+ int i;
+
+ for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) {
+ wqe = rvt_get_swqe_ptr(qp, i);
+ kfree(wqe->ud_wr.attr);
+ wqe->ud_wr.attr = NULL;
+ }
+}
+
+/**
+ * alloc_ud_wq_attr - AH attribute cache for UD QPs
+ * @qp: Valid QP with allowed_ops set
+ * @node: Numa node for allocation
+ *
+ * The rvt_swqe data structure being used is a union, so this is
+ * only valid for UD QPs.
+ */
+static int alloc_ud_wq_attr(struct rvt_qp *qp, int node)
+{
+ struct rvt_swqe *wqe;
+ int i;
+
+ for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) {
+ wqe = rvt_get_swqe_ptr(qp, i);
+ wqe->ud_wr.attr = kzalloc_node(sizeof(*wqe->ud_wr.attr),
+ GFP_KERNEL, node);
+ if (!wqe->ud_wr.attr) {
+ free_ud_wq_attr(qp);
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * rvt_create_qp - create a queue pair for a device
+ * @ibqp: the queue pair
+ * @init_attr: the attributes of the queue pair
+ * @udata: user data for libibverbs.so
+ *
+ * Queue pair creation is mostly an rvt issue. However, drivers have their own
+ * unique idea of what queue pair numbers mean. For instance there is a reserved
+ * range for PSM.
+ *
+ * Return: 0 on success, otherwise returns an errno.
+ *
+ * Called by the ib_create_qp() core verbs function.
+ */
+int rvt_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
+ int ret = -ENOMEM;
+ struct rvt_swqe *swq = NULL;
+ size_t sz;
+ size_t sg_list_sz = 0;
+ struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+ void *priv = NULL;
+ size_t sqsize;
+ u8 exclude_prefix = 0;
+
+ if (!rdi)
+ return -EINVAL;
+
+ if (init_attr->create_flags & ~IB_QP_CREATE_NETDEV_USE)
+ return -EOPNOTSUPP;
+
+ if (init_attr->cap.max_send_sge > rdi->dparms.props.max_send_sge ||
+ init_attr->cap.max_send_wr > rdi->dparms.props.max_qp_wr)
+ return -EINVAL;
+
+ /* Check receive queue parameters if no SRQ is specified. */
+ if (!init_attr->srq) {
+ if (init_attr->cap.max_recv_sge >
+ rdi->dparms.props.max_recv_sge ||
+ init_attr->cap.max_recv_wr > rdi->dparms.props.max_qp_wr)
+ return -EINVAL;
+
+ if (init_attr->cap.max_send_sge +
+ init_attr->cap.max_send_wr +
+ init_attr->cap.max_recv_sge +
+ init_attr->cap.max_recv_wr == 0)
+ return -EINVAL;
+ }
+ sqsize =
+ init_attr->cap.max_send_wr + 1 +
+ rdi->dparms.reserved_operations;
+ switch (init_attr->qp_type) {
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ if (init_attr->port_num == 0 ||
+ init_attr->port_num > ibqp->device->phys_port_cnt)
+ return -EINVAL;
+ fallthrough;
+ case IB_QPT_UC:
+ case IB_QPT_RC:
+ case IB_QPT_UD:
+ sz = struct_size(swq, sg_list, init_attr->cap.max_send_sge);
+ swq = vzalloc_node(array_size(sz, sqsize), rdi->dparms.node);
+ if (!swq)
+ return -ENOMEM;
+
+ if (init_attr->srq) {
+ struct rvt_srq *srq = ibsrq_to_rvtsrq(init_attr->srq);
+
+ if (srq->rq.max_sge > 1)
+ sg_list_sz = sizeof(*qp->r_sg_list) *
+ (srq->rq.max_sge - 1);
+ } else if (init_attr->cap.max_recv_sge > 1)
+ sg_list_sz = sizeof(*qp->r_sg_list) *
+ (init_attr->cap.max_recv_sge - 1);
+ qp->r_sg_list =
+ kzalloc_node(sg_list_sz, GFP_KERNEL, rdi->dparms.node);
+ if (!qp->r_sg_list)
+ goto bail_qp;
+ qp->allowed_ops = get_allowed_ops(init_attr->qp_type);
+
+ RCU_INIT_POINTER(qp->next, NULL);
+ if (init_attr->qp_type == IB_QPT_RC) {
+ qp->s_ack_queue =
+ kcalloc_node(rvt_max_atomic(rdi),
+ sizeof(*qp->s_ack_queue),
+ GFP_KERNEL,
+ rdi->dparms.node);
+ if (!qp->s_ack_queue)
+ goto bail_qp;
+ }
+ /* initialize timers needed for rc qp */
+ timer_setup(&qp->s_timer, rvt_rc_timeout, 0);
+ hrtimer_init(&qp->s_rnr_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
+ qp->s_rnr_timer.function = rvt_rc_rnr_retry;
+
+ /*
+ * Driver needs to set up it's private QP structure and do any
+ * initialization that is needed.
+ */
+ priv = rdi->driver_f.qp_priv_alloc(rdi, qp);
+ if (IS_ERR(priv)) {
+ ret = PTR_ERR(priv);
+ goto bail_qp;
+ }
+ qp->priv = priv;
+ qp->timeout_jiffies =
+ usecs_to_jiffies((4096UL * (1UL << qp->timeout)) /
+ 1000UL);
+ if (init_attr->srq) {
+ sz = 0;
+ } else {
+ qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
+ qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
+ sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
+ sizeof(struct rvt_rwqe);
+ ret = rvt_alloc_rq(&qp->r_rq, qp->r_rq.size * sz,
+ rdi->dparms.node, udata);
+ if (ret)
+ goto bail_driver_priv;
+ }
+
+ /*
+ * ib_create_qp() will initialize qp->ibqp
+ * except for qp->ibqp.qp_num.
+ */
+ spin_lock_init(&qp->r_lock);
+ spin_lock_init(&qp->s_hlock);
+ spin_lock_init(&qp->s_lock);
+ atomic_set(&qp->refcount, 0);
+ atomic_set(&qp->local_ops_pending, 0);
+ init_waitqueue_head(&qp->wait);
+ INIT_LIST_HEAD(&qp->rspwait);
+ qp->state = IB_QPS_RESET;
+ qp->s_wq = swq;
+ qp->s_size = sqsize;
+ qp->s_avail = init_attr->cap.max_send_wr;
+ qp->s_max_sge = init_attr->cap.max_send_sge;
+ if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
+ qp->s_flags = RVT_S_SIGNAL_REQ_WR;
+ ret = alloc_ud_wq_attr(qp, rdi->dparms.node);
+ if (ret)
+ goto bail_rq_rvt;
+
+ if (init_attr->create_flags & IB_QP_CREATE_NETDEV_USE)
+ exclude_prefix = RVT_AIP_QP_PREFIX;
+
+ ret = alloc_qpn(rdi, &rdi->qp_dev->qpn_table,
+ init_attr->qp_type,
+ init_attr->port_num,
+ exclude_prefix);
+ if (ret < 0)
+ goto bail_rq_wq;
+
+ qp->ibqp.qp_num = ret;
+ if (init_attr->create_flags & IB_QP_CREATE_NETDEV_USE)
+ qp->ibqp.qp_num |= RVT_AIP_QP_BASE;
+ qp->port_num = init_attr->port_num;
+ rvt_init_qp(rdi, qp, init_attr->qp_type);
+ if (rdi->driver_f.qp_priv_init) {
+ ret = rdi->driver_f.qp_priv_init(rdi, qp, init_attr);
+ if (ret)
+ goto bail_rq_wq;
+ }
+ break;
+
+ default:
+ /* Don't support raw QPs */
+ return -EOPNOTSUPP;
+ }
+
+ init_attr->cap.max_inline_data = 0;
+
+ /*
+ * Return the address of the RWQ as the offset to mmap.
+ * See rvt_mmap() for details.
+ */
+ if (udata && udata->outlen >= sizeof(__u64)) {
+ if (!qp->r_rq.wq) {
+ __u64 offset = 0;
+
+ ret = ib_copy_to_udata(udata, &offset,
+ sizeof(offset));
+ if (ret)
+ goto bail_qpn;
+ } else {
+ u32 s = sizeof(struct rvt_rwq) + qp->r_rq.size * sz;
+
+ qp->ip = rvt_create_mmap_info(rdi, s, udata,
+ qp->r_rq.wq);
+ if (IS_ERR(qp->ip)) {
+ ret = PTR_ERR(qp->ip);
+ goto bail_qpn;
+ }
+
+ ret = ib_copy_to_udata(udata, &qp->ip->offset,
+ sizeof(qp->ip->offset));
+ if (ret)
+ goto bail_ip;
+ }
+ qp->pid = current->pid;
+ }
+
+ spin_lock(&rdi->n_qps_lock);
+ if (rdi->n_qps_allocated == rdi->dparms.props.max_qp) {
+ spin_unlock(&rdi->n_qps_lock);
+ ret = -ENOMEM;
+ goto bail_ip;
+ }
+
+ rdi->n_qps_allocated++;
+ /*
+ * Maintain a busy_jiffies variable that will be added to the timeout
+ * period in mod_retry_timer and add_retry_timer. This busy jiffies
+ * is scaled by the number of rc qps created for the device to reduce
+ * the number of timeouts occurring when there is a large number of
+ * qps. busy_jiffies is incremented every rc qp scaling interval.
+ * The scaling interval is selected based on extensive performance
+ * evaluation of targeted workloads.
+ */
+ if (init_attr->qp_type == IB_QPT_RC) {
+ rdi->n_rc_qps++;
+ rdi->busy_jiffies = rdi->n_rc_qps / RC_QP_SCALING_INTERVAL;
+ }
+ spin_unlock(&rdi->n_qps_lock);
+
+ if (qp->ip) {
+ spin_lock_irq(&rdi->pending_lock);
+ list_add(&qp->ip->pending_mmaps, &rdi->pending_mmaps);
+ spin_unlock_irq(&rdi->pending_lock);
+ }
+
+ return 0;
+
+bail_ip:
+ if (qp->ip)
+ kref_put(&qp->ip->ref, rvt_release_mmap_info);
+
+bail_qpn:
+ rvt_free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num);
+
+bail_rq_wq:
+ free_ud_wq_attr(qp);
+
+bail_rq_rvt:
+ rvt_free_rq(&qp->r_rq);
+
+bail_driver_priv:
+ rdi->driver_f.qp_priv_free(rdi, qp);
+
+bail_qp:
+ kfree(qp->s_ack_queue);
+ kfree(qp->r_sg_list);
+ vfree(swq);
+ return ret;
+}
+
+/**
+ * rvt_error_qp - put a QP into the error state
+ * @qp: the QP to put into the error state
+ * @err: the receive completion error to signal if a RWQE is active
+ *
+ * Flushes both send and receive work queues.
+ *
+ * Return: true if last WQE event should be generated.
+ * The QP r_lock and s_lock should be held and interrupts disabled.
+ * If we are already in error state, just return.
+ */
+int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err)
+{
+ struct ib_wc wc;
+ int ret = 0;
+ struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+
+ lockdep_assert_held(&qp->r_lock);
+ lockdep_assert_held(&qp->s_lock);
+ if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET)
+ goto bail;
+
+ qp->state = IB_QPS_ERR;
+
+ if (qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR)) {
+ qp->s_flags &= ~(RVT_S_TIMER | RVT_S_WAIT_RNR);
+ del_timer(&qp->s_timer);
+ }
+
+ if (qp->s_flags & RVT_S_ANY_WAIT_SEND)
+ qp->s_flags &= ~RVT_S_ANY_WAIT_SEND;
+
+ rdi->driver_f.notify_error_qp(qp);
+
+ /* Schedule the sending tasklet to drain the send work queue. */
+ if (READ_ONCE(qp->s_last) != qp->s_head)
+ rdi->driver_f.schedule_send(qp);
+
+ rvt_clear_mr_refs(qp, 0);
+
+ memset(&wc, 0, sizeof(wc));
+ wc.qp = &qp->ibqp;
+ wc.opcode = IB_WC_RECV;
+
+ if (test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) {
+ wc.wr_id = qp->r_wr_id;
+ wc.status = err;
+ rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
+ }
+ wc.status = IB_WC_WR_FLUSH_ERR;
+
+ if (qp->r_rq.kwq) {
+ u32 head;
+ u32 tail;
+ struct rvt_rwq *wq = NULL;
+ struct rvt_krwq *kwq = NULL;
+
+ spin_lock(&qp->r_rq.kwq->c_lock);
+ /* qp->ip used to validate if there is a user buffer mmaped */
+ if (qp->ip) {
+ wq = qp->r_rq.wq;
+ head = RDMA_READ_UAPI_ATOMIC(wq->head);
+ tail = RDMA_READ_UAPI_ATOMIC(wq->tail);
+ } else {
+ kwq = qp->r_rq.kwq;
+ head = kwq->head;
+ tail = kwq->tail;
+ }
+ /* sanity check pointers before trusting them */
+ if (head >= qp->r_rq.size)
+ head = 0;
+ if (tail >= qp->r_rq.size)
+ tail = 0;
+ while (tail != head) {
+ wc.wr_id = rvt_get_rwqe_ptr(&qp->r_rq, tail)->wr_id;
+ if (++tail >= qp->r_rq.size)
+ tail = 0;
+ rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
+ }
+ if (qp->ip)
+ RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail);
+ else
+ kwq->tail = tail;
+ spin_unlock(&qp->r_rq.kwq->c_lock);
+ } else if (qp->ibqp.event_handler) {
+ ret = 1;
+ }
+
+bail:
+ return ret;
+}
+EXPORT_SYMBOL(rvt_error_qp);
+
+/*
+ * Put the QP into the hash table.
+ * The hash table holds a reference to the QP.
+ */
+static void rvt_insert_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp)
+{
+ struct rvt_ibport *rvp = rdi->ports[qp->port_num - 1];
+ unsigned long flags;
+
+ rvt_get_qp(qp);
+ spin_lock_irqsave(&rdi->qp_dev->qpt_lock, flags);
+
+ if (qp->ibqp.qp_num <= 1) {
+ rcu_assign_pointer(rvp->qp[qp->ibqp.qp_num], qp);
+ } else {
+ u32 n = hash_32(qp->ibqp.qp_num, rdi->qp_dev->qp_table_bits);
+
+ qp->next = rdi->qp_dev->qp_table[n];
+ rcu_assign_pointer(rdi->qp_dev->qp_table[n], qp);
+ trace_rvt_qpinsert(qp, n);
+ }
+
+ spin_unlock_irqrestore(&rdi->qp_dev->qpt_lock, flags);
+}
+
+/**
+ * rvt_modify_qp - modify the attributes of a queue pair
+ * @ibqp: the queue pair who's attributes we're modifying
+ * @attr: the new attributes
+ * @attr_mask: the mask of attributes to modify
+ * @udata: user data for libibverbs.so
+ *
+ * Return: 0 on success, otherwise returns an errno.
+ */
+int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata)
+{
+ struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+ struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
+ enum ib_qp_state cur_state, new_state;
+ struct ib_event ev;
+ int lastwqe = 0;
+ int mig = 0;
+ int pmtu = 0; /* for gcc warning only */
+ int opa_ah;
+
+ if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS)
+ return -EOPNOTSUPP;
+
+ spin_lock_irq(&qp->r_lock);
+ spin_lock(&qp->s_hlock);
+ spin_lock(&qp->s_lock);
+
+ cur_state = attr_mask & IB_QP_CUR_STATE ?
+ attr->cur_qp_state : qp->state;
+ new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+ opa_ah = rdma_cap_opa_ah(ibqp->device, qp->port_num);
+
+ if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
+ attr_mask))
+ goto inval;
+
+ if (rdi->driver_f.check_modify_qp &&
+ rdi->driver_f.check_modify_qp(qp, attr, attr_mask, udata))
+ goto inval;
+
+ if (attr_mask & IB_QP_AV) {
+ if (opa_ah) {
+ if (rdma_ah_get_dlid(&attr->ah_attr) >=
+ opa_get_mcast_base(OPA_MCAST_NR))
+ goto inval;
+ } else {
+ if (rdma_ah_get_dlid(&attr->ah_attr) >=
+ be16_to_cpu(IB_MULTICAST_LID_BASE))
+ goto inval;
+ }
+
+ if (rvt_check_ah(qp->ibqp.device, &attr->ah_attr))
+ goto inval;
+ }
+
+ if (attr_mask & IB_QP_ALT_PATH) {
+ if (opa_ah) {
+ if (rdma_ah_get_dlid(&attr->alt_ah_attr) >=
+ opa_get_mcast_base(OPA_MCAST_NR))
+ goto inval;
+ } else {
+ if (rdma_ah_get_dlid(&attr->alt_ah_attr) >=
+ be16_to_cpu(IB_MULTICAST_LID_BASE))
+ goto inval;
+ }
+
+ if (rvt_check_ah(qp->ibqp.device, &attr->alt_ah_attr))
+ goto inval;
+ if (attr->alt_pkey_index >= rvt_get_npkeys(rdi))
+ goto inval;
+ }
+
+ if (attr_mask & IB_QP_PKEY_INDEX)
+ if (attr->pkey_index >= rvt_get_npkeys(rdi))
+ goto inval;
+
+ if (attr_mask & IB_QP_MIN_RNR_TIMER)
+ if (attr->min_rnr_timer > 31)
+ goto inval;
+
+ if (attr_mask & IB_QP_PORT)
+ if (qp->ibqp.qp_type == IB_QPT_SMI ||
+ qp->ibqp.qp_type == IB_QPT_GSI ||
+ attr->port_num == 0 ||
+ attr->port_num > ibqp->device->phys_port_cnt)
+ goto inval;
+
+ if (attr_mask & IB_QP_DEST_QPN)
+ if (attr->dest_qp_num > RVT_QPN_MASK)
+ goto inval;
+
+ if (attr_mask & IB_QP_RETRY_CNT)
+ if (attr->retry_cnt > 7)
+ goto inval;
+
+ if (attr_mask & IB_QP_RNR_RETRY)
+ if (attr->rnr_retry > 7)
+ goto inval;
+
+ /*
+ * Don't allow invalid path_mtu values. OK to set greater
+ * than the active mtu (or even the max_cap, if we have tuned
+ * that to a small mtu. We'll set qp->path_mtu
+ * to the lesser of requested attribute mtu and active,
+ * for packetizing messages.
+ * Note that the QP port has to be set in INIT and MTU in RTR.
+ */
+ if (attr_mask & IB_QP_PATH_MTU) {
+ pmtu = rdi->driver_f.get_pmtu_from_attr(rdi, qp, attr);
+ if (pmtu < 0)
+ goto inval;
+ }
+
+ if (attr_mask & IB_QP_PATH_MIG_STATE) {
+ if (attr->path_mig_state == IB_MIG_REARM) {
+ if (qp->s_mig_state == IB_MIG_ARMED)
+ goto inval;
+ if (new_state != IB_QPS_RTS)
+ goto inval;
+ } else if (attr->path_mig_state == IB_MIG_MIGRATED) {
+ if (qp->s_mig_state == IB_MIG_REARM)
+ goto inval;
+ if (new_state != IB_QPS_RTS && new_state != IB_QPS_SQD)
+ goto inval;
+ if (qp->s_mig_state == IB_MIG_ARMED)
+ mig = 1;
+ } else {
+ goto inval;
+ }
+ }
+
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+ if (attr->max_dest_rd_atomic > rdi->dparms.max_rdma_atomic)
+ goto inval;
+
+ switch (new_state) {
+ case IB_QPS_RESET:
+ if (qp->state != IB_QPS_RESET)
+ _rvt_reset_qp(rdi, qp, ibqp->qp_type);
+ break;
+
+ case IB_QPS_RTR:
+ /* Allow event to re-trigger if QP set to RTR more than once */
+ qp->r_flags &= ~RVT_R_COMM_EST;
+ qp->state = new_state;
+ break;
+
+ case IB_QPS_SQD:
+ qp->s_draining = qp->s_last != qp->s_cur;
+ qp->state = new_state;
+ break;
+
+ case IB_QPS_SQE:
+ if (qp->ibqp.qp_type == IB_QPT_RC)
+ goto inval;
+ qp->state = new_state;
+ break;
+
+ case IB_QPS_ERR:
+ lastwqe = rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ break;
+
+ default:
+ qp->state = new_state;
+ break;
+ }
+
+ if (attr_mask & IB_QP_PKEY_INDEX)
+ qp->s_pkey_index = attr->pkey_index;
+
+ if (attr_mask & IB_QP_PORT)
+ qp->port_num = attr->port_num;
+
+ if (attr_mask & IB_QP_DEST_QPN)
+ qp->remote_qpn = attr->dest_qp_num;
+
+ if (attr_mask & IB_QP_SQ_PSN) {
+ qp->s_next_psn = attr->sq_psn & rdi->dparms.psn_modify_mask;
+ qp->s_psn = qp->s_next_psn;
+ qp->s_sending_psn = qp->s_next_psn;
+ qp->s_last_psn = qp->s_next_psn - 1;
+ qp->s_sending_hpsn = qp->s_last_psn;
+ }
+
+ if (attr_mask & IB_QP_RQ_PSN)
+ qp->r_psn = attr->rq_psn & rdi->dparms.psn_modify_mask;
+
+ if (attr_mask & IB_QP_ACCESS_FLAGS)
+ qp->qp_access_flags = attr->qp_access_flags;
+
+ if (attr_mask & IB_QP_AV) {
+ rdma_replace_ah_attr(&qp->remote_ah_attr, &attr->ah_attr);
+ qp->s_srate = rdma_ah_get_static_rate(&attr->ah_attr);
+ qp->srate_mbps = ib_rate_to_mbps(qp->s_srate);
+ }
+
+ if (attr_mask & IB_QP_ALT_PATH) {
+ rdma_replace_ah_attr(&qp->alt_ah_attr, &attr->alt_ah_attr);
+ qp->s_alt_pkey_index = attr->alt_pkey_index;
+ }
+
+ if (attr_mask & IB_QP_PATH_MIG_STATE) {
+ qp->s_mig_state = attr->path_mig_state;
+ if (mig) {
+ qp->remote_ah_attr = qp->alt_ah_attr;
+ qp->port_num = rdma_ah_get_port_num(&qp->alt_ah_attr);
+ qp->s_pkey_index = qp->s_alt_pkey_index;
+ }
+ }
+
+ if (attr_mask & IB_QP_PATH_MTU) {
+ qp->pmtu = rdi->driver_f.mtu_from_qp(rdi, qp, pmtu);
+ qp->log_pmtu = ilog2(qp->pmtu);
+ }
+
+ if (attr_mask & IB_QP_RETRY_CNT) {
+ qp->s_retry_cnt = attr->retry_cnt;
+ qp->s_retry = attr->retry_cnt;
+ }
+
+ if (attr_mask & IB_QP_RNR_RETRY) {
+ qp->s_rnr_retry_cnt = attr->rnr_retry;
+ qp->s_rnr_retry = attr->rnr_retry;
+ }
+
+ if (attr_mask & IB_QP_MIN_RNR_TIMER)
+ qp->r_min_rnr_timer = attr->min_rnr_timer;
+
+ if (attr_mask & IB_QP_TIMEOUT) {
+ qp->timeout = attr->timeout;
+ qp->timeout_jiffies = rvt_timeout_to_jiffies(qp->timeout);
+ }
+
+ if (attr_mask & IB_QP_QKEY)
+ qp->qkey = attr->qkey;
+
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+ qp->r_max_rd_atomic = attr->max_dest_rd_atomic;
+
+ if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC)
+ qp->s_max_rd_atomic = attr->max_rd_atomic;
+
+ if (rdi->driver_f.modify_qp)
+ rdi->driver_f.modify_qp(qp, attr, attr_mask, udata);
+
+ spin_unlock(&qp->s_lock);
+ spin_unlock(&qp->s_hlock);
+ spin_unlock_irq(&qp->r_lock);
+
+ if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+ rvt_insert_qp(rdi, qp);
+
+ if (lastwqe) {
+ ev.device = qp->ibqp.device;
+ ev.element.qp = &qp->ibqp;
+ ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+ qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+ }
+ if (mig) {
+ ev.device = qp->ibqp.device;
+ ev.element.qp = &qp->ibqp;
+ ev.event = IB_EVENT_PATH_MIG;
+ qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+ }
+ return 0;
+
+inval:
+ spin_unlock(&qp->s_lock);
+ spin_unlock(&qp->s_hlock);
+ spin_unlock_irq(&qp->r_lock);
+ return -EINVAL;
+}
+
+/**
+ * rvt_destroy_qp - destroy a queue pair
+ * @ibqp: the queue pair to destroy
+ * @udata: unused by the driver
+ *
+ * Note that this can be called while the QP is actively sending or
+ * receiving!
+ *
+ * Return: 0 on success.
+ */
+int rvt_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
+{
+ struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
+ struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+
+ rvt_reset_qp(rdi, qp, ibqp->qp_type);
+
+ wait_event(qp->wait, !atomic_read(&qp->refcount));
+ /* qpn is now available for use again */
+ rvt_free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num);
+
+ spin_lock(&rdi->n_qps_lock);
+ rdi->n_qps_allocated--;
+ if (qp->ibqp.qp_type == IB_QPT_RC) {
+ rdi->n_rc_qps--;
+ rdi->busy_jiffies = rdi->n_rc_qps / RC_QP_SCALING_INTERVAL;
+ }
+ spin_unlock(&rdi->n_qps_lock);
+
+ if (qp->ip)
+ kref_put(&qp->ip->ref, rvt_release_mmap_info);
+ kvfree(qp->r_rq.kwq);
+ rdi->driver_f.qp_priv_free(rdi, qp);
+ kfree(qp->s_ack_queue);
+ kfree(qp->r_sg_list);
+ rdma_destroy_ah_attr(&qp->remote_ah_attr);
+ rdma_destroy_ah_attr(&qp->alt_ah_attr);
+ free_ud_wq_attr(qp);
+ vfree(qp->s_wq);
+ return 0;
+}
+
+/**
+ * rvt_query_qp - query an ipbq
+ * @ibqp: IB qp to query
+ * @attr: attr struct to fill in
+ * @attr_mask: attr mask ignored
+ * @init_attr: struct to fill in
+ *
+ * Return: always 0
+ */
+int rvt_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_qp_init_attr *init_attr)
+{
+ struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
+ struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+
+ attr->qp_state = qp->state;
+ attr->cur_qp_state = attr->qp_state;
+ attr->path_mtu = rdi->driver_f.mtu_to_path_mtu(qp->pmtu);
+ attr->path_mig_state = qp->s_mig_state;
+ attr->qkey = qp->qkey;
+ attr->rq_psn = qp->r_psn & rdi->dparms.psn_mask;
+ attr->sq_psn = qp->s_next_psn & rdi->dparms.psn_mask;
+ attr->dest_qp_num = qp->remote_qpn;
+ attr->qp_access_flags = qp->qp_access_flags;
+ attr->cap.max_send_wr = qp->s_size - 1 -
+ rdi->dparms.reserved_operations;
+ attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
+ attr->cap.max_send_sge = qp->s_max_sge;
+ attr->cap.max_recv_sge = qp->r_rq.max_sge;
+ attr->cap.max_inline_data = 0;
+ attr->ah_attr = qp->remote_ah_attr;
+ attr->alt_ah_attr = qp->alt_ah_attr;
+ attr->pkey_index = qp->s_pkey_index;
+ attr->alt_pkey_index = qp->s_alt_pkey_index;
+ attr->en_sqd_async_notify = 0;
+ attr->sq_draining = qp->s_draining;
+ attr->max_rd_atomic = qp->s_max_rd_atomic;
+ attr->max_dest_rd_atomic = qp->r_max_rd_atomic;
+ attr->min_rnr_timer = qp->r_min_rnr_timer;
+ attr->port_num = qp->port_num;
+ attr->timeout = qp->timeout;
+ attr->retry_cnt = qp->s_retry_cnt;
+ attr->rnr_retry = qp->s_rnr_retry_cnt;
+ attr->alt_port_num =
+ rdma_ah_get_port_num(&qp->alt_ah_attr);
+ attr->alt_timeout = qp->alt_timeout;
+
+ init_attr->event_handler = qp->ibqp.event_handler;
+ init_attr->qp_context = qp->ibqp.qp_context;
+ init_attr->send_cq = qp->ibqp.send_cq;
+ init_attr->recv_cq = qp->ibqp.recv_cq;
+ init_attr->srq = qp->ibqp.srq;
+ init_attr->cap = attr->cap;
+ if (qp->s_flags & RVT_S_SIGNAL_REQ_WR)
+ init_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+ else
+ init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
+ init_attr->qp_type = qp->ibqp.qp_type;
+ init_attr->port_num = qp->port_num;
+ return 0;
+}
+
+/**
+ * rvt_post_recv - post a receive on a QP
+ * @ibqp: the QP to post the receive on
+ * @wr: the WR to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ *
+ * Return: 0 on success otherwise errno
+ */
+int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
+ const struct ib_recv_wr **bad_wr)
+{
+ struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
+ struct rvt_krwq *wq = qp->r_rq.kwq;
+ unsigned long flags;
+ int qp_err_flush = (ib_rvt_state_ops[qp->state] & RVT_FLUSH_RECV) &&
+ !qp->ibqp.srq;
+
+ /* Check that state is OK to post receive. */
+ if (!(ib_rvt_state_ops[qp->state] & RVT_POST_RECV_OK) || !wq) {
+ *bad_wr = wr;
+ return -EINVAL;
+ }
+
+ for (; wr; wr = wr->next) {
+ struct rvt_rwqe *wqe;
+ u32 next;
+ int i;
+
+ if ((unsigned)wr->num_sge > qp->r_rq.max_sge) {
+ *bad_wr = wr;
+ return -EINVAL;
+ }
+
+ spin_lock_irqsave(&qp->r_rq.kwq->p_lock, flags);
+ next = wq->head + 1;
+ if (next >= qp->r_rq.size)
+ next = 0;
+ if (next == READ_ONCE(wq->tail)) {
+ spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags);
+ *bad_wr = wr;
+ return -ENOMEM;
+ }
+ if (unlikely(qp_err_flush)) {
+ struct ib_wc wc;
+
+ memset(&wc, 0, sizeof(wc));
+ wc.qp = &qp->ibqp;
+ wc.opcode = IB_WC_RECV;
+ wc.wr_id = wr->wr_id;
+ wc.status = IB_WC_WR_FLUSH_ERR;
+ rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
+ } else {
+ wqe = rvt_get_rwqe_ptr(&qp->r_rq, wq->head);
+ wqe->wr_id = wr->wr_id;
+ wqe->num_sge = wr->num_sge;
+ for (i = 0; i < wr->num_sge; i++) {
+ wqe->sg_list[i].addr = wr->sg_list[i].addr;
+ wqe->sg_list[i].length = wr->sg_list[i].length;
+ wqe->sg_list[i].lkey = wr->sg_list[i].lkey;
+ }
+ /*
+ * Make sure queue entry is written
+ * before the head index.
+ */
+ smp_store_release(&wq->head, next);
+ }
+ spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags);
+ }
+ return 0;
+}
+
+/**
+ * rvt_qp_valid_operation - validate post send wr request
+ * @qp: the qp
+ * @post_parms: the post send table for the driver
+ * @wr: the work request
+ *
+ * The routine validates the operation based on the
+ * validation table an returns the length of the operation
+ * which can extend beyond the ib_send_bw. Operation
+ * dependent flags key atomic operation validation.
+ *
+ * There is an exception for UD qps that validates the pd and
+ * overrides the length to include the additional UD specific
+ * length.
+ *
+ * Returns a negative error or the length of the work request
+ * for building the swqe.
+ */
+static inline int rvt_qp_valid_operation(
+ struct rvt_qp *qp,
+ const struct rvt_operation_params *post_parms,
+ const struct ib_send_wr *wr)
+{
+ int len;
+
+ if (wr->opcode >= RVT_OPERATION_MAX || !post_parms[wr->opcode].length)
+ return -EINVAL;
+ if (!(post_parms[wr->opcode].qpt_support & BIT(qp->ibqp.qp_type)))
+ return -EINVAL;
+ if ((post_parms[wr->opcode].flags & RVT_OPERATION_PRIV) &&
+ ibpd_to_rvtpd(qp->ibqp.pd)->user)
+ return -EINVAL;
+ if (post_parms[wr->opcode].flags & RVT_OPERATION_ATOMIC_SGE &&
+ (wr->num_sge == 0 ||
+ wr->sg_list[0].length < sizeof(u64) ||
+ wr->sg_list[0].addr & (sizeof(u64) - 1)))
+ return -EINVAL;
+ if (post_parms[wr->opcode].flags & RVT_OPERATION_ATOMIC &&
+ !qp->s_max_rd_atomic)
+ return -EINVAL;
+ len = post_parms[wr->opcode].length;
+ /* UD specific */
+ if (qp->ibqp.qp_type != IB_QPT_UC &&
+ qp->ibqp.qp_type != IB_QPT_RC) {
+ if (qp->ibqp.pd != ud_wr(wr)->ah->pd)
+ return -EINVAL;
+ len = sizeof(struct ib_ud_wr);
+ }
+ return len;
+}
+
+/**
+ * rvt_qp_is_avail - determine queue capacity
+ * @qp: the qp
+ * @rdi: the rdmavt device
+ * @reserved_op: is reserved operation
+ *
+ * This assumes the s_hlock is held but the s_last
+ * qp variable is uncontrolled.
+ *
+ * For non reserved operations, the qp->s_avail
+ * may be changed.
+ *
+ * The return value is zero or a -ENOMEM.
+ */
+static inline int rvt_qp_is_avail(
+ struct rvt_qp *qp,
+ struct rvt_dev_info *rdi,
+ bool reserved_op)
+{
+ u32 slast;
+ u32 avail;
+ u32 reserved_used;
+
+ /* see rvt_qp_wqe_unreserve() */
+ smp_mb__before_atomic();
+ if (unlikely(reserved_op)) {
+ /* see rvt_qp_wqe_unreserve() */
+ reserved_used = atomic_read(&qp->s_reserved_used);
+ if (reserved_used >= rdi->dparms.reserved_operations)
+ return -ENOMEM;
+ return 0;
+ }
+ /* non-reserved operations */
+ if (likely(qp->s_avail))
+ return 0;
+ /* See rvt_qp_complete_swqe() */
+ slast = smp_load_acquire(&qp->s_last);
+ if (qp->s_head >= slast)
+ avail = qp->s_size - (qp->s_head - slast);
+ else
+ avail = slast - qp->s_head;
+
+ reserved_used = atomic_read(&qp->s_reserved_used);
+ avail = avail - 1 -
+ (rdi->dparms.reserved_operations - reserved_used);
+ /* insure we don't assign a negative s_avail */
+ if ((s32)avail <= 0)
+ return -ENOMEM;
+ qp->s_avail = avail;
+ if (WARN_ON(qp->s_avail >
+ (qp->s_size - 1 - rdi->dparms.reserved_operations)))
+ rvt_pr_err(rdi,
+ "More avail entries than QP RB size.\nQP: %u, size: %u, avail: %u\nhead: %u, tail: %u, cur: %u, acked: %u, last: %u",
+ qp->ibqp.qp_num, qp->s_size, qp->s_avail,
+ qp->s_head, qp->s_tail, qp->s_cur,
+ qp->s_acked, qp->s_last);
+ return 0;
+}
+
+/**
+ * rvt_post_one_wr - post one RC, UC, or UD send work request
+ * @qp: the QP to post on
+ * @wr: the work request to send
+ * @call_send: kick the send engine into gear
+ */
+static int rvt_post_one_wr(struct rvt_qp *qp,
+ const struct ib_send_wr *wr,
+ bool *call_send)
+{
+ struct rvt_swqe *wqe;
+ u32 next;
+ int i;
+ int j;
+ int acc;
+ struct rvt_lkey_table *rkt;
+ struct rvt_pd *pd;
+ struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+ u8 log_pmtu;
+ int ret;
+ size_t cplen;
+ bool reserved_op;
+ int local_ops_delayed = 0;
+
+ BUILD_BUG_ON(IB_QPT_MAX >= (sizeof(u32) * BITS_PER_BYTE));
+
+ /* IB spec says that num_sge == 0 is OK. */
+ if (unlikely(wr->num_sge > qp->s_max_sge))
+ return -EINVAL;
+
+ ret = rvt_qp_valid_operation(qp, rdi->post_parms, wr);
+ if (ret < 0)
+ return ret;
+ cplen = ret;
+
+ /*
+ * Local operations include fast register and local invalidate.
+ * Fast register needs to be processed immediately because the
+ * registered lkey may be used by following work requests and the
+ * lkey needs to be valid at the time those requests are posted.
+ * Local invalidate can be processed immediately if fencing is
+ * not required and no previous local invalidate ops are pending.
+ * Signaled local operations that have been processed immediately
+ * need to have requests with "completion only" flags set posted
+ * to the send queue in order to generate completions.
+ */
+ if ((rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL)) {
+ switch (wr->opcode) {
+ case IB_WR_REG_MR:
+ ret = rvt_fast_reg_mr(qp,
+ reg_wr(wr)->mr,
+ reg_wr(wr)->key,
+ reg_wr(wr)->access);
+ if (ret || !(wr->send_flags & IB_SEND_SIGNALED))
+ return ret;
+ break;
+ case IB_WR_LOCAL_INV:
+ if ((wr->send_flags & IB_SEND_FENCE) ||
+ atomic_read(&qp->local_ops_pending)) {
+ local_ops_delayed = 1;
+ } else {
+ ret = rvt_invalidate_rkey(
+ qp, wr->ex.invalidate_rkey);
+ if (ret || !(wr->send_flags & IB_SEND_SIGNALED))
+ return ret;
+ }
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ reserved_op = rdi->post_parms[wr->opcode].flags &
+ RVT_OPERATION_USE_RESERVE;
+ /* check for avail */
+ ret = rvt_qp_is_avail(qp, rdi, reserved_op);
+ if (ret)
+ return ret;
+ next = qp->s_head + 1;
+ if (next >= qp->s_size)
+ next = 0;
+
+ rkt = &rdi->lkey_table;
+ pd = ibpd_to_rvtpd(qp->ibqp.pd);
+ wqe = rvt_get_swqe_ptr(qp, qp->s_head);
+
+ /* cplen has length from above */
+ memcpy(&wqe->ud_wr, wr, cplen);
+
+ wqe->length = 0;
+ j = 0;
+ if (wr->num_sge) {
+ struct rvt_sge *last_sge = NULL;
+
+ acc = wr->opcode >= IB_WR_RDMA_READ ?
+ IB_ACCESS_LOCAL_WRITE : 0;
+ for (i = 0; i < wr->num_sge; i++) {
+ u32 length = wr->sg_list[i].length;
+
+ if (length == 0)
+ continue;
+ ret = rvt_lkey_ok(rkt, pd, &wqe->sg_list[j], last_sge,
+ &wr->sg_list[i], acc);
+ if (unlikely(ret < 0))
+ goto bail_inval_free;
+ wqe->length += length;
+ if (ret)
+ last_sge = &wqe->sg_list[j];
+ j += ret;
+ }
+ wqe->wr.num_sge = j;
+ }
+
+ /*
+ * Calculate and set SWQE PSN values prior to handing it off
+ * to the driver's check routine. This give the driver the
+ * opportunity to adjust PSN values based on internal checks.
+ */
+ log_pmtu = qp->log_pmtu;
+ if (qp->allowed_ops == IB_OPCODE_UD) {
+ struct rvt_ah *ah = rvt_get_swqe_ah(wqe);
+
+ log_pmtu = ah->log_pmtu;
+ rdma_copy_ah_attr(wqe->ud_wr.attr, &ah->attr);
+ }
+
+ if (rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) {
+ if (local_ops_delayed)
+ atomic_inc(&qp->local_ops_pending);
+ else
+ wqe->wr.send_flags |= RVT_SEND_COMPLETION_ONLY;
+ wqe->ssn = 0;
+ wqe->psn = 0;
+ wqe->lpsn = 0;
+ } else {
+ wqe->ssn = qp->s_ssn++;
+ wqe->psn = qp->s_next_psn;
+ wqe->lpsn = wqe->psn +
+ (wqe->length ?
+ ((wqe->length - 1) >> log_pmtu) :
+ 0);
+ }
+
+ /* general part of wqe valid - allow for driver checks */
+ if (rdi->driver_f.setup_wqe) {
+ ret = rdi->driver_f.setup_wqe(qp, wqe, call_send);
+ if (ret < 0)
+ goto bail_inval_free_ref;
+ }
+
+ if (!(rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL))
+ qp->s_next_psn = wqe->lpsn + 1;
+
+ if (unlikely(reserved_op)) {
+ wqe->wr.send_flags |= RVT_SEND_RESERVE_USED;
+ rvt_qp_wqe_reserve(qp, wqe);
+ } else {
+ wqe->wr.send_flags &= ~RVT_SEND_RESERVE_USED;
+ qp->s_avail--;
+ }
+ trace_rvt_post_one_wr(qp, wqe, wr->num_sge);
+ smp_wmb(); /* see request builders */
+ qp->s_head = next;
+
+ return 0;
+
+bail_inval_free_ref:
+ if (qp->allowed_ops == IB_OPCODE_UD)
+ rdma_destroy_ah_attr(wqe->ud_wr.attr);
+bail_inval_free:
+ /* release mr holds */
+ while (j) {
+ struct rvt_sge *sge = &wqe->sg_list[--j];
+
+ rvt_put_mr(sge->mr);
+ }
+ return ret;
+}
+
+/**
+ * rvt_post_send - post a send on a QP
+ * @ibqp: the QP to post the send on
+ * @wr: the list of work requests to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ *
+ * Return: 0 on success else errno
+ */
+int rvt_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
+ const struct ib_send_wr **bad_wr)
+{
+ struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
+ struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+ unsigned long flags = 0;
+ bool call_send;
+ unsigned nreq = 0;
+ int err = 0;
+
+ spin_lock_irqsave(&qp->s_hlock, flags);
+
+ /*
+ * Ensure QP state is such that we can send. If not bail out early,
+ * there is no need to do this every time we post a send.
+ */
+ if (unlikely(!(ib_rvt_state_ops[qp->state] & RVT_POST_SEND_OK))) {
+ spin_unlock_irqrestore(&qp->s_hlock, flags);
+ return -EINVAL;
+ }
+
+ /*
+ * If the send queue is empty, and we only have a single WR then just go
+ * ahead and kick the send engine into gear. Otherwise we will always
+ * just schedule the send to happen later.
+ */
+ call_send = qp->s_head == READ_ONCE(qp->s_last) && !wr->next;
+
+ for (; wr; wr = wr->next) {
+ err = rvt_post_one_wr(qp, wr, &call_send);
+ if (unlikely(err)) {
+ *bad_wr = wr;
+ goto bail;
+ }
+ nreq++;
+ }
+bail:
+ spin_unlock_irqrestore(&qp->s_hlock, flags);
+ if (nreq) {
+ /*
+ * Only call do_send if there is exactly one packet, and the
+ * driver said it was ok.
+ */
+ if (nreq == 1 && call_send)
+ rdi->driver_f.do_send(qp);
+ else
+ rdi->driver_f.schedule_send_no_lock(qp);
+ }
+ return err;
+}
+
+/**
+ * rvt_post_srq_recv - post a receive on a shared receive queue
+ * @ibsrq: the SRQ to post the receive on
+ * @wr: the list of work requests to post
+ * @bad_wr: A pointer to the first WR to cause a problem is put here
+ *
+ * This may be called from interrupt context.
+ *
+ * Return: 0 on success else errno
+ */
+int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
+ const struct ib_recv_wr **bad_wr)
+{
+ struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
+ struct rvt_krwq *wq;
+ unsigned long flags;
+
+ for (; wr; wr = wr->next) {
+ struct rvt_rwqe *wqe;
+ u32 next;
+ int i;
+
+ if ((unsigned)wr->num_sge > srq->rq.max_sge) {
+ *bad_wr = wr;
+ return -EINVAL;
+ }
+
+ spin_lock_irqsave(&srq->rq.kwq->p_lock, flags);
+ wq = srq->rq.kwq;
+ next = wq->head + 1;
+ if (next >= srq->rq.size)
+ next = 0;
+ if (next == READ_ONCE(wq->tail)) {
+ spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags);
+ *bad_wr = wr;
+ return -ENOMEM;
+ }
+
+ wqe = rvt_get_rwqe_ptr(&srq->rq, wq->head);
+ wqe->wr_id = wr->wr_id;
+ wqe->num_sge = wr->num_sge;
+ for (i = 0; i < wr->num_sge; i++) {
+ wqe->sg_list[i].addr = wr->sg_list[i].addr;
+ wqe->sg_list[i].length = wr->sg_list[i].length;
+ wqe->sg_list[i].lkey = wr->sg_list[i].lkey;
+ }
+ /* Make sure queue entry is written before the head index. */
+ smp_store_release(&wq->head, next);
+ spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags);
+ }
+ return 0;
+}
+
+/*
+ * rvt used the internal kernel struct as part of its ABI, for now make sure
+ * the kernel struct does not change layout. FIXME: rvt should never cast the
+ * user struct to a kernel struct.
+ */
+static struct ib_sge *rvt_cast_sge(struct rvt_wqe_sge *sge)
+{
+ BUILD_BUG_ON(offsetof(struct ib_sge, addr) !=
+ offsetof(struct rvt_wqe_sge, addr));
+ BUILD_BUG_ON(offsetof(struct ib_sge, length) !=
+ offsetof(struct rvt_wqe_sge, length));
+ BUILD_BUG_ON(offsetof(struct ib_sge, lkey) !=
+ offsetof(struct rvt_wqe_sge, lkey));
+ return (struct ib_sge *)sge;
+}
+
+/*
+ * Validate a RWQE and fill in the SGE state.
+ * Return 1 if OK.
+ */
+static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe)
+{
+ int i, j, ret;
+ struct ib_wc wc;
+ struct rvt_lkey_table *rkt;
+ struct rvt_pd *pd;
+ struct rvt_sge_state *ss;
+ struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+
+ rkt = &rdi->lkey_table;
+ pd = ibpd_to_rvtpd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd);
+ ss = &qp->r_sge;
+ ss->sg_list = qp->r_sg_list;
+ qp->r_len = 0;
+ for (i = j = 0; i < wqe->num_sge; i++) {
+ if (wqe->sg_list[i].length == 0)
+ continue;
+ /* Check LKEY */
+ ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
+ NULL, rvt_cast_sge(&wqe->sg_list[i]),
+ IB_ACCESS_LOCAL_WRITE);
+ if (unlikely(ret <= 0))
+ goto bad_lkey;
+ qp->r_len += wqe->sg_list[i].length;
+ j++;
+ }
+ ss->num_sge = j;
+ ss->total_len = qp->r_len;
+ return 1;
+
+bad_lkey:
+ while (j) {
+ struct rvt_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge;
+
+ rvt_put_mr(sge->mr);
+ }
+ ss->num_sge = 0;
+ memset(&wc, 0, sizeof(wc));
+ wc.wr_id = wqe->wr_id;
+ wc.status = IB_WC_LOC_PROT_ERR;
+ wc.opcode = IB_WC_RECV;
+ wc.qp = &qp->ibqp;
+ /* Signal solicited completion event. */
+ rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
+ return 0;
+}
+
+/**
+ * get_rvt_head - get head indices of the circular buffer
+ * @rq: data structure for request queue entry
+ * @ip: the QP
+ *
+ * Return - head index value
+ */
+static inline u32 get_rvt_head(struct rvt_rq *rq, void *ip)
+{
+ u32 head;
+
+ if (ip)
+ head = RDMA_READ_UAPI_ATOMIC(rq->wq->head);
+ else
+ head = rq->kwq->head;
+
+ return head;
+}
+
+/**
+ * rvt_get_rwqe - copy the next RWQE into the QP's RWQE
+ * @qp: the QP
+ * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
+ *
+ * Return -1 if there is a local error, 0 if no RWQE is available,
+ * otherwise return 1.
+ *
+ * Can be called from interrupt level.
+ */
+int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only)
+{
+ unsigned long flags;
+ struct rvt_rq *rq;
+ struct rvt_krwq *kwq = NULL;
+ struct rvt_rwq *wq;
+ struct rvt_srq *srq;
+ struct rvt_rwqe *wqe;
+ void (*handler)(struct ib_event *, void *);
+ u32 tail;
+ u32 head;
+ int ret;
+ void *ip = NULL;
+
+ if (qp->ibqp.srq) {
+ srq = ibsrq_to_rvtsrq(qp->ibqp.srq);
+ handler = srq->ibsrq.event_handler;
+ rq = &srq->rq;
+ ip = srq->ip;
+ } else {
+ srq = NULL;
+ handler = NULL;
+ rq = &qp->r_rq;
+ ip = qp->ip;
+ }
+
+ spin_lock_irqsave(&rq->kwq->c_lock, flags);
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+ ret = 0;
+ goto unlock;
+ }
+ kwq = rq->kwq;
+ if (ip) {
+ wq = rq->wq;
+ tail = RDMA_READ_UAPI_ATOMIC(wq->tail);
+ } else {
+ tail = kwq->tail;
+ }
+
+ /* Validate tail before using it since it is user writable. */
+ if (tail >= rq->size)
+ tail = 0;
+
+ if (kwq->count < RVT_RWQ_COUNT_THRESHOLD) {
+ head = get_rvt_head(rq, ip);
+ kwq->count = rvt_get_rq_count(rq, head, tail);
+ }
+ if (unlikely(kwq->count == 0)) {
+ ret = 0;
+ goto unlock;
+ }
+ /* Make sure entry is read after the count is read. */
+ smp_rmb();
+ wqe = rvt_get_rwqe_ptr(rq, tail);
+ /*
+ * Even though we update the tail index in memory, the verbs
+ * consumer is not supposed to post more entries until a
+ * completion is generated.
+ */
+ if (++tail >= rq->size)
+ tail = 0;
+ if (ip)
+ RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail);
+ else
+ kwq->tail = tail;
+ if (!wr_id_only && !init_sge(qp, wqe)) {
+ ret = -1;
+ goto unlock;
+ }
+ qp->r_wr_id = wqe->wr_id;
+
+ kwq->count--;
+ ret = 1;
+ set_bit(RVT_R_WRID_VALID, &qp->r_aflags);
+ if (handler) {
+ /*
+ * Validate head pointer value and compute
+ * the number of remaining WQEs.
+ */
+ if (kwq->count < srq->limit) {
+ kwq->count =
+ rvt_get_rq_count(rq,
+ get_rvt_head(rq, ip), tail);
+ if (kwq->count < srq->limit) {
+ struct ib_event ev;
+
+ srq->limit = 0;
+ spin_unlock_irqrestore(&rq->kwq->c_lock, flags);
+ ev.device = qp->ibqp.device;
+ ev.element.srq = qp->ibqp.srq;
+ ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+ handler(&ev, srq->ibsrq.srq_context);
+ goto bail;
+ }
+ }
+ }
+unlock:
+ spin_unlock_irqrestore(&rq->kwq->c_lock, flags);
+bail:
+ return ret;
+}
+EXPORT_SYMBOL(rvt_get_rwqe);
+
+/**
+ * rvt_comm_est - handle trap with QP established
+ * @qp: the QP
+ */
+void rvt_comm_est(struct rvt_qp *qp)
+{
+ qp->r_flags |= RVT_R_COMM_EST;
+ if (qp->ibqp.event_handler) {
+ struct ib_event ev;
+
+ ev.device = qp->ibqp.device;
+ ev.element.qp = &qp->ibqp;
+ ev.event = IB_EVENT_COMM_EST;
+ qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+ }
+}
+EXPORT_SYMBOL(rvt_comm_est);
+
+void rvt_rc_error(struct rvt_qp *qp, enum ib_wc_status err)
+{
+ unsigned long flags;
+ int lastwqe;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ lastwqe = rvt_error_qp(qp, err);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+
+ if (lastwqe) {
+ struct ib_event ev;
+
+ ev.device = qp->ibqp.device;
+ ev.element.qp = &qp->ibqp;
+ ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+ qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+ }
+}
+EXPORT_SYMBOL(rvt_rc_error);
+
+/*
+ * rvt_rnr_tbl_to_usec - return index into ib_rvt_rnr_table
+ * @index - the index
+ * return usec from an index into ib_rvt_rnr_table
+ */
+unsigned long rvt_rnr_tbl_to_usec(u32 index)
+{
+ return ib_rvt_rnr_table[(index & IB_AETH_CREDIT_MASK)];
+}
+EXPORT_SYMBOL(rvt_rnr_tbl_to_usec);
+
+static inline unsigned long rvt_aeth_to_usec(u32 aeth)
+{
+ return ib_rvt_rnr_table[(aeth >> IB_AETH_CREDIT_SHIFT) &
+ IB_AETH_CREDIT_MASK];
+}
+
+/*
+ * rvt_add_retry_timer_ext - add/start a retry timer
+ * @qp - the QP
+ * @shift - timeout shift to wait for multiple packets
+ * add a retry timer on the QP
+ */
+void rvt_add_retry_timer_ext(struct rvt_qp *qp, u8 shift)
+{
+ struct ib_qp *ibqp = &qp->ibqp;
+ struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+
+ lockdep_assert_held(&qp->s_lock);
+ qp->s_flags |= RVT_S_TIMER;
+ /* 4.096 usec. * (1 << qp->timeout) */
+ qp->s_timer.expires = jiffies + rdi->busy_jiffies +
+ (qp->timeout_jiffies << shift);
+ add_timer(&qp->s_timer);
+}
+EXPORT_SYMBOL(rvt_add_retry_timer_ext);
+
+/**
+ * rvt_add_rnr_timer - add/start an rnr timer on the QP
+ * @qp: the QP
+ * @aeth: aeth of RNR timeout, simulated aeth for loopback
+ */
+void rvt_add_rnr_timer(struct rvt_qp *qp, u32 aeth)
+{
+ u32 to;
+
+ lockdep_assert_held(&qp->s_lock);
+ qp->s_flags |= RVT_S_WAIT_RNR;
+ to = rvt_aeth_to_usec(aeth);
+ trace_rvt_rnrnak_add(qp, to);
+ hrtimer_start(&qp->s_rnr_timer,
+ ns_to_ktime(1000 * to), HRTIMER_MODE_REL_PINNED);
+}
+EXPORT_SYMBOL(rvt_add_rnr_timer);
+
+/**
+ * rvt_stop_rc_timers - stop all timers
+ * @qp: the QP
+ * stop any pending timers
+ */
+void rvt_stop_rc_timers(struct rvt_qp *qp)
+{
+ lockdep_assert_held(&qp->s_lock);
+ /* Remove QP from all timers */
+ if (qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR)) {
+ qp->s_flags &= ~(RVT_S_TIMER | RVT_S_WAIT_RNR);
+ del_timer(&qp->s_timer);
+ hrtimer_try_to_cancel(&qp->s_rnr_timer);
+ }
+}
+EXPORT_SYMBOL(rvt_stop_rc_timers);
+
+/**
+ * rvt_stop_rnr_timer - stop an rnr timer
+ * @qp: the QP
+ *
+ * stop an rnr timer and return if the timer
+ * had been pending.
+ */
+static void rvt_stop_rnr_timer(struct rvt_qp *qp)
+{
+ lockdep_assert_held(&qp->s_lock);
+ /* Remove QP from rnr timer */
+ if (qp->s_flags & RVT_S_WAIT_RNR) {
+ qp->s_flags &= ~RVT_S_WAIT_RNR;
+ trace_rvt_rnrnak_stop(qp, 0);
+ }
+}
+
+/**
+ * rvt_del_timers_sync - wait for any timeout routines to exit
+ * @qp: the QP
+ */
+void rvt_del_timers_sync(struct rvt_qp *qp)
+{
+ del_timer_sync(&qp->s_timer);
+ hrtimer_cancel(&qp->s_rnr_timer);
+}
+EXPORT_SYMBOL(rvt_del_timers_sync);
+
+/*
+ * This is called from s_timer for missing responses.
+ */
+static void rvt_rc_timeout(struct timer_list *t)
+{
+ struct rvt_qp *qp = from_timer(qp, t, s_timer);
+ struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->r_lock, flags);
+ spin_lock(&qp->s_lock);
+ if (qp->s_flags & RVT_S_TIMER) {
+ struct rvt_ibport *rvp = rdi->ports[qp->port_num - 1];
+
+ qp->s_flags &= ~RVT_S_TIMER;
+ rvp->n_rc_timeouts++;
+ del_timer(&qp->s_timer);
+ trace_rvt_rc_timeout(qp, qp->s_last_psn + 1);
+ if (rdi->driver_f.notify_restart_rc)
+ rdi->driver_f.notify_restart_rc(qp,
+ qp->s_last_psn + 1,
+ 1);
+ rdi->driver_f.schedule_send(qp);
+ }
+ spin_unlock(&qp->s_lock);
+ spin_unlock_irqrestore(&qp->r_lock, flags);
+}
+
+/*
+ * This is called from s_timer for RNR timeouts.
+ */
+enum hrtimer_restart rvt_rc_rnr_retry(struct hrtimer *t)
+{
+ struct rvt_qp *qp = container_of(t, struct rvt_qp, s_rnr_timer);
+ struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ rvt_stop_rnr_timer(qp);
+ trace_rvt_rnrnak_timeout(qp, 0);
+ rdi->driver_f.schedule_send(qp);
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ return HRTIMER_NORESTART;
+}
+EXPORT_SYMBOL(rvt_rc_rnr_retry);
+
+/**
+ * rvt_qp_iter_init - initial for QP iteration
+ * @rdi: rvt devinfo
+ * @v: u64 value
+ * @cb: user-defined callback
+ *
+ * This returns an iterator suitable for iterating QPs
+ * in the system.
+ *
+ * The @cb is a user-defined callback and @v is a 64-bit
+ * value passed to and relevant for processing in the
+ * @cb. An example use case would be to alter QP processing
+ * based on criteria not part of the rvt_qp.
+ *
+ * Use cases that require memory allocation to succeed
+ * must preallocate appropriately.
+ *
+ * Return: a pointer to an rvt_qp_iter or NULL
+ */
+struct rvt_qp_iter *rvt_qp_iter_init(struct rvt_dev_info *rdi,
+ u64 v,
+ void (*cb)(struct rvt_qp *qp, u64 v))
+{
+ struct rvt_qp_iter *i;
+
+ i = kzalloc(sizeof(*i), GFP_KERNEL);
+ if (!i)
+ return NULL;
+
+ i->rdi = rdi;
+ /* number of special QPs (SMI/GSI) for device */
+ i->specials = rdi->ibdev.phys_port_cnt * 2;
+ i->v = v;
+ i->cb = cb;
+
+ return i;
+}
+EXPORT_SYMBOL(rvt_qp_iter_init);
+
+/**
+ * rvt_qp_iter_next - return the next QP in iter
+ * @iter: the iterator
+ *
+ * Fine grained QP iterator suitable for use
+ * with debugfs seq_file mechanisms.
+ *
+ * Updates iter->qp with the current QP when the return
+ * value is 0.
+ *
+ * Return: 0 - iter->qp is valid 1 - no more QPs
+ */
+int rvt_qp_iter_next(struct rvt_qp_iter *iter)
+ __must_hold(RCU)
+{
+ int n = iter->n;
+ int ret = 1;
+ struct rvt_qp *pqp = iter->qp;
+ struct rvt_qp *qp;
+ struct rvt_dev_info *rdi = iter->rdi;
+
+ /*
+ * The approach is to consider the special qps
+ * as additional table entries before the
+ * real hash table. Since the qp code sets
+ * the qp->next hash link to NULL, this works just fine.
+ *
+ * iter->specials is 2 * # ports
+ *
+ * n = 0..iter->specials is the special qp indices
+ *
+ * n = iter->specials..rdi->qp_dev->qp_table_size+iter->specials are
+ * the potential hash bucket entries
+ *
+ */
+ for (; n < rdi->qp_dev->qp_table_size + iter->specials; n++) {
+ if (pqp) {
+ qp = rcu_dereference(pqp->next);
+ } else {
+ if (n < iter->specials) {
+ struct rvt_ibport *rvp;
+ int pidx;
+
+ pidx = n % rdi->ibdev.phys_port_cnt;
+ rvp = rdi->ports[pidx];
+ qp = rcu_dereference(rvp->qp[n & 1]);
+ } else {
+ qp = rcu_dereference(
+ rdi->qp_dev->qp_table[
+ (n - iter->specials)]);
+ }
+ }
+ pqp = qp;
+ if (qp) {
+ iter->qp = qp;
+ iter->n = n;
+ return 0;
+ }
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rvt_qp_iter_next);
+
+/**
+ * rvt_qp_iter - iterate all QPs
+ * @rdi: rvt devinfo
+ * @v: a 64-bit value
+ * @cb: a callback
+ *
+ * This provides a way for iterating all QPs.
+ *
+ * The @cb is a user-defined callback and @v is a 64-bit
+ * value passed to and relevant for processing in the
+ * cb. An example use case would be to alter QP processing
+ * based on criteria not part of the rvt_qp.
+ *
+ * The code has an internal iterator to simplify
+ * non seq_file use cases.
+ */
+void rvt_qp_iter(struct rvt_dev_info *rdi,
+ u64 v,
+ void (*cb)(struct rvt_qp *qp, u64 v))
+{
+ int ret;
+ struct rvt_qp_iter i = {
+ .rdi = rdi,
+ .specials = rdi->ibdev.phys_port_cnt * 2,
+ .v = v,
+ .cb = cb
+ };
+
+ rcu_read_lock();
+ do {
+ ret = rvt_qp_iter_next(&i);
+ if (!ret) {
+ rvt_get_qp(i.qp);
+ rcu_read_unlock();
+ i.cb(i.qp, i.v);
+ rcu_read_lock();
+ rvt_put_qp(i.qp);
+ }
+ } while (!ret);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(rvt_qp_iter);
+
+/*
+ * This should be called with s_lock and r_lock held.
+ */
+void rvt_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ enum ib_wc_status status)
+{
+ u32 old_last, last;
+ struct rvt_dev_info *rdi;
+
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
+ return;
+ rdi = ib_to_rvt(qp->ibqp.device);
+
+ old_last = qp->s_last;
+ trace_rvt_qp_send_completion(qp, wqe, old_last);
+ last = rvt_qp_complete_swqe(qp, wqe, rdi->wc_opcode[wqe->wr.opcode],
+ status);
+ if (qp->s_acked == old_last)
+ qp->s_acked = last;
+ if (qp->s_cur == old_last)
+ qp->s_cur = last;
+ if (qp->s_tail == old_last)
+ qp->s_tail = last;
+ if (qp->state == IB_QPS_SQD && last == qp->s_cur)
+ qp->s_draining = 0;
+}
+EXPORT_SYMBOL(rvt_send_complete);
+
+/**
+ * rvt_copy_sge - copy data to SGE memory
+ * @qp: associated QP
+ * @ss: the SGE state
+ * @data: the data to copy
+ * @length: the length of the data
+ * @release: boolean to release MR
+ * @copy_last: do a separate copy of the last 8 bytes
+ */
+void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss,
+ void *data, u32 length,
+ bool release, bool copy_last)
+{
+ struct rvt_sge *sge = &ss->sge;
+ int i;
+ bool in_last = false;
+ bool cacheless_copy = false;
+ struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+ struct rvt_wss *wss = rdi->wss;
+ unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode;
+
+ if (sge_copy_mode == RVT_SGE_COPY_CACHELESS) {
+ cacheless_copy = length >= PAGE_SIZE;
+ } else if (sge_copy_mode == RVT_SGE_COPY_ADAPTIVE) {
+ if (length >= PAGE_SIZE) {
+ /*
+ * NOTE: this *assumes*:
+ * o The first vaddr is the dest.
+ * o If multiple pages, then vaddr is sequential.
+ */
+ wss_insert(wss, sge->vaddr);
+ if (length >= (2 * PAGE_SIZE))
+ wss_insert(wss, (sge->vaddr + PAGE_SIZE));
+
+ cacheless_copy = wss_exceeds_threshold(wss);
+ } else {
+ wss_advance_clean_counter(wss);
+ }
+ }
+
+ if (copy_last) {
+ if (length > 8) {
+ length -= 8;
+ } else {
+ copy_last = false;
+ in_last = true;
+ }
+ }
+
+again:
+ while (length) {
+ u32 len = rvt_get_sge_length(sge, length);
+
+ WARN_ON_ONCE(len == 0);
+ if (unlikely(in_last)) {
+ /* enforce byte transfer ordering */
+ for (i = 0; i < len; i++)
+ ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i];
+ } else if (cacheless_copy) {
+ cacheless_memcpy(sge->vaddr, data, len);
+ } else {
+ memcpy(sge->vaddr, data, len);
+ }
+ rvt_update_sge(ss, len, release);
+ data += len;
+ length -= len;
+ }
+
+ if (copy_last) {
+ copy_last = false;
+ in_last = true;
+ length = 8;
+ goto again;
+ }
+}
+EXPORT_SYMBOL(rvt_copy_sge);
+
+static enum ib_wc_status loopback_qp_drop(struct rvt_ibport *rvp,
+ struct rvt_qp *sqp)
+{
+ rvp->n_pkt_drops++;
+ /*
+ * For RC, the requester would timeout and retry so
+ * shortcut the timeouts and just signal too many retries.
+ */
+ return sqp->ibqp.qp_type == IB_QPT_RC ?
+ IB_WC_RETRY_EXC_ERR : IB_WC_SUCCESS;
+}
+
+/**
+ * rvt_ruc_loopback - handle UC and RC loopback requests
+ * @sqp: the sending QP
+ *
+ * This is called from rvt_do_send() to forward a WQE addressed to the same HFI
+ * Note that although we are single threaded due to the send engine, we still
+ * have to protect against post_send(). We don't have to worry about
+ * receive interrupts since this is a connected protocol and all packets
+ * will pass through here.
+ */
+void rvt_ruc_loopback(struct rvt_qp *sqp)
+{
+ struct rvt_ibport *rvp = NULL;
+ struct rvt_dev_info *rdi = ib_to_rvt(sqp->ibqp.device);
+ struct rvt_qp *qp;
+ struct rvt_swqe *wqe;
+ struct rvt_sge *sge;
+ unsigned long flags;
+ struct ib_wc wc;
+ u64 sdata;
+ atomic64_t *maddr;
+ enum ib_wc_status send_status;
+ bool release;
+ int ret;
+ bool copy_last = false;
+ int local_ops = 0;
+
+ rcu_read_lock();
+ rvp = rdi->ports[sqp->port_num - 1];
+
+ /*
+ * Note that we check the responder QP state after
+ * checking the requester's state.
+ */
+
+ qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), rvp,
+ sqp->remote_qpn);
+
+ spin_lock_irqsave(&sqp->s_lock, flags);
+
+ /* Return if we are already busy processing a work request. */
+ if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) ||
+ !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND))
+ goto unlock;
+
+ sqp->s_flags |= RVT_S_BUSY;
+
+again:
+ if (sqp->s_last == READ_ONCE(sqp->s_head))
+ goto clr_busy;
+ wqe = rvt_get_swqe_ptr(sqp, sqp->s_last);
+
+ /* Return if it is not OK to start a new work request. */
+ if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
+ if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND))
+ goto clr_busy;
+ /* We are in the error state, flush the work request. */
+ send_status = IB_WC_WR_FLUSH_ERR;
+ goto flush_send;
+ }
+
+ /*
+ * We can rely on the entry not changing without the s_lock
+ * being held until we update s_last.
+ * We increment s_cur to indicate s_last is in progress.
+ */
+ if (sqp->s_last == sqp->s_cur) {
+ if (++sqp->s_cur >= sqp->s_size)
+ sqp->s_cur = 0;
+ }
+ spin_unlock_irqrestore(&sqp->s_lock, flags);
+
+ if (!qp) {
+ send_status = loopback_qp_drop(rvp, sqp);
+ goto serr_no_r_lock;
+ }
+ spin_lock_irqsave(&qp->r_lock, flags);
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) ||
+ qp->ibqp.qp_type != sqp->ibqp.qp_type) {
+ send_status = loopback_qp_drop(rvp, sqp);
+ goto serr;
+ }
+
+ memset(&wc, 0, sizeof(wc));
+ send_status = IB_WC_SUCCESS;
+
+ release = true;
+ sqp->s_sge.sge = wqe->sg_list[0];
+ sqp->s_sge.sg_list = wqe->sg_list + 1;
+ sqp->s_sge.num_sge = wqe->wr.num_sge;
+ sqp->s_len = wqe->length;
+ switch (wqe->wr.opcode) {
+ case IB_WR_REG_MR:
+ goto send_comp;
+
+ case IB_WR_LOCAL_INV:
+ if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) {
+ if (rvt_invalidate_rkey(sqp,
+ wqe->wr.ex.invalidate_rkey))
+ send_status = IB_WC_LOC_PROT_ERR;
+ local_ops = 1;
+ }
+ goto send_comp;
+
+ case IB_WR_SEND_WITH_INV:
+ case IB_WR_SEND_WITH_IMM:
+ case IB_WR_SEND:
+ ret = rvt_get_rwqe(qp, false);
+ if (ret < 0)
+ goto op_err;
+ if (!ret)
+ goto rnr_nak;
+ if (wqe->length > qp->r_len)
+ goto inv_err;
+ switch (wqe->wr.opcode) {
+ case IB_WR_SEND_WITH_INV:
+ if (!rvt_invalidate_rkey(qp,
+ wqe->wr.ex.invalidate_rkey)) {
+ wc.wc_flags = IB_WC_WITH_INVALIDATE;
+ wc.ex.invalidate_rkey =
+ wqe->wr.ex.invalidate_rkey;
+ }
+ break;
+ case IB_WR_SEND_WITH_IMM:
+ wc.wc_flags = IB_WC_WITH_IMM;
+ wc.ex.imm_data = wqe->wr.ex.imm_data;
+ break;
+ default:
+ break;
+ }
+ break;
+
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+ goto inv_err;
+ wc.wc_flags = IB_WC_WITH_IMM;
+ wc.ex.imm_data = wqe->wr.ex.imm_data;
+ ret = rvt_get_rwqe(qp, true);
+ if (ret < 0)
+ goto op_err;
+ if (!ret)
+ goto rnr_nak;
+ /* skip copy_last set and qp_access_flags recheck */
+ goto do_write;
+ case IB_WR_RDMA_WRITE:
+ copy_last = rvt_is_user_qp(qp);
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+ goto inv_err;
+do_write:
+ if (wqe->length == 0)
+ break;
+ if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
+ wqe->rdma_wr.remote_addr,
+ wqe->rdma_wr.rkey,
+ IB_ACCESS_REMOTE_WRITE)))
+ goto acc_err;
+ qp->r_sge.sg_list = NULL;
+ qp->r_sge.num_sge = 1;
+ qp->r_sge.total_len = wqe->length;
+ break;
+
+ case IB_WR_RDMA_READ:
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+ goto inv_err;
+ if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
+ wqe->rdma_wr.remote_addr,
+ wqe->rdma_wr.rkey,
+ IB_ACCESS_REMOTE_READ)))
+ goto acc_err;
+ release = false;
+ sqp->s_sge.sg_list = NULL;
+ sqp->s_sge.num_sge = 1;
+ qp->r_sge.sge = wqe->sg_list[0];
+ qp->r_sge.sg_list = wqe->sg_list + 1;
+ qp->r_sge.num_sge = wqe->wr.num_sge;
+ qp->r_sge.total_len = wqe->length;
+ break;
+
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+ goto inv_err;
+ if (unlikely(wqe->atomic_wr.remote_addr & (sizeof(u64) - 1)))
+ goto inv_err;
+ if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
+ wqe->atomic_wr.remote_addr,
+ wqe->atomic_wr.rkey,
+ IB_ACCESS_REMOTE_ATOMIC)))
+ goto acc_err;
+ /* Perform atomic OP and save result. */
+ maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
+ sdata = wqe->atomic_wr.compare_add;
+ *(u64 *)sqp->s_sge.sge.vaddr =
+ (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
+ (u64)atomic64_add_return(sdata, maddr) - sdata :
+ (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
+ sdata, wqe->atomic_wr.swap);
+ rvt_put_mr(qp->r_sge.sge.mr);
+ qp->r_sge.num_sge = 0;
+ goto send_comp;
+
+ default:
+ send_status = IB_WC_LOC_QP_OP_ERR;
+ goto serr;
+ }
+
+ sge = &sqp->s_sge.sge;
+ while (sqp->s_len) {
+ u32 len = rvt_get_sge_length(sge, sqp->s_len);
+
+ WARN_ON_ONCE(len == 0);
+ rvt_copy_sge(qp, &qp->r_sge, sge->vaddr,
+ len, release, copy_last);
+ rvt_update_sge(&sqp->s_sge, len, !release);
+ sqp->s_len -= len;
+ }
+ if (release)
+ rvt_put_ss(&qp->r_sge);
+
+ if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
+ goto send_comp;
+
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+ wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+ else
+ wc.opcode = IB_WC_RECV;
+ wc.wr_id = qp->r_wr_id;
+ wc.status = IB_WC_SUCCESS;
+ wc.byte_len = wqe->length;
+ wc.qp = &qp->ibqp;
+ wc.src_qp = qp->remote_qpn;
+ wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX;
+ wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
+ wc.port_num = 1;
+ /* Signal completion event if the solicited bit is set. */
+ rvt_recv_cq(qp, &wc, wqe->wr.send_flags & IB_SEND_SOLICITED);
+
+send_comp:
+ spin_unlock_irqrestore(&qp->r_lock, flags);
+ spin_lock_irqsave(&sqp->s_lock, flags);
+ rvp->n_loop_pkts++;
+flush_send:
+ sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
+ spin_lock(&sqp->r_lock);
+ rvt_send_complete(sqp, wqe, send_status);
+ spin_unlock(&sqp->r_lock);
+ if (local_ops) {
+ atomic_dec(&sqp->local_ops_pending);
+ local_ops = 0;
+ }
+ goto again;
+
+rnr_nak:
+ /* Handle RNR NAK */
+ if (qp->ibqp.qp_type == IB_QPT_UC)
+ goto send_comp;
+ rvp->n_rnr_naks++;
+ /*
+ * Note: we don't need the s_lock held since the BUSY flag
+ * makes this single threaded.
+ */
+ if (sqp->s_rnr_retry == 0) {
+ send_status = IB_WC_RNR_RETRY_EXC_ERR;
+ goto serr;
+ }
+ if (sqp->s_rnr_retry_cnt < 7)
+ sqp->s_rnr_retry--;
+ spin_unlock_irqrestore(&qp->r_lock, flags);
+ spin_lock_irqsave(&sqp->s_lock, flags);
+ if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK))
+ goto clr_busy;
+ rvt_add_rnr_timer(sqp, qp->r_min_rnr_timer <<
+ IB_AETH_CREDIT_SHIFT);
+ goto clr_busy;
+
+op_err:
+ send_status = IB_WC_REM_OP_ERR;
+ wc.status = IB_WC_LOC_QP_OP_ERR;
+ goto err;
+
+inv_err:
+ send_status =
+ sqp->ibqp.qp_type == IB_QPT_RC ?
+ IB_WC_REM_INV_REQ_ERR :
+ IB_WC_SUCCESS;
+ wc.status = IB_WC_LOC_QP_OP_ERR;
+ goto err;
+
+acc_err:
+ send_status = IB_WC_REM_ACCESS_ERR;
+ wc.status = IB_WC_LOC_PROT_ERR;
+err:
+ /* responder goes to error state */
+ rvt_rc_error(qp, wc.status);
+
+serr:
+ spin_unlock_irqrestore(&qp->r_lock, flags);
+serr_no_r_lock:
+ spin_lock_irqsave(&sqp->s_lock, flags);
+ spin_lock(&sqp->r_lock);
+ rvt_send_complete(sqp, wqe, send_status);
+ spin_unlock(&sqp->r_lock);
+ if (sqp->ibqp.qp_type == IB_QPT_RC) {
+ int lastwqe;
+
+ spin_lock(&sqp->r_lock);
+ lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
+ spin_unlock(&sqp->r_lock);
+
+ sqp->s_flags &= ~RVT_S_BUSY;
+ spin_unlock_irqrestore(&sqp->s_lock, flags);
+ if (lastwqe) {
+ struct ib_event ev;
+
+ ev.device = sqp->ibqp.device;
+ ev.element.qp = &sqp->ibqp;
+ ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+ sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
+ }
+ goto done;
+ }
+clr_busy:
+ sqp->s_flags &= ~RVT_S_BUSY;
+unlock:
+ spin_unlock_irqrestore(&sqp->s_lock, flags);
+done:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(rvt_ruc_loopback);
diff --git a/drivers/infiniband/sw/rdmavt/qp.h b/drivers/infiniband/sw/rdmavt/qp.h
new file mode 100644
index 0000000000..bd04be8072
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/qp.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ */
+
+#ifndef DEF_RVTQP_H
+#define DEF_RVTQP_H
+
+#include <rdma/rdmavt_qp.h>
+
+int rvt_driver_qp_init(struct rvt_dev_info *rdi);
+void rvt_qp_exit(struct rvt_dev_info *rdi);
+int rvt_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata);
+int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata);
+int rvt_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
+int rvt_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_qp_init_attr *init_attr);
+int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
+ const struct ib_recv_wr **bad_wr);
+int rvt_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
+ const struct ib_send_wr **bad_wr);
+int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
+ const struct ib_recv_wr **bad_wr);
+int rvt_wss_init(struct rvt_dev_info *rdi);
+void rvt_wss_exit(struct rvt_dev_info *rdi);
+int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node,
+ struct ib_udata *udata);
+#endif /* DEF_RVTQP_H */
diff --git a/drivers/infiniband/sw/rdmavt/rc.c b/drivers/infiniband/sw/rdmavt/rc.c
new file mode 100644
index 0000000000..4e5d4a2763
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/rc.c
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ */
+
+#include <rdma/rdmavt_qp.h>
+#include <rdma/ib_hdrs.h>
+
+/*
+ * Convert the AETH credit code into the number of credits.
+ */
+static const u16 credit_table[31] = {
+ 0, /* 0 */
+ 1, /* 1 */
+ 2, /* 2 */
+ 3, /* 3 */
+ 4, /* 4 */
+ 6, /* 5 */
+ 8, /* 6 */
+ 12, /* 7 */
+ 16, /* 8 */
+ 24, /* 9 */
+ 32, /* A */
+ 48, /* B */
+ 64, /* C */
+ 96, /* D */
+ 128, /* E */
+ 192, /* F */
+ 256, /* 10 */
+ 384, /* 11 */
+ 512, /* 12 */
+ 768, /* 13 */
+ 1024, /* 14 */
+ 1536, /* 15 */
+ 2048, /* 16 */
+ 3072, /* 17 */
+ 4096, /* 18 */
+ 6144, /* 19 */
+ 8192, /* 1A */
+ 12288, /* 1B */
+ 16384, /* 1C */
+ 24576, /* 1D */
+ 32768 /* 1E */
+};
+
+/**
+ * rvt_compute_aeth - compute the AETH (syndrome + MSN)
+ * @qp: the queue pair to compute the AETH for
+ *
+ * Returns the AETH.
+ */
+__be32 rvt_compute_aeth(struct rvt_qp *qp)
+{
+ u32 aeth = qp->r_msn & IB_MSN_MASK;
+
+ if (qp->ibqp.srq) {
+ /*
+ * Shared receive queues don't generate credits.
+ * Set the credit field to the invalid value.
+ */
+ aeth |= IB_AETH_CREDIT_INVAL << IB_AETH_CREDIT_SHIFT;
+ } else {
+ u32 min, max, x;
+ u32 credits;
+ u32 head;
+ u32 tail;
+
+ credits = READ_ONCE(qp->r_rq.kwq->count);
+ if (credits == 0) {
+ /* sanity check pointers before trusting them */
+ if (qp->ip) {
+ head = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->head);
+ tail = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->tail);
+ } else {
+ head = READ_ONCE(qp->r_rq.kwq->head);
+ tail = READ_ONCE(qp->r_rq.kwq->tail);
+ }
+ if (head >= qp->r_rq.size)
+ head = 0;
+ if (tail >= qp->r_rq.size)
+ tail = 0;
+ /*
+ * Compute the number of credits available (RWQEs).
+ * There is a small chance that the pair of reads are
+ * not atomic, which is OK, since the fuzziness is
+ * resolved as further ACKs go out.
+ */
+ credits = rvt_get_rq_count(&qp->r_rq, head, tail);
+ }
+ /*
+ * Binary search the credit table to find the code to
+ * use.
+ */
+ min = 0;
+ max = 31;
+ for (;;) {
+ x = (min + max) / 2;
+ if (credit_table[x] == credits)
+ break;
+ if (credit_table[x] > credits) {
+ max = x;
+ } else {
+ if (min == x)
+ break;
+ min = x;
+ }
+ }
+ aeth |= x << IB_AETH_CREDIT_SHIFT;
+ }
+ return cpu_to_be32(aeth);
+}
+EXPORT_SYMBOL(rvt_compute_aeth);
+
+/**
+ * rvt_get_credit - flush the send work queue of a QP
+ * @qp: the qp who's send work queue to flush
+ * @aeth: the Acknowledge Extended Transport Header
+ *
+ * The QP s_lock should be held.
+ */
+void rvt_get_credit(struct rvt_qp *qp, u32 aeth)
+{
+ struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+ u32 credit = (aeth >> IB_AETH_CREDIT_SHIFT) & IB_AETH_CREDIT_MASK;
+
+ lockdep_assert_held(&qp->s_lock);
+ /*
+ * If the credit is invalid, we can send
+ * as many packets as we like. Otherwise, we have to
+ * honor the credit field.
+ */
+ if (credit == IB_AETH_CREDIT_INVAL) {
+ if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) {
+ qp->s_flags |= RVT_S_UNLIMITED_CREDIT;
+ if (qp->s_flags & RVT_S_WAIT_SSN_CREDIT) {
+ qp->s_flags &= ~RVT_S_WAIT_SSN_CREDIT;
+ rdi->driver_f.schedule_send(qp);
+ }
+ }
+ } else if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) {
+ /* Compute new LSN (i.e., MSN + credit) */
+ credit = (aeth + credit_table[credit]) & IB_MSN_MASK;
+ if (rvt_cmp_msn(credit, qp->s_lsn) > 0) {
+ qp->s_lsn = credit;
+ if (qp->s_flags & RVT_S_WAIT_SSN_CREDIT) {
+ qp->s_flags &= ~RVT_S_WAIT_SSN_CREDIT;
+ rdi->driver_f.schedule_send(qp);
+ }
+ }
+ }
+}
+EXPORT_SYMBOL(rvt_get_credit);
+
+/**
+ * rvt_restart_sge - rewind the sge state for a wqe
+ * @ss: the sge state pointer
+ * @wqe: the wqe to rewind
+ * @len: the data length from the start of the wqe in bytes
+ *
+ * Returns the remaining data length.
+ */
+u32 rvt_restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len)
+{
+ ss->sge = wqe->sg_list[0];
+ ss->sg_list = wqe->sg_list + 1;
+ ss->num_sge = wqe->wr.num_sge;
+ ss->total_len = wqe->length;
+ rvt_skip_sge(ss, len, false);
+ return wqe->length - len;
+}
+EXPORT_SYMBOL(rvt_restart_sge);
+
diff --git a/drivers/infiniband/sw/rdmavt/srq.c b/drivers/infiniband/sw/rdmavt/srq.c
new file mode 100644
index 0000000000..14d196bde2
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/srq.c
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <rdma/uverbs_ioctl.h>
+
+#include "srq.h"
+#include "vt.h"
+#include "qp.h"
+/**
+ * rvt_driver_srq_init - init srq resources on a per driver basis
+ * @rdi: rvt dev structure
+ *
+ * Do any initialization needed when a driver registers with rdmavt.
+ */
+void rvt_driver_srq_init(struct rvt_dev_info *rdi)
+{
+ spin_lock_init(&rdi->n_srqs_lock);
+ rdi->n_srqs_allocated = 0;
+}
+
+/**
+ * rvt_create_srq - create a shared receive queue
+ * @ibsrq: the protection domain of the SRQ to create
+ * @srq_init_attr: the attributes of the SRQ
+ * @udata: data from libibverbs when creating a user SRQ
+ *
+ * Return: 0 on success
+ */
+int rvt_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *srq_init_attr,
+ struct ib_udata *udata)
+{
+ struct rvt_dev_info *dev = ib_to_rvt(ibsrq->device);
+ struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
+ u32 sz;
+ int ret;
+
+ if (srq_init_attr->srq_type != IB_SRQT_BASIC)
+ return -EOPNOTSUPP;
+
+ if (srq_init_attr->attr.max_sge == 0 ||
+ srq_init_attr->attr.max_sge > dev->dparms.props.max_srq_sge ||
+ srq_init_attr->attr.max_wr == 0 ||
+ srq_init_attr->attr.max_wr > dev->dparms.props.max_srq_wr)
+ return -EINVAL;
+
+ /*
+ * Need to use vmalloc() if we want to support large #s of entries.
+ */
+ srq->rq.size = srq_init_attr->attr.max_wr + 1;
+ srq->rq.max_sge = srq_init_attr->attr.max_sge;
+ sz = sizeof(struct ib_sge) * srq->rq.max_sge +
+ sizeof(struct rvt_rwqe);
+ if (rvt_alloc_rq(&srq->rq, srq->rq.size * sz,
+ dev->dparms.node, udata)) {
+ ret = -ENOMEM;
+ goto bail_srq;
+ }
+
+ /*
+ * Return the address of the RWQ as the offset to mmap.
+ * See rvt_mmap() for details.
+ */
+ if (udata && udata->outlen >= sizeof(__u64)) {
+ u32 s = sizeof(struct rvt_rwq) + srq->rq.size * sz;
+
+ srq->ip = rvt_create_mmap_info(dev, s, udata, srq->rq.wq);
+ if (IS_ERR(srq->ip)) {
+ ret = PTR_ERR(srq->ip);
+ goto bail_wq;
+ }
+
+ ret = ib_copy_to_udata(udata, &srq->ip->offset,
+ sizeof(srq->ip->offset));
+ if (ret)
+ goto bail_ip;
+ }
+
+ /*
+ * ib_create_srq() will initialize srq->ibsrq.
+ */
+ spin_lock_init(&srq->rq.lock);
+ srq->limit = srq_init_attr->attr.srq_limit;
+
+ spin_lock(&dev->n_srqs_lock);
+ if (dev->n_srqs_allocated == dev->dparms.props.max_srq) {
+ spin_unlock(&dev->n_srqs_lock);
+ ret = -ENOMEM;
+ goto bail_ip;
+ }
+
+ dev->n_srqs_allocated++;
+ spin_unlock(&dev->n_srqs_lock);
+
+ if (srq->ip) {
+ spin_lock_irq(&dev->pending_lock);
+ list_add(&srq->ip->pending_mmaps, &dev->pending_mmaps);
+ spin_unlock_irq(&dev->pending_lock);
+ }
+
+ return 0;
+
+bail_ip:
+ kfree(srq->ip);
+bail_wq:
+ rvt_free_rq(&srq->rq);
+bail_srq:
+ return ret;
+}
+
+/**
+ * rvt_modify_srq - modify a shared receive queue
+ * @ibsrq: the SRQ to modify
+ * @attr: the new attributes of the SRQ
+ * @attr_mask: indicates which attributes to modify
+ * @udata: user data for libibverbs.so
+ *
+ * Return: 0 on success
+ */
+int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+ enum ib_srq_attr_mask attr_mask,
+ struct ib_udata *udata)
+{
+ struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
+ struct rvt_dev_info *dev = ib_to_rvt(ibsrq->device);
+ struct rvt_rq tmp_rq = {};
+ int ret = 0;
+
+ if (attr_mask & IB_SRQ_MAX_WR) {
+ struct rvt_krwq *okwq = NULL;
+ struct rvt_rwq *owq = NULL;
+ struct rvt_rwqe *p;
+ u32 sz, size, n, head, tail;
+
+ /* Check that the requested sizes are below the limits. */
+ if ((attr->max_wr > dev->dparms.props.max_srq_wr) ||
+ ((attr_mask & IB_SRQ_LIMIT) ?
+ attr->srq_limit : srq->limit) > attr->max_wr)
+ return -EINVAL;
+ sz = sizeof(struct rvt_rwqe) +
+ srq->rq.max_sge * sizeof(struct ib_sge);
+ size = attr->max_wr + 1;
+ if (rvt_alloc_rq(&tmp_rq, size * sz, dev->dparms.node,
+ udata))
+ return -ENOMEM;
+ /* Check that we can write the offset to mmap. */
+ if (udata && udata->inlen >= sizeof(__u64)) {
+ __u64 offset_addr;
+ __u64 offset = 0;
+
+ ret = ib_copy_from_udata(&offset_addr, udata,
+ sizeof(offset_addr));
+ if (ret)
+ goto bail_free;
+ udata->outbuf = (void __user *)
+ (unsigned long)offset_addr;
+ ret = ib_copy_to_udata(udata, &offset,
+ sizeof(offset));
+ if (ret)
+ goto bail_free;
+ }
+
+ spin_lock_irq(&srq->rq.kwq->c_lock);
+ /*
+ * validate head and tail pointer values and compute
+ * the number of remaining WQEs.
+ */
+ if (udata) {
+ owq = srq->rq.wq;
+ head = RDMA_READ_UAPI_ATOMIC(owq->head);
+ tail = RDMA_READ_UAPI_ATOMIC(owq->tail);
+ } else {
+ okwq = srq->rq.kwq;
+ head = okwq->head;
+ tail = okwq->tail;
+ }
+ if (head >= srq->rq.size || tail >= srq->rq.size) {
+ ret = -EINVAL;
+ goto bail_unlock;
+ }
+ n = head;
+ if (n < tail)
+ n += srq->rq.size - tail;
+ else
+ n -= tail;
+ if (size <= n) {
+ ret = -EINVAL;
+ goto bail_unlock;
+ }
+ n = 0;
+ p = tmp_rq.kwq->curr_wq;
+ while (tail != head) {
+ struct rvt_rwqe *wqe;
+ int i;
+
+ wqe = rvt_get_rwqe_ptr(&srq->rq, tail);
+ p->wr_id = wqe->wr_id;
+ p->num_sge = wqe->num_sge;
+ for (i = 0; i < wqe->num_sge; i++)
+ p->sg_list[i] = wqe->sg_list[i];
+ n++;
+ p = (struct rvt_rwqe *)((char *)p + sz);
+ if (++tail >= srq->rq.size)
+ tail = 0;
+ }
+ srq->rq.kwq = tmp_rq.kwq;
+ if (udata) {
+ srq->rq.wq = tmp_rq.wq;
+ RDMA_WRITE_UAPI_ATOMIC(tmp_rq.wq->head, n);
+ RDMA_WRITE_UAPI_ATOMIC(tmp_rq.wq->tail, 0);
+ } else {
+ tmp_rq.kwq->head = n;
+ tmp_rq.kwq->tail = 0;
+ }
+ srq->rq.size = size;
+ if (attr_mask & IB_SRQ_LIMIT)
+ srq->limit = attr->srq_limit;
+ spin_unlock_irq(&srq->rq.kwq->c_lock);
+
+ vfree(owq);
+ kvfree(okwq);
+
+ if (srq->ip) {
+ struct rvt_mmap_info *ip = srq->ip;
+ struct rvt_dev_info *dev = ib_to_rvt(srq->ibsrq.device);
+ u32 s = sizeof(struct rvt_rwq) + size * sz;
+
+ rvt_update_mmap_info(dev, ip, s, tmp_rq.wq);
+
+ /*
+ * Return the offset to mmap.
+ * See rvt_mmap() for details.
+ */
+ if (udata && udata->inlen >= sizeof(__u64)) {
+ ret = ib_copy_to_udata(udata, &ip->offset,
+ sizeof(ip->offset));
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Put user mapping info onto the pending list
+ * unless it already is on the list.
+ */
+ spin_lock_irq(&dev->pending_lock);
+ if (list_empty(&ip->pending_mmaps))
+ list_add(&ip->pending_mmaps,
+ &dev->pending_mmaps);
+ spin_unlock_irq(&dev->pending_lock);
+ }
+ } else if (attr_mask & IB_SRQ_LIMIT) {
+ spin_lock_irq(&srq->rq.kwq->c_lock);
+ if (attr->srq_limit >= srq->rq.size)
+ ret = -EINVAL;
+ else
+ srq->limit = attr->srq_limit;
+ spin_unlock_irq(&srq->rq.kwq->c_lock);
+ }
+ return ret;
+
+bail_unlock:
+ spin_unlock_irq(&srq->rq.kwq->c_lock);
+bail_free:
+ rvt_free_rq(&tmp_rq);
+ return ret;
+}
+
+/**
+ * rvt_query_srq - query srq data
+ * @ibsrq: srq to query
+ * @attr: return info in attr
+ *
+ * Return: always 0
+ */
+int rvt_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
+{
+ struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
+
+ attr->max_wr = srq->rq.size - 1;
+ attr->max_sge = srq->rq.max_sge;
+ attr->srq_limit = srq->limit;
+ return 0;
+}
+
+/**
+ * rvt_destroy_srq - destory an srq
+ * @ibsrq: srq object to destroy
+ * @udata: user data for libibverbs.so
+ */
+int rvt_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
+{
+ struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
+ struct rvt_dev_info *dev = ib_to_rvt(ibsrq->device);
+
+ spin_lock(&dev->n_srqs_lock);
+ dev->n_srqs_allocated--;
+ spin_unlock(&dev->n_srqs_lock);
+ if (srq->ip)
+ kref_put(&srq->ip->ref, rvt_release_mmap_info);
+ kvfree(srq->rq.kwq);
+ return 0;
+}
diff --git a/drivers/infiniband/sw/rdmavt/srq.h b/drivers/infiniband/sw/rdmavt/srq.h
new file mode 100644
index 0000000000..7d17372cd2
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/srq.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ */
+
+#ifndef DEF_RVTSRQ_H
+#define DEF_RVTSRQ_H
+
+#include <rdma/rdma_vt.h>
+void rvt_driver_srq_init(struct rvt_dev_info *rdi);
+int rvt_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *srq_init_attr,
+ struct ib_udata *udata);
+int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+ enum ib_srq_attr_mask attr_mask,
+ struct ib_udata *udata);
+int rvt_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
+int rvt_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata);
+
+#endif /* DEF_RVTSRQ_H */
diff --git a/drivers/infiniband/sw/rdmavt/trace.c b/drivers/infiniband/sw/rdmavt/trace.c
new file mode 100644
index 0000000000..01704b8dd6
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/trace.c
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ */
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/drivers/infiniband/sw/rdmavt/trace.h b/drivers/infiniband/sw/rdmavt/trace.h
new file mode 100644
index 0000000000..30eb4a72ea
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/trace.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+/*
+ * Copyright(c) 2016, 2017 Intel Corporation.
+ */
+
+#define RDI_DEV_ENTRY(rdi) __string(dev, rvt_get_ibdev_name(rdi))
+#define RDI_DEV_ASSIGN(rdi) __assign_str(dev, rvt_get_ibdev_name(rdi))
+
+#include "trace_rvt.h"
+#include "trace_qp.h"
+#include "trace_tx.h"
+#include "trace_mr.h"
+#include "trace_cq.h"
+#include "trace_rc.h"
diff --git a/drivers/infiniband/sw/rdmavt/trace_cq.h b/drivers/infiniband/sw/rdmavt/trace_cq.h
new file mode 100644
index 0000000000..30dd1d9bae
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/trace_cq.h
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+/*
+ * Copyright(c) 2016 - 2018 Intel Corporation.
+ */
+#if !defined(__RVT_TRACE_CQ_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __RVT_TRACE_CQ_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdmavt_cq.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rvt_cq
+
+#define wc_opcode_name(opcode) { IB_WC_##opcode, #opcode }
+#define show_wc_opcode(opcode) \
+__print_symbolic(opcode, \
+ wc_opcode_name(SEND), \
+ wc_opcode_name(RDMA_WRITE), \
+ wc_opcode_name(RDMA_READ), \
+ wc_opcode_name(COMP_SWAP), \
+ wc_opcode_name(FETCH_ADD), \
+ wc_opcode_name(LSO), \
+ wc_opcode_name(LOCAL_INV), \
+ wc_opcode_name(REG_MR), \
+ wc_opcode_name(MASKED_COMP_SWAP), \
+ wc_opcode_name(RECV), \
+ wc_opcode_name(RECV_RDMA_WITH_IMM))
+
+#define CQ_ATTR_PRINT \
+"[%s] user cq %s cqe %u comp_vector %d comp_vector_cpu %d flags %x"
+
+DECLARE_EVENT_CLASS(rvt_cq_template,
+ TP_PROTO(struct rvt_cq *cq,
+ const struct ib_cq_init_attr *attr),
+ TP_ARGS(cq, attr),
+ TP_STRUCT__entry(RDI_DEV_ENTRY(cq->rdi)
+ __field(struct rvt_mmap_info *, ip)
+ __field(unsigned int, cqe)
+ __field(int, comp_vector)
+ __field(int, comp_vector_cpu)
+ __field(u32, flags)
+ ),
+ TP_fast_assign(RDI_DEV_ASSIGN(cq->rdi);
+ __entry->ip = cq->ip;
+ __entry->cqe = attr->cqe;
+ __entry->comp_vector = attr->comp_vector;
+ __entry->comp_vector_cpu =
+ cq->comp_vector_cpu;
+ __entry->flags = attr->flags;
+ ),
+ TP_printk(CQ_ATTR_PRINT, __get_str(dev),
+ __entry->ip ? "true" : "false", __entry->cqe,
+ __entry->comp_vector, __entry->comp_vector_cpu,
+ __entry->flags
+ )
+);
+
+DEFINE_EVENT(rvt_cq_template, rvt_create_cq,
+ TP_PROTO(struct rvt_cq *cq, const struct ib_cq_init_attr *attr),
+ TP_ARGS(cq, attr));
+
+#define CQ_PRN \
+"[%s] idx %u wr_id %llx status %u opcode %u,%s length %u qpn %x flags %x imm %x"
+
+DECLARE_EVENT_CLASS(
+ rvt_cq_entry_template,
+ TP_PROTO(struct rvt_cq *cq, struct ib_wc *wc, u32 idx),
+ TP_ARGS(cq, wc, idx),
+ TP_STRUCT__entry(
+ RDI_DEV_ENTRY(cq->rdi)
+ __field(u64, wr_id)
+ __field(u32, status)
+ __field(u32, opcode)
+ __field(u32, qpn)
+ __field(u32, length)
+ __field(u32, idx)
+ __field(u32, flags)
+ __field(u32, imm)
+ ),
+ TP_fast_assign(
+ RDI_DEV_ASSIGN(cq->rdi);
+ __entry->wr_id = wc->wr_id;
+ __entry->status = wc->status;
+ __entry->opcode = wc->opcode;
+ __entry->length = wc->byte_len;
+ __entry->qpn = wc->qp->qp_num;
+ __entry->idx = idx;
+ __entry->flags = wc->wc_flags;
+ __entry->imm = be32_to_cpu(wc->ex.imm_data);
+ ),
+ TP_printk(
+ CQ_PRN,
+ __get_str(dev),
+ __entry->idx,
+ __entry->wr_id,
+ __entry->status,
+ __entry->opcode, show_wc_opcode(__entry->opcode),
+ __entry->length,
+ __entry->qpn,
+ __entry->flags,
+ __entry->imm
+ )
+);
+
+DEFINE_EVENT(
+ rvt_cq_entry_template, rvt_cq_enter,
+ TP_PROTO(struct rvt_cq *cq, struct ib_wc *wc, u32 idx),
+ TP_ARGS(cq, wc, idx));
+
+DEFINE_EVENT(
+ rvt_cq_entry_template, rvt_cq_poll,
+ TP_PROTO(struct rvt_cq *cq, struct ib_wc *wc, u32 idx),
+ TP_ARGS(cq, wc, idx));
+
+#endif /* __RVT_TRACE_CQ_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_cq
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/sw/rdmavt/trace_mr.h b/drivers/infiniband/sw/rdmavt/trace_mr.h
new file mode 100644
index 0000000000..1de7012000
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/trace_mr.h
@@ -0,0 +1,182 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ */
+#if !defined(__RVT_TRACE_MR_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __RVT_TRACE_MR_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_mr.h>
+
+#include "mr.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rvt_mr
+DECLARE_EVENT_CLASS(
+ rvt_mr_template,
+ TP_PROTO(struct rvt_mregion *mr, u16 m, u16 n, void *v, size_t len),
+ TP_ARGS(mr, m, n, v, len),
+ TP_STRUCT__entry(
+ RDI_DEV_ENTRY(ib_to_rvt(mr->pd->device))
+ __field(void *, vaddr)
+ __field(struct page *, page)
+ __field(u64, iova)
+ __field(u64, user_base)
+ __field(size_t, len)
+ __field(size_t, length)
+ __field(u32, lkey)
+ __field(u32, offset)
+ __field(u16, m)
+ __field(u16, n)
+ ),
+ TP_fast_assign(
+ RDI_DEV_ASSIGN(ib_to_rvt(mr->pd->device));
+ __entry->vaddr = v;
+ __entry->page = virt_to_page(v);
+ __entry->iova = mr->iova;
+ __entry->user_base = mr->user_base;
+ __entry->lkey = mr->lkey;
+ __entry->m = m;
+ __entry->n = n;
+ __entry->len = len;
+ __entry->length = mr->length;
+ __entry->offset = mr->offset;
+ ),
+ TP_printk(
+ "[%s] lkey %x iova %llx user_base %llx mr_len %lu vaddr %llx page %p m %u n %u len %lu off %u",
+ __get_str(dev),
+ __entry->lkey,
+ __entry->iova,
+ __entry->user_base,
+ __entry->length,
+ (unsigned long long)__entry->vaddr,
+ __entry->page,
+ __entry->m,
+ __entry->n,
+ __entry->len,
+ __entry->offset
+ )
+);
+
+DEFINE_EVENT(
+ rvt_mr_template, rvt_mr_page_seg,
+ TP_PROTO(struct rvt_mregion *mr, u16 m, u16 n, void *v, size_t len),
+ TP_ARGS(mr, m, n, v, len));
+
+DEFINE_EVENT(
+ rvt_mr_template, rvt_mr_fmr_seg,
+ TP_PROTO(struct rvt_mregion *mr, u16 m, u16 n, void *v, size_t len),
+ TP_ARGS(mr, m, n, v, len));
+
+DEFINE_EVENT(
+ rvt_mr_template, rvt_mr_user_seg,
+ TP_PROTO(struct rvt_mregion *mr, u16 m, u16 n, void *v, size_t len),
+ TP_ARGS(mr, m, n, v, len));
+
+DECLARE_EVENT_CLASS(
+ rvt_sge_template,
+ TP_PROTO(struct rvt_sge *sge, struct ib_sge *isge),
+ TP_ARGS(sge, isge),
+ TP_STRUCT__entry(
+ RDI_DEV_ENTRY(ib_to_rvt(sge->mr->pd->device))
+ __field(struct rvt_mregion *, mr)
+ __field(struct rvt_sge *, sge)
+ __field(struct ib_sge *, isge)
+ __field(void *, vaddr)
+ __field(u64, ivaddr)
+ __field(u32, lkey)
+ __field(u32, sge_length)
+ __field(u32, length)
+ __field(u32, ilength)
+ __field(int, user)
+ __field(u16, m)
+ __field(u16, n)
+ ),
+ TP_fast_assign(
+ RDI_DEV_ASSIGN(ib_to_rvt(sge->mr->pd->device));
+ __entry->mr = sge->mr;
+ __entry->sge = sge;
+ __entry->isge = isge;
+ __entry->vaddr = sge->vaddr;
+ __entry->ivaddr = isge->addr;
+ __entry->lkey = sge->mr->lkey;
+ __entry->sge_length = sge->sge_length;
+ __entry->length = sge->length;
+ __entry->ilength = isge->length;
+ __entry->m = sge->m;
+ __entry->n = sge->m;
+ __entry->user = ibpd_to_rvtpd(sge->mr->pd)->user;
+ ),
+ TP_printk(
+ "[%s] mr %p sge %p isge %p vaddr %p ivaddr %llx lkey %x sge_length %u length %u ilength %u m %u n %u user %u",
+ __get_str(dev),
+ __entry->mr,
+ __entry->sge,
+ __entry->isge,
+ __entry->vaddr,
+ __entry->ivaddr,
+ __entry->lkey,
+ __entry->sge_length,
+ __entry->length,
+ __entry->ilength,
+ __entry->m,
+ __entry->n,
+ __entry->user
+ )
+);
+
+DEFINE_EVENT(
+ rvt_sge_template, rvt_sge_adjacent,
+ TP_PROTO(struct rvt_sge *sge, struct ib_sge *isge),
+ TP_ARGS(sge, isge));
+
+DEFINE_EVENT(
+ rvt_sge_template, rvt_sge_new,
+ TP_PROTO(struct rvt_sge *sge, struct ib_sge *isge),
+ TP_ARGS(sge, isge));
+
+TRACE_EVENT(
+ rvt_map_mr_sg,
+ TP_PROTO(struct ib_mr *ibmr, int sg_nents, unsigned int *sg_offset),
+ TP_ARGS(ibmr, sg_nents, sg_offset),
+ TP_STRUCT__entry(
+ RDI_DEV_ENTRY(ib_to_rvt(to_imr(ibmr)->mr.pd->device))
+ __field(u64, iova)
+ __field(u64, ibmr_iova)
+ __field(u64, user_base)
+ __field(u64, ibmr_length)
+ __field(int, sg_nents)
+ __field(uint, sg_offset)
+ ),
+ TP_fast_assign(
+ RDI_DEV_ASSIGN(ib_to_rvt(to_imr(ibmr)->mr.pd->device));
+ __entry->ibmr_iova = ibmr->iova;
+ __entry->iova = to_imr(ibmr)->mr.iova;
+ __entry->user_base = to_imr(ibmr)->mr.user_base;
+ __entry->ibmr_length = to_imr(ibmr)->mr.length;
+ __entry->sg_nents = sg_nents;
+ __entry->sg_offset = sg_offset ? *sg_offset : 0;
+ ),
+ TP_printk(
+ "[%s] ibmr_iova %llx iova %llx user_base %llx length %llx sg_nents %d sg_offset %u",
+ __get_str(dev),
+ __entry->ibmr_iova,
+ __entry->iova,
+ __entry->user_base,
+ __entry->ibmr_length,
+ __entry->sg_nents,
+ __entry->sg_offset
+ )
+);
+
+#endif /* __RVT_TRACE_MR_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_mr
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/sw/rdmavt/trace_qp.h b/drivers/infiniband/sw/rdmavt/trace_qp.h
new file mode 100644
index 0000000000..c28c81fcb3
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/trace_qp.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ */
+#if !defined(__RVT_TRACE_QP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __RVT_TRACE_QP_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdmavt_qp.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rvt_qp
+
+DECLARE_EVENT_CLASS(rvt_qphash_template,
+ TP_PROTO(struct rvt_qp *qp, u32 bucket),
+ TP_ARGS(qp, bucket),
+ TP_STRUCT__entry(
+ RDI_DEV_ENTRY(ib_to_rvt(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u32, bucket)
+ ),
+ TP_fast_assign(
+ RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->bucket = bucket;
+ ),
+ TP_printk(
+ "[%s] qpn 0x%x bucket %u",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->bucket
+ )
+);
+
+DEFINE_EVENT(rvt_qphash_template, rvt_qpinsert,
+ TP_PROTO(struct rvt_qp *qp, u32 bucket),
+ TP_ARGS(qp, bucket));
+
+DEFINE_EVENT(rvt_qphash_template, rvt_qpremove,
+ TP_PROTO(struct rvt_qp *qp, u32 bucket),
+ TP_ARGS(qp, bucket));
+
+DECLARE_EVENT_CLASS(
+ rvt_rnrnak_template,
+ TP_PROTO(struct rvt_qp *qp, u32 to),
+ TP_ARGS(qp, to),
+ TP_STRUCT__entry(
+ RDI_DEV_ENTRY(ib_to_rvt(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(void *, hrtimer)
+ __field(u32, s_flags)
+ __field(u32, to)
+ ),
+ TP_fast_assign(
+ RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->hrtimer = &qp->s_rnr_timer;
+ __entry->s_flags = qp->s_flags;
+ __entry->to = to;
+ ),
+ TP_printk(
+ "[%s] qpn 0x%x hrtimer 0x%p s_flags 0x%x timeout %u us",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->hrtimer,
+ __entry->s_flags,
+ __entry->to
+ )
+);
+
+DEFINE_EVENT(
+ rvt_rnrnak_template, rvt_rnrnak_add,
+ TP_PROTO(struct rvt_qp *qp, u32 to),
+ TP_ARGS(qp, to));
+
+DEFINE_EVENT(
+ rvt_rnrnak_template, rvt_rnrnak_timeout,
+ TP_PROTO(struct rvt_qp *qp, u32 to),
+ TP_ARGS(qp, to));
+
+DEFINE_EVENT(
+ rvt_rnrnak_template, rvt_rnrnak_stop,
+ TP_PROTO(struct rvt_qp *qp, u32 to),
+ TP_ARGS(qp, to));
+
+#endif /* __RVT_TRACE_QP_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_qp
+#include <trace/define_trace.h>
+
diff --git a/drivers/infiniband/sw/rdmavt/trace_rc.h b/drivers/infiniband/sw/rdmavt/trace_rc.h
new file mode 100644
index 0000000000..833bf778b0
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/trace_rc.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+/*
+ * Copyright(c) 2017 Intel Corporation.
+ */
+#if !defined(__RVT_TRACE_RC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __RVT_TRACE_RC_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdmavt_qp.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rvt_rc
+
+DECLARE_EVENT_CLASS(rvt_rc_template,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn),
+ TP_STRUCT__entry(
+ RDI_DEV_ENTRY(ib_to_rvt(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u32, s_flags)
+ __field(u32, psn)
+ __field(u32, s_psn)
+ __field(u32, s_next_psn)
+ __field(u32, s_sending_psn)
+ __field(u32, s_sending_hpsn)
+ __field(u32, r_psn)
+ ),
+ TP_fast_assign(
+ RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->s_flags = qp->s_flags;
+ __entry->psn = psn;
+ __entry->s_psn = qp->s_psn;
+ __entry->s_next_psn = qp->s_next_psn;
+ __entry->s_sending_psn = qp->s_sending_psn;
+ __entry->s_sending_hpsn = qp->s_sending_hpsn;
+ __entry->r_psn = qp->r_psn;
+ ),
+ TP_printk(
+ "[%s] qpn 0x%x s_flags 0x%x psn 0x%x s_psn 0x%x s_next_psn 0x%x s_sending_psn 0x%x sending_hpsn 0x%x r_psn 0x%x",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->s_flags,
+ __entry->psn,
+ __entry->s_psn,
+ __entry->s_next_psn,
+ __entry->s_sending_psn,
+ __entry->s_sending_hpsn,
+ __entry->r_psn
+ )
+);
+
+DEFINE_EVENT(rvt_rc_template, rvt_rc_timeout,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+#endif /* __RVT_TRACE_RC_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_rc
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/sw/rdmavt/trace_rvt.h b/drivers/infiniband/sw/rdmavt/trace_rvt.h
new file mode 100644
index 0000000000..9df6b0b826
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/trace_rvt.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ */
+#if !defined(__RVT_TRACE_RVT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __RVT_TRACE_RVT_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_vt.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rvt
+
+TRACE_EVENT(rvt_dbg,
+ TP_PROTO(struct rvt_dev_info *rdi,
+ const char *msg),
+ TP_ARGS(rdi, msg),
+ TP_STRUCT__entry(
+ RDI_DEV_ENTRY(rdi)
+ __string(msg, msg)
+ ),
+ TP_fast_assign(
+ RDI_DEV_ASSIGN(rdi);
+ __assign_str(msg, msg);
+ ),
+ TP_printk("[%s]: %s", __get_str(dev), __get_str(msg))
+);
+
+#endif /* __RVT_TRACE_MISC_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_rvt
+#include <trace/define_trace.h>
+
diff --git a/drivers/infiniband/sw/rdmavt/trace_tx.h b/drivers/infiniband/sw/rdmavt/trace_tx.h
new file mode 100644
index 0000000000..ff7d39a307
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/trace_tx.h
@@ -0,0 +1,163 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ */
+#if !defined(__RVT_TRACE_TX_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __RVT_TRACE_TX_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/rdmavt_qp.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rvt_tx
+
+#define wr_opcode_name(opcode) { IB_WR_##opcode, #opcode }
+#define show_wr_opcode(opcode) \
+__print_symbolic(opcode, \
+ wr_opcode_name(RDMA_WRITE), \
+ wr_opcode_name(RDMA_WRITE_WITH_IMM), \
+ wr_opcode_name(SEND), \
+ wr_opcode_name(SEND_WITH_IMM), \
+ wr_opcode_name(RDMA_READ), \
+ wr_opcode_name(ATOMIC_CMP_AND_SWP), \
+ wr_opcode_name(ATOMIC_FETCH_AND_ADD), \
+ wr_opcode_name(LSO), \
+ wr_opcode_name(SEND_WITH_INV), \
+ wr_opcode_name(RDMA_READ_WITH_INV), \
+ wr_opcode_name(LOCAL_INV), \
+ wr_opcode_name(MASKED_ATOMIC_CMP_AND_SWP), \
+ wr_opcode_name(MASKED_ATOMIC_FETCH_AND_ADD), \
+ wr_opcode_name(RESERVED1), \
+ wr_opcode_name(RESERVED2), \
+ wr_opcode_name(RESERVED3), \
+ wr_opcode_name(RESERVED4), \
+ wr_opcode_name(RESERVED5), \
+ wr_opcode_name(RESERVED6), \
+ wr_opcode_name(RESERVED7), \
+ wr_opcode_name(RESERVED8), \
+ wr_opcode_name(RESERVED9), \
+ wr_opcode_name(RESERVED10))
+
+#define POS_PRN \
+"[%s] wqe %p wr_id %llx send_flags %x qpn %x qpt %u psn %x lpsn %x ssn %x length %u opcode 0x%.2x,%s size %u avail %u head %u last %u pid %u num_sge %u wr_num_sge %u"
+
+TRACE_EVENT(
+ rvt_post_one_wr,
+ TP_PROTO(struct rvt_qp *qp, struct rvt_swqe *wqe, int wr_num_sge),
+ TP_ARGS(qp, wqe, wr_num_sge),
+ TP_STRUCT__entry(
+ RDI_DEV_ENTRY(ib_to_rvt(qp->ibqp.device))
+ __field(u64, wr_id)
+ __field(struct rvt_swqe *, wqe)
+ __field(u32, qpn)
+ __field(u32, qpt)
+ __field(u32, psn)
+ __field(u32, lpsn)
+ __field(u32, length)
+ __field(u32, opcode)
+ __field(u32, size)
+ __field(u32, avail)
+ __field(u32, head)
+ __field(u32, last)
+ __field(u32, ssn)
+ __field(int, send_flags)
+ __field(pid_t, pid)
+ __field(int, num_sge)
+ __field(int, wr_num_sge)
+ ),
+ TP_fast_assign(
+ RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device));
+ __entry->wqe = wqe;
+ __entry->wr_id = wqe->wr.wr_id;
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->qpt = qp->ibqp.qp_type;
+ __entry->psn = wqe->psn;
+ __entry->lpsn = wqe->lpsn;
+ __entry->length = wqe->length;
+ __entry->opcode = wqe->wr.opcode;
+ __entry->size = qp->s_size;
+ __entry->avail = qp->s_avail;
+ __entry->head = qp->s_head;
+ __entry->last = qp->s_last;
+ __entry->pid = qp->pid;
+ __entry->ssn = wqe->ssn;
+ __entry->send_flags = wqe->wr.send_flags;
+ __entry->num_sge = wqe->wr.num_sge;
+ __entry->wr_num_sge = wr_num_sge;
+ ),
+ TP_printk(
+ POS_PRN,
+ __get_str(dev),
+ __entry->wqe,
+ __entry->wr_id,
+ __entry->send_flags,
+ __entry->qpn,
+ __entry->qpt,
+ __entry->psn,
+ __entry->lpsn,
+ __entry->ssn,
+ __entry->length,
+ __entry->opcode, show_wr_opcode(__entry->opcode),
+ __entry->size,
+ __entry->avail,
+ __entry->head,
+ __entry->last,
+ __entry->pid,
+ __entry->num_sge,
+ __entry->wr_num_sge
+ )
+);
+
+TRACE_EVENT(
+ rvt_qp_send_completion,
+ TP_PROTO(struct rvt_qp *qp, struct rvt_swqe *wqe, u32 idx),
+ TP_ARGS(qp, wqe, idx),
+ TP_STRUCT__entry(
+ RDI_DEV_ENTRY(ib_to_rvt(qp->ibqp.device))
+ __field(struct rvt_swqe *, wqe)
+ __field(u64, wr_id)
+ __field(u32, qpn)
+ __field(u32, qpt)
+ __field(u32, length)
+ __field(u32, idx)
+ __field(u32, ssn)
+ __field(enum ib_wr_opcode, opcode)
+ __field(int, send_flags)
+ ),
+ TP_fast_assign(
+ RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device));
+ __entry->wqe = wqe;
+ __entry->wr_id = wqe->wr.wr_id;
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->qpt = qp->ibqp.qp_type;
+ __entry->length = wqe->length;
+ __entry->idx = idx;
+ __entry->ssn = wqe->ssn;
+ __entry->opcode = wqe->wr.opcode;
+ __entry->send_flags = wqe->wr.send_flags;
+ ),
+ TP_printk(
+ "[%s] qpn 0x%x qpt %u wqe %p idx %u wr_id %llx length %u ssn %u opcode %x send_flags %x",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->qpt,
+ __entry->wqe,
+ __entry->idx,
+ __entry->wr_id,
+ __entry->length,
+ __entry->ssn,
+ __entry->opcode,
+ __entry->send_flags
+ )
+);
+#endif /* __RVT_TRACE_TX_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_tx
+#include <trace/define_trace.h>
+
diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c
new file mode 100644
index 0000000000..d61f8de7f2
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/vt.c
@@ -0,0 +1,619 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+/*
+ * Copyright(c) 2016 - 2018 Intel Corporation.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/dma-mapping.h>
+#include "vt.h"
+#include "cq.h"
+#include "trace.h"
+
+#define RVT_UVERBS_ABI_VERSION 2
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("RDMA Verbs Transport Library");
+
+static int __init rvt_init(void)
+{
+ int ret = rvt_driver_cq_init();
+
+ if (ret)
+ pr_err("Error in driver CQ init.\n");
+
+ return ret;
+}
+module_init(rvt_init);
+
+static void __exit rvt_cleanup(void)
+{
+ rvt_cq_exit();
+}
+module_exit(rvt_cleanup);
+
+/**
+ * rvt_alloc_device - allocate rdi
+ * @size: how big of a structure to allocate
+ * @nports: number of ports to allocate array slots for
+ *
+ * Use IB core device alloc to allocate space for the rdi which is assumed to be
+ * inside of the ib_device. Any extra space that drivers require should be
+ * included in size.
+ *
+ * We also allocate a port array based on the number of ports.
+ *
+ * Return: pointer to allocated rdi
+ */
+struct rvt_dev_info *rvt_alloc_device(size_t size, int nports)
+{
+ struct rvt_dev_info *rdi;
+
+ rdi = container_of(_ib_alloc_device(size), struct rvt_dev_info, ibdev);
+ if (!rdi)
+ return rdi;
+
+ rdi->ports = kcalloc(nports, sizeof(*rdi->ports), GFP_KERNEL);
+ if (!rdi->ports)
+ ib_dealloc_device(&rdi->ibdev);
+
+ return rdi;
+}
+EXPORT_SYMBOL(rvt_alloc_device);
+
+/**
+ * rvt_dealloc_device - deallocate rdi
+ * @rdi: structure to free
+ *
+ * Free a structure allocated with rvt_alloc_device()
+ */
+void rvt_dealloc_device(struct rvt_dev_info *rdi)
+{
+ kfree(rdi->ports);
+ ib_dealloc_device(&rdi->ibdev);
+}
+EXPORT_SYMBOL(rvt_dealloc_device);
+
+static int rvt_query_device(struct ib_device *ibdev,
+ struct ib_device_attr *props,
+ struct ib_udata *uhw)
+{
+ struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
+
+ if (uhw->inlen || uhw->outlen)
+ return -EINVAL;
+ /*
+ * Return rvt_dev_info.dparms.props contents
+ */
+ *props = rdi->dparms.props;
+ return 0;
+}
+
+static int rvt_get_numa_node(struct ib_device *ibdev)
+{
+ struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
+
+ return rdi->dparms.node;
+}
+
+static int rvt_modify_device(struct ib_device *device,
+ int device_modify_mask,
+ struct ib_device_modify *device_modify)
+{
+ /*
+ * There is currently no need to supply this based on qib and hfi1.
+ * Future drivers may need to implement this though.
+ */
+
+ return -EOPNOTSUPP;
+}
+
+/**
+ * rvt_query_port - Passes the query port call to the driver
+ * @ibdev: Verbs IB dev
+ * @port_num: port number, 1 based from ib core
+ * @props: structure to hold returned properties
+ *
+ * Return: 0 on success
+ */
+static int rvt_query_port(struct ib_device *ibdev, u32 port_num,
+ struct ib_port_attr *props)
+{
+ struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
+ struct rvt_ibport *rvp;
+ u32 port_index = ibport_num_to_idx(ibdev, port_num);
+
+ rvp = rdi->ports[port_index];
+ /* props being zeroed by the caller, avoid zeroing it here */
+ props->sm_lid = rvp->sm_lid;
+ props->sm_sl = rvp->sm_sl;
+ props->port_cap_flags = rvp->port_cap_flags;
+ props->max_msg_sz = 0x80000000;
+ props->pkey_tbl_len = rvt_get_npkeys(rdi);
+ props->bad_pkey_cntr = rvp->pkey_violations;
+ props->qkey_viol_cntr = rvp->qkey_violations;
+ props->subnet_timeout = rvp->subnet_timeout;
+ props->init_type_reply = 0;
+
+ /* Populate the remaining ib_port_attr elements */
+ return rdi->driver_f.query_port_state(rdi, port_num, props);
+}
+
+/**
+ * rvt_modify_port - modify port
+ * @ibdev: Verbs IB dev
+ * @port_num: Port number, 1 based from ib core
+ * @port_modify_mask: How to change the port
+ * @props: Structure to fill in
+ *
+ * Return: 0 on success
+ */
+static int rvt_modify_port(struct ib_device *ibdev, u32 port_num,
+ int port_modify_mask, struct ib_port_modify *props)
+{
+ struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
+ struct rvt_ibport *rvp;
+ int ret = 0;
+ u32 port_index = ibport_num_to_idx(ibdev, port_num);
+
+ rvp = rdi->ports[port_index];
+ if (port_modify_mask & IB_PORT_OPA_MASK_CHG) {
+ rvp->port_cap3_flags |= props->set_port_cap_mask;
+ rvp->port_cap3_flags &= ~props->clr_port_cap_mask;
+ } else {
+ rvp->port_cap_flags |= props->set_port_cap_mask;
+ rvp->port_cap_flags &= ~props->clr_port_cap_mask;
+ }
+
+ if (props->set_port_cap_mask || props->clr_port_cap_mask)
+ rdi->driver_f.cap_mask_chg(rdi, port_num);
+ if (port_modify_mask & IB_PORT_SHUTDOWN)
+ ret = rdi->driver_f.shut_down_port(rdi, port_num);
+ if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR)
+ rvp->qkey_violations = 0;
+
+ return ret;
+}
+
+/**
+ * rvt_query_pkey - Return a pkey from the table at a given index
+ * @ibdev: Verbs IB dev
+ * @port_num: Port number, 1 based from ib core
+ * @index: Index into pkey table
+ * @pkey: returned pkey from the port pkey table
+ *
+ * Return: 0 on failure pkey otherwise
+ */
+static int rvt_query_pkey(struct ib_device *ibdev, u32 port_num, u16 index,
+ u16 *pkey)
+{
+ /*
+ * Driver will be responsible for keeping rvt_dev_info.pkey_table up to
+ * date. This function will just return that value. There is no need to
+ * lock, if a stale value is read and sent to the user so be it there is
+ * no way to protect against that anyway.
+ */
+ struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
+ u32 port_index;
+
+ port_index = ibport_num_to_idx(ibdev, port_num);
+
+ if (index >= rvt_get_npkeys(rdi))
+ return -EINVAL;
+
+ *pkey = rvt_get_pkey(rdi, port_index, index);
+ return 0;
+}
+
+/**
+ * rvt_query_gid - Return a gid from the table
+ * @ibdev: Verbs IB dev
+ * @port_num: Port number, 1 based from ib core
+ * @guid_index: Index in table
+ * @gid: Gid to return
+ *
+ * Return: 0 on success
+ */
+static int rvt_query_gid(struct ib_device *ibdev, u32 port_num,
+ int guid_index, union ib_gid *gid)
+{
+ struct rvt_dev_info *rdi;
+ struct rvt_ibport *rvp;
+ u32 port_index;
+
+ /*
+ * Driver is responsible for updating the guid table. Which will be used
+ * to craft the return value. This will work similar to how query_pkey()
+ * is being done.
+ */
+ port_index = ibport_num_to_idx(ibdev, port_num);
+
+ rdi = ib_to_rvt(ibdev);
+ rvp = rdi->ports[port_index];
+
+ gid->global.subnet_prefix = rvp->gid_prefix;
+
+ return rdi->driver_f.get_guid_be(rdi, rvp, guid_index,
+ &gid->global.interface_id);
+}
+
+/**
+ * rvt_alloc_ucontext - Allocate a user context
+ * @uctx: Verbs context
+ * @udata: User data allocated
+ */
+static int rvt_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
+{
+ return 0;
+}
+
+/**
+ * rvt_dealloc_ucontext - Free a user context
+ * @context: Unused
+ */
+static void rvt_dealloc_ucontext(struct ib_ucontext *context)
+{
+ return;
+}
+
+static int rvt_get_port_immutable(struct ib_device *ibdev, u32 port_num,
+ struct ib_port_immutable *immutable)
+{
+ struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
+ struct ib_port_attr attr;
+ int err;
+
+ immutable->core_cap_flags = rdi->dparms.core_cap_flags;
+
+ err = ib_query_port(ibdev, port_num, &attr);
+ if (err)
+ return err;
+
+ immutable->pkey_tbl_len = attr.pkey_tbl_len;
+ immutable->gid_tbl_len = attr.gid_tbl_len;
+ immutable->max_mad_size = rdi->dparms.max_mad_size;
+
+ return 0;
+}
+
+enum {
+ MISC,
+ QUERY_DEVICE,
+ MODIFY_DEVICE,
+ QUERY_PORT,
+ MODIFY_PORT,
+ QUERY_PKEY,
+ QUERY_GID,
+ ALLOC_UCONTEXT,
+ DEALLOC_UCONTEXT,
+ GET_PORT_IMMUTABLE,
+ CREATE_QP,
+ MODIFY_QP,
+ DESTROY_QP,
+ QUERY_QP,
+ POST_SEND,
+ POST_RECV,
+ POST_SRQ_RECV,
+ CREATE_AH,
+ DESTROY_AH,
+ MODIFY_AH,
+ QUERY_AH,
+ CREATE_SRQ,
+ MODIFY_SRQ,
+ DESTROY_SRQ,
+ QUERY_SRQ,
+ ATTACH_MCAST,
+ DETACH_MCAST,
+ GET_DMA_MR,
+ REG_USER_MR,
+ DEREG_MR,
+ ALLOC_MR,
+ MAP_MR_SG,
+ ALLOC_FMR,
+ MAP_PHYS_FMR,
+ UNMAP_FMR,
+ DEALLOC_FMR,
+ MMAP,
+ CREATE_CQ,
+ DESTROY_CQ,
+ POLL_CQ,
+ REQ_NOTFIY_CQ,
+ RESIZE_CQ,
+ ALLOC_PD,
+ DEALLOC_PD,
+ _VERB_IDX_MAX /* Must always be last! */
+};
+
+static const struct ib_device_ops rvt_dev_ops = {
+ .uverbs_abi_ver = RVT_UVERBS_ABI_VERSION,
+
+ .alloc_mr = rvt_alloc_mr,
+ .alloc_pd = rvt_alloc_pd,
+ .alloc_ucontext = rvt_alloc_ucontext,
+ .attach_mcast = rvt_attach_mcast,
+ .create_ah = rvt_create_ah,
+ .create_cq = rvt_create_cq,
+ .create_qp = rvt_create_qp,
+ .create_srq = rvt_create_srq,
+ .create_user_ah = rvt_create_ah,
+ .dealloc_pd = rvt_dealloc_pd,
+ .dealloc_ucontext = rvt_dealloc_ucontext,
+ .dereg_mr = rvt_dereg_mr,
+ .destroy_ah = rvt_destroy_ah,
+ .destroy_cq = rvt_destroy_cq,
+ .destroy_qp = rvt_destroy_qp,
+ .destroy_srq = rvt_destroy_srq,
+ .detach_mcast = rvt_detach_mcast,
+ .get_dma_mr = rvt_get_dma_mr,
+ .get_numa_node = rvt_get_numa_node,
+ .get_port_immutable = rvt_get_port_immutable,
+ .map_mr_sg = rvt_map_mr_sg,
+ .mmap = rvt_mmap,
+ .modify_ah = rvt_modify_ah,
+ .modify_device = rvt_modify_device,
+ .modify_port = rvt_modify_port,
+ .modify_qp = rvt_modify_qp,
+ .modify_srq = rvt_modify_srq,
+ .poll_cq = rvt_poll_cq,
+ .post_recv = rvt_post_recv,
+ .post_send = rvt_post_send,
+ .post_srq_recv = rvt_post_srq_recv,
+ .query_ah = rvt_query_ah,
+ .query_device = rvt_query_device,
+ .query_gid = rvt_query_gid,
+ .query_pkey = rvt_query_pkey,
+ .query_port = rvt_query_port,
+ .query_qp = rvt_query_qp,
+ .query_srq = rvt_query_srq,
+ .reg_user_mr = rvt_reg_user_mr,
+ .req_notify_cq = rvt_req_notify_cq,
+ .resize_cq = rvt_resize_cq,
+
+ INIT_RDMA_OBJ_SIZE(ib_ah, rvt_ah, ibah),
+ INIT_RDMA_OBJ_SIZE(ib_cq, rvt_cq, ibcq),
+ INIT_RDMA_OBJ_SIZE(ib_pd, rvt_pd, ibpd),
+ INIT_RDMA_OBJ_SIZE(ib_qp, rvt_qp, ibqp),
+ INIT_RDMA_OBJ_SIZE(ib_srq, rvt_srq, ibsrq),
+ INIT_RDMA_OBJ_SIZE(ib_ucontext, rvt_ucontext, ibucontext),
+};
+
+static noinline int check_support(struct rvt_dev_info *rdi, int verb)
+{
+ switch (verb) {
+ case MISC:
+ /*
+ * These functions are not part of verbs specifically but are
+ * required for rdmavt to function.
+ */
+ if ((!rdi->ibdev.ops.port_groups) ||
+ (!rdi->driver_f.get_pci_dev))
+ return -EINVAL;
+ break;
+
+ case MODIFY_DEVICE:
+ /*
+ * rdmavt does not support modify device currently drivers must
+ * provide.
+ */
+ if (!rdi->ibdev.ops.modify_device)
+ return -EOPNOTSUPP;
+ break;
+
+ case QUERY_PORT:
+ if (!rdi->ibdev.ops.query_port)
+ if (!rdi->driver_f.query_port_state)
+ return -EINVAL;
+ break;
+
+ case MODIFY_PORT:
+ if (!rdi->ibdev.ops.modify_port)
+ if (!rdi->driver_f.cap_mask_chg ||
+ !rdi->driver_f.shut_down_port)
+ return -EINVAL;
+ break;
+
+ case QUERY_GID:
+ if (!rdi->ibdev.ops.query_gid)
+ if (!rdi->driver_f.get_guid_be)
+ return -EINVAL;
+ break;
+
+ case CREATE_QP:
+ if (!rdi->ibdev.ops.create_qp)
+ if (!rdi->driver_f.qp_priv_alloc ||
+ !rdi->driver_f.qp_priv_free ||
+ !rdi->driver_f.notify_qp_reset ||
+ !rdi->driver_f.flush_qp_waiters ||
+ !rdi->driver_f.stop_send_queue ||
+ !rdi->driver_f.quiesce_qp)
+ return -EINVAL;
+ break;
+
+ case MODIFY_QP:
+ if (!rdi->ibdev.ops.modify_qp)
+ if (!rdi->driver_f.notify_qp_reset ||
+ !rdi->driver_f.schedule_send ||
+ !rdi->driver_f.get_pmtu_from_attr ||
+ !rdi->driver_f.flush_qp_waiters ||
+ !rdi->driver_f.stop_send_queue ||
+ !rdi->driver_f.quiesce_qp ||
+ !rdi->driver_f.notify_error_qp ||
+ !rdi->driver_f.mtu_from_qp ||
+ !rdi->driver_f.mtu_to_path_mtu)
+ return -EINVAL;
+ break;
+
+ case DESTROY_QP:
+ if (!rdi->ibdev.ops.destroy_qp)
+ if (!rdi->driver_f.qp_priv_free ||
+ !rdi->driver_f.notify_qp_reset ||
+ !rdi->driver_f.flush_qp_waiters ||
+ !rdi->driver_f.stop_send_queue ||
+ !rdi->driver_f.quiesce_qp)
+ return -EINVAL;
+ break;
+
+ case POST_SEND:
+ if (!rdi->ibdev.ops.post_send)
+ if (!rdi->driver_f.schedule_send ||
+ !rdi->driver_f.do_send ||
+ !rdi->post_parms)
+ return -EINVAL;
+ break;
+
+ }
+
+ return 0;
+}
+
+/**
+ * rvt_register_device - register a driver
+ * @rdi: main dev structure for all of rdmavt operations
+ *
+ * It is up to drivers to allocate the rdi and fill in the appropriate
+ * information.
+ *
+ * Return: 0 on success otherwise an errno.
+ */
+int rvt_register_device(struct rvt_dev_info *rdi)
+{
+ int ret = 0, i;
+
+ if (!rdi)
+ return -EINVAL;
+
+ /*
+ * Check to ensure drivers have setup the required helpers for the verbs
+ * they want rdmavt to handle
+ */
+ for (i = 0; i < _VERB_IDX_MAX; i++)
+ if (check_support(rdi, i)) {
+ pr_err("Driver support req not met at %d\n", i);
+ return -EINVAL;
+ }
+
+ ib_set_device_ops(&rdi->ibdev, &rvt_dev_ops);
+
+ /* Once we get past here we can use rvt_pr macros and tracepoints */
+ trace_rvt_dbg(rdi, "Driver attempting registration");
+ rvt_mmap_init(rdi);
+
+ /* Queue Pairs */
+ ret = rvt_driver_qp_init(rdi);
+ if (ret) {
+ pr_err("Error in driver QP init.\n");
+ return -EINVAL;
+ }
+
+ /* Address Handle */
+ spin_lock_init(&rdi->n_ahs_lock);
+ rdi->n_ahs_allocated = 0;
+
+ /* Shared Receive Queue */
+ rvt_driver_srq_init(rdi);
+
+ /* Multicast */
+ rvt_driver_mcast_init(rdi);
+
+ /* Mem Region */
+ ret = rvt_driver_mr_init(rdi);
+ if (ret) {
+ pr_err("Error in driver MR init.\n");
+ goto bail_no_mr;
+ }
+
+ /* Memory Working Set Size */
+ ret = rvt_wss_init(rdi);
+ if (ret) {
+ rvt_pr_err(rdi, "Error in WSS init.\n");
+ goto bail_mr;
+ }
+
+ /* Completion queues */
+ spin_lock_init(&rdi->n_cqs_lock);
+
+ /* Protection Domain */
+ spin_lock_init(&rdi->n_pds_lock);
+ rdi->n_pds_allocated = 0;
+
+ /*
+ * There are some things which could be set by underlying drivers but
+ * really should be up to rdmavt to set. For instance drivers can't know
+ * exactly which functions rdmavt supports, nor do they know the ABI
+ * version, so we do all of this sort of stuff here.
+ */
+ rdi->ibdev.uverbs_cmd_mask |=
+ (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+ (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+ (1ull << IB_USER_VERBS_CMD_POST_SEND) |
+ (1ull << IB_USER_VERBS_CMD_POST_RECV) |
+ (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
+ rdi->ibdev.node_type = RDMA_NODE_IB_CA;
+ if (!rdi->ibdev.num_comp_vectors)
+ rdi->ibdev.num_comp_vectors = 1;
+
+ /* We are now good to announce we exist */
+ ret = ib_register_device(&rdi->ibdev, dev_name(&rdi->ibdev.dev), NULL);
+ if (ret) {
+ rvt_pr_err(rdi, "Failed to register driver with ib core.\n");
+ goto bail_wss;
+ }
+
+ rvt_create_mad_agents(rdi);
+
+ rvt_pr_info(rdi, "Registration with rdmavt done.\n");
+ return ret;
+
+bail_wss:
+ rvt_wss_exit(rdi);
+bail_mr:
+ rvt_mr_exit(rdi);
+
+bail_no_mr:
+ rvt_qp_exit(rdi);
+
+ return ret;
+}
+EXPORT_SYMBOL(rvt_register_device);
+
+/**
+ * rvt_unregister_device - remove a driver
+ * @rdi: rvt dev struct
+ */
+void rvt_unregister_device(struct rvt_dev_info *rdi)
+{
+ trace_rvt_dbg(rdi, "Driver is unregistering.");
+ if (!rdi)
+ return;
+
+ rvt_free_mad_agents(rdi);
+
+ ib_unregister_device(&rdi->ibdev);
+ rvt_wss_exit(rdi);
+ rvt_mr_exit(rdi);
+ rvt_qp_exit(rdi);
+}
+EXPORT_SYMBOL(rvt_unregister_device);
+
+/**
+ * rvt_init_port - init internal data for driver port
+ * @rdi: rvt_dev_info struct
+ * @port: rvt port
+ * @port_index: 0 based index of ports, different from IB core port num
+ * @pkey_table: pkey_table for @port
+ *
+ * Keep track of a list of ports. No need to have a detach port.
+ * They persist until the driver goes away.
+ *
+ * Return: always 0
+ */
+int rvt_init_port(struct rvt_dev_info *rdi, struct rvt_ibport *port,
+ int port_index, u16 *pkey_table)
+{
+
+ rdi->ports[port_index] = port;
+ rdi->ports[port_index]->pkey_table = pkey_table;
+
+ return 0;
+}
+EXPORT_SYMBOL(rvt_init_port);
diff --git a/drivers/infiniband/sw/rdmavt/vt.h b/drivers/infiniband/sw/rdmavt/vt.h
new file mode 100644
index 0000000000..461574e3f6
--- /dev/null
+++ b/drivers/infiniband/sw/rdmavt/vt.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+/*
+ * Copyright(c) 2016 Intel Corporation.
+ */
+
+#ifndef DEF_RDMAVT_H
+#define DEF_RDMAVT_H
+
+#include <rdma/rdma_vt.h>
+#include <linux/pci.h>
+#include "pd.h"
+#include "qp.h"
+#include "ah.h"
+#include "mr.h"
+#include "srq.h"
+#include "mcast.h"
+#include "mmap.h"
+#include "cq.h"
+#include "mad.h"
+
+#define rvt_pr_info(rdi, fmt, ...) \
+ __rvt_pr_info(rdi->driver_f.get_pci_dev(rdi), \
+ rvt_get_ibdev_name(rdi), \
+ fmt, \
+ ##__VA_ARGS__)
+
+#define rvt_pr_warn(rdi, fmt, ...) \
+ __rvt_pr_warn(rdi->driver_f.get_pci_dev(rdi), \
+ rvt_get_ibdev_name(rdi), \
+ fmt, \
+ ##__VA_ARGS__)
+
+#define rvt_pr_err(rdi, fmt, ...) \
+ __rvt_pr_err(rdi->driver_f.get_pci_dev(rdi), \
+ rvt_get_ibdev_name(rdi), \
+ fmt, \
+ ##__VA_ARGS__)
+
+#define rvt_pr_err_ratelimited(rdi, fmt, ...) \
+ __rvt_pr_err_ratelimited((rdi)->driver_f.get_pci_dev(rdi), \
+ rvt_get_ibdev_name(rdi), \
+ fmt, \
+ ##__VA_ARGS__)
+
+#define __rvt_pr_info(pdev, name, fmt, ...) \
+ dev_info(&pdev->dev, "%s: " fmt, name, ##__VA_ARGS__)
+
+#define __rvt_pr_warn(pdev, name, fmt, ...) \
+ dev_warn(&pdev->dev, "%s: " fmt, name, ##__VA_ARGS__)
+
+#define __rvt_pr_err(pdev, name, fmt, ...) \
+ dev_err(&pdev->dev, "%s: " fmt, name, ##__VA_ARGS__)
+
+#define __rvt_pr_err_ratelimited(pdev, name, fmt, ...) \
+ dev_err_ratelimited(&(pdev)->dev, "%s: " fmt, name, ##__VA_ARGS__)
+
+static inline u32 ibport_num_to_idx(struct ib_device *ibdev, u32 port_num)
+{
+ return port_num - 1; /* IB ports start at 1 our arrays at 0 */
+}
+
+#endif /* DEF_RDMAVT_H */
diff --git a/drivers/infiniband/sw/rxe/Kconfig b/drivers/infiniband/sw/rxe/Kconfig
new file mode 100644
index 0000000000..06b8dc5093
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/Kconfig
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config RDMA_RXE
+ tristate "Software RDMA over Ethernet (RoCE) driver"
+ depends on INET && PCI && INFINIBAND
+ depends on INFINIBAND_VIRT_DMA
+ select NET_UDP_TUNNEL
+ select CRYPTO
+ select CRYPTO_CRC32
+ help
+ This driver implements the InfiniBand RDMA transport over
+ the Linux network stack. It enables a system with a
+ standard Ethernet adapter to interoperate with a RoCE
+ adapter or with another system running the RXE driver.
+ Documentation on InfiniBand and RoCE can be downloaded at
+ www.infinibandta.org and www.openfabrics.org. (See also
+ siw which is a similar software driver for iWARP.)
+
+ The driver is split into two layers, one interfaces with the
+ Linux RDMA stack and implements a kernel or user space
+ verbs API. The user space verbs API requires a support
+ library named librxe which is loaded by the generic user
+ space verbs API, libibverbs. The other layer interfaces
+ with the Linux network stack at layer 3.
+
+ To configure and work with soft-RoCE driver please use the
+ following wiki page under "configure Soft-RoCE (RXE)" section:
+
+ https://github.com/linux-rdma/rdma-core/blob/master/Documentation/rxe.md
diff --git a/drivers/infiniband/sw/rxe/Makefile b/drivers/infiniband/sw/rxe/Makefile
new file mode 100644
index 0000000000..5395a581f4
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/Makefile
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_RDMA_RXE) += rdma_rxe.o
+
+rdma_rxe-y := \
+ rxe.o \
+ rxe_comp.o \
+ rxe_req.o \
+ rxe_resp.o \
+ rxe_recv.o \
+ rxe_pool.o \
+ rxe_queue.o \
+ rxe_verbs.o \
+ rxe_av.o \
+ rxe_srq.o \
+ rxe_qp.o \
+ rxe_cq.o \
+ rxe_mr.o \
+ rxe_mw.o \
+ rxe_opcode.o \
+ rxe_mmap.o \
+ rxe_icrc.o \
+ rxe_mcast.o \
+ rxe_task.o \
+ rxe_net.o \
+ rxe_hw_counters.o
diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
new file mode 100644
index 0000000000..54c723a6ed
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -0,0 +1,243 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#include <rdma/rdma_netlink.h>
+#include <net/addrconf.h>
+#include "rxe.h"
+#include "rxe_loc.h"
+
+MODULE_AUTHOR("Bob Pearson, Frank Zago, John Groves, Kamal Heib");
+MODULE_DESCRIPTION("Soft RDMA transport");
+MODULE_LICENSE("Dual BSD/GPL");
+
+/* free resources for a rxe device all objects created for this device must
+ * have been destroyed
+ */
+void rxe_dealloc(struct ib_device *ib_dev)
+{
+ struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev);
+
+ rxe_pool_cleanup(&rxe->uc_pool);
+ rxe_pool_cleanup(&rxe->pd_pool);
+ rxe_pool_cleanup(&rxe->ah_pool);
+ rxe_pool_cleanup(&rxe->srq_pool);
+ rxe_pool_cleanup(&rxe->qp_pool);
+ rxe_pool_cleanup(&rxe->cq_pool);
+ rxe_pool_cleanup(&rxe->mr_pool);
+ rxe_pool_cleanup(&rxe->mw_pool);
+
+ WARN_ON(!RB_EMPTY_ROOT(&rxe->mcg_tree));
+
+ if (rxe->tfm)
+ crypto_free_shash(rxe->tfm);
+}
+
+/* initialize rxe device parameters */
+static void rxe_init_device_param(struct rxe_dev *rxe)
+{
+ rxe->max_inline_data = RXE_MAX_INLINE_DATA;
+
+ rxe->attr.vendor_id = RXE_VENDOR_ID;
+ rxe->attr.max_mr_size = RXE_MAX_MR_SIZE;
+ rxe->attr.page_size_cap = RXE_PAGE_SIZE_CAP;
+ rxe->attr.max_qp = RXE_MAX_QP;
+ rxe->attr.max_qp_wr = RXE_MAX_QP_WR;
+ rxe->attr.device_cap_flags = RXE_DEVICE_CAP_FLAGS;
+ rxe->attr.kernel_cap_flags = IBK_ALLOW_USER_UNREG;
+ rxe->attr.max_send_sge = RXE_MAX_SGE;
+ rxe->attr.max_recv_sge = RXE_MAX_SGE;
+ rxe->attr.max_sge_rd = RXE_MAX_SGE_RD;
+ rxe->attr.max_cq = RXE_MAX_CQ;
+ rxe->attr.max_cqe = (1 << RXE_MAX_LOG_CQE) - 1;
+ rxe->attr.max_mr = RXE_MAX_MR;
+ rxe->attr.max_mw = RXE_MAX_MW;
+ rxe->attr.max_pd = RXE_MAX_PD;
+ rxe->attr.max_qp_rd_atom = RXE_MAX_QP_RD_ATOM;
+ rxe->attr.max_res_rd_atom = RXE_MAX_RES_RD_ATOM;
+ rxe->attr.max_qp_init_rd_atom = RXE_MAX_QP_INIT_RD_ATOM;
+ rxe->attr.atomic_cap = IB_ATOMIC_HCA;
+ rxe->attr.max_mcast_grp = RXE_MAX_MCAST_GRP;
+ rxe->attr.max_mcast_qp_attach = RXE_MAX_MCAST_QP_ATTACH;
+ rxe->attr.max_total_mcast_qp_attach = RXE_MAX_TOT_MCAST_QP_ATTACH;
+ rxe->attr.max_ah = RXE_MAX_AH;
+ rxe->attr.max_srq = RXE_MAX_SRQ;
+ rxe->attr.max_srq_wr = RXE_MAX_SRQ_WR;
+ rxe->attr.max_srq_sge = RXE_MAX_SRQ_SGE;
+ rxe->attr.max_fast_reg_page_list_len = RXE_MAX_FMR_PAGE_LIST_LEN;
+ rxe->attr.max_pkeys = RXE_MAX_PKEYS;
+ rxe->attr.local_ca_ack_delay = RXE_LOCAL_CA_ACK_DELAY;
+ addrconf_addr_eui48((unsigned char *)&rxe->attr.sys_image_guid,
+ rxe->ndev->dev_addr);
+
+ rxe->max_ucontext = RXE_MAX_UCONTEXT;
+}
+
+/* initialize port attributes */
+static void rxe_init_port_param(struct rxe_port *port)
+{
+ port->attr.state = IB_PORT_DOWN;
+ port->attr.max_mtu = IB_MTU_4096;
+ port->attr.active_mtu = IB_MTU_256;
+ port->attr.gid_tbl_len = RXE_PORT_GID_TBL_LEN;
+ port->attr.port_cap_flags = RXE_PORT_PORT_CAP_FLAGS;
+ port->attr.max_msg_sz = RXE_PORT_MAX_MSG_SZ;
+ port->attr.bad_pkey_cntr = RXE_PORT_BAD_PKEY_CNTR;
+ port->attr.qkey_viol_cntr = RXE_PORT_QKEY_VIOL_CNTR;
+ port->attr.pkey_tbl_len = RXE_PORT_PKEY_TBL_LEN;
+ port->attr.lid = RXE_PORT_LID;
+ port->attr.sm_lid = RXE_PORT_SM_LID;
+ port->attr.lmc = RXE_PORT_LMC;
+ port->attr.max_vl_num = RXE_PORT_MAX_VL_NUM;
+ port->attr.sm_sl = RXE_PORT_SM_SL;
+ port->attr.subnet_timeout = RXE_PORT_SUBNET_TIMEOUT;
+ port->attr.init_type_reply = RXE_PORT_INIT_TYPE_REPLY;
+ port->attr.active_width = RXE_PORT_ACTIVE_WIDTH;
+ port->attr.active_speed = RXE_PORT_ACTIVE_SPEED;
+ port->attr.phys_state = RXE_PORT_PHYS_STATE;
+ port->mtu_cap = ib_mtu_enum_to_int(IB_MTU_256);
+ port->subnet_prefix = cpu_to_be64(RXE_PORT_SUBNET_PREFIX);
+}
+
+/* initialize port state, note IB convention that HCA ports are always
+ * numbered from 1
+ */
+static void rxe_init_ports(struct rxe_dev *rxe)
+{
+ struct rxe_port *port = &rxe->port;
+
+ rxe_init_port_param(port);
+ addrconf_addr_eui48((unsigned char *)&port->port_guid,
+ rxe->ndev->dev_addr);
+ spin_lock_init(&port->port_lock);
+}
+
+/* init pools of managed objects */
+static void rxe_init_pools(struct rxe_dev *rxe)
+{
+ rxe_pool_init(rxe, &rxe->uc_pool, RXE_TYPE_UC);
+ rxe_pool_init(rxe, &rxe->pd_pool, RXE_TYPE_PD);
+ rxe_pool_init(rxe, &rxe->ah_pool, RXE_TYPE_AH);
+ rxe_pool_init(rxe, &rxe->srq_pool, RXE_TYPE_SRQ);
+ rxe_pool_init(rxe, &rxe->qp_pool, RXE_TYPE_QP);
+ rxe_pool_init(rxe, &rxe->cq_pool, RXE_TYPE_CQ);
+ rxe_pool_init(rxe, &rxe->mr_pool, RXE_TYPE_MR);
+ rxe_pool_init(rxe, &rxe->mw_pool, RXE_TYPE_MW);
+}
+
+/* initialize rxe device state */
+static void rxe_init(struct rxe_dev *rxe)
+{
+ /* init default device parameters */
+ rxe_init_device_param(rxe);
+
+ rxe_init_ports(rxe);
+ rxe_init_pools(rxe);
+
+ /* init pending mmap list */
+ spin_lock_init(&rxe->mmap_offset_lock);
+ spin_lock_init(&rxe->pending_lock);
+ INIT_LIST_HEAD(&rxe->pending_mmaps);
+
+ /* init multicast support */
+ spin_lock_init(&rxe->mcg_lock);
+ rxe->mcg_tree = RB_ROOT;
+
+ mutex_init(&rxe->usdev_lock);
+}
+
+void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
+{
+ struct rxe_port *port = &rxe->port;
+ enum ib_mtu mtu;
+
+ mtu = eth_mtu_int_to_enum(ndev_mtu);
+
+ /* Make sure that new MTU in range */
+ mtu = mtu ? min_t(enum ib_mtu, mtu, IB_MTU_4096) : IB_MTU_256;
+
+ port->attr.active_mtu = mtu;
+ port->mtu_cap = ib_mtu_enum_to_int(mtu);
+
+ rxe_info_dev(rxe, "Set mtu to %d", port->mtu_cap);
+}
+
+/* called by ifc layer to create new rxe device.
+ * The caller should allocate memory for rxe by calling ib_alloc_device.
+ */
+int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name)
+{
+ rxe_init(rxe);
+ rxe_set_mtu(rxe, mtu);
+
+ return rxe_register_device(rxe, ibdev_name);
+}
+
+static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
+{
+ struct rxe_dev *rxe;
+ int err = 0;
+
+ if (is_vlan_dev(ndev)) {
+ rxe_err("rxe creation allowed on top of a real device only");
+ err = -EPERM;
+ goto err;
+ }
+
+ rxe = rxe_get_dev_from_net(ndev);
+ if (rxe) {
+ ib_device_put(&rxe->ib_dev);
+ rxe_err_dev(rxe, "already configured on %s", ndev->name);
+ err = -EEXIST;
+ goto err;
+ }
+
+ err = rxe_net_add(ibdev_name, ndev);
+ if (err) {
+ rxe_err("failed to add %s\n", ndev->name);
+ goto err;
+ }
+err:
+ return err;
+}
+
+static struct rdma_link_ops rxe_link_ops = {
+ .type = "rxe",
+ .newlink = rxe_newlink,
+};
+
+static int __init rxe_module_init(void)
+{
+ int err;
+
+ err = rxe_alloc_wq();
+ if (err)
+ return err;
+
+ err = rxe_net_init();
+ if (err) {
+ rxe_destroy_wq();
+ return err;
+ }
+
+ rdma_link_register(&rxe_link_ops);
+ pr_info("loaded\n");
+ return 0;
+}
+
+static void __exit rxe_module_exit(void)
+{
+ rdma_link_unregister(&rxe_link_ops);
+ ib_unregister_driver(RDMA_DRIVER_RXE);
+ rxe_net_exit();
+ rxe_destroy_wq();
+
+ pr_info("unloaded\n");
+}
+
+late_initcall(rxe_module_init);
+module_exit(rxe_module_exit);
+
+MODULE_ALIAS_RDMA_LINK("rxe");
diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h
new file mode 100644
index 0000000000..d33dd6cf83
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe.h
@@ -0,0 +1,161 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#ifndef RXE_H
+#define RXE_H
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/skbuff.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_addr.h>
+#include <crypto/hash.h>
+
+#include "rxe_net.h"
+#include "rxe_opcode.h"
+#include "rxe_hdr.h"
+#include "rxe_param.h"
+#include "rxe_verbs.h"
+#include "rxe_loc.h"
+
+/*
+ * Version 1 and Version 2 are identical on 64 bit machines, but on 32 bit
+ * machines Version 2 has a different struct layout.
+ */
+#define RXE_UVERBS_ABI_VERSION 2
+
+#define RXE_ROCE_V2_SPORT (0xc000)
+
+#define rxe_dbg(fmt, ...) pr_debug("%s: " fmt "\n", __func__, ##__VA_ARGS__)
+#define rxe_dbg_dev(rxe, fmt, ...) ibdev_dbg(&(rxe)->ib_dev, \
+ "%s: " fmt, __func__, ##__VA_ARGS__)
+#define rxe_dbg_uc(uc, fmt, ...) ibdev_dbg((uc)->ibuc.device, \
+ "uc#%d %s: " fmt, (uc)->elem.index, __func__, ##__VA_ARGS__)
+#define rxe_dbg_pd(pd, fmt, ...) ibdev_dbg((pd)->ibpd.device, \
+ "pd#%d %s: " fmt, (pd)->elem.index, __func__, ##__VA_ARGS__)
+#define rxe_dbg_ah(ah, fmt, ...) ibdev_dbg((ah)->ibah.device, \
+ "ah#%d %s: " fmt, (ah)->elem.index, __func__, ##__VA_ARGS__)
+#define rxe_dbg_srq(srq, fmt, ...) ibdev_dbg((srq)->ibsrq.device, \
+ "srq#%d %s: " fmt, (srq)->elem.index, __func__, ##__VA_ARGS__)
+#define rxe_dbg_qp(qp, fmt, ...) ibdev_dbg((qp)->ibqp.device, \
+ "qp#%d %s: " fmt, (qp)->elem.index, __func__, ##__VA_ARGS__)
+#define rxe_dbg_cq(cq, fmt, ...) ibdev_dbg((cq)->ibcq.device, \
+ "cq#%d %s: " fmt, (cq)->elem.index, __func__, ##__VA_ARGS__)
+#define rxe_dbg_mr(mr, fmt, ...) ibdev_dbg((mr)->ibmr.device, \
+ "mr#%d %s: " fmt, (mr)->elem.index, __func__, ##__VA_ARGS__)
+#define rxe_dbg_mw(mw, fmt, ...) ibdev_dbg((mw)->ibmw.device, \
+ "mw#%d %s: " fmt, (mw)->elem.index, __func__, ##__VA_ARGS__)
+
+#define rxe_err(fmt, ...) pr_err_ratelimited("%s: " fmt "\n", __func__, \
+ ##__VA_ARGS__)
+#define rxe_err_dev(rxe, fmt, ...) ibdev_err_ratelimited(&(rxe)->ib_dev, \
+ "%s: " fmt, __func__, ##__VA_ARGS__)
+#define rxe_err_uc(uc, fmt, ...) ibdev_err_ratelimited((uc)->ibuc.device, \
+ "uc#%d %s: " fmt, (uc)->elem.index, __func__, ##__VA_ARGS__)
+#define rxe_err_pd(pd, fmt, ...) ibdev_err_ratelimited((pd)->ibpd.device, \
+ "pd#%d %s: " fmt, (pd)->elem.index, __func__, ##__VA_ARGS__)
+#define rxe_err_ah(ah, fmt, ...) ibdev_err_ratelimited((ah)->ibah.device, \
+ "ah#%d %s: " fmt, (ah)->elem.index, __func__, ##__VA_ARGS__)
+#define rxe_err_srq(srq, fmt, ...) ibdev_err_ratelimited((srq)->ibsrq.device, \
+ "srq#%d %s: " fmt, (srq)->elem.index, __func__, ##__VA_ARGS__)
+#define rxe_err_qp(qp, fmt, ...) ibdev_err_ratelimited((qp)->ibqp.device, \
+ "qp#%d %s: " fmt, (qp)->elem.index, __func__, ##__VA_ARGS__)
+#define rxe_err_cq(cq, fmt, ...) ibdev_err_ratelimited((cq)->ibcq.device, \
+ "cq#%d %s: " fmt, (cq)->elem.index, __func__, ##__VA_ARGS__)
+#define rxe_err_mr(mr, fmt, ...) ibdev_err_ratelimited((mr)->ibmr.device, \
+ "mr#%d %s: " fmt, (mr)->elem.index, __func__, ##__VA_ARGS__)
+#define rxe_err_mw(mw, fmt, ...) ibdev_err_ratelimited((mw)->ibmw.device, \
+ "mw#%d %s: " fmt, (mw)->elem.index, __func__, ##__VA_ARGS__)
+
+#define rxe_info(fmt, ...) pr_info_ratelimited("%s: " fmt "\n", __func__, \
+ ##__VA_ARGS__)
+#define rxe_info_dev(rxe, fmt, ...) ibdev_info_ratelimited(&(rxe)->ib_dev, \
+ "%s: " fmt, __func__, ##__VA_ARGS__)
+#define rxe_info_uc(uc, fmt, ...) ibdev_info_ratelimited((uc)->ibuc.device, \
+ "uc#%d %s: " fmt, (uc)->elem.index, __func__, ##__VA_ARGS__)
+#define rxe_info_pd(pd, fmt, ...) ibdev_info_ratelimited((pd)->ibpd.device, \
+ "pd#%d %s: " fmt, (pd)->elem.index, __func__, ##__VA_ARGS__)
+#define rxe_info_ah(ah, fmt, ...) ibdev_info_ratelimited((ah)->ibah.device, \
+ "ah#%d %s: " fmt, (ah)->elem.index, __func__, ##__VA_ARGS__)
+#define rxe_info_srq(srq, fmt, ...) ibdev_info_ratelimited((srq)->ibsrq.device, \
+ "srq#%d %s: " fmt, (srq)->elem.index, __func__, ##__VA_ARGS__)
+#define rxe_info_qp(qp, fmt, ...) ibdev_info_ratelimited((qp)->ibqp.device, \
+ "qp#%d %s: " fmt, (qp)->elem.index, __func__, ##__VA_ARGS__)
+#define rxe_info_cq(cq, fmt, ...) ibdev_info_ratelimited((cq)->ibcq.device, \
+ "cq#%d %s: " fmt, (cq)->elem.index, __func__, ##__VA_ARGS__)
+#define rxe_info_mr(mr, fmt, ...) ibdev_info_ratelimited((mr)->ibmr.device, \
+ "mr#%d %s: " fmt, (mr)->elem.index, __func__, ##__VA_ARGS__)
+#define rxe_info_mw(mw, fmt, ...) ibdev_info_ratelimited((mw)->ibmw.device, \
+ "mw#%d %s: " fmt, (mw)->elem.index, __func__, ##__VA_ARGS__)
+
+/* responder states */
+enum resp_states {
+ RESPST_NONE,
+ RESPST_GET_REQ,
+ RESPST_CHK_PSN,
+ RESPST_CHK_OP_SEQ,
+ RESPST_CHK_OP_VALID,
+ RESPST_CHK_RESOURCE,
+ RESPST_CHK_LENGTH,
+ RESPST_CHK_RKEY,
+ RESPST_EXECUTE,
+ RESPST_READ_REPLY,
+ RESPST_ATOMIC_REPLY,
+ RESPST_ATOMIC_WRITE_REPLY,
+ RESPST_PROCESS_FLUSH,
+ RESPST_COMPLETE,
+ RESPST_ACKNOWLEDGE,
+ RESPST_CLEANUP,
+ RESPST_DUPLICATE_REQUEST,
+ RESPST_ERR_MALFORMED_WQE,
+ RESPST_ERR_UNSUPPORTED_OPCODE,
+ RESPST_ERR_MISALIGNED_ATOMIC,
+ RESPST_ERR_PSN_OUT_OF_SEQ,
+ RESPST_ERR_MISSING_OPCODE_FIRST,
+ RESPST_ERR_MISSING_OPCODE_LAST_C,
+ RESPST_ERR_MISSING_OPCODE_LAST_D1E,
+ RESPST_ERR_TOO_MANY_RDMA_ATM_REQ,
+ RESPST_ERR_RNR,
+ RESPST_ERR_RKEY_VIOLATION,
+ RESPST_ERR_INVALIDATE_RKEY,
+ RESPST_ERR_LENGTH,
+ RESPST_ERR_CQ_OVERFLOW,
+ RESPST_ERROR,
+ RESPST_DONE,
+ RESPST_EXIT,
+};
+
+void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu);
+
+int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name);
+
+void rxe_rcv(struct sk_buff *skb);
+
+/* The caller must do a matching ib_device_put(&dev->ib_dev) */
+static inline struct rxe_dev *rxe_get_dev_from_net(struct net_device *ndev)
+{
+ struct ib_device *ibdev =
+ ib_device_get_by_netdev(ndev, RDMA_DRIVER_RXE);
+
+ if (!ibdev)
+ return NULL;
+ return container_of(ibdev, struct rxe_dev, ib_dev);
+}
+
+void rxe_port_up(struct rxe_dev *rxe);
+void rxe_port_down(struct rxe_dev *rxe);
+void rxe_set_port_state(struct rxe_dev *rxe);
+
+#endif /* RXE_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c
new file mode 100644
index 0000000000..889d7adbd4
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_av.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+void rxe_init_av(struct rdma_ah_attr *attr, struct rxe_av *av)
+{
+ rxe_av_from_attr(rdma_ah_get_port_num(attr), av, attr);
+ rxe_av_fill_ip_info(av, attr);
+ memcpy(av->dmac, attr->roce.dmac, ETH_ALEN);
+}
+
+static int chk_attr(void *obj, struct rdma_ah_attr *attr, bool obj_is_ah)
+{
+ const struct ib_global_route *grh = rdma_ah_read_grh(attr);
+ struct rxe_port *port;
+ struct rxe_dev *rxe;
+ struct rxe_qp *qp;
+ struct rxe_ah *ah;
+ int type;
+
+ if (obj_is_ah) {
+ ah = obj;
+ rxe = to_rdev(ah->ibah.device);
+ } else {
+ qp = obj;
+ rxe = to_rdev(qp->ibqp.device);
+ }
+
+ port = &rxe->port;
+
+ if (rdma_ah_get_ah_flags(attr) & IB_AH_GRH) {
+ if (grh->sgid_index > port->attr.gid_tbl_len) {
+ if (obj_is_ah)
+ rxe_dbg_ah(ah, "invalid sgid index = %d\n",
+ grh->sgid_index);
+ else
+ rxe_dbg_qp(qp, "invalid sgid index = %d\n",
+ grh->sgid_index);
+ return -EINVAL;
+ }
+
+ type = rdma_gid_attr_network_type(grh->sgid_attr);
+ if (type < RDMA_NETWORK_IPV4 ||
+ type > RDMA_NETWORK_IPV6) {
+ if (obj_is_ah)
+ rxe_dbg_ah(ah, "invalid network type for rdma_rxe = %d\n",
+ type);
+ else
+ rxe_dbg_qp(qp, "invalid network type for rdma_rxe = %d\n",
+ type);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+int rxe_av_chk_attr(struct rxe_qp *qp, struct rdma_ah_attr *attr)
+{
+ return chk_attr(qp, attr, false);
+}
+
+int rxe_ah_chk_attr(struct rxe_ah *ah, struct rdma_ah_attr *attr)
+{
+ return chk_attr(ah, attr, true);
+}
+
+void rxe_av_from_attr(u8 port_num, struct rxe_av *av,
+ struct rdma_ah_attr *attr)
+{
+ const struct ib_global_route *grh = rdma_ah_read_grh(attr);
+
+ memset(av, 0, sizeof(*av));
+ memcpy(av->grh.dgid.raw, grh->dgid.raw, sizeof(grh->dgid.raw));
+ av->grh.flow_label = grh->flow_label;
+ av->grh.sgid_index = grh->sgid_index;
+ av->grh.hop_limit = grh->hop_limit;
+ av->grh.traffic_class = grh->traffic_class;
+ av->port_num = port_num;
+}
+
+void rxe_av_to_attr(struct rxe_av *av, struct rdma_ah_attr *attr)
+{
+ struct ib_global_route *grh = rdma_ah_retrieve_grh(attr);
+
+ attr->type = RDMA_AH_ATTR_TYPE_ROCE;
+
+ memcpy(grh->dgid.raw, av->grh.dgid.raw, sizeof(av->grh.dgid.raw));
+ grh->flow_label = av->grh.flow_label;
+ grh->sgid_index = av->grh.sgid_index;
+ grh->hop_limit = av->grh.hop_limit;
+ grh->traffic_class = av->grh.traffic_class;
+
+ rdma_ah_set_ah_flags(attr, IB_AH_GRH);
+ rdma_ah_set_port_num(attr, av->port_num);
+}
+
+void rxe_av_fill_ip_info(struct rxe_av *av, struct rdma_ah_attr *attr)
+{
+ const struct ib_gid_attr *sgid_attr = attr->grh.sgid_attr;
+ int ibtype;
+ int type;
+
+ rdma_gid2ip((struct sockaddr *)&av->sgid_addr, &sgid_attr->gid);
+ rdma_gid2ip((struct sockaddr *)&av->dgid_addr,
+ &rdma_ah_read_grh(attr)->dgid);
+
+ ibtype = rdma_gid_attr_network_type(sgid_attr);
+
+ switch (ibtype) {
+ case RDMA_NETWORK_IPV4:
+ type = RXE_NETWORK_TYPE_IPV4;
+ break;
+ case RDMA_NETWORK_IPV6:
+ type = RXE_NETWORK_TYPE_IPV6;
+ break;
+ default:
+ /* not reached - checked in rxe_av_chk_attr */
+ type = 0;
+ break;
+ }
+
+ av->network_type = type;
+}
+
+struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt, struct rxe_ah **ahp)
+{
+ struct rxe_ah *ah;
+ u32 ah_num;
+
+ if (ahp)
+ *ahp = NULL;
+
+ if (!pkt || !pkt->qp)
+ return NULL;
+
+ if (qp_type(pkt->qp) == IB_QPT_RC || qp_type(pkt->qp) == IB_QPT_UC)
+ return &pkt->qp->pri_av;
+
+ if (!pkt->wqe)
+ return NULL;
+
+ ah_num = pkt->wqe->wr.wr.ud.ah_num;
+ if (ah_num) {
+ /* only new user provider or kernel client */
+ ah = rxe_pool_get_index(&pkt->rxe->ah_pool, ah_num);
+ if (!ah) {
+ rxe_dbg_qp(pkt->qp, "Unable to find AH matching ah_num\n");
+ return NULL;
+ }
+
+ if (rxe_ah_pd(ah) != pkt->qp->pd) {
+ rxe_dbg_qp(pkt->qp, "PDs don't match for AH and QP\n");
+ rxe_put(ah);
+ return NULL;
+ }
+
+ if (ahp)
+ *ahp = ah;
+ else
+ rxe_put(ah);
+
+ return &ah->av;
+ }
+
+ /* only old user provider for UD sends*/
+ return &pkt->wqe->wr.wr.ud.av;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c
new file mode 100644
index 0000000000..d0bdc2d8ad
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_comp.c
@@ -0,0 +1,851 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#include <linux/skbuff.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+#include "rxe_task.h"
+
+enum comp_state {
+ COMPST_GET_ACK,
+ COMPST_GET_WQE,
+ COMPST_COMP_WQE,
+ COMPST_COMP_ACK,
+ COMPST_CHECK_PSN,
+ COMPST_CHECK_ACK,
+ COMPST_READ,
+ COMPST_ATOMIC,
+ COMPST_WRITE_SEND,
+ COMPST_UPDATE_COMP,
+ COMPST_ERROR_RETRY,
+ COMPST_RNR_RETRY,
+ COMPST_ERROR,
+ COMPST_EXIT, /* We have an issue, and we want to rerun the completer */
+ COMPST_DONE, /* The completer finished successflly */
+};
+
+static char *comp_state_name[] = {
+ [COMPST_GET_ACK] = "GET ACK",
+ [COMPST_GET_WQE] = "GET WQE",
+ [COMPST_COMP_WQE] = "COMP WQE",
+ [COMPST_COMP_ACK] = "COMP ACK",
+ [COMPST_CHECK_PSN] = "CHECK PSN",
+ [COMPST_CHECK_ACK] = "CHECK ACK",
+ [COMPST_READ] = "READ",
+ [COMPST_ATOMIC] = "ATOMIC",
+ [COMPST_WRITE_SEND] = "WRITE/SEND",
+ [COMPST_UPDATE_COMP] = "UPDATE COMP",
+ [COMPST_ERROR_RETRY] = "ERROR RETRY",
+ [COMPST_RNR_RETRY] = "RNR RETRY",
+ [COMPST_ERROR] = "ERROR",
+ [COMPST_EXIT] = "EXIT",
+ [COMPST_DONE] = "DONE",
+};
+
+static unsigned long rnrnak_usec[32] = {
+ [IB_RNR_TIMER_655_36] = 655360,
+ [IB_RNR_TIMER_000_01] = 10,
+ [IB_RNR_TIMER_000_02] = 20,
+ [IB_RNR_TIMER_000_03] = 30,
+ [IB_RNR_TIMER_000_04] = 40,
+ [IB_RNR_TIMER_000_06] = 60,
+ [IB_RNR_TIMER_000_08] = 80,
+ [IB_RNR_TIMER_000_12] = 120,
+ [IB_RNR_TIMER_000_16] = 160,
+ [IB_RNR_TIMER_000_24] = 240,
+ [IB_RNR_TIMER_000_32] = 320,
+ [IB_RNR_TIMER_000_48] = 480,
+ [IB_RNR_TIMER_000_64] = 640,
+ [IB_RNR_TIMER_000_96] = 960,
+ [IB_RNR_TIMER_001_28] = 1280,
+ [IB_RNR_TIMER_001_92] = 1920,
+ [IB_RNR_TIMER_002_56] = 2560,
+ [IB_RNR_TIMER_003_84] = 3840,
+ [IB_RNR_TIMER_005_12] = 5120,
+ [IB_RNR_TIMER_007_68] = 7680,
+ [IB_RNR_TIMER_010_24] = 10240,
+ [IB_RNR_TIMER_015_36] = 15360,
+ [IB_RNR_TIMER_020_48] = 20480,
+ [IB_RNR_TIMER_030_72] = 30720,
+ [IB_RNR_TIMER_040_96] = 40960,
+ [IB_RNR_TIMER_061_44] = 61410,
+ [IB_RNR_TIMER_081_92] = 81920,
+ [IB_RNR_TIMER_122_88] = 122880,
+ [IB_RNR_TIMER_163_84] = 163840,
+ [IB_RNR_TIMER_245_76] = 245760,
+ [IB_RNR_TIMER_327_68] = 327680,
+ [IB_RNR_TIMER_491_52] = 491520,
+};
+
+static inline unsigned long rnrnak_jiffies(u8 timeout)
+{
+ return max_t(unsigned long,
+ usecs_to_jiffies(rnrnak_usec[timeout]), 1);
+}
+
+static enum ib_wc_opcode wr_to_wc_opcode(enum ib_wr_opcode opcode)
+{
+ switch (opcode) {
+ case IB_WR_RDMA_WRITE: return IB_WC_RDMA_WRITE;
+ case IB_WR_RDMA_WRITE_WITH_IMM: return IB_WC_RDMA_WRITE;
+ case IB_WR_SEND: return IB_WC_SEND;
+ case IB_WR_SEND_WITH_IMM: return IB_WC_SEND;
+ case IB_WR_RDMA_READ: return IB_WC_RDMA_READ;
+ case IB_WR_ATOMIC_CMP_AND_SWP: return IB_WC_COMP_SWAP;
+ case IB_WR_ATOMIC_FETCH_AND_ADD: return IB_WC_FETCH_ADD;
+ case IB_WR_LSO: return IB_WC_LSO;
+ case IB_WR_SEND_WITH_INV: return IB_WC_SEND;
+ case IB_WR_RDMA_READ_WITH_INV: return IB_WC_RDMA_READ;
+ case IB_WR_LOCAL_INV: return IB_WC_LOCAL_INV;
+ case IB_WR_REG_MR: return IB_WC_REG_MR;
+ case IB_WR_BIND_MW: return IB_WC_BIND_MW;
+ case IB_WR_ATOMIC_WRITE: return IB_WC_ATOMIC_WRITE;
+ case IB_WR_FLUSH: return IB_WC_FLUSH;
+
+ default:
+ return 0xff;
+ }
+}
+
+void retransmit_timer(struct timer_list *t)
+{
+ struct rxe_qp *qp = from_timer(qp, t, retrans_timer);
+ unsigned long flags;
+
+ rxe_dbg_qp(qp, "retransmit timer fired\n");
+
+ spin_lock_irqsave(&qp->state_lock, flags);
+ if (qp->valid) {
+ qp->comp.timeout = 1;
+ rxe_sched_task(&qp->comp.task);
+ }
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+}
+
+void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb)
+{
+ int must_sched;
+
+ skb_queue_tail(&qp->resp_pkts, skb);
+
+ must_sched = skb_queue_len(&qp->resp_pkts) > 1;
+ if (must_sched != 0)
+ rxe_counter_inc(SKB_TO_PKT(skb)->rxe, RXE_CNT_COMPLETER_SCHED);
+
+ if (must_sched)
+ rxe_sched_task(&qp->comp.task);
+ else
+ rxe_run_task(&qp->comp.task);
+}
+
+static inline enum comp_state get_wqe(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt,
+ struct rxe_send_wqe **wqe_p)
+{
+ struct rxe_send_wqe *wqe;
+
+ /* we come here whether or not we found a response packet to see if
+ * there are any posted WQEs
+ */
+ wqe = queue_head(qp->sq.queue, QUEUE_TYPE_FROM_CLIENT);
+ *wqe_p = wqe;
+
+ /* no WQE or requester has not started it yet */
+ if (!wqe || wqe->state == wqe_state_posted)
+ return pkt ? COMPST_DONE : COMPST_EXIT;
+
+ /* WQE does not require an ack */
+ if (wqe->state == wqe_state_done)
+ return COMPST_COMP_WQE;
+
+ /* WQE caused an error */
+ if (wqe->state == wqe_state_error)
+ return COMPST_ERROR;
+
+ /* we have a WQE, if we also have an ack check its PSN */
+ return pkt ? COMPST_CHECK_PSN : COMPST_EXIT;
+}
+
+static inline void reset_retry_counters(struct rxe_qp *qp)
+{
+ qp->comp.retry_cnt = qp->attr.retry_cnt;
+ qp->comp.rnr_retry = qp->attr.rnr_retry;
+ qp->comp.started_retry = 0;
+}
+
+static inline enum comp_state check_psn(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt,
+ struct rxe_send_wqe *wqe)
+{
+ s32 diff;
+
+ /* check to see if response is past the oldest WQE. if it is, complete
+ * send/write or error read/atomic
+ */
+ diff = psn_compare(pkt->psn, wqe->last_psn);
+ if (diff > 0) {
+ if (wqe->state == wqe_state_pending) {
+ if (wqe->mask & WR_ATOMIC_OR_READ_MASK)
+ return COMPST_ERROR_RETRY;
+
+ reset_retry_counters(qp);
+ return COMPST_COMP_WQE;
+ } else {
+ return COMPST_DONE;
+ }
+ }
+
+ /* compare response packet to expected response */
+ diff = psn_compare(pkt->psn, qp->comp.psn);
+ if (diff < 0) {
+ /* response is most likely a retried packet if it matches an
+ * uncompleted WQE go complete it else ignore it
+ */
+ if (pkt->psn == wqe->last_psn)
+ return COMPST_COMP_ACK;
+ else if (pkt->opcode == IB_OPCODE_RC_ACKNOWLEDGE &&
+ (qp->comp.opcode == IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST ||
+ qp->comp.opcode == IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE))
+ return COMPST_CHECK_ACK;
+ else
+ return COMPST_DONE;
+ } else if ((diff > 0) && (wqe->mask & WR_ATOMIC_OR_READ_MASK)) {
+ return COMPST_DONE;
+ } else {
+ return COMPST_CHECK_ACK;
+ }
+}
+
+static inline enum comp_state check_ack(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt,
+ struct rxe_send_wqe *wqe)
+{
+ unsigned int mask = pkt->mask;
+ u8 syn;
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+
+ /* Check the sequence only */
+ switch (qp->comp.opcode) {
+ case -1:
+ /* Will catch all *_ONLY cases. */
+ if (!(mask & RXE_START_MASK))
+ return COMPST_ERROR;
+
+ break;
+
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST:
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE:
+ /* Check NAK code to handle a remote error */
+ if (pkt->opcode == IB_OPCODE_RC_ACKNOWLEDGE)
+ break;
+
+ if (pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE &&
+ pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST) {
+ /* read retries of partial data may restart from
+ * read response first or response only.
+ */
+ if ((pkt->psn == wqe->first_psn &&
+ pkt->opcode ==
+ IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) ||
+ (wqe->first_psn == wqe->last_psn &&
+ pkt->opcode ==
+ IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY))
+ break;
+
+ return COMPST_ERROR;
+ }
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+
+ /* Check operation validity. */
+ switch (pkt->opcode) {
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST:
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST:
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY:
+ syn = aeth_syn(pkt);
+
+ if ((syn & AETH_TYPE_MASK) != AETH_ACK)
+ return COMPST_ERROR;
+
+ if (wqe->wr.opcode == IB_WR_ATOMIC_WRITE)
+ return COMPST_WRITE_SEND;
+
+ fallthrough;
+ /* (IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE doesn't have an AETH)
+ */
+ case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE:
+ if (wqe->wr.opcode != IB_WR_RDMA_READ &&
+ wqe->wr.opcode != IB_WR_RDMA_READ_WITH_INV &&
+ wqe->wr.opcode != IB_WR_FLUSH) {
+ wqe->status = IB_WC_FATAL_ERR;
+ return COMPST_ERROR;
+ }
+ reset_retry_counters(qp);
+ return COMPST_READ;
+
+ case IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE:
+ syn = aeth_syn(pkt);
+
+ if ((syn & AETH_TYPE_MASK) != AETH_ACK)
+ return COMPST_ERROR;
+
+ if (wqe->wr.opcode != IB_WR_ATOMIC_CMP_AND_SWP &&
+ wqe->wr.opcode != IB_WR_ATOMIC_FETCH_AND_ADD)
+ return COMPST_ERROR;
+ reset_retry_counters(qp);
+ return COMPST_ATOMIC;
+
+ case IB_OPCODE_RC_ACKNOWLEDGE:
+ syn = aeth_syn(pkt);
+ switch (syn & AETH_TYPE_MASK) {
+ case AETH_ACK:
+ reset_retry_counters(qp);
+ return COMPST_WRITE_SEND;
+
+ case AETH_RNR_NAK:
+ rxe_counter_inc(rxe, RXE_CNT_RCV_RNR);
+ return COMPST_RNR_RETRY;
+
+ case AETH_NAK:
+ switch (syn) {
+ case AETH_NAK_PSN_SEQ_ERROR:
+ /* a nak implicitly acks all packets with psns
+ * before
+ */
+ if (psn_compare(pkt->psn, qp->comp.psn) > 0) {
+ rxe_counter_inc(rxe,
+ RXE_CNT_RCV_SEQ_ERR);
+ qp->comp.psn = pkt->psn;
+ if (qp->req.wait_psn) {
+ qp->req.wait_psn = 0;
+ rxe_sched_task(&qp->req.task);
+ }
+ }
+ return COMPST_ERROR_RETRY;
+
+ case AETH_NAK_INVALID_REQ:
+ wqe->status = IB_WC_REM_INV_REQ_ERR;
+ return COMPST_ERROR;
+
+ case AETH_NAK_REM_ACC_ERR:
+ wqe->status = IB_WC_REM_ACCESS_ERR;
+ return COMPST_ERROR;
+
+ case AETH_NAK_REM_OP_ERR:
+ wqe->status = IB_WC_REM_OP_ERR;
+ return COMPST_ERROR;
+
+ default:
+ rxe_dbg_qp(qp, "unexpected nak %x\n", syn);
+ wqe->status = IB_WC_REM_OP_ERR;
+ return COMPST_ERROR;
+ }
+
+ default:
+ return COMPST_ERROR;
+ }
+ break;
+
+ default:
+ rxe_dbg_qp(qp, "unexpected opcode\n");
+ }
+
+ return COMPST_ERROR;
+}
+
+static inline enum comp_state do_read(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt,
+ struct rxe_send_wqe *wqe)
+{
+ int ret;
+
+ ret = copy_data(qp->pd, IB_ACCESS_LOCAL_WRITE,
+ &wqe->dma, payload_addr(pkt),
+ payload_size(pkt), RXE_TO_MR_OBJ);
+ if (ret) {
+ wqe->status = IB_WC_LOC_PROT_ERR;
+ return COMPST_ERROR;
+ }
+
+ if (wqe->dma.resid == 0 && (pkt->mask & RXE_END_MASK))
+ return COMPST_COMP_ACK;
+
+ return COMPST_UPDATE_COMP;
+}
+
+static inline enum comp_state do_atomic(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt,
+ struct rxe_send_wqe *wqe)
+{
+ int ret;
+
+ u64 atomic_orig = atmack_orig(pkt);
+
+ ret = copy_data(qp->pd, IB_ACCESS_LOCAL_WRITE,
+ &wqe->dma, &atomic_orig,
+ sizeof(u64), RXE_TO_MR_OBJ);
+ if (ret) {
+ wqe->status = IB_WC_LOC_PROT_ERR;
+ return COMPST_ERROR;
+ }
+
+ return COMPST_COMP_ACK;
+}
+
+static void make_send_cqe(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
+ struct rxe_cqe *cqe)
+{
+ struct ib_wc *wc = &cqe->ibwc;
+ struct ib_uverbs_wc *uwc = &cqe->uibwc;
+
+ memset(cqe, 0, sizeof(*cqe));
+
+ if (!qp->is_user) {
+ wc->wr_id = wqe->wr.wr_id;
+ wc->status = wqe->status;
+ wc->qp = &qp->ibqp;
+ } else {
+ uwc->wr_id = wqe->wr.wr_id;
+ uwc->status = wqe->status;
+ uwc->qp_num = qp->ibqp.qp_num;
+ }
+
+ if (wqe->status == IB_WC_SUCCESS) {
+ if (!qp->is_user) {
+ wc->opcode = wr_to_wc_opcode(wqe->wr.opcode);
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM ||
+ wqe->wr.opcode == IB_WR_SEND_WITH_IMM)
+ wc->wc_flags = IB_WC_WITH_IMM;
+ wc->byte_len = wqe->dma.length;
+ } else {
+ uwc->opcode = wr_to_wc_opcode(wqe->wr.opcode);
+ if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM ||
+ wqe->wr.opcode == IB_WR_SEND_WITH_IMM)
+ uwc->wc_flags = IB_WC_WITH_IMM;
+ uwc->byte_len = wqe->dma.length;
+ }
+ } else {
+ if (wqe->status != IB_WC_WR_FLUSH_ERR)
+ rxe_err_qp(qp, "non-flush error status = %d",
+ wqe->status);
+ }
+}
+
+/*
+ * IBA Spec. Section 10.7.3.1 SIGNALED COMPLETIONS
+ * ---------8<---------8<-------------
+ * ...Note that if a completion error occurs, a Work Completion
+ * will always be generated, even if the signaling
+ * indicator requests an Unsignaled Completion.
+ * ---------8<---------8<-------------
+ */
+static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
+{
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ struct rxe_cqe cqe;
+ bool post;
+
+ /* do we need to post a completion */
+ post = ((qp->sq_sig_type == IB_SIGNAL_ALL_WR) ||
+ (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
+ wqe->status != IB_WC_SUCCESS);
+
+ if (post)
+ make_send_cqe(qp, wqe, &cqe);
+
+ queue_advance_consumer(qp->sq.queue, QUEUE_TYPE_FROM_CLIENT);
+
+ if (post)
+ rxe_cq_post(qp->scq, &cqe, 0);
+
+ if (wqe->wr.opcode == IB_WR_SEND ||
+ wqe->wr.opcode == IB_WR_SEND_WITH_IMM ||
+ wqe->wr.opcode == IB_WR_SEND_WITH_INV)
+ rxe_counter_inc(rxe, RXE_CNT_RDMA_SEND);
+
+ /*
+ * we completed something so let req run again
+ * if it is trying to fence
+ */
+ if (qp->req.wait_fence) {
+ qp->req.wait_fence = 0;
+ rxe_sched_task(&qp->req.task);
+ }
+}
+
+static void comp_check_sq_drain_done(struct rxe_qp *qp)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->state_lock, flags);
+ if (unlikely(qp_state(qp) == IB_QPS_SQD)) {
+ if (qp->attr.sq_draining && qp->comp.psn == qp->req.psn) {
+ qp->attr.sq_draining = 0;
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+
+ if (qp->ibqp.event_handler) {
+ struct ib_event ev;
+
+ ev.device = qp->ibqp.device;
+ ev.element.qp = &qp->ibqp;
+ ev.event = IB_EVENT_SQ_DRAINED;
+ qp->ibqp.event_handler(&ev,
+ qp->ibqp.qp_context);
+ }
+ return;
+ }
+ }
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+}
+
+static inline enum comp_state complete_ack(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt,
+ struct rxe_send_wqe *wqe)
+{
+ if (wqe->has_rd_atomic) {
+ wqe->has_rd_atomic = 0;
+ atomic_inc(&qp->req.rd_atomic);
+ if (qp->req.need_rd_atomic) {
+ qp->comp.timeout_retry = 0;
+ qp->req.need_rd_atomic = 0;
+ rxe_sched_task(&qp->req.task);
+ }
+ }
+
+ comp_check_sq_drain_done(qp);
+
+ do_complete(qp, wqe);
+
+ if (psn_compare(pkt->psn, qp->comp.psn) >= 0)
+ return COMPST_UPDATE_COMP;
+ else
+ return COMPST_DONE;
+}
+
+static inline enum comp_state complete_wqe(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt,
+ struct rxe_send_wqe *wqe)
+{
+ if (pkt && wqe->state == wqe_state_pending) {
+ if (psn_compare(wqe->last_psn, qp->comp.psn) >= 0) {
+ qp->comp.psn = (wqe->last_psn + 1) & BTH_PSN_MASK;
+ qp->comp.opcode = -1;
+ }
+
+ if (qp->req.wait_psn) {
+ qp->req.wait_psn = 0;
+ rxe_sched_task(&qp->req.task);
+ }
+ }
+
+ do_complete(qp, wqe);
+
+ return COMPST_GET_WQE;
+}
+
+/* drain incoming response packet queue */
+static void drain_resp_pkts(struct rxe_qp *qp)
+{
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&qp->resp_pkts))) {
+ rxe_put(qp);
+ kfree_skb(skb);
+ ib_device_put(qp->ibqp.device);
+ }
+}
+
+/* complete send wqe with flush error */
+static int flush_send_wqe(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
+{
+ struct rxe_cqe cqe = {};
+ struct ib_wc *wc = &cqe.ibwc;
+ struct ib_uverbs_wc *uwc = &cqe.uibwc;
+ int err;
+
+ if (qp->is_user) {
+ uwc->wr_id = wqe->wr.wr_id;
+ uwc->status = IB_WC_WR_FLUSH_ERR;
+ uwc->qp_num = qp->ibqp.qp_num;
+ } else {
+ wc->wr_id = wqe->wr.wr_id;
+ wc->status = IB_WC_WR_FLUSH_ERR;
+ wc->qp = &qp->ibqp;
+ }
+
+ err = rxe_cq_post(qp->scq, &cqe, 0);
+ if (err)
+ rxe_dbg_cq(qp->scq, "post cq failed, err = %d", err);
+
+ return err;
+}
+
+/* drain and optionally complete the send queue
+ * if unable to complete a wqe, i.e. cq is full, stop
+ * completing and flush the remaining wqes
+ */
+static void flush_send_queue(struct rxe_qp *qp, bool notify)
+{
+ struct rxe_send_wqe *wqe;
+ struct rxe_queue *q = qp->sq.queue;
+ int err;
+
+ /* send queue never got created. nothing to do. */
+ if (!qp->sq.queue)
+ return;
+
+ while ((wqe = queue_head(q, q->type))) {
+ if (notify) {
+ err = flush_send_wqe(qp, wqe);
+ if (err)
+ notify = 0;
+ }
+ queue_advance_consumer(q, q->type);
+ }
+}
+
+static void free_pkt(struct rxe_pkt_info *pkt)
+{
+ struct sk_buff *skb = PKT_TO_SKB(pkt);
+ struct rxe_qp *qp = pkt->qp;
+ struct ib_device *dev = qp->ibqp.device;
+
+ kfree_skb(skb);
+ rxe_put(qp);
+ ib_device_put(dev);
+}
+
+/* reset the retry timer if
+ * - QP is type RC
+ * - there is a packet sent by the requester that
+ * might be acked (we still might get spurious
+ * timeouts but try to keep them as few as possible)
+ * - the timeout parameter is set
+ * - the QP is alive
+ */
+static void reset_retry_timer(struct rxe_qp *qp)
+{
+ unsigned long flags;
+
+ if (qp_type(qp) == IB_QPT_RC && qp->qp_timeout_jiffies) {
+ spin_lock_irqsave(&qp->state_lock, flags);
+ if (qp_state(qp) >= IB_QPS_RTS &&
+ psn_compare(qp->req.psn, qp->comp.psn) > 0)
+ mod_timer(&qp->retrans_timer,
+ jiffies + qp->qp_timeout_jiffies);
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+ }
+}
+
+int rxe_completer(struct rxe_qp *qp)
+{
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ struct rxe_send_wqe *wqe = NULL;
+ struct sk_buff *skb = NULL;
+ struct rxe_pkt_info *pkt = NULL;
+ enum comp_state state;
+ int ret;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->state_lock, flags);
+ if (!qp->valid || qp_state(qp) == IB_QPS_ERR ||
+ qp_state(qp) == IB_QPS_RESET) {
+ bool notify = qp->valid && (qp_state(qp) == IB_QPS_ERR);
+
+ drain_resp_pkts(qp);
+ flush_send_queue(qp, notify);
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+ goto exit;
+ }
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+
+ if (qp->comp.timeout) {
+ qp->comp.timeout_retry = 1;
+ qp->comp.timeout = 0;
+ } else {
+ qp->comp.timeout_retry = 0;
+ }
+
+ if (qp->req.need_retry)
+ goto exit;
+
+ state = COMPST_GET_ACK;
+
+ while (1) {
+ rxe_dbg_qp(qp, "state = %s\n", comp_state_name[state]);
+ switch (state) {
+ case COMPST_GET_ACK:
+ skb = skb_dequeue(&qp->resp_pkts);
+ if (skb) {
+ pkt = SKB_TO_PKT(skb);
+ qp->comp.timeout_retry = 0;
+ }
+ state = COMPST_GET_WQE;
+ break;
+
+ case COMPST_GET_WQE:
+ state = get_wqe(qp, pkt, &wqe);
+ break;
+
+ case COMPST_CHECK_PSN:
+ state = check_psn(qp, pkt, wqe);
+ break;
+
+ case COMPST_CHECK_ACK:
+ state = check_ack(qp, pkt, wqe);
+ break;
+
+ case COMPST_READ:
+ state = do_read(qp, pkt, wqe);
+ break;
+
+ case COMPST_ATOMIC:
+ state = do_atomic(qp, pkt, wqe);
+ break;
+
+ case COMPST_WRITE_SEND:
+ if (wqe->state == wqe_state_pending &&
+ wqe->last_psn == pkt->psn)
+ state = COMPST_COMP_ACK;
+ else
+ state = COMPST_UPDATE_COMP;
+ break;
+
+ case COMPST_COMP_ACK:
+ state = complete_ack(qp, pkt, wqe);
+ break;
+
+ case COMPST_COMP_WQE:
+ state = complete_wqe(qp, pkt, wqe);
+ break;
+
+ case COMPST_UPDATE_COMP:
+ if (pkt->mask & RXE_END_MASK)
+ qp->comp.opcode = -1;
+ else
+ qp->comp.opcode = pkt->opcode;
+
+ if (psn_compare(pkt->psn, qp->comp.psn) >= 0)
+ qp->comp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
+
+ if (qp->req.wait_psn) {
+ qp->req.wait_psn = 0;
+ rxe_sched_task(&qp->req.task);
+ }
+
+ state = COMPST_DONE;
+ break;
+
+ case COMPST_DONE:
+ goto done;
+
+ case COMPST_EXIT:
+ if (qp->comp.timeout_retry && wqe) {
+ state = COMPST_ERROR_RETRY;
+ break;
+ }
+
+ reset_retry_timer(qp);
+ goto exit;
+
+ case COMPST_ERROR_RETRY:
+ /* we come here if the retry timer fired and we did
+ * not receive a response packet. try to retry the send
+ * queue if that makes sense and the limits have not
+ * been exceeded. remember that some timeouts are
+ * spurious since we do not reset the timer but kick
+ * it down the road or let it expire
+ */
+
+ /* there is nothing to retry in this case */
+ if (!wqe || (wqe->state == wqe_state_posted))
+ goto exit;
+
+ /* if we've started a retry, don't start another
+ * retry sequence, unless this is a timeout.
+ */
+ if (qp->comp.started_retry &&
+ !qp->comp.timeout_retry)
+ goto done;
+
+ if (qp->comp.retry_cnt > 0) {
+ if (qp->comp.retry_cnt != 7)
+ qp->comp.retry_cnt--;
+
+ /* no point in retrying if we have already
+ * seen the last ack that the requester could
+ * have caused
+ */
+ if (psn_compare(qp->req.psn,
+ qp->comp.psn) > 0) {
+ /* tell the requester to retry the
+ * send queue next time around
+ */
+ rxe_counter_inc(rxe,
+ RXE_CNT_COMP_RETRY);
+ qp->req.need_retry = 1;
+ qp->comp.started_retry = 1;
+ rxe_sched_task(&qp->req.task);
+ }
+ goto done;
+
+ } else {
+ rxe_counter_inc(rxe, RXE_CNT_RETRY_EXCEEDED);
+ wqe->status = IB_WC_RETRY_EXC_ERR;
+ state = COMPST_ERROR;
+ }
+ break;
+
+ case COMPST_RNR_RETRY:
+ /* we come here if we received an RNR NAK */
+ if (qp->comp.rnr_retry > 0) {
+ if (qp->comp.rnr_retry != 7)
+ qp->comp.rnr_retry--;
+
+ /* don't start a retry flow until the
+ * rnr timer has fired
+ */
+ qp->req.wait_for_rnr_timer = 1;
+ rxe_dbg_qp(qp, "set rnr nak timer\n");
+ // TODO who protects from destroy_qp??
+ mod_timer(&qp->rnr_nak_timer,
+ jiffies + rnrnak_jiffies(aeth_syn(pkt)
+ & ~AETH_TYPE_MASK));
+ goto exit;
+ } else {
+ rxe_counter_inc(rxe,
+ RXE_CNT_RNR_RETRY_EXCEEDED);
+ wqe->status = IB_WC_RNR_RETRY_EXC_ERR;
+ state = COMPST_ERROR;
+ }
+ break;
+
+ case COMPST_ERROR:
+ WARN_ON_ONCE(wqe->status == IB_WC_SUCCESS);
+ do_complete(qp, wqe);
+ rxe_qp_error(qp);
+ goto exit;
+ }
+ }
+
+ /* A non-zero return value will cause rxe_do_task to
+ * exit its loop and end the work item. A zero return
+ * will continue looping and return to rxe_completer
+ */
+done:
+ ret = 0;
+ goto out;
+exit:
+ ret = -EAGAIN;
+out:
+ if (pkt)
+ free_pkt(pkt);
+ return ret;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c
new file mode 100644
index 0000000000..d5486cbb3f
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_cq.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+#include <linux/vmalloc.h>
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq,
+ int cqe, int comp_vector)
+{
+ int count;
+
+ if (cqe <= 0) {
+ rxe_dbg_dev(rxe, "cqe(%d) <= 0\n", cqe);
+ goto err1;
+ }
+
+ if (cqe > rxe->attr.max_cqe) {
+ rxe_dbg_dev(rxe, "cqe(%d) > max_cqe(%d)\n",
+ cqe, rxe->attr.max_cqe);
+ goto err1;
+ }
+
+ if (cq) {
+ count = queue_count(cq->queue, QUEUE_TYPE_TO_CLIENT);
+ if (cqe < count) {
+ rxe_dbg_cq(cq, "cqe(%d) < current # elements in queue (%d)",
+ cqe, count);
+ goto err1;
+ }
+ }
+
+ return 0;
+
+err1:
+ return -EINVAL;
+}
+
+int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe,
+ int comp_vector, struct ib_udata *udata,
+ struct rxe_create_cq_resp __user *uresp)
+{
+ int err;
+ enum queue_type type;
+
+ type = QUEUE_TYPE_TO_CLIENT;
+ cq->queue = rxe_queue_init(rxe, &cqe,
+ sizeof(struct rxe_cqe), type);
+ if (!cq->queue) {
+ rxe_dbg_dev(rxe, "unable to create cq\n");
+ return -ENOMEM;
+ }
+
+ err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, udata,
+ cq->queue->buf, cq->queue->buf_size, &cq->queue->ip);
+ if (err) {
+ vfree(cq->queue->buf);
+ kfree(cq->queue);
+ return err;
+ }
+
+ cq->is_user = uresp;
+
+ spin_lock_init(&cq->cq_lock);
+ cq->ibcq.cqe = cqe;
+ return 0;
+}
+
+int rxe_cq_resize_queue(struct rxe_cq *cq, int cqe,
+ struct rxe_resize_cq_resp __user *uresp,
+ struct ib_udata *udata)
+{
+ int err;
+
+ err = rxe_queue_resize(cq->queue, (unsigned int *)&cqe,
+ sizeof(struct rxe_cqe), udata,
+ uresp ? &uresp->mi : NULL, NULL, &cq->cq_lock);
+ if (!err)
+ cq->ibcq.cqe = cqe;
+
+ return err;
+}
+
+/* caller holds reference to cq */
+int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited)
+{
+ struct ib_event ev;
+ int full;
+ void *addr;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cq->cq_lock, flags);
+
+ full = queue_full(cq->queue, QUEUE_TYPE_TO_CLIENT);
+ if (unlikely(full)) {
+ rxe_err_cq(cq, "queue full");
+ spin_unlock_irqrestore(&cq->cq_lock, flags);
+ if (cq->ibcq.event_handler) {
+ ev.device = cq->ibcq.device;
+ ev.element.cq = &cq->ibcq;
+ ev.event = IB_EVENT_CQ_ERR;
+ cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
+ }
+
+ return -EBUSY;
+ }
+
+ addr = queue_producer_addr(cq->queue, QUEUE_TYPE_TO_CLIENT);
+ memcpy(addr, cqe, sizeof(*cqe));
+
+ queue_advance_producer(cq->queue, QUEUE_TYPE_TO_CLIENT);
+
+ if ((cq->notify & IB_CQ_NEXT_COMP) ||
+ (cq->notify & IB_CQ_SOLICITED && solicited)) {
+ cq->notify = 0;
+ cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+ }
+
+ spin_unlock_irqrestore(&cq->cq_lock, flags);
+
+ return 0;
+}
+
+void rxe_cq_cleanup(struct rxe_pool_elem *elem)
+{
+ struct rxe_cq *cq = container_of(elem, typeof(*cq), elem);
+
+ if (cq->queue)
+ rxe_queue_cleanup(cq->queue);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_hdr.h b/drivers/infiniband/sw/rxe/rxe_hdr.h
new file mode 100644
index 0000000000..46f82b27fc
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_hdr.h
@@ -0,0 +1,977 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#ifndef RXE_HDR_H
+#define RXE_HDR_H
+
+/* extracted information about a packet carried in an sk_buff struct fits in
+ * the skbuff cb array. Must be at most 48 bytes. stored in control block of
+ * sk_buff for received packets.
+ */
+struct rxe_pkt_info {
+ struct rxe_dev *rxe; /* device that owns packet */
+ struct rxe_qp *qp; /* qp that owns packet */
+ struct rxe_send_wqe *wqe; /* send wqe */
+ u8 *hdr; /* points to bth */
+ u32 mask; /* useful info about pkt */
+ u32 psn; /* bth psn of packet */
+ u16 pkey_index; /* partition of pkt */
+ u16 paylen; /* length of bth - icrc */
+ u8 port_num; /* port pkt received on */
+ u8 opcode; /* bth opcode of packet */
+};
+
+/* Macros should be used only for received skb */
+static inline struct rxe_pkt_info *SKB_TO_PKT(struct sk_buff *skb)
+{
+ BUILD_BUG_ON(sizeof(struct rxe_pkt_info) > sizeof(skb->cb));
+ return (void *)skb->cb;
+}
+
+static inline struct sk_buff *PKT_TO_SKB(struct rxe_pkt_info *pkt)
+{
+ return container_of((void *)pkt, struct sk_buff, cb);
+}
+
+/*
+ * IBA header types and methods
+ *
+ * Some of these are for reference and completeness only since
+ * rxe does not currently support RD transport
+ * most of this could be moved into IB core. ib_pack.h has
+ * part of this but is incomplete
+ *
+ * Header specific routines to insert/extract values to/from headers
+ * the routines that are named __hhh_(set_)fff() take a pointer to a
+ * hhh header and get(set) the fff field. The routines named
+ * hhh_(set_)fff take a packet info struct and find the
+ * header and field based on the opcode in the packet.
+ * Conversion to/from network byte order from cpu order is also done.
+ */
+
+#define RXE_ICRC_SIZE (4)
+#define RXE_MAX_HDR_LENGTH (80)
+
+/******************************************************************************
+ * Base Transport Header
+ ******************************************************************************/
+struct rxe_bth {
+ u8 opcode;
+ u8 flags;
+ __be16 pkey;
+ __be32 qpn;
+ __be32 apsn;
+};
+
+#define BTH_TVER (0)
+#define BTH_DEF_PKEY (0xffff)
+
+#define BTH_SE_MASK (0x80)
+#define BTH_MIG_MASK (0x40)
+#define BTH_PAD_MASK (0x30)
+#define BTH_TVER_MASK (0x0f)
+#define BTH_FECN_MASK (0x80000000)
+#define BTH_BECN_MASK (0x40000000)
+#define BTH_RESV6A_MASK (0x3f000000)
+#define BTH_QPN_MASK (0x00ffffff)
+#define BTH_ACK_MASK (0x80000000)
+#define BTH_RESV7_MASK (0x7f000000)
+#define BTH_PSN_MASK (0x00ffffff)
+
+static inline u8 __bth_opcode(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ return bth->opcode;
+}
+
+static inline void __bth_set_opcode(void *arg, u8 opcode)
+{
+ struct rxe_bth *bth = arg;
+
+ bth->opcode = opcode;
+}
+
+static inline u8 __bth_se(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ return 0 != (BTH_SE_MASK & bth->flags);
+}
+
+static inline void __bth_set_se(void *arg, int se)
+{
+ struct rxe_bth *bth = arg;
+
+ if (se)
+ bth->flags |= BTH_SE_MASK;
+ else
+ bth->flags &= ~BTH_SE_MASK;
+}
+
+static inline u8 __bth_mig(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ return 0 != (BTH_MIG_MASK & bth->flags);
+}
+
+static inline void __bth_set_mig(void *arg, u8 mig)
+{
+ struct rxe_bth *bth = arg;
+
+ if (mig)
+ bth->flags |= BTH_MIG_MASK;
+ else
+ bth->flags &= ~BTH_MIG_MASK;
+}
+
+static inline u8 __bth_pad(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ return (BTH_PAD_MASK & bth->flags) >> 4;
+}
+
+static inline void __bth_set_pad(void *arg, u8 pad)
+{
+ struct rxe_bth *bth = arg;
+
+ bth->flags = (BTH_PAD_MASK & (pad << 4)) |
+ (~BTH_PAD_MASK & bth->flags);
+}
+
+static inline u8 __bth_tver(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ return BTH_TVER_MASK & bth->flags;
+}
+
+static inline void __bth_set_tver(void *arg, u8 tver)
+{
+ struct rxe_bth *bth = arg;
+
+ bth->flags = (BTH_TVER_MASK & tver) |
+ (~BTH_TVER_MASK & bth->flags);
+}
+
+static inline u16 __bth_pkey(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ return be16_to_cpu(bth->pkey);
+}
+
+static inline void __bth_set_pkey(void *arg, u16 pkey)
+{
+ struct rxe_bth *bth = arg;
+
+ bth->pkey = cpu_to_be16(pkey);
+}
+
+static inline u32 __bth_qpn(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ return BTH_QPN_MASK & be32_to_cpu(bth->qpn);
+}
+
+static inline void __bth_set_qpn(void *arg, u32 qpn)
+{
+ struct rxe_bth *bth = arg;
+ u32 resvqpn = be32_to_cpu(bth->qpn);
+
+ bth->qpn = cpu_to_be32((BTH_QPN_MASK & qpn) |
+ (~BTH_QPN_MASK & resvqpn));
+}
+
+static inline int __bth_fecn(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ return 0 != (cpu_to_be32(BTH_FECN_MASK) & bth->qpn);
+}
+
+static inline void __bth_set_fecn(void *arg, int fecn)
+{
+ struct rxe_bth *bth = arg;
+
+ if (fecn)
+ bth->qpn |= cpu_to_be32(BTH_FECN_MASK);
+ else
+ bth->qpn &= ~cpu_to_be32(BTH_FECN_MASK);
+}
+
+static inline int __bth_becn(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ return 0 != (cpu_to_be32(BTH_BECN_MASK) & bth->qpn);
+}
+
+static inline void __bth_set_becn(void *arg, int becn)
+{
+ struct rxe_bth *bth = arg;
+
+ if (becn)
+ bth->qpn |= cpu_to_be32(BTH_BECN_MASK);
+ else
+ bth->qpn &= ~cpu_to_be32(BTH_BECN_MASK);
+}
+
+static inline u8 __bth_resv6a(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ return (BTH_RESV6A_MASK & be32_to_cpu(bth->qpn)) >> 24;
+}
+
+static inline void __bth_set_resv6a(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ bth->qpn = cpu_to_be32(~BTH_RESV6A_MASK);
+}
+
+static inline int __bth_ack(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ return 0 != (cpu_to_be32(BTH_ACK_MASK) & bth->apsn);
+}
+
+static inline void __bth_set_ack(void *arg, int ack)
+{
+ struct rxe_bth *bth = arg;
+
+ if (ack)
+ bth->apsn |= cpu_to_be32(BTH_ACK_MASK);
+ else
+ bth->apsn &= ~cpu_to_be32(BTH_ACK_MASK);
+}
+
+static inline void __bth_set_resv7(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ bth->apsn &= ~cpu_to_be32(BTH_RESV7_MASK);
+}
+
+static inline u32 __bth_psn(void *arg)
+{
+ struct rxe_bth *bth = arg;
+
+ return BTH_PSN_MASK & be32_to_cpu(bth->apsn);
+}
+
+static inline void __bth_set_psn(void *arg, u32 psn)
+{
+ struct rxe_bth *bth = arg;
+ u32 apsn = be32_to_cpu(bth->apsn);
+
+ bth->apsn = cpu_to_be32((BTH_PSN_MASK & psn) |
+ (~BTH_PSN_MASK & apsn));
+}
+
+static inline u8 bth_opcode(struct rxe_pkt_info *pkt)
+{
+ return __bth_opcode(pkt->hdr);
+}
+
+static inline void bth_set_opcode(struct rxe_pkt_info *pkt, u8 opcode)
+{
+ __bth_set_opcode(pkt->hdr, opcode);
+}
+
+static inline u8 bth_se(struct rxe_pkt_info *pkt)
+{
+ return __bth_se(pkt->hdr);
+}
+
+static inline void bth_set_se(struct rxe_pkt_info *pkt, int se)
+{
+ __bth_set_se(pkt->hdr, se);
+}
+
+static inline u8 bth_mig(struct rxe_pkt_info *pkt)
+{
+ return __bth_mig(pkt->hdr);
+}
+
+static inline void bth_set_mig(struct rxe_pkt_info *pkt, u8 mig)
+{
+ __bth_set_mig(pkt->hdr, mig);
+}
+
+static inline u8 bth_pad(struct rxe_pkt_info *pkt)
+{
+ return __bth_pad(pkt->hdr);
+}
+
+static inline void bth_set_pad(struct rxe_pkt_info *pkt, u8 pad)
+{
+ __bth_set_pad(pkt->hdr, pad);
+}
+
+static inline u8 bth_tver(struct rxe_pkt_info *pkt)
+{
+ return __bth_tver(pkt->hdr);
+}
+
+static inline void bth_set_tver(struct rxe_pkt_info *pkt, u8 tver)
+{
+ __bth_set_tver(pkt->hdr, tver);
+}
+
+static inline u16 bth_pkey(struct rxe_pkt_info *pkt)
+{
+ return __bth_pkey(pkt->hdr);
+}
+
+static inline void bth_set_pkey(struct rxe_pkt_info *pkt, u16 pkey)
+{
+ __bth_set_pkey(pkt->hdr, pkey);
+}
+
+static inline u32 bth_qpn(struct rxe_pkt_info *pkt)
+{
+ return __bth_qpn(pkt->hdr);
+}
+
+static inline void bth_set_qpn(struct rxe_pkt_info *pkt, u32 qpn)
+{
+ __bth_set_qpn(pkt->hdr, qpn);
+}
+
+static inline int bth_fecn(struct rxe_pkt_info *pkt)
+{
+ return __bth_fecn(pkt->hdr);
+}
+
+static inline void bth_set_fecn(struct rxe_pkt_info *pkt, int fecn)
+{
+ __bth_set_fecn(pkt->hdr, fecn);
+}
+
+static inline int bth_becn(struct rxe_pkt_info *pkt)
+{
+ return __bth_becn(pkt->hdr);
+}
+
+static inline void bth_set_becn(struct rxe_pkt_info *pkt, int becn)
+{
+ __bth_set_becn(pkt->hdr, becn);
+}
+
+static inline u8 bth_resv6a(struct rxe_pkt_info *pkt)
+{
+ return __bth_resv6a(pkt->hdr);
+}
+
+static inline void bth_set_resv6a(struct rxe_pkt_info *pkt)
+{
+ __bth_set_resv6a(pkt->hdr);
+}
+
+static inline int bth_ack(struct rxe_pkt_info *pkt)
+{
+ return __bth_ack(pkt->hdr);
+}
+
+static inline void bth_set_ack(struct rxe_pkt_info *pkt, int ack)
+{
+ __bth_set_ack(pkt->hdr, ack);
+}
+
+static inline void bth_set_resv7(struct rxe_pkt_info *pkt)
+{
+ __bth_set_resv7(pkt->hdr);
+}
+
+static inline u32 bth_psn(struct rxe_pkt_info *pkt)
+{
+ return __bth_psn(pkt->hdr);
+}
+
+static inline void bth_set_psn(struct rxe_pkt_info *pkt, u32 psn)
+{
+ __bth_set_psn(pkt->hdr, psn);
+}
+
+static inline void bth_init(struct rxe_pkt_info *pkt, u8 opcode, int se,
+ int mig, int pad, u16 pkey, u32 qpn, int ack_req,
+ u32 psn)
+{
+ struct rxe_bth *bth = (struct rxe_bth *)(pkt->hdr);
+
+ bth->opcode = opcode;
+ bth->flags = (pad << 4) & BTH_PAD_MASK;
+ if (se)
+ bth->flags |= BTH_SE_MASK;
+ if (mig)
+ bth->flags |= BTH_MIG_MASK;
+ bth->pkey = cpu_to_be16(pkey);
+ bth->qpn = cpu_to_be32(qpn & BTH_QPN_MASK);
+ psn &= BTH_PSN_MASK;
+ if (ack_req)
+ psn |= BTH_ACK_MASK;
+ bth->apsn = cpu_to_be32(psn);
+}
+
+/******************************************************************************
+ * Reliable Datagram Extended Transport Header
+ ******************************************************************************/
+struct rxe_rdeth {
+ __be32 een;
+};
+
+#define RDETH_EEN_MASK (0x00ffffff)
+
+static inline u8 __rdeth_een(void *arg)
+{
+ struct rxe_rdeth *rdeth = arg;
+
+ return RDETH_EEN_MASK & be32_to_cpu(rdeth->een);
+}
+
+static inline void __rdeth_set_een(void *arg, u32 een)
+{
+ struct rxe_rdeth *rdeth = arg;
+
+ rdeth->een = cpu_to_be32(RDETH_EEN_MASK & een);
+}
+
+static inline u8 rdeth_een(struct rxe_pkt_info *pkt)
+{
+ return __rdeth_een(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_RDETH]);
+}
+
+static inline void rdeth_set_een(struct rxe_pkt_info *pkt, u32 een)
+{
+ __rdeth_set_een(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_RDETH], een);
+}
+
+/******************************************************************************
+ * Datagram Extended Transport Header
+ ******************************************************************************/
+struct rxe_deth {
+ __be32 qkey;
+ __be32 sqp;
+};
+
+#define GSI_QKEY (0x80010000)
+#define DETH_SQP_MASK (0x00ffffff)
+
+static inline u32 __deth_qkey(void *arg)
+{
+ struct rxe_deth *deth = arg;
+
+ return be32_to_cpu(deth->qkey);
+}
+
+static inline void __deth_set_qkey(void *arg, u32 qkey)
+{
+ struct rxe_deth *deth = arg;
+
+ deth->qkey = cpu_to_be32(qkey);
+}
+
+static inline u32 __deth_sqp(void *arg)
+{
+ struct rxe_deth *deth = arg;
+
+ return DETH_SQP_MASK & be32_to_cpu(deth->sqp);
+}
+
+static inline void __deth_set_sqp(void *arg, u32 sqp)
+{
+ struct rxe_deth *deth = arg;
+
+ deth->sqp = cpu_to_be32(DETH_SQP_MASK & sqp);
+}
+
+static inline u32 deth_qkey(struct rxe_pkt_info *pkt)
+{
+ return __deth_qkey(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_DETH]);
+}
+
+static inline void deth_set_qkey(struct rxe_pkt_info *pkt, u32 qkey)
+{
+ __deth_set_qkey(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_DETH], qkey);
+}
+
+static inline u32 deth_sqp(struct rxe_pkt_info *pkt)
+{
+ return __deth_sqp(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_DETH]);
+}
+
+static inline void deth_set_sqp(struct rxe_pkt_info *pkt, u32 sqp)
+{
+ __deth_set_sqp(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_DETH], sqp);
+}
+
+/******************************************************************************
+ * RDMA Extended Transport Header
+ ******************************************************************************/
+struct rxe_reth {
+ __be64 va;
+ __be32 rkey;
+ __be32 len;
+};
+
+static inline u64 __reth_va(void *arg)
+{
+ struct rxe_reth *reth = arg;
+
+ return be64_to_cpu(reth->va);
+}
+
+static inline void __reth_set_va(void *arg, u64 va)
+{
+ struct rxe_reth *reth = arg;
+
+ reth->va = cpu_to_be64(va);
+}
+
+static inline u32 __reth_rkey(void *arg)
+{
+ struct rxe_reth *reth = arg;
+
+ return be32_to_cpu(reth->rkey);
+}
+
+static inline void __reth_set_rkey(void *arg, u32 rkey)
+{
+ struct rxe_reth *reth = arg;
+
+ reth->rkey = cpu_to_be32(rkey);
+}
+
+static inline u32 __reth_len(void *arg)
+{
+ struct rxe_reth *reth = arg;
+
+ return be32_to_cpu(reth->len);
+}
+
+static inline void __reth_set_len(void *arg, u32 len)
+{
+ struct rxe_reth *reth = arg;
+
+ reth->len = cpu_to_be32(len);
+}
+
+static inline u64 reth_va(struct rxe_pkt_info *pkt)
+{
+ return __reth_va(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_RETH]);
+}
+
+static inline void reth_set_va(struct rxe_pkt_info *pkt, u64 va)
+{
+ __reth_set_va(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_RETH], va);
+}
+
+static inline u32 reth_rkey(struct rxe_pkt_info *pkt)
+{
+ return __reth_rkey(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_RETH]);
+}
+
+static inline void reth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey)
+{
+ __reth_set_rkey(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_RETH], rkey);
+}
+
+static inline u32 reth_len(struct rxe_pkt_info *pkt)
+{
+ return __reth_len(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_RETH]);
+}
+
+static inline void reth_set_len(struct rxe_pkt_info *pkt, u32 len)
+{
+ __reth_set_len(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_RETH], len);
+}
+
+/******************************************************************************
+ * FLUSH Extended Transport Header
+ ******************************************************************************/
+
+struct rxe_feth {
+ __be32 bits;
+};
+
+#define FETH_PLT_MASK (0x0000000f) /* bits 3-0 */
+#define FETH_SEL_MASK (0x00000030) /* bits 5-4 */
+#define FETH_SEL_SHIFT (4U)
+
+static inline u32 __feth_plt(void *arg)
+{
+ struct rxe_feth *feth = arg;
+
+ return be32_to_cpu(feth->bits) & FETH_PLT_MASK;
+}
+
+static inline u32 __feth_sel(void *arg)
+{
+ struct rxe_feth *feth = arg;
+
+ return (be32_to_cpu(feth->bits) & FETH_SEL_MASK) >> FETH_SEL_SHIFT;
+}
+
+static inline u32 feth_plt(struct rxe_pkt_info *pkt)
+{
+ return __feth_plt(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_FETH]);
+}
+
+static inline u32 feth_sel(struct rxe_pkt_info *pkt)
+{
+ return __feth_sel(pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_FETH]);
+}
+
+static inline void feth_init(struct rxe_pkt_info *pkt, u8 type, u8 level)
+{
+ struct rxe_feth *feth = (struct rxe_feth *)
+ (pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_FETH]);
+ u32 bits = ((level << FETH_SEL_SHIFT) & FETH_SEL_MASK) |
+ (type & FETH_PLT_MASK);
+
+ feth->bits = cpu_to_be32(bits);
+}
+
+/******************************************************************************
+ * Atomic Extended Transport Header
+ ******************************************************************************/
+struct rxe_atmeth {
+ __be64 va;
+ __be32 rkey;
+ __be64 swap_add;
+ __be64 comp;
+} __packed;
+
+static inline u64 __atmeth_va(void *arg)
+{
+ struct rxe_atmeth *atmeth = arg;
+
+ return be64_to_cpu(atmeth->va);
+}
+
+static inline void __atmeth_set_va(void *arg, u64 va)
+{
+ struct rxe_atmeth *atmeth = arg;
+
+ atmeth->va = cpu_to_be64(va);
+}
+
+static inline u32 __atmeth_rkey(void *arg)
+{
+ struct rxe_atmeth *atmeth = arg;
+
+ return be32_to_cpu(atmeth->rkey);
+}
+
+static inline void __atmeth_set_rkey(void *arg, u32 rkey)
+{
+ struct rxe_atmeth *atmeth = arg;
+
+ atmeth->rkey = cpu_to_be32(rkey);
+}
+
+static inline u64 __atmeth_swap_add(void *arg)
+{
+ struct rxe_atmeth *atmeth = arg;
+
+ return be64_to_cpu(atmeth->swap_add);
+}
+
+static inline void __atmeth_set_swap_add(void *arg, u64 swap_add)
+{
+ struct rxe_atmeth *atmeth = arg;
+
+ atmeth->swap_add = cpu_to_be64(swap_add);
+}
+
+static inline u64 __atmeth_comp(void *arg)
+{
+ struct rxe_atmeth *atmeth = arg;
+
+ return be64_to_cpu(atmeth->comp);
+}
+
+static inline void __atmeth_set_comp(void *arg, u64 comp)
+{
+ struct rxe_atmeth *atmeth = arg;
+
+ atmeth->comp = cpu_to_be64(comp);
+}
+
+static inline u64 atmeth_va(struct rxe_pkt_info *pkt)
+{
+ return __atmeth_va(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_ATMETH]);
+}
+
+static inline void atmeth_set_va(struct rxe_pkt_info *pkt, u64 va)
+{
+ __atmeth_set_va(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_ATMETH], va);
+}
+
+static inline u32 atmeth_rkey(struct rxe_pkt_info *pkt)
+{
+ return __atmeth_rkey(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_ATMETH]);
+}
+
+static inline void atmeth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey)
+{
+ __atmeth_set_rkey(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_ATMETH], rkey);
+}
+
+static inline u64 atmeth_swap_add(struct rxe_pkt_info *pkt)
+{
+ return __atmeth_swap_add(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_ATMETH]);
+}
+
+static inline void atmeth_set_swap_add(struct rxe_pkt_info *pkt, u64 swap_add)
+{
+ __atmeth_set_swap_add(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_ATMETH], swap_add);
+}
+
+static inline u64 atmeth_comp(struct rxe_pkt_info *pkt)
+{
+ return __atmeth_comp(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_ATMETH]);
+}
+
+static inline void atmeth_set_comp(struct rxe_pkt_info *pkt, u64 comp)
+{
+ __atmeth_set_comp(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_ATMETH], comp);
+}
+
+/******************************************************************************
+ * Ack Extended Transport Header
+ ******************************************************************************/
+struct rxe_aeth {
+ __be32 smsn;
+};
+
+#define AETH_SYN_MASK (0xff000000)
+#define AETH_MSN_MASK (0x00ffffff)
+
+enum aeth_syndrome {
+ AETH_TYPE_MASK = 0xe0,
+ AETH_ACK = 0x00,
+ AETH_RNR_NAK = 0x20,
+ AETH_RSVD = 0x40,
+ AETH_NAK = 0x60,
+ AETH_ACK_UNLIMITED = 0x1f,
+ AETH_NAK_PSN_SEQ_ERROR = 0x60,
+ AETH_NAK_INVALID_REQ = 0x61,
+ AETH_NAK_REM_ACC_ERR = 0x62,
+ AETH_NAK_REM_OP_ERR = 0x63,
+};
+
+static inline u8 __aeth_syn(void *arg)
+{
+ struct rxe_aeth *aeth = arg;
+
+ return (AETH_SYN_MASK & be32_to_cpu(aeth->smsn)) >> 24;
+}
+
+static inline void __aeth_set_syn(void *arg, u8 syn)
+{
+ struct rxe_aeth *aeth = arg;
+ u32 smsn = be32_to_cpu(aeth->smsn);
+
+ aeth->smsn = cpu_to_be32((AETH_SYN_MASK & (syn << 24)) |
+ (~AETH_SYN_MASK & smsn));
+}
+
+static inline u32 __aeth_msn(void *arg)
+{
+ struct rxe_aeth *aeth = arg;
+
+ return AETH_MSN_MASK & be32_to_cpu(aeth->smsn);
+}
+
+static inline void __aeth_set_msn(void *arg, u32 msn)
+{
+ struct rxe_aeth *aeth = arg;
+ u32 smsn = be32_to_cpu(aeth->smsn);
+
+ aeth->smsn = cpu_to_be32((AETH_MSN_MASK & msn) |
+ (~AETH_MSN_MASK & smsn));
+}
+
+static inline u8 aeth_syn(struct rxe_pkt_info *pkt)
+{
+ return __aeth_syn(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_AETH]);
+}
+
+static inline void aeth_set_syn(struct rxe_pkt_info *pkt, u8 syn)
+{
+ __aeth_set_syn(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_AETH], syn);
+}
+
+static inline u32 aeth_msn(struct rxe_pkt_info *pkt)
+{
+ return __aeth_msn(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_AETH]);
+}
+
+static inline void aeth_set_msn(struct rxe_pkt_info *pkt, u32 msn)
+{
+ __aeth_set_msn(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_AETH], msn);
+}
+
+/******************************************************************************
+ * Atomic Ack Extended Transport Header
+ ******************************************************************************/
+struct rxe_atmack {
+ __be64 orig;
+};
+
+static inline u64 __atmack_orig(void *arg)
+{
+ struct rxe_atmack *atmack = arg;
+
+ return be64_to_cpu(atmack->orig);
+}
+
+static inline void __atmack_set_orig(void *arg, u64 orig)
+{
+ struct rxe_atmack *atmack = arg;
+
+ atmack->orig = cpu_to_be64(orig);
+}
+
+static inline u64 atmack_orig(struct rxe_pkt_info *pkt)
+{
+ return __atmack_orig(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_ATMACK]);
+}
+
+static inline void atmack_set_orig(struct rxe_pkt_info *pkt, u64 orig)
+{
+ __atmack_set_orig(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_ATMACK], orig);
+}
+
+/******************************************************************************
+ * Immediate Extended Transport Header
+ ******************************************************************************/
+struct rxe_immdt {
+ __be32 imm;
+};
+
+static inline __be32 __immdt_imm(void *arg)
+{
+ struct rxe_immdt *immdt = arg;
+
+ return immdt->imm;
+}
+
+static inline void __immdt_set_imm(void *arg, __be32 imm)
+{
+ struct rxe_immdt *immdt = arg;
+
+ immdt->imm = imm;
+}
+
+static inline __be32 immdt_imm(struct rxe_pkt_info *pkt)
+{
+ return __immdt_imm(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_IMMDT]);
+}
+
+static inline void immdt_set_imm(struct rxe_pkt_info *pkt, __be32 imm)
+{
+ __immdt_set_imm(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_IMMDT], imm);
+}
+
+/******************************************************************************
+ * Invalidate Extended Transport Header
+ ******************************************************************************/
+struct rxe_ieth {
+ __be32 rkey;
+};
+
+static inline u32 __ieth_rkey(void *arg)
+{
+ struct rxe_ieth *ieth = arg;
+
+ return be32_to_cpu(ieth->rkey);
+}
+
+static inline void __ieth_set_rkey(void *arg, u32 rkey)
+{
+ struct rxe_ieth *ieth = arg;
+
+ ieth->rkey = cpu_to_be32(rkey);
+}
+
+static inline u32 ieth_rkey(struct rxe_pkt_info *pkt)
+{
+ return __ieth_rkey(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_IETH]);
+}
+
+static inline void ieth_set_rkey(struct rxe_pkt_info *pkt, u32 rkey)
+{
+ __ieth_set_rkey(pkt->hdr +
+ rxe_opcode[pkt->opcode].offset[RXE_IETH], rkey);
+}
+
+enum rxe_hdr_length {
+ RXE_BTH_BYTES = sizeof(struct rxe_bth),
+ RXE_DETH_BYTES = sizeof(struct rxe_deth),
+ RXE_IMMDT_BYTES = sizeof(struct rxe_immdt),
+ RXE_RETH_BYTES = sizeof(struct rxe_reth),
+ RXE_AETH_BYTES = sizeof(struct rxe_aeth),
+ RXE_ATMACK_BYTES = sizeof(struct rxe_atmack),
+ RXE_ATMETH_BYTES = sizeof(struct rxe_atmeth),
+ RXE_IETH_BYTES = sizeof(struct rxe_ieth),
+ RXE_RDETH_BYTES = sizeof(struct rxe_rdeth),
+ RXE_FETH_BYTES = sizeof(struct rxe_feth),
+};
+
+static inline size_t header_size(struct rxe_pkt_info *pkt)
+{
+ return rxe_opcode[pkt->opcode].length;
+}
+
+static inline void *payload_addr(struct rxe_pkt_info *pkt)
+{
+ return pkt->hdr + rxe_opcode[pkt->opcode].offset[RXE_PAYLOAD];
+}
+
+static inline size_t payload_size(struct rxe_pkt_info *pkt)
+{
+ return pkt->paylen - rxe_opcode[pkt->opcode].offset[RXE_PAYLOAD]
+ - bth_pad(pkt) - RXE_ICRC_SIZE;
+}
+
+#endif /* RXE_HDR_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.c b/drivers/infiniband/sw/rxe/rxe_hw_counters.c
new file mode 100644
index 0000000000..a012522b57
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2017 Mellanox Technologies Ltd. All rights reserved.
+ */
+
+#include "rxe.h"
+#include "rxe_hw_counters.h"
+
+static const struct rdma_stat_desc rxe_counter_descs[] = {
+ [RXE_CNT_SENT_PKTS].name = "sent_pkts",
+ [RXE_CNT_RCVD_PKTS].name = "rcvd_pkts",
+ [RXE_CNT_DUP_REQ].name = "duplicate_request",
+ [RXE_CNT_OUT_OF_SEQ_REQ].name = "out_of_seq_request",
+ [RXE_CNT_RCV_RNR].name = "rcvd_rnr_err",
+ [RXE_CNT_SND_RNR].name = "send_rnr_err",
+ [RXE_CNT_RCV_SEQ_ERR].name = "rcvd_seq_err",
+ [RXE_CNT_COMPLETER_SCHED].name = "ack_deferred",
+ [RXE_CNT_RETRY_EXCEEDED].name = "retry_exceeded_err",
+ [RXE_CNT_RNR_RETRY_EXCEEDED].name = "retry_rnr_exceeded_err",
+ [RXE_CNT_COMP_RETRY].name = "completer_retry_err",
+ [RXE_CNT_SEND_ERR].name = "send_err",
+ [RXE_CNT_LINK_DOWNED].name = "link_downed",
+ [RXE_CNT_RDMA_SEND].name = "rdma_sends",
+ [RXE_CNT_RDMA_RECV].name = "rdma_recvs",
+};
+
+int rxe_ib_get_hw_stats(struct ib_device *ibdev,
+ struct rdma_hw_stats *stats,
+ u32 port, int index)
+{
+ struct rxe_dev *dev = to_rdev(ibdev);
+ unsigned int cnt;
+
+ if (!port || !stats)
+ return -EINVAL;
+
+ for (cnt = 0; cnt < ARRAY_SIZE(rxe_counter_descs); cnt++)
+ stats->value[cnt] = atomic64_read(&dev->stats_counters[cnt]);
+
+ return ARRAY_SIZE(rxe_counter_descs);
+}
+
+struct rdma_hw_stats *rxe_ib_alloc_hw_port_stats(struct ib_device *ibdev,
+ u32 port_num)
+{
+ BUILD_BUG_ON(ARRAY_SIZE(rxe_counter_descs) != RXE_NUM_OF_COUNTERS);
+
+ return rdma_alloc_hw_stats_struct(rxe_counter_descs,
+ ARRAY_SIZE(rxe_counter_descs),
+ RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.h b/drivers/infiniband/sw/rxe/rxe_hw_counters.h
new file mode 100644
index 0000000000..71f4d4fa9d
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2017 Mellanox Technologies Ltd. All rights reserved.
+ */
+
+#ifndef RXE_HW_COUNTERS_H
+#define RXE_HW_COUNTERS_H
+
+/*
+ * when adding counters to enum also add
+ * them to rxe_counter_name[] vector.
+ */
+enum rxe_counters {
+ RXE_CNT_SENT_PKTS,
+ RXE_CNT_RCVD_PKTS,
+ RXE_CNT_DUP_REQ,
+ RXE_CNT_OUT_OF_SEQ_REQ,
+ RXE_CNT_RCV_RNR,
+ RXE_CNT_SND_RNR,
+ RXE_CNT_RCV_SEQ_ERR,
+ RXE_CNT_COMPLETER_SCHED,
+ RXE_CNT_RETRY_EXCEEDED,
+ RXE_CNT_RNR_RETRY_EXCEEDED,
+ RXE_CNT_COMP_RETRY,
+ RXE_CNT_SEND_ERR,
+ RXE_CNT_LINK_DOWNED,
+ RXE_CNT_RDMA_SEND,
+ RXE_CNT_RDMA_RECV,
+ RXE_NUM_OF_COUNTERS
+};
+
+struct rdma_hw_stats *rxe_ib_alloc_hw_port_stats(struct ib_device *ibdev,
+ u32 port_num);
+int rxe_ib_get_hw_stats(struct ib_device *ibdev,
+ struct rdma_hw_stats *stats,
+ u32 port, int index);
+#endif /* RXE_HW_COUNTERS_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_icrc.c b/drivers/infiniband/sw/rxe/rxe_icrc.c
new file mode 100644
index 0000000000..fdf5f08cd8
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_icrc.c
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#include <linux/crc32.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+/**
+ * rxe_icrc_init() - Initialize crypto function for computing crc32
+ * @rxe: rdma_rxe device object
+ *
+ * Return: 0 on success else an error
+ */
+int rxe_icrc_init(struct rxe_dev *rxe)
+{
+ struct crypto_shash *tfm;
+
+ tfm = crypto_alloc_shash("crc32", 0, 0);
+ if (IS_ERR(tfm)) {
+ rxe_dbg_dev(rxe, "failed to init crc32 algorithm err: %ld\n",
+ PTR_ERR(tfm));
+ return PTR_ERR(tfm);
+ }
+
+ rxe->tfm = tfm;
+
+ return 0;
+}
+
+/**
+ * rxe_crc32() - Compute cumulative crc32 for a contiguous segment
+ * @rxe: rdma_rxe device object
+ * @crc: starting crc32 value from previous segments
+ * @next: starting address of current segment
+ * @len: length of current segment
+ *
+ * Return: the cumulative crc32 checksum
+ */
+static __be32 rxe_crc32(struct rxe_dev *rxe, __be32 crc, void *next, size_t len)
+{
+ __be32 icrc;
+ int err;
+
+ SHASH_DESC_ON_STACK(shash, rxe->tfm);
+
+ shash->tfm = rxe->tfm;
+ *(__be32 *)shash_desc_ctx(shash) = crc;
+ err = crypto_shash_update(shash, next, len);
+ if (unlikely(err)) {
+ rxe_dbg_dev(rxe, "failed crc calculation, err: %d\n", err);
+ return (__force __be32)crc32_le((__force u32)crc, next, len);
+ }
+
+ icrc = *(__be32 *)shash_desc_ctx(shash);
+ barrier_data(shash_desc_ctx(shash));
+
+ return icrc;
+}
+
+/**
+ * rxe_icrc_hdr() - Compute the partial ICRC for the network and transport
+ * headers of a packet.
+ * @skb: packet buffer
+ * @pkt: packet information
+ *
+ * Return: the partial ICRC
+ */
+static __be32 rxe_icrc_hdr(struct sk_buff *skb, struct rxe_pkt_info *pkt)
+{
+ unsigned int bth_offset = 0;
+ struct iphdr *ip4h = NULL;
+ struct ipv6hdr *ip6h = NULL;
+ struct udphdr *udph;
+ struct rxe_bth *bth;
+ __be32 crc;
+ int length;
+ int hdr_size = sizeof(struct udphdr) +
+ (skb->protocol == htons(ETH_P_IP) ?
+ sizeof(struct iphdr) : sizeof(struct ipv6hdr));
+ /* pseudo header buffer size is calculate using ipv6 header size since
+ * it is bigger than ipv4
+ */
+ u8 pshdr[sizeof(struct udphdr) +
+ sizeof(struct ipv6hdr) +
+ RXE_BTH_BYTES];
+
+ /* This seed is the result of computing a CRC with a seed of
+ * 0xfffffff and 8 bytes of 0xff representing a masked LRH.
+ */
+ crc = (__force __be32)0xdebb20e3;
+
+ if (skb->protocol == htons(ETH_P_IP)) { /* IPv4 */
+ memcpy(pshdr, ip_hdr(skb), hdr_size);
+ ip4h = (struct iphdr *)pshdr;
+ udph = (struct udphdr *)(ip4h + 1);
+
+ ip4h->ttl = 0xff;
+ ip4h->check = CSUM_MANGLED_0;
+ ip4h->tos = 0xff;
+ } else { /* IPv6 */
+ memcpy(pshdr, ipv6_hdr(skb), hdr_size);
+ ip6h = (struct ipv6hdr *)pshdr;
+ udph = (struct udphdr *)(ip6h + 1);
+
+ memset(ip6h->flow_lbl, 0xff, sizeof(ip6h->flow_lbl));
+ ip6h->priority = 0xf;
+ ip6h->hop_limit = 0xff;
+ }
+ udph->check = CSUM_MANGLED_0;
+
+ bth_offset += hdr_size;
+
+ memcpy(&pshdr[bth_offset], pkt->hdr, RXE_BTH_BYTES);
+ bth = (struct rxe_bth *)&pshdr[bth_offset];
+
+ /* exclude bth.resv8a */
+ bth->qpn |= cpu_to_be32(~BTH_QPN_MASK);
+
+ length = hdr_size + RXE_BTH_BYTES;
+ crc = rxe_crc32(pkt->rxe, crc, pshdr, length);
+
+ /* And finish to compute the CRC on the remainder of the headers. */
+ crc = rxe_crc32(pkt->rxe, crc, pkt->hdr + RXE_BTH_BYTES,
+ rxe_opcode[pkt->opcode].length - RXE_BTH_BYTES);
+ return crc;
+}
+
+/**
+ * rxe_icrc_check() - Compute ICRC for a packet and compare to the ICRC
+ * delivered in the packet.
+ * @skb: packet buffer
+ * @pkt: packet information
+ *
+ * Return: 0 if the values match else an error
+ */
+int rxe_icrc_check(struct sk_buff *skb, struct rxe_pkt_info *pkt)
+{
+ __be32 *icrcp;
+ __be32 pkt_icrc;
+ __be32 icrc;
+
+ icrcp = (__be32 *)(pkt->hdr + pkt->paylen - RXE_ICRC_SIZE);
+ pkt_icrc = *icrcp;
+
+ icrc = rxe_icrc_hdr(skb, pkt);
+ icrc = rxe_crc32(pkt->rxe, icrc, (u8 *)payload_addr(pkt),
+ payload_size(pkt) + bth_pad(pkt));
+ icrc = ~icrc;
+
+ if (unlikely(icrc != pkt_icrc))
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * rxe_icrc_generate() - compute ICRC for a packet.
+ * @skb: packet buffer
+ * @pkt: packet information
+ */
+void rxe_icrc_generate(struct sk_buff *skb, struct rxe_pkt_info *pkt)
+{
+ __be32 *icrcp;
+ __be32 icrc;
+
+ icrcp = (__be32 *)(pkt->hdr + pkt->paylen - RXE_ICRC_SIZE);
+ icrc = rxe_icrc_hdr(skb, pkt);
+ icrc = rxe_crc32(pkt->rxe, icrc, (u8 *)payload_addr(pkt),
+ payload_size(pkt) + bth_pad(pkt));
+ *icrcp = ~icrc;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
new file mode 100644
index 0000000000..4d2a8ef52c
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_loc.h
@@ -0,0 +1,183 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#ifndef RXE_LOC_H
+#define RXE_LOC_H
+
+/* rxe_av.c */
+void rxe_init_av(struct rdma_ah_attr *attr, struct rxe_av *av);
+int rxe_av_chk_attr(struct rxe_qp *qp, struct rdma_ah_attr *attr);
+int rxe_ah_chk_attr(struct rxe_ah *ah, struct rdma_ah_attr *attr);
+void rxe_av_from_attr(u8 port_num, struct rxe_av *av,
+ struct rdma_ah_attr *attr);
+void rxe_av_to_attr(struct rxe_av *av, struct rdma_ah_attr *attr);
+void rxe_av_fill_ip_info(struct rxe_av *av, struct rdma_ah_attr *attr);
+struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt, struct rxe_ah **ahp);
+
+/* rxe_cq.c */
+int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq,
+ int cqe, int comp_vector);
+
+int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe,
+ int comp_vector, struct ib_udata *udata,
+ struct rxe_create_cq_resp __user *uresp);
+
+int rxe_cq_resize_queue(struct rxe_cq *cq, int new_cqe,
+ struct rxe_resize_cq_resp __user *uresp,
+ struct ib_udata *udata);
+
+int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited);
+
+void rxe_cq_cleanup(struct rxe_pool_elem *elem);
+
+/* rxe_mcast.c */
+struct rxe_mcg *rxe_lookup_mcg(struct rxe_dev *rxe, union ib_gid *mgid);
+int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid);
+int rxe_detach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid);
+void rxe_cleanup_mcg(struct kref *kref);
+
+/* rxe_mmap.c */
+struct rxe_mmap_info {
+ struct list_head pending_mmaps;
+ struct ib_ucontext *context;
+ struct kref ref;
+ void *obj;
+
+ struct mminfo info;
+};
+
+void rxe_mmap_release(struct kref *ref);
+
+struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *dev, u32 size,
+ struct ib_udata *udata, void *obj);
+
+int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
+
+/* rxe_mr.c */
+u8 rxe_get_next_key(u32 last_key);
+void rxe_mr_init_dma(int access, struct rxe_mr *mr);
+int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
+ int access, struct rxe_mr *mr);
+int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr);
+int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length);
+int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr,
+ unsigned int length, enum rxe_mr_copy_dir dir);
+int copy_data(struct rxe_pd *pd, int access, struct rxe_dma_info *dma,
+ void *addr, int length, enum rxe_mr_copy_dir dir);
+int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+ int sg_nents, unsigned int *sg_offset);
+int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
+ u64 compare, u64 swap_add, u64 *orig_val);
+int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value);
+struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key,
+ enum rxe_mr_lookup_type type);
+int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length);
+int advance_dma_data(struct rxe_dma_info *dma, unsigned int length);
+int rxe_invalidate_mr(struct rxe_qp *qp, u32 key);
+int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe);
+void rxe_mr_cleanup(struct rxe_pool_elem *elem);
+
+/* rxe_mw.c */
+int rxe_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata);
+int rxe_dealloc_mw(struct ib_mw *ibmw);
+int rxe_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe);
+int rxe_invalidate_mw(struct rxe_qp *qp, u32 rkey);
+struct rxe_mw *rxe_lookup_mw(struct rxe_qp *qp, int access, u32 rkey);
+void rxe_mw_cleanup(struct rxe_pool_elem *elem);
+
+/* rxe_net.c */
+struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av,
+ int paylen, struct rxe_pkt_info *pkt);
+int rxe_prepare(struct rxe_av *av, struct rxe_pkt_info *pkt,
+ struct sk_buff *skb);
+int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
+ struct sk_buff *skb);
+const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num);
+
+/* rxe_qp.c */
+int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init);
+int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd,
+ struct ib_qp_init_attr *init,
+ struct rxe_create_qp_resp __user *uresp,
+ struct ib_pd *ibpd, struct ib_udata *udata);
+int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init);
+int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp,
+ struct ib_qp_attr *attr, int mask);
+int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr,
+ int mask, struct ib_udata *udata);
+int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask);
+void rxe_qp_error(struct rxe_qp *qp);
+int rxe_qp_chk_destroy(struct rxe_qp *qp);
+void rxe_qp_cleanup(struct rxe_pool_elem *elem);
+
+static inline int qp_num(struct rxe_qp *qp)
+{
+ return qp->ibqp.qp_num;
+}
+
+static inline enum ib_qp_type qp_type(struct rxe_qp *qp)
+{
+ return qp->ibqp.qp_type;
+}
+
+static inline enum ib_qp_state qp_state(struct rxe_qp *qp)
+{
+ return qp->attr.qp_state;
+}
+
+static inline int qp_mtu(struct rxe_qp *qp)
+{
+ if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC)
+ return qp->attr.path_mtu;
+ else
+ return IB_MTU_4096;
+}
+
+void free_rd_atomic_resource(struct resp_res *res);
+
+static inline void rxe_advance_resp_resource(struct rxe_qp *qp)
+{
+ qp->resp.res_head++;
+ if (unlikely(qp->resp.res_head == qp->attr.max_dest_rd_atomic))
+ qp->resp.res_head = 0;
+}
+
+void retransmit_timer(struct timer_list *t);
+void rnr_nak_timer(struct timer_list *t);
+
+/* rxe_srq.c */
+int rxe_srq_chk_init(struct rxe_dev *rxe, struct ib_srq_init_attr *init);
+int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq,
+ struct ib_srq_init_attr *init, struct ib_udata *udata,
+ struct rxe_create_srq_resp __user *uresp);
+int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
+ struct ib_srq_attr *attr, enum ib_srq_attr_mask mask);
+int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
+ struct ib_srq_attr *attr, enum ib_srq_attr_mask mask,
+ struct rxe_modify_srq_cmd *ucmd, struct ib_udata *udata);
+void rxe_srq_cleanup(struct rxe_pool_elem *elem);
+
+void rxe_dealloc(struct ib_device *ib_dev);
+
+int rxe_completer(struct rxe_qp *qp);
+int rxe_requester(struct rxe_qp *qp);
+int rxe_responder(struct rxe_qp *qp);
+
+/* rxe_icrc.c */
+int rxe_icrc_init(struct rxe_dev *rxe);
+int rxe_icrc_check(struct sk_buff *skb, struct rxe_pkt_info *pkt);
+void rxe_icrc_generate(struct sk_buff *skb, struct rxe_pkt_info *pkt);
+
+void rxe_resp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb);
+
+void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb);
+
+static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp)
+{
+ return rxe_wr_opcode_info[opcode].mask[qp->ibqp.qp_type];
+}
+
+#endif /* RXE_LOC_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c
new file mode 100644
index 0000000000..86cc2e18a7
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_mcast.c
@@ -0,0 +1,479 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2022 Hewlett Packard Enterprise, Inc. All rights reserved.
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+/*
+ * rxe_mcast.c implements driver support for multicast transport.
+ * It is based on two data structures struct rxe_mcg ('mcg') and
+ * struct rxe_mca ('mca'). An mcg is allocated each time a qp is
+ * attached to a new mgid for the first time. These are indexed by
+ * a red-black tree using the mgid. This data structure is searched
+ * for the mcg when a multicast packet is received and when another
+ * qp is attached to the same mgid. It is cleaned up when the last qp
+ * is detached from the mcg. Each time a qp is attached to an mcg an
+ * mca is created. It holds a pointer to the qp and is added to a list
+ * of qp's that are attached to the mcg. The qp_list is used to replicate
+ * mcast packets in the rxe receive path.
+ */
+
+#include "rxe.h"
+
+/**
+ * rxe_mcast_add - add multicast address to rxe device
+ * @rxe: rxe device object
+ * @mgid: multicast address as a gid
+ *
+ * Returns 0 on success else an error
+ */
+static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid)
+{
+ unsigned char ll_addr[ETH_ALEN];
+
+ ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr);
+
+ return dev_mc_add(rxe->ndev, ll_addr);
+}
+
+/**
+ * rxe_mcast_del - delete multicast address from rxe device
+ * @rxe: rxe device object
+ * @mgid: multicast address as a gid
+ *
+ * Returns 0 on success else an error
+ */
+static int rxe_mcast_del(struct rxe_dev *rxe, union ib_gid *mgid)
+{
+ unsigned char ll_addr[ETH_ALEN];
+
+ ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr);
+
+ return dev_mc_del(rxe->ndev, ll_addr);
+}
+
+/**
+ * __rxe_insert_mcg - insert an mcg into red-black tree (rxe->mcg_tree)
+ * @mcg: mcg object with an embedded red-black tree node
+ *
+ * Context: caller must hold a reference to mcg and rxe->mcg_lock and
+ * is responsible to avoid adding the same mcg twice to the tree.
+ */
+static void __rxe_insert_mcg(struct rxe_mcg *mcg)
+{
+ struct rb_root *tree = &mcg->rxe->mcg_tree;
+ struct rb_node **link = &tree->rb_node;
+ struct rb_node *node = NULL;
+ struct rxe_mcg *tmp;
+ int cmp;
+
+ while (*link) {
+ node = *link;
+ tmp = rb_entry(node, struct rxe_mcg, node);
+
+ cmp = memcmp(&tmp->mgid, &mcg->mgid, sizeof(mcg->mgid));
+ if (cmp > 0)
+ link = &(*link)->rb_left;
+ else
+ link = &(*link)->rb_right;
+ }
+
+ rb_link_node(&mcg->node, node, link);
+ rb_insert_color(&mcg->node, tree);
+}
+
+/**
+ * __rxe_remove_mcg - remove an mcg from red-black tree holding lock
+ * @mcg: mcast group object with an embedded red-black tree node
+ *
+ * Context: caller must hold a reference to mcg and rxe->mcg_lock
+ */
+static void __rxe_remove_mcg(struct rxe_mcg *mcg)
+{
+ rb_erase(&mcg->node, &mcg->rxe->mcg_tree);
+}
+
+/**
+ * __rxe_lookup_mcg - lookup mcg in rxe->mcg_tree while holding lock
+ * @rxe: rxe device object
+ * @mgid: multicast IP address
+ *
+ * Context: caller must hold rxe->mcg_lock
+ * Returns: mcg on success and takes a ref to mcg else NULL
+ */
+static struct rxe_mcg *__rxe_lookup_mcg(struct rxe_dev *rxe,
+ union ib_gid *mgid)
+{
+ struct rb_root *tree = &rxe->mcg_tree;
+ struct rxe_mcg *mcg;
+ struct rb_node *node;
+ int cmp;
+
+ node = tree->rb_node;
+
+ while (node) {
+ mcg = rb_entry(node, struct rxe_mcg, node);
+
+ cmp = memcmp(&mcg->mgid, mgid, sizeof(*mgid));
+
+ if (cmp > 0)
+ node = node->rb_left;
+ else if (cmp < 0)
+ node = node->rb_right;
+ else
+ break;
+ }
+
+ if (node) {
+ kref_get(&mcg->ref_cnt);
+ return mcg;
+ }
+
+ return NULL;
+}
+
+/**
+ * rxe_lookup_mcg - lookup up mcg in red-back tree
+ * @rxe: rxe device object
+ * @mgid: multicast IP address
+ *
+ * Returns: mcg if found else NULL
+ */
+struct rxe_mcg *rxe_lookup_mcg(struct rxe_dev *rxe, union ib_gid *mgid)
+{
+ struct rxe_mcg *mcg;
+
+ spin_lock_bh(&rxe->mcg_lock);
+ mcg = __rxe_lookup_mcg(rxe, mgid);
+ spin_unlock_bh(&rxe->mcg_lock);
+
+ return mcg;
+}
+
+/**
+ * __rxe_init_mcg - initialize a new mcg
+ * @rxe: rxe device
+ * @mgid: multicast address as a gid
+ * @mcg: new mcg object
+ *
+ * Context: caller should hold rxe->mcg lock
+ */
+static void __rxe_init_mcg(struct rxe_dev *rxe, union ib_gid *mgid,
+ struct rxe_mcg *mcg)
+{
+ kref_init(&mcg->ref_cnt);
+ memcpy(&mcg->mgid, mgid, sizeof(mcg->mgid));
+ INIT_LIST_HEAD(&mcg->qp_list);
+ mcg->rxe = rxe;
+
+ /* caller holds a ref on mcg but that will be
+ * dropped when mcg goes out of scope. We need to take a ref
+ * on the pointer that will be saved in the red-black tree
+ * by __rxe_insert_mcg and used to lookup mcg from mgid later.
+ * Inserting mcg makes it visible to outside so this should
+ * be done last after the object is ready.
+ */
+ kref_get(&mcg->ref_cnt);
+ __rxe_insert_mcg(mcg);
+}
+
+/**
+ * rxe_get_mcg - lookup or allocate a mcg
+ * @rxe: rxe device object
+ * @mgid: multicast IP address as a gid
+ *
+ * Returns: mcg on success else ERR_PTR(error)
+ */
+static struct rxe_mcg *rxe_get_mcg(struct rxe_dev *rxe, union ib_gid *mgid)
+{
+ struct rxe_mcg *mcg, *tmp;
+ int err;
+
+ if (rxe->attr.max_mcast_grp == 0)
+ return ERR_PTR(-EINVAL);
+
+ /* check to see if mcg already exists */
+ mcg = rxe_lookup_mcg(rxe, mgid);
+ if (mcg)
+ return mcg;
+
+ /* check to see if we have reached limit */
+ if (atomic_inc_return(&rxe->mcg_num) > rxe->attr.max_mcast_grp) {
+ err = -ENOMEM;
+ goto err_dec;
+ }
+
+ /* speculative alloc of new mcg */
+ mcg = kzalloc(sizeof(*mcg), GFP_KERNEL);
+ if (!mcg) {
+ err = -ENOMEM;
+ goto err_dec;
+ }
+
+ spin_lock_bh(&rxe->mcg_lock);
+ /* re-check to see if someone else just added it */
+ tmp = __rxe_lookup_mcg(rxe, mgid);
+ if (tmp) {
+ spin_unlock_bh(&rxe->mcg_lock);
+ atomic_dec(&rxe->mcg_num);
+ kfree(mcg);
+ return tmp;
+ }
+
+ __rxe_init_mcg(rxe, mgid, mcg);
+ spin_unlock_bh(&rxe->mcg_lock);
+
+ /* add mcast address outside of lock */
+ err = rxe_mcast_add(rxe, mgid);
+ if (!err)
+ return mcg;
+
+ kfree(mcg);
+err_dec:
+ atomic_dec(&rxe->mcg_num);
+ return ERR_PTR(err);
+}
+
+/**
+ * rxe_cleanup_mcg - cleanup mcg for kref_put
+ * @kref: struct kref embnedded in mcg
+ */
+void rxe_cleanup_mcg(struct kref *kref)
+{
+ struct rxe_mcg *mcg = container_of(kref, typeof(*mcg), ref_cnt);
+
+ kfree(mcg);
+}
+
+/**
+ * __rxe_destroy_mcg - destroy mcg object holding rxe->mcg_lock
+ * @mcg: the mcg object
+ *
+ * Context: caller is holding rxe->mcg_lock
+ * no qp's are attached to mcg
+ */
+static void __rxe_destroy_mcg(struct rxe_mcg *mcg)
+{
+ struct rxe_dev *rxe = mcg->rxe;
+
+ /* remove mcg from red-black tree then drop ref */
+ __rxe_remove_mcg(mcg);
+ kref_put(&mcg->ref_cnt, rxe_cleanup_mcg);
+
+ atomic_dec(&rxe->mcg_num);
+}
+
+/**
+ * rxe_destroy_mcg - destroy mcg object
+ * @mcg: the mcg object
+ *
+ * Context: no qp's are attached to mcg
+ */
+static void rxe_destroy_mcg(struct rxe_mcg *mcg)
+{
+ /* delete mcast address outside of lock */
+ rxe_mcast_del(mcg->rxe, &mcg->mgid);
+
+ spin_lock_bh(&mcg->rxe->mcg_lock);
+ __rxe_destroy_mcg(mcg);
+ spin_unlock_bh(&mcg->rxe->mcg_lock);
+}
+
+/**
+ * __rxe_init_mca - initialize a new mca holding lock
+ * @qp: qp object
+ * @mcg: mcg object
+ * @mca: empty space for new mca
+ *
+ * Context: caller must hold references on qp and mcg, rxe->mcg_lock
+ * and pass memory for new mca
+ *
+ * Returns: 0 on success else an error
+ */
+static int __rxe_init_mca(struct rxe_qp *qp, struct rxe_mcg *mcg,
+ struct rxe_mca *mca)
+{
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ int n;
+
+ n = atomic_inc_return(&rxe->mcg_attach);
+ if (n > rxe->attr.max_total_mcast_qp_attach) {
+ atomic_dec(&rxe->mcg_attach);
+ return -ENOMEM;
+ }
+
+ n = atomic_inc_return(&mcg->qp_num);
+ if (n > rxe->attr.max_mcast_qp_attach) {
+ atomic_dec(&mcg->qp_num);
+ atomic_dec(&rxe->mcg_attach);
+ return -ENOMEM;
+ }
+
+ atomic_inc(&qp->mcg_num);
+
+ rxe_get(qp);
+ mca->qp = qp;
+
+ list_add_tail(&mca->qp_list, &mcg->qp_list);
+
+ return 0;
+}
+
+/**
+ * rxe_attach_mcg - attach qp to mcg if not already attached
+ * @qp: qp object
+ * @mcg: mcg object
+ *
+ * Context: caller must hold reference on qp and mcg.
+ * Returns: 0 on success else an error
+ */
+static int rxe_attach_mcg(struct rxe_mcg *mcg, struct rxe_qp *qp)
+{
+ struct rxe_dev *rxe = mcg->rxe;
+ struct rxe_mca *mca, *tmp;
+ int err;
+
+ /* check to see if the qp is already a member of the group */
+ spin_lock_bh(&rxe->mcg_lock);
+ list_for_each_entry(mca, &mcg->qp_list, qp_list) {
+ if (mca->qp == qp) {
+ spin_unlock_bh(&rxe->mcg_lock);
+ return 0;
+ }
+ }
+ spin_unlock_bh(&rxe->mcg_lock);
+
+ /* speculative alloc new mca without using GFP_ATOMIC */
+ mca = kzalloc(sizeof(*mca), GFP_KERNEL);
+ if (!mca)
+ return -ENOMEM;
+
+ spin_lock_bh(&rxe->mcg_lock);
+ /* re-check to see if someone else just attached qp */
+ list_for_each_entry(tmp, &mcg->qp_list, qp_list) {
+ if (tmp->qp == qp) {
+ kfree(mca);
+ err = 0;
+ goto out;
+ }
+ }
+
+ err = __rxe_init_mca(qp, mcg, mca);
+ if (err)
+ kfree(mca);
+out:
+ spin_unlock_bh(&rxe->mcg_lock);
+ return err;
+}
+
+/**
+ * __rxe_cleanup_mca - cleanup mca object holding lock
+ * @mca: mca object
+ * @mcg: mcg object
+ *
+ * Context: caller must hold a reference to mcg and rxe->mcg_lock
+ */
+static void __rxe_cleanup_mca(struct rxe_mca *mca, struct rxe_mcg *mcg)
+{
+ list_del(&mca->qp_list);
+
+ atomic_dec(&mcg->qp_num);
+ atomic_dec(&mcg->rxe->mcg_attach);
+ atomic_dec(&mca->qp->mcg_num);
+ rxe_put(mca->qp);
+
+ kfree(mca);
+}
+
+/**
+ * rxe_detach_mcg - detach qp from mcg
+ * @mcg: mcg object
+ * @qp: qp object
+ *
+ * Returns: 0 on success else an error if qp is not attached.
+ */
+static int rxe_detach_mcg(struct rxe_mcg *mcg, struct rxe_qp *qp)
+{
+ struct rxe_dev *rxe = mcg->rxe;
+ struct rxe_mca *mca, *tmp;
+
+ spin_lock_bh(&rxe->mcg_lock);
+ list_for_each_entry_safe(mca, tmp, &mcg->qp_list, qp_list) {
+ if (mca->qp == qp) {
+ __rxe_cleanup_mca(mca, mcg);
+
+ /* if the number of qp's attached to the
+ * mcast group falls to zero go ahead and
+ * tear it down. This will not free the
+ * object since we are still holding a ref
+ * from the caller
+ */
+ if (atomic_read(&mcg->qp_num) <= 0)
+ __rxe_destroy_mcg(mcg);
+
+ spin_unlock_bh(&rxe->mcg_lock);
+ return 0;
+ }
+ }
+
+ /* we didn't find the qp on the list */
+ spin_unlock_bh(&rxe->mcg_lock);
+ return -EINVAL;
+}
+
+/**
+ * rxe_attach_mcast - attach qp to multicast group (see IBA-11.3.1)
+ * @ibqp: (IB) qp object
+ * @mgid: multicast IP address
+ * @mlid: multicast LID, ignored for RoCEv2 (see IBA-A17.5.6)
+ *
+ * Returns: 0 on success else an errno
+ */
+int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid)
+{
+ int err;
+ struct rxe_dev *rxe = to_rdev(ibqp->device);
+ struct rxe_qp *qp = to_rqp(ibqp);
+ struct rxe_mcg *mcg;
+
+ /* takes a ref on mcg if successful */
+ mcg = rxe_get_mcg(rxe, mgid);
+ if (IS_ERR(mcg))
+ return PTR_ERR(mcg);
+
+ err = rxe_attach_mcg(mcg, qp);
+
+ /* if we failed to attach the first qp to mcg tear it down */
+ if (atomic_read(&mcg->qp_num) == 0)
+ rxe_destroy_mcg(mcg);
+
+ kref_put(&mcg->ref_cnt, rxe_cleanup_mcg);
+
+ return err;
+}
+
+/**
+ * rxe_detach_mcast - detach qp from multicast group (see IBA-11.3.2)
+ * @ibqp: address of (IB) qp object
+ * @mgid: multicast IP address
+ * @mlid: multicast LID, ignored for RoCEv2 (see IBA-A17.5.6)
+ *
+ * Returns: 0 on success else an errno
+ */
+int rxe_detach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid)
+{
+ struct rxe_dev *rxe = to_rdev(ibqp->device);
+ struct rxe_qp *qp = to_rqp(ibqp);
+ struct rxe_mcg *mcg;
+ int err;
+
+ mcg = rxe_lookup_mcg(rxe, mgid);
+ if (!mcg)
+ return -EINVAL;
+
+ err = rxe_detach_mcg(mcg, qp);
+ kref_put(&mcg->ref_cnt, rxe_cleanup_mcg);
+
+ return err;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_mmap.c b/drivers/infiniband/sw/rxe/rxe_mmap.c
new file mode 100644
index 0000000000..6b7f2bd698
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_mmap.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <rdma/uverbs_ioctl.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+void rxe_mmap_release(struct kref *ref)
+{
+ struct rxe_mmap_info *ip = container_of(ref,
+ struct rxe_mmap_info, ref);
+ struct rxe_dev *rxe = to_rdev(ip->context->device);
+
+ spin_lock_bh(&rxe->pending_lock);
+
+ if (!list_empty(&ip->pending_mmaps))
+ list_del(&ip->pending_mmaps);
+
+ spin_unlock_bh(&rxe->pending_lock);
+
+ vfree(ip->obj); /* buf */
+ kfree(ip);
+}
+
+/*
+ * open and close keep track of how many times the memory region is mapped,
+ * to avoid releasing it.
+ */
+static void rxe_vma_open(struct vm_area_struct *vma)
+{
+ struct rxe_mmap_info *ip = vma->vm_private_data;
+
+ kref_get(&ip->ref);
+}
+
+static void rxe_vma_close(struct vm_area_struct *vma)
+{
+ struct rxe_mmap_info *ip = vma->vm_private_data;
+
+ kref_put(&ip->ref, rxe_mmap_release);
+}
+
+static const struct vm_operations_struct rxe_vm_ops = {
+ .open = rxe_vma_open,
+ .close = rxe_vma_close,
+};
+
+/**
+ * rxe_mmap - create a new mmap region
+ * @context: the IB user context of the process making the mmap() call
+ * @vma: the VMA to be initialized
+ * Return zero if the mmap is OK. Otherwise, return an errno.
+ */
+int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+ struct rxe_dev *rxe = to_rdev(context->device);
+ unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+ unsigned long size = vma->vm_end - vma->vm_start;
+ struct rxe_mmap_info *ip, *pp;
+ int ret;
+
+ /*
+ * Search the device's list of objects waiting for a mmap call.
+ * Normally, this list is very short since a call to create a
+ * CQ, QP, or SRQ is soon followed by a call to mmap().
+ */
+ spin_lock_bh(&rxe->pending_lock);
+ list_for_each_entry_safe(ip, pp, &rxe->pending_mmaps, pending_mmaps) {
+ if (context != ip->context || (__u64)offset != ip->info.offset)
+ continue;
+
+ /* Don't allow a mmap larger than the object. */
+ if (size > ip->info.size) {
+ rxe_dbg_dev(rxe, "mmap region is larger than the object!\n");
+ spin_unlock_bh(&rxe->pending_lock);
+ ret = -EINVAL;
+ goto done;
+ }
+
+ goto found_it;
+ }
+ rxe_dbg_dev(rxe, "unable to find pending mmap info\n");
+ spin_unlock_bh(&rxe->pending_lock);
+ ret = -EINVAL;
+ goto done;
+
+found_it:
+ list_del_init(&ip->pending_mmaps);
+ spin_unlock_bh(&rxe->pending_lock);
+
+ ret = remap_vmalloc_range(vma, ip->obj, 0);
+ if (ret) {
+ rxe_dbg_dev(rxe, "err %d from remap_vmalloc_range\n", ret);
+ goto done;
+ }
+
+ vma->vm_ops = &rxe_vm_ops;
+ vma->vm_private_data = ip;
+ rxe_vma_open(vma);
+done:
+ return ret;
+}
+
+/*
+ * Allocate information for rxe_mmap
+ */
+struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *rxe, u32 size,
+ struct ib_udata *udata, void *obj)
+{
+ struct rxe_mmap_info *ip;
+
+ if (!udata)
+ return ERR_PTR(-EINVAL);
+
+ ip = kmalloc(sizeof(*ip), GFP_KERNEL);
+ if (!ip)
+ return ERR_PTR(-ENOMEM);
+
+ size = PAGE_ALIGN(size);
+
+ spin_lock_bh(&rxe->mmap_offset_lock);
+
+ if (rxe->mmap_offset == 0)
+ rxe->mmap_offset = ALIGN(PAGE_SIZE, SHMLBA);
+
+ ip->info.offset = rxe->mmap_offset;
+ rxe->mmap_offset += ALIGN(size, SHMLBA);
+
+ spin_unlock_bh(&rxe->mmap_offset_lock);
+
+ INIT_LIST_HEAD(&ip->pending_mmaps);
+ ip->info.size = size;
+ ip->context =
+ container_of(udata, struct uverbs_attr_bundle, driver_udata)
+ ->context;
+ ip->obj = obj;
+ kref_init(&ip->ref);
+
+ return ip;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
new file mode 100644
index 0000000000..f54042e9ae
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -0,0 +1,731 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#include <linux/libnvdimm.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+/* Return a random 8 bit key value that is
+ * different than the last_key. Set last_key to -1
+ * if this is the first key for an MR or MW
+ */
+u8 rxe_get_next_key(u32 last_key)
+{
+ u8 key;
+
+ do {
+ get_random_bytes(&key, 1);
+ } while (key == last_key);
+
+ return key;
+}
+
+int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length)
+{
+ switch (mr->ibmr.type) {
+ case IB_MR_TYPE_DMA:
+ return 0;
+
+ case IB_MR_TYPE_USER:
+ case IB_MR_TYPE_MEM_REG:
+ if (iova < mr->ibmr.iova ||
+ iova + length > mr->ibmr.iova + mr->ibmr.length) {
+ rxe_dbg_mr(mr, "iova/length out of range");
+ return -EINVAL;
+ }
+ return 0;
+
+ default:
+ rxe_dbg_mr(mr, "mr type not supported\n");
+ return -EINVAL;
+ }
+}
+
+static void rxe_mr_init(int access, struct rxe_mr *mr)
+{
+ u32 key = mr->elem.index << 8 | rxe_get_next_key(-1);
+
+ /* set ibmr->l/rkey and also copy into private l/rkey
+ * for user MRs these will always be the same
+ * for cases where caller 'owns' the key portion
+ * they may be different until REG_MR WQE is executed.
+ */
+ mr->lkey = mr->ibmr.lkey = key;
+ mr->rkey = mr->ibmr.rkey = key;
+
+ mr->access = access;
+ mr->ibmr.page_size = PAGE_SIZE;
+ mr->page_mask = PAGE_MASK;
+ mr->page_shift = PAGE_SHIFT;
+ mr->state = RXE_MR_STATE_INVALID;
+}
+
+void rxe_mr_init_dma(int access, struct rxe_mr *mr)
+{
+ rxe_mr_init(access, mr);
+
+ mr->state = RXE_MR_STATE_VALID;
+ mr->ibmr.type = IB_MR_TYPE_DMA;
+}
+
+static unsigned long rxe_mr_iova_to_index(struct rxe_mr *mr, u64 iova)
+{
+ return (iova >> mr->page_shift) - (mr->ibmr.iova >> mr->page_shift);
+}
+
+static unsigned long rxe_mr_iova_to_page_offset(struct rxe_mr *mr, u64 iova)
+{
+ return iova & (mr_page_size(mr) - 1);
+}
+
+static bool is_pmem_page(struct page *pg)
+{
+ unsigned long paddr = page_to_phys(pg);
+
+ return REGION_INTERSECTS ==
+ region_intersects(paddr, PAGE_SIZE, IORESOURCE_MEM,
+ IORES_DESC_PERSISTENT_MEMORY);
+}
+
+static int rxe_mr_fill_pages_from_sgt(struct rxe_mr *mr, struct sg_table *sgt)
+{
+ XA_STATE(xas, &mr->page_list, 0);
+ struct sg_page_iter sg_iter;
+ struct page *page;
+ bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT);
+
+ __sg_page_iter_start(&sg_iter, sgt->sgl, sgt->orig_nents, 0);
+ if (!__sg_page_iter_next(&sg_iter))
+ return 0;
+
+ do {
+ xas_lock(&xas);
+ while (true) {
+ page = sg_page_iter_page(&sg_iter);
+
+ if (persistent && !is_pmem_page(page)) {
+ rxe_dbg_mr(mr, "Page can't be persistent\n");
+ xas_set_err(&xas, -EINVAL);
+ break;
+ }
+
+ xas_store(&xas, page);
+ if (xas_error(&xas))
+ break;
+ xas_next(&xas);
+ if (!__sg_page_iter_next(&sg_iter))
+ break;
+ }
+ xas_unlock(&xas);
+ } while (xas_nomem(&xas, GFP_KERNEL));
+
+ return xas_error(&xas);
+}
+
+int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
+ int access, struct rxe_mr *mr)
+{
+ struct ib_umem *umem;
+ int err;
+
+ rxe_mr_init(access, mr);
+
+ xa_init(&mr->page_list);
+
+ umem = ib_umem_get(&rxe->ib_dev, start, length, access);
+ if (IS_ERR(umem)) {
+ rxe_dbg_mr(mr, "Unable to pin memory region err = %d\n",
+ (int)PTR_ERR(umem));
+ return PTR_ERR(umem);
+ }
+
+ err = rxe_mr_fill_pages_from_sgt(mr, &umem->sgt_append.sgt);
+ if (err) {
+ ib_umem_release(umem);
+ return err;
+ }
+
+ mr->umem = umem;
+ mr->ibmr.type = IB_MR_TYPE_USER;
+ mr->state = RXE_MR_STATE_VALID;
+
+ return 0;
+}
+
+static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf)
+{
+ XA_STATE(xas, &mr->page_list, 0);
+ int i = 0;
+ int err;
+
+ xa_init(&mr->page_list);
+
+ do {
+ xas_lock(&xas);
+ while (i != num_buf) {
+ xas_store(&xas, XA_ZERO_ENTRY);
+ if (xas_error(&xas))
+ break;
+ xas_next(&xas);
+ i++;
+ }
+ xas_unlock(&xas);
+ } while (xas_nomem(&xas, GFP_KERNEL));
+
+ err = xas_error(&xas);
+ if (err)
+ return err;
+
+ mr->num_buf = num_buf;
+
+ return 0;
+}
+
+int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr)
+{
+ int err;
+
+ /* always allow remote access for FMRs */
+ rxe_mr_init(RXE_ACCESS_REMOTE, mr);
+
+ err = rxe_mr_alloc(mr, max_pages);
+ if (err)
+ goto err1;
+
+ mr->state = RXE_MR_STATE_FREE;
+ mr->ibmr.type = IB_MR_TYPE_MEM_REG;
+
+ return 0;
+
+err1:
+ return err;
+}
+
+static int rxe_set_page(struct ib_mr *ibmr, u64 dma_addr)
+{
+ struct rxe_mr *mr = to_rmr(ibmr);
+ struct page *page = ib_virt_dma_to_page(dma_addr);
+ bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT);
+ int err;
+
+ if (persistent && !is_pmem_page(page)) {
+ rxe_dbg_mr(mr, "Page cannot be persistent\n");
+ return -EINVAL;
+ }
+
+ if (unlikely(mr->nbuf == mr->num_buf))
+ return -ENOMEM;
+
+ err = xa_err(xa_store(&mr->page_list, mr->nbuf, page, GFP_KERNEL));
+ if (err)
+ return err;
+
+ mr->nbuf++;
+ return 0;
+}
+
+int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sgl,
+ int sg_nents, unsigned int *sg_offset)
+{
+ struct rxe_mr *mr = to_rmr(ibmr);
+ unsigned int page_size = mr_page_size(mr);
+
+ mr->nbuf = 0;
+ mr->page_shift = ilog2(page_size);
+ mr->page_mask = ~((u64)page_size - 1);
+ mr->page_offset = mr->ibmr.iova & (page_size - 1);
+
+ return ib_sg_to_pages(ibmr, sgl, sg_nents, sg_offset, rxe_set_page);
+}
+
+static int rxe_mr_copy_xarray(struct rxe_mr *mr, u64 iova, void *addr,
+ unsigned int length, enum rxe_mr_copy_dir dir)
+{
+ unsigned int page_offset = rxe_mr_iova_to_page_offset(mr, iova);
+ unsigned long index = rxe_mr_iova_to_index(mr, iova);
+ unsigned int bytes;
+ struct page *page;
+ void *va;
+
+ while (length) {
+ page = xa_load(&mr->page_list, index);
+ if (!page)
+ return -EFAULT;
+
+ bytes = min_t(unsigned int, length,
+ mr_page_size(mr) - page_offset);
+ va = kmap_local_page(page);
+ if (dir == RXE_FROM_MR_OBJ)
+ memcpy(addr, va + page_offset, bytes);
+ else
+ memcpy(va + page_offset, addr, bytes);
+ kunmap_local(va);
+
+ page_offset = 0;
+ addr += bytes;
+ length -= bytes;
+ index++;
+ }
+
+ return 0;
+}
+
+static void rxe_mr_copy_dma(struct rxe_mr *mr, u64 dma_addr, void *addr,
+ unsigned int length, enum rxe_mr_copy_dir dir)
+{
+ unsigned int page_offset = dma_addr & (PAGE_SIZE - 1);
+ unsigned int bytes;
+ struct page *page;
+ u8 *va;
+
+ while (length) {
+ page = ib_virt_dma_to_page(dma_addr);
+ bytes = min_t(unsigned int, length,
+ PAGE_SIZE - page_offset);
+ va = kmap_local_page(page);
+
+ if (dir == RXE_TO_MR_OBJ)
+ memcpy(va + page_offset, addr, bytes);
+ else
+ memcpy(addr, va + page_offset, bytes);
+
+ kunmap_local(va);
+ page_offset = 0;
+ dma_addr += bytes;
+ addr += bytes;
+ length -= bytes;
+ }
+}
+
+int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr,
+ unsigned int length, enum rxe_mr_copy_dir dir)
+{
+ int err;
+
+ if (length == 0)
+ return 0;
+
+ if (WARN_ON(!mr))
+ return -EINVAL;
+
+ if (mr->ibmr.type == IB_MR_TYPE_DMA) {
+ rxe_mr_copy_dma(mr, iova, addr, length, dir);
+ return 0;
+ }
+
+ err = mr_check_range(mr, iova, length);
+ if (unlikely(err)) {
+ rxe_dbg_mr(mr, "iova out of range");
+ return err;
+ }
+
+ return rxe_mr_copy_xarray(mr, iova, addr, length, dir);
+}
+
+/* copy data in or out of a wqe, i.e. sg list
+ * under the control of a dma descriptor
+ */
+int copy_data(
+ struct rxe_pd *pd,
+ int access,
+ struct rxe_dma_info *dma,
+ void *addr,
+ int length,
+ enum rxe_mr_copy_dir dir)
+{
+ int bytes;
+ struct rxe_sge *sge = &dma->sge[dma->cur_sge];
+ int offset = dma->sge_offset;
+ int resid = dma->resid;
+ struct rxe_mr *mr = NULL;
+ u64 iova;
+ int err;
+
+ if (length == 0)
+ return 0;
+
+ if (length > resid) {
+ err = -EINVAL;
+ goto err2;
+ }
+
+ if (sge->length && (offset < sge->length)) {
+ mr = lookup_mr(pd, access, sge->lkey, RXE_LOOKUP_LOCAL);
+ if (!mr) {
+ err = -EINVAL;
+ goto err1;
+ }
+ }
+
+ while (length > 0) {
+ bytes = length;
+
+ if (offset >= sge->length) {
+ if (mr) {
+ rxe_put(mr);
+ mr = NULL;
+ }
+ sge++;
+ dma->cur_sge++;
+ offset = 0;
+
+ if (dma->cur_sge >= dma->num_sge) {
+ err = -ENOSPC;
+ goto err2;
+ }
+
+ if (sge->length) {
+ mr = lookup_mr(pd, access, sge->lkey,
+ RXE_LOOKUP_LOCAL);
+ if (!mr) {
+ err = -EINVAL;
+ goto err1;
+ }
+ } else {
+ continue;
+ }
+ }
+
+ if (bytes > sge->length - offset)
+ bytes = sge->length - offset;
+
+ if (bytes > 0) {
+ iova = sge->addr + offset;
+ err = rxe_mr_copy(mr, iova, addr, bytes, dir);
+ if (err)
+ goto err2;
+
+ offset += bytes;
+ resid -= bytes;
+ length -= bytes;
+ addr += bytes;
+ }
+ }
+
+ dma->sge_offset = offset;
+ dma->resid = resid;
+
+ if (mr)
+ rxe_put(mr);
+
+ return 0;
+
+err2:
+ if (mr)
+ rxe_put(mr);
+err1:
+ return err;
+}
+
+int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length)
+{
+ unsigned int page_offset;
+ unsigned long index;
+ struct page *page;
+ unsigned int bytes;
+ int err;
+ u8 *va;
+
+ /* mr must be valid even if length is zero */
+ if (WARN_ON(!mr))
+ return -EINVAL;
+
+ if (length == 0)
+ return 0;
+
+ if (mr->ibmr.type == IB_MR_TYPE_DMA)
+ return -EFAULT;
+
+ err = mr_check_range(mr, iova, length);
+ if (err)
+ return err;
+
+ while (length > 0) {
+ index = rxe_mr_iova_to_index(mr, iova);
+ page = xa_load(&mr->page_list, index);
+ page_offset = rxe_mr_iova_to_page_offset(mr, iova);
+ if (!page)
+ return -EFAULT;
+ bytes = min_t(unsigned int, length,
+ mr_page_size(mr) - page_offset);
+
+ va = kmap_local_page(page);
+ arch_wb_cache_pmem(va + page_offset, bytes);
+ kunmap_local(va);
+
+ length -= bytes;
+ iova += bytes;
+ page_offset = 0;
+ }
+
+ return 0;
+}
+
+/* Guarantee atomicity of atomic operations at the machine level. */
+static DEFINE_SPINLOCK(atomic_ops_lock);
+
+int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
+ u64 compare, u64 swap_add, u64 *orig_val)
+{
+ unsigned int page_offset;
+ struct page *page;
+ u64 value;
+ u64 *va;
+
+ if (unlikely(mr->state != RXE_MR_STATE_VALID)) {
+ rxe_dbg_mr(mr, "mr not in valid state");
+ return RESPST_ERR_RKEY_VIOLATION;
+ }
+
+ if (mr->ibmr.type == IB_MR_TYPE_DMA) {
+ page_offset = iova & (PAGE_SIZE - 1);
+ page = ib_virt_dma_to_page(iova);
+ } else {
+ unsigned long index;
+ int err;
+
+ err = mr_check_range(mr, iova, sizeof(value));
+ if (err) {
+ rxe_dbg_mr(mr, "iova out of range");
+ return RESPST_ERR_RKEY_VIOLATION;
+ }
+ page_offset = rxe_mr_iova_to_page_offset(mr, iova);
+ index = rxe_mr_iova_to_index(mr, iova);
+ page = xa_load(&mr->page_list, index);
+ if (!page)
+ return RESPST_ERR_RKEY_VIOLATION;
+ }
+
+ if (unlikely(page_offset & 0x7)) {
+ rxe_dbg_mr(mr, "iova not aligned");
+ return RESPST_ERR_MISALIGNED_ATOMIC;
+ }
+
+ va = kmap_local_page(page);
+
+ spin_lock_bh(&atomic_ops_lock);
+ value = *orig_val = va[page_offset >> 3];
+
+ if (opcode == IB_OPCODE_RC_COMPARE_SWAP) {
+ if (value == compare)
+ va[page_offset >> 3] = swap_add;
+ } else {
+ value += swap_add;
+ va[page_offset >> 3] = value;
+ }
+ spin_unlock_bh(&atomic_ops_lock);
+
+ kunmap_local(va);
+
+ return 0;
+}
+
+#if defined CONFIG_64BIT
+/* only implemented or called for 64 bit architectures */
+int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
+{
+ unsigned int page_offset;
+ struct page *page;
+ u64 *va;
+
+ /* See IBA oA19-28 */
+ if (unlikely(mr->state != RXE_MR_STATE_VALID)) {
+ rxe_dbg_mr(mr, "mr not in valid state");
+ return RESPST_ERR_RKEY_VIOLATION;
+ }
+
+ if (mr->ibmr.type == IB_MR_TYPE_DMA) {
+ page_offset = iova & (PAGE_SIZE - 1);
+ page = ib_virt_dma_to_page(iova);
+ } else {
+ unsigned long index;
+ int err;
+
+ /* See IBA oA19-28 */
+ err = mr_check_range(mr, iova, sizeof(value));
+ if (unlikely(err)) {
+ rxe_dbg_mr(mr, "iova out of range");
+ return RESPST_ERR_RKEY_VIOLATION;
+ }
+ page_offset = rxe_mr_iova_to_page_offset(mr, iova);
+ index = rxe_mr_iova_to_index(mr, iova);
+ page = xa_load(&mr->page_list, index);
+ if (!page)
+ return RESPST_ERR_RKEY_VIOLATION;
+ }
+
+ /* See IBA A19.4.2 */
+ if (unlikely(page_offset & 0x7)) {
+ rxe_dbg_mr(mr, "misaligned address");
+ return RESPST_ERR_MISALIGNED_ATOMIC;
+ }
+
+ va = kmap_local_page(page);
+
+ /* Do atomic write after all prior operations have completed */
+ smp_store_release(&va[page_offset >> 3], value);
+
+ kunmap_local(va);
+
+ return 0;
+}
+#else
+int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
+{
+ return RESPST_ERR_UNSUPPORTED_OPCODE;
+}
+#endif
+
+int advance_dma_data(struct rxe_dma_info *dma, unsigned int length)
+{
+ struct rxe_sge *sge = &dma->sge[dma->cur_sge];
+ int offset = dma->sge_offset;
+ int resid = dma->resid;
+
+ while (length) {
+ unsigned int bytes;
+
+ if (offset >= sge->length) {
+ sge++;
+ dma->cur_sge++;
+ offset = 0;
+ if (dma->cur_sge >= dma->num_sge)
+ return -ENOSPC;
+ }
+
+ bytes = length;
+
+ if (bytes > sge->length - offset)
+ bytes = sge->length - offset;
+
+ offset += bytes;
+ resid -= bytes;
+ length -= bytes;
+ }
+
+ dma->sge_offset = offset;
+ dma->resid = resid;
+
+ return 0;
+}
+
+struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key,
+ enum rxe_mr_lookup_type type)
+{
+ struct rxe_mr *mr;
+ struct rxe_dev *rxe = to_rdev(pd->ibpd.device);
+ int index = key >> 8;
+
+ mr = rxe_pool_get_index(&rxe->mr_pool, index);
+ if (!mr)
+ return NULL;
+
+ if (unlikely((type == RXE_LOOKUP_LOCAL && mr->lkey != key) ||
+ (type == RXE_LOOKUP_REMOTE && mr->rkey != key) ||
+ mr_pd(mr) != pd || ((access & mr->access) != access) ||
+ mr->state != RXE_MR_STATE_VALID)) {
+ rxe_put(mr);
+ mr = NULL;
+ }
+
+ return mr;
+}
+
+int rxe_invalidate_mr(struct rxe_qp *qp, u32 key)
+{
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ struct rxe_mr *mr;
+ int remote;
+ int ret;
+
+ mr = rxe_pool_get_index(&rxe->mr_pool, key >> 8);
+ if (!mr) {
+ rxe_dbg_qp(qp, "No MR for key %#x\n", key);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ remote = mr->access & RXE_ACCESS_REMOTE;
+ if (remote ? (key != mr->rkey) : (key != mr->lkey)) {
+ rxe_dbg_mr(mr, "wr key (%#x) doesn't match mr key (%#x)\n",
+ key, (remote ? mr->rkey : mr->lkey));
+ ret = -EINVAL;
+ goto err_drop_ref;
+ }
+
+ if (atomic_read(&mr->num_mw) > 0) {
+ rxe_dbg_mr(mr, "Attempt to invalidate an MR while bound to MWs\n");
+ ret = -EINVAL;
+ goto err_drop_ref;
+ }
+
+ if (unlikely(mr->ibmr.type != IB_MR_TYPE_MEM_REG)) {
+ rxe_dbg_mr(mr, "Type (%d) is wrong\n", mr->ibmr.type);
+ ret = -EINVAL;
+ goto err_drop_ref;
+ }
+
+ mr->state = RXE_MR_STATE_FREE;
+ ret = 0;
+
+err_drop_ref:
+ rxe_put(mr);
+err:
+ return ret;
+}
+
+/* user can (re)register fast MR by executing a REG_MR WQE.
+ * user is expected to hold a reference on the ib mr until the
+ * WQE completes.
+ * Once a fast MR is created this is the only way to change the
+ * private keys. It is the responsibility of the user to maintain
+ * the ib mr keys in sync with rxe mr keys.
+ */
+int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
+{
+ struct rxe_mr *mr = to_rmr(wqe->wr.wr.reg.mr);
+ u32 key = wqe->wr.wr.reg.key;
+ u32 access = wqe->wr.wr.reg.access;
+
+ /* user can only register MR in free state */
+ if (unlikely(mr->state != RXE_MR_STATE_FREE)) {
+ rxe_dbg_mr(mr, "mr->lkey = 0x%x not free\n", mr->lkey);
+ return -EINVAL;
+ }
+
+ /* user can only register mr with qp in same protection domain */
+ if (unlikely(qp->ibqp.pd != mr->ibmr.pd)) {
+ rxe_dbg_mr(mr, "qp->pd and mr->pd don't match\n");
+ return -EINVAL;
+ }
+
+ /* user is only allowed to change key portion of l/rkey */
+ if (unlikely((mr->lkey & ~0xff) != (key & ~0xff))) {
+ rxe_dbg_mr(mr, "key = 0x%x has wrong index mr->lkey = 0x%x\n",
+ key, mr->lkey);
+ return -EINVAL;
+ }
+
+ mr->access = access;
+ mr->lkey = key;
+ mr->rkey = key;
+ mr->ibmr.iova = wqe->wr.wr.reg.mr->iova;
+ mr->state = RXE_MR_STATE_VALID;
+
+ return 0;
+}
+
+void rxe_mr_cleanup(struct rxe_pool_elem *elem)
+{
+ struct rxe_mr *mr = container_of(elem, typeof(*mr), elem);
+
+ rxe_put(mr_pd(mr));
+ ib_umem_release(mr->umem);
+
+ if (mr->ibmr.type != IB_MR_TYPE_DMA)
+ xa_destroy(&mr->page_list);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_mw.c b/drivers/infiniband/sw/rxe/rxe_mw.c
new file mode 100644
index 0000000000..d9312b5c9d
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_mw.c
@@ -0,0 +1,338 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2020 Hewlett Packard Enterprise, Inc. All rights reserved.
+ */
+
+/*
+ * The rdma_rxe driver supports type 1 or type 2B memory windows.
+ * Type 1 MWs are created by ibv_alloc_mw() verbs calls and bound by
+ * ibv_bind_mw() calls. Type 2 MWs are also created by ibv_alloc_mw()
+ * but bound by bind_mw work requests. The ibv_bind_mw() call is converted
+ * by libibverbs to a bind_mw work request.
+ */
+
+#include "rxe.h"
+
+int rxe_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
+{
+ struct rxe_mw *mw = to_rmw(ibmw);
+ struct rxe_pd *pd = to_rpd(ibmw->pd);
+ struct rxe_dev *rxe = to_rdev(ibmw->device);
+ int ret;
+
+ rxe_get(pd);
+
+ ret = rxe_add_to_pool(&rxe->mw_pool, mw);
+ if (ret) {
+ rxe_put(pd);
+ return ret;
+ }
+
+ mw->rkey = ibmw->rkey = (mw->elem.index << 8) | rxe_get_next_key(-1);
+ mw->state = (mw->ibmw.type == IB_MW_TYPE_2) ?
+ RXE_MW_STATE_FREE : RXE_MW_STATE_VALID;
+ spin_lock_init(&mw->lock);
+
+ rxe_finalize(mw);
+
+ return 0;
+}
+
+int rxe_dealloc_mw(struct ib_mw *ibmw)
+{
+ struct rxe_mw *mw = to_rmw(ibmw);
+
+ rxe_cleanup(mw);
+
+ return 0;
+}
+
+static int rxe_check_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
+ struct rxe_mw *mw, struct rxe_mr *mr, int access)
+{
+ if (mw->ibmw.type == IB_MW_TYPE_1) {
+ if (unlikely(mw->state != RXE_MW_STATE_VALID)) {
+ rxe_dbg_mw(mw,
+ "attempt to bind a type 1 MW not in the valid state\n");
+ return -EINVAL;
+ }
+
+ /* o10-36.2.2 */
+ if (unlikely((access & IB_ZERO_BASED))) {
+ rxe_dbg_mw(mw, "attempt to bind a zero based type 1 MW\n");
+ return -EINVAL;
+ }
+ }
+
+ if (mw->ibmw.type == IB_MW_TYPE_2) {
+ /* o10-37.2.30 */
+ if (unlikely(mw->state != RXE_MW_STATE_FREE)) {
+ rxe_dbg_mw(mw,
+ "attempt to bind a type 2 MW not in the free state\n");
+ return -EINVAL;
+ }
+
+ /* C10-72 */
+ if (unlikely(qp->pd != to_rpd(mw->ibmw.pd))) {
+ rxe_dbg_mw(mw,
+ "attempt to bind type 2 MW with qp with different PD\n");
+ return -EINVAL;
+ }
+
+ /* o10-37.2.40 */
+ if (unlikely(!mr || wqe->wr.wr.mw.length == 0)) {
+ rxe_dbg_mw(mw,
+ "attempt to invalidate type 2 MW by binding with NULL or zero length MR\n");
+ return -EINVAL;
+ }
+ }
+
+ /* remaining checks only apply to a nonzero MR */
+ if (!mr)
+ return 0;
+
+ if (unlikely(mr->access & IB_ZERO_BASED)) {
+ rxe_dbg_mw(mw, "attempt to bind MW to zero based MR\n");
+ return -EINVAL;
+ }
+
+ /* C10-73 */
+ if (unlikely(!(mr->access & IB_ACCESS_MW_BIND))) {
+ rxe_dbg_mw(mw,
+ "attempt to bind an MW to an MR without bind access\n");
+ return -EINVAL;
+ }
+
+ /* C10-74 */
+ if (unlikely((access &
+ (IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC)) &&
+ !(mr->access & IB_ACCESS_LOCAL_WRITE))) {
+ rxe_dbg_mw(mw,
+ "attempt to bind an Writable MW to an MR without local write access\n");
+ return -EINVAL;
+ }
+
+ /* C10-75 */
+ if (access & IB_ZERO_BASED) {
+ if (unlikely(wqe->wr.wr.mw.length > mr->ibmr.length)) {
+ rxe_dbg_mw(mw,
+ "attempt to bind a ZB MW outside of the MR\n");
+ return -EINVAL;
+ }
+ } else {
+ if (unlikely((wqe->wr.wr.mw.addr < mr->ibmr.iova) ||
+ ((wqe->wr.wr.mw.addr + wqe->wr.wr.mw.length) >
+ (mr->ibmr.iova + mr->ibmr.length)))) {
+ rxe_dbg_mw(mw,
+ "attempt to bind a VA MW outside of the MR\n");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static void rxe_do_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
+ struct rxe_mw *mw, struct rxe_mr *mr, int access)
+{
+ u32 key = wqe->wr.wr.mw.rkey & 0xff;
+
+ mw->rkey = (mw->rkey & ~0xff) | key;
+ mw->access = access;
+ mw->state = RXE_MW_STATE_VALID;
+ mw->addr = wqe->wr.wr.mw.addr;
+ mw->length = wqe->wr.wr.mw.length;
+
+ if (mw->mr) {
+ rxe_put(mw->mr);
+ atomic_dec(&mw->mr->num_mw);
+ mw->mr = NULL;
+ }
+
+ if (mw->length) {
+ mw->mr = mr;
+ atomic_inc(&mr->num_mw);
+ rxe_get(mr);
+ }
+
+ if (mw->ibmw.type == IB_MW_TYPE_2) {
+ rxe_get(qp);
+ mw->qp = qp;
+ }
+}
+
+int rxe_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
+{
+ int ret;
+ struct rxe_mw *mw;
+ struct rxe_mr *mr;
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ u32 mw_rkey = wqe->wr.wr.mw.mw_rkey;
+ u32 mr_lkey = wqe->wr.wr.mw.mr_lkey;
+ int access = wqe->wr.wr.mw.access;
+
+ mw = rxe_pool_get_index(&rxe->mw_pool, mw_rkey >> 8);
+ if (unlikely(!mw)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (unlikely(mw->rkey != mw_rkey)) {
+ ret = -EINVAL;
+ goto err_drop_mw;
+ }
+
+ if (likely(wqe->wr.wr.mw.length)) {
+ mr = rxe_pool_get_index(&rxe->mr_pool, mr_lkey >> 8);
+ if (unlikely(!mr)) {
+ ret = -EINVAL;
+ goto err_drop_mw;
+ }
+
+ if (unlikely(mr->lkey != mr_lkey)) {
+ ret = -EINVAL;
+ goto err_drop_mr;
+ }
+ } else {
+ mr = NULL;
+ }
+
+ if (access & ~RXE_ACCESS_SUPPORTED_MW) {
+ rxe_err_mw(mw, "access %#x not supported", access);
+ ret = -EOPNOTSUPP;
+ goto err_drop_mr;
+ }
+
+ spin_lock_bh(&mw->lock);
+
+ ret = rxe_check_bind_mw(qp, wqe, mw, mr, access);
+ if (ret)
+ goto err_unlock;
+
+ rxe_do_bind_mw(qp, wqe, mw, mr, access);
+err_unlock:
+ spin_unlock_bh(&mw->lock);
+err_drop_mr:
+ if (mr)
+ rxe_put(mr);
+err_drop_mw:
+ rxe_put(mw);
+err:
+ return ret;
+}
+
+static int rxe_check_invalidate_mw(struct rxe_qp *qp, struct rxe_mw *mw)
+{
+ if (unlikely(mw->state == RXE_MW_STATE_INVALID))
+ return -EINVAL;
+
+ /* o10-37.2.26 */
+ if (unlikely(mw->ibmw.type == IB_MW_TYPE_1))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void rxe_do_invalidate_mw(struct rxe_mw *mw)
+{
+ struct rxe_qp *qp;
+ struct rxe_mr *mr;
+
+ /* valid type 2 MW will always have a QP pointer */
+ qp = mw->qp;
+ mw->qp = NULL;
+ rxe_put(qp);
+
+ /* valid type 2 MW will always have an MR pointer */
+ mr = mw->mr;
+ mw->mr = NULL;
+ atomic_dec(&mr->num_mw);
+ rxe_put(mr);
+
+ mw->access = 0;
+ mw->addr = 0;
+ mw->length = 0;
+ mw->state = RXE_MW_STATE_FREE;
+}
+
+int rxe_invalidate_mw(struct rxe_qp *qp, u32 rkey)
+{
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ struct rxe_mw *mw;
+ int ret;
+
+ mw = rxe_pool_get_index(&rxe->mw_pool, rkey >> 8);
+ if (!mw) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (rkey != mw->rkey) {
+ ret = -EINVAL;
+ goto err_drop_ref;
+ }
+
+ spin_lock_bh(&mw->lock);
+
+ ret = rxe_check_invalidate_mw(qp, mw);
+ if (ret)
+ goto err_unlock;
+
+ rxe_do_invalidate_mw(mw);
+err_unlock:
+ spin_unlock_bh(&mw->lock);
+err_drop_ref:
+ rxe_put(mw);
+err:
+ return ret;
+}
+
+struct rxe_mw *rxe_lookup_mw(struct rxe_qp *qp, int access, u32 rkey)
+{
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ struct rxe_pd *pd = to_rpd(qp->ibqp.pd);
+ struct rxe_mw *mw;
+ int index = rkey >> 8;
+
+ mw = rxe_pool_get_index(&rxe->mw_pool, index);
+ if (!mw)
+ return NULL;
+
+ if (unlikely((mw->rkey != rkey) || rxe_mw_pd(mw) != pd ||
+ (mw->ibmw.type == IB_MW_TYPE_2 && mw->qp != qp) ||
+ (mw->length == 0) || ((access & mw->access) != access) ||
+ mw->state != RXE_MW_STATE_VALID)) {
+ rxe_put(mw);
+ return NULL;
+ }
+
+ return mw;
+}
+
+void rxe_mw_cleanup(struct rxe_pool_elem *elem)
+{
+ struct rxe_mw *mw = container_of(elem, typeof(*mw), elem);
+ struct rxe_pd *pd = to_rpd(mw->ibmw.pd);
+
+ rxe_put(pd);
+
+ if (mw->mr) {
+ struct rxe_mr *mr = mw->mr;
+
+ mw->mr = NULL;
+ atomic_dec(&mr->num_mw);
+ rxe_put(mr);
+ }
+
+ if (mw->qp) {
+ struct rxe_qp *qp = mw->qp;
+
+ mw->qp = NULL;
+ rxe_put(qp);
+ }
+
+ mw->access = 0;
+ mw->addr = 0;
+ mw->length = 0;
+ mw->state = RXE_MW_STATE_INVALID;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
new file mode 100644
index 0000000000..cd59666158
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -0,0 +1,696 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <linux/if_vlan.h>
+#include <net/udp_tunnel.h>
+#include <net/sch_generic.h>
+#include <linux/netfilter.h>
+#include <rdma/ib_addr.h>
+
+#include "rxe.h"
+#include "rxe_net.h"
+#include "rxe_loc.h"
+
+static struct rxe_recv_sockets recv_sockets;
+
+static struct dst_entry *rxe_find_route4(struct rxe_qp *qp,
+ struct net_device *ndev,
+ struct in_addr *saddr,
+ struct in_addr *daddr)
+{
+ struct rtable *rt;
+ struct flowi4 fl = { { 0 } };
+
+ memset(&fl, 0, sizeof(fl));
+ fl.flowi4_oif = ndev->ifindex;
+ memcpy(&fl.saddr, saddr, sizeof(*saddr));
+ memcpy(&fl.daddr, daddr, sizeof(*daddr));
+ fl.flowi4_proto = IPPROTO_UDP;
+
+ rt = ip_route_output_key(&init_net, &fl);
+ if (IS_ERR(rt)) {
+ rxe_dbg_qp(qp, "no route to %pI4\n", &daddr->s_addr);
+ return NULL;
+ }
+
+ return &rt->dst;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct dst_entry *rxe_find_route6(struct rxe_qp *qp,
+ struct net_device *ndev,
+ struct in6_addr *saddr,
+ struct in6_addr *daddr)
+{
+ struct dst_entry *ndst;
+ struct flowi6 fl6 = { { 0 } };
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_oif = ndev->ifindex;
+ memcpy(&fl6.saddr, saddr, sizeof(*saddr));
+ memcpy(&fl6.daddr, daddr, sizeof(*daddr));
+ fl6.flowi6_proto = IPPROTO_UDP;
+
+ ndst = ipv6_stub->ipv6_dst_lookup_flow(sock_net(recv_sockets.sk6->sk),
+ recv_sockets.sk6->sk, &fl6,
+ NULL);
+ if (IS_ERR(ndst)) {
+ rxe_dbg_qp(qp, "no route to %pI6\n", daddr);
+ return NULL;
+ }
+
+ if (unlikely(ndst->error)) {
+ rxe_dbg_qp(qp, "no route to %pI6\n", daddr);
+ goto put;
+ }
+
+ return ndst;
+put:
+ dst_release(ndst);
+ return NULL;
+}
+
+#else
+
+static struct dst_entry *rxe_find_route6(struct rxe_qp *qp,
+ struct net_device *ndev,
+ struct in6_addr *saddr,
+ struct in6_addr *daddr)
+{
+ return NULL;
+}
+
+#endif
+
+static struct dst_entry *rxe_find_route(struct net_device *ndev,
+ struct rxe_qp *qp,
+ struct rxe_av *av)
+{
+ struct dst_entry *dst = NULL;
+
+ if (qp_type(qp) == IB_QPT_RC)
+ dst = sk_dst_get(qp->sk->sk);
+
+ if (!dst || !dst_check(dst, qp->dst_cookie)) {
+ if (dst)
+ dst_release(dst);
+
+ if (av->network_type == RXE_NETWORK_TYPE_IPV4) {
+ struct in_addr *saddr;
+ struct in_addr *daddr;
+
+ saddr = &av->sgid_addr._sockaddr_in.sin_addr;
+ daddr = &av->dgid_addr._sockaddr_in.sin_addr;
+ dst = rxe_find_route4(qp, ndev, saddr, daddr);
+ } else if (av->network_type == RXE_NETWORK_TYPE_IPV6) {
+ struct in6_addr *saddr6;
+ struct in6_addr *daddr6;
+
+ saddr6 = &av->sgid_addr._sockaddr_in6.sin6_addr;
+ daddr6 = &av->dgid_addr._sockaddr_in6.sin6_addr;
+ dst = rxe_find_route6(qp, ndev, saddr6, daddr6);
+#if IS_ENABLED(CONFIG_IPV6)
+ if (dst)
+ qp->dst_cookie =
+ rt6_get_cookie((struct rt6_info *)dst);
+#endif
+ }
+
+ if (dst && (qp_type(qp) == IB_QPT_RC)) {
+ dst_hold(dst);
+ sk_dst_set(qp->sk->sk, dst);
+ }
+ }
+ return dst;
+}
+
+static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct udphdr *udph;
+ struct rxe_dev *rxe;
+ struct net_device *ndev = skb->dev;
+ struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+
+ /* takes a reference on rxe->ib_dev
+ * drop when skb is freed
+ */
+ rxe = rxe_get_dev_from_net(ndev);
+ if (!rxe && is_vlan_dev(ndev))
+ rxe = rxe_get_dev_from_net(vlan_dev_real_dev(ndev));
+ if (!rxe)
+ goto drop;
+
+ if (skb_linearize(skb)) {
+ ib_device_put(&rxe->ib_dev);
+ goto drop;
+ }
+
+ udph = udp_hdr(skb);
+ pkt->rxe = rxe;
+ pkt->port_num = 1;
+ pkt->hdr = (u8 *)(udph + 1);
+ pkt->mask = RXE_GRH_MASK;
+ pkt->paylen = be16_to_cpu(udph->len) - sizeof(*udph);
+
+ /* remove udp header */
+ skb_pull(skb, sizeof(struct udphdr));
+
+ rxe_rcv(skb);
+
+ return 0;
+drop:
+ kfree_skb(skb);
+
+ return 0;
+}
+
+static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port,
+ bool ipv6)
+{
+ int err;
+ struct socket *sock;
+ struct udp_port_cfg udp_cfg = { };
+ struct udp_tunnel_sock_cfg tnl_cfg = { };
+
+ if (ipv6) {
+ udp_cfg.family = AF_INET6;
+ udp_cfg.ipv6_v6only = 1;
+ } else {
+ udp_cfg.family = AF_INET;
+ }
+
+ udp_cfg.local_udp_port = port;
+
+ /* Create UDP socket */
+ err = udp_sock_create(net, &udp_cfg, &sock);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ tnl_cfg.encap_type = 1;
+ tnl_cfg.encap_rcv = rxe_udp_encap_recv;
+
+ /* Setup UDP tunnel */
+ setup_udp_tunnel_sock(net, sock, &tnl_cfg);
+
+ return sock;
+}
+
+static void rxe_release_udp_tunnel(struct socket *sk)
+{
+ if (sk)
+ udp_tunnel_sock_release(sk);
+}
+
+static void prepare_udp_hdr(struct sk_buff *skb, __be16 src_port,
+ __be16 dst_port)
+{
+ struct udphdr *udph;
+
+ __skb_push(skb, sizeof(*udph));
+ skb_reset_transport_header(skb);
+ udph = udp_hdr(skb);
+
+ udph->dest = dst_port;
+ udph->source = src_port;
+ udph->len = htons(skb->len);
+ udph->check = 0;
+}
+
+static void prepare_ipv4_hdr(struct dst_entry *dst, struct sk_buff *skb,
+ __be32 saddr, __be32 daddr, __u8 proto,
+ __u8 tos, __u8 ttl, __be16 df, bool xnet)
+{
+ struct iphdr *iph;
+
+ skb_scrub_packet(skb, xnet);
+
+ skb_clear_hash(skb);
+ skb_dst_set(skb, dst_clone(dst));
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+
+ skb_push(skb, sizeof(struct iphdr));
+ skb_reset_network_header(skb);
+
+ iph = ip_hdr(skb);
+
+ iph->version = IPVERSION;
+ iph->ihl = sizeof(struct iphdr) >> 2;
+ iph->tot_len = htons(skb->len);
+ iph->frag_off = df;
+ iph->protocol = proto;
+ iph->tos = tos;
+ iph->daddr = daddr;
+ iph->saddr = saddr;
+ iph->ttl = ttl;
+ __ip_select_ident(dev_net(dst->dev), iph,
+ skb_shinfo(skb)->gso_segs ?: 1);
+}
+
+static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb,
+ struct in6_addr *saddr, struct in6_addr *daddr,
+ __u8 proto, __u8 prio, __u8 ttl)
+{
+ struct ipv6hdr *ip6h;
+
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+ IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED
+ | IPSKB_REROUTED);
+ skb_dst_set(skb, dst_clone(dst));
+
+ __skb_push(skb, sizeof(*ip6h));
+ skb_reset_network_header(skb);
+ ip6h = ipv6_hdr(skb);
+ ip6_flow_hdr(ip6h, prio, htonl(0));
+ ip6h->payload_len = htons(skb->len);
+ ip6h->nexthdr = proto;
+ ip6h->hop_limit = ttl;
+ ip6h->daddr = *daddr;
+ ip6h->saddr = *saddr;
+ ip6h->payload_len = htons(skb->len - sizeof(*ip6h));
+}
+
+static int prepare4(struct rxe_av *av, struct rxe_pkt_info *pkt,
+ struct sk_buff *skb)
+{
+ struct rxe_qp *qp = pkt->qp;
+ struct dst_entry *dst;
+ bool xnet = false;
+ __be16 df = htons(IP_DF);
+ struct in_addr *saddr = &av->sgid_addr._sockaddr_in.sin_addr;
+ struct in_addr *daddr = &av->dgid_addr._sockaddr_in.sin_addr;
+
+ dst = rxe_find_route(skb->dev, qp, av);
+ if (!dst) {
+ rxe_dbg_qp(qp, "Host not reachable\n");
+ return -EHOSTUNREACH;
+ }
+
+ prepare_udp_hdr(skb, cpu_to_be16(qp->src_port),
+ cpu_to_be16(ROCE_V2_UDP_DPORT));
+
+ prepare_ipv4_hdr(dst, skb, saddr->s_addr, daddr->s_addr, IPPROTO_UDP,
+ av->grh.traffic_class, av->grh.hop_limit, df, xnet);
+
+ dst_release(dst);
+ return 0;
+}
+
+static int prepare6(struct rxe_av *av, struct rxe_pkt_info *pkt,
+ struct sk_buff *skb)
+{
+ struct rxe_qp *qp = pkt->qp;
+ struct dst_entry *dst;
+ struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr;
+ struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr;
+
+ dst = rxe_find_route(skb->dev, qp, av);
+ if (!dst) {
+ rxe_dbg_qp(qp, "Host not reachable\n");
+ return -EHOSTUNREACH;
+ }
+
+ prepare_udp_hdr(skb, cpu_to_be16(qp->src_port),
+ cpu_to_be16(ROCE_V2_UDP_DPORT));
+
+ prepare_ipv6_hdr(dst, skb, saddr, daddr, IPPROTO_UDP,
+ av->grh.traffic_class,
+ av->grh.hop_limit);
+
+ dst_release(dst);
+ return 0;
+}
+
+int rxe_prepare(struct rxe_av *av, struct rxe_pkt_info *pkt,
+ struct sk_buff *skb)
+{
+ int err = 0;
+
+ if (skb->protocol == htons(ETH_P_IP))
+ err = prepare4(av, pkt, skb);
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ err = prepare6(av, pkt, skb);
+
+ if (ether_addr_equal(skb->dev->dev_addr, av->dmac))
+ pkt->mask |= RXE_LOOPBACK_MASK;
+
+ return err;
+}
+
+static void rxe_skb_tx_dtor(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ struct rxe_qp *qp = sk->sk_user_data;
+ int skb_out = atomic_dec_return(&qp->skb_out);
+
+ if (unlikely(qp->need_req_skb &&
+ skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW))
+ rxe_sched_task(&qp->req.task);
+
+ rxe_put(qp);
+}
+
+static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt)
+{
+ int err;
+
+ skb->destructor = rxe_skb_tx_dtor;
+ skb->sk = pkt->qp->sk->sk;
+
+ rxe_get(pkt->qp);
+ atomic_inc(&pkt->qp->skb_out);
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ err = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ err = ip6_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
+ } else {
+ rxe_dbg_qp(pkt->qp, "Unknown layer 3 protocol: %d\n",
+ skb->protocol);
+ atomic_dec(&pkt->qp->skb_out);
+ rxe_put(pkt->qp);
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ if (unlikely(net_xmit_eval(err))) {
+ rxe_dbg_qp(pkt->qp, "error sending packet: %d\n", err);
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+/* fix up a send packet to match the packets
+ * received from UDP before looping them back
+ */
+static int rxe_loopback(struct sk_buff *skb, struct rxe_pkt_info *pkt)
+{
+ memcpy(SKB_TO_PKT(skb), pkt, sizeof(*pkt));
+
+ if (skb->protocol == htons(ETH_P_IP))
+ skb_pull(skb, sizeof(struct iphdr));
+ else
+ skb_pull(skb, sizeof(struct ipv6hdr));
+
+ if (WARN_ON(!ib_device_try_get(&pkt->rxe->ib_dev))) {
+ kfree_skb(skb);
+ return -EIO;
+ }
+
+ /* remove udp header */
+ skb_pull(skb, sizeof(struct udphdr));
+
+ rxe_rcv(skb);
+
+ return 0;
+}
+
+int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
+ struct sk_buff *skb)
+{
+ int err;
+ int is_request = pkt->mask & RXE_REQ_MASK;
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->state_lock, flags);
+ if ((is_request && (qp_state(qp) < IB_QPS_RTS)) ||
+ (!is_request && (qp_state(qp) < IB_QPS_RTR))) {
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+ rxe_dbg_qp(qp, "Packet dropped. QP is not in ready state\n");
+ goto drop;
+ }
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+
+ rxe_icrc_generate(skb, pkt);
+
+ if (pkt->mask & RXE_LOOPBACK_MASK)
+ err = rxe_loopback(skb, pkt);
+ else
+ err = rxe_send(skb, pkt);
+ if (err) {
+ rxe_counter_inc(rxe, RXE_CNT_SEND_ERR);
+ return err;
+ }
+
+ if ((qp_type(qp) != IB_QPT_RC) &&
+ (pkt->mask & RXE_END_MASK)) {
+ pkt->wqe->state = wqe_state_done;
+ rxe_sched_task(&qp->comp.task);
+ }
+
+ rxe_counter_inc(rxe, RXE_CNT_SENT_PKTS);
+ goto done;
+
+drop:
+ kfree_skb(skb);
+ err = 0;
+done:
+ return err;
+}
+
+struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av,
+ int paylen, struct rxe_pkt_info *pkt)
+{
+ unsigned int hdr_len;
+ struct sk_buff *skb = NULL;
+ struct net_device *ndev;
+ const struct ib_gid_attr *attr;
+ const int port_num = 1;
+
+ attr = rdma_get_gid_attr(&rxe->ib_dev, port_num, av->grh.sgid_index);
+ if (IS_ERR(attr))
+ return NULL;
+
+ if (av->network_type == RXE_NETWORK_TYPE_IPV4)
+ hdr_len = ETH_HLEN + sizeof(struct udphdr) +
+ sizeof(struct iphdr);
+ else
+ hdr_len = ETH_HLEN + sizeof(struct udphdr) +
+ sizeof(struct ipv6hdr);
+
+ rcu_read_lock();
+ ndev = rdma_read_gid_attr_ndev_rcu(attr);
+ if (IS_ERR(ndev)) {
+ rcu_read_unlock();
+ goto out;
+ }
+ skb = alloc_skb(paylen + hdr_len + LL_RESERVED_SPACE(ndev),
+ GFP_ATOMIC);
+
+ if (unlikely(!skb)) {
+ rcu_read_unlock();
+ goto out;
+ }
+
+ skb_reserve(skb, hdr_len + LL_RESERVED_SPACE(ndev));
+
+ /* FIXME: hold reference to this netdev until life of this skb. */
+ skb->dev = ndev;
+ rcu_read_unlock();
+
+ if (av->network_type == RXE_NETWORK_TYPE_IPV4)
+ skb->protocol = htons(ETH_P_IP);
+ else
+ skb->protocol = htons(ETH_P_IPV6);
+
+ pkt->rxe = rxe;
+ pkt->port_num = port_num;
+ pkt->hdr = skb_put(skb, paylen);
+ pkt->mask |= RXE_GRH_MASK;
+
+out:
+ rdma_put_gid_attr(attr);
+ return skb;
+}
+
+/*
+ * this is required by rxe_cfg to match rxe devices in
+ * /sys/class/infiniband up with their underlying ethernet devices
+ */
+const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num)
+{
+ return rxe->ndev->name;
+}
+
+int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
+{
+ int err;
+ struct rxe_dev *rxe = NULL;
+
+ rxe = ib_alloc_device(rxe_dev, ib_dev);
+ if (!rxe)
+ return -ENOMEM;
+
+ rxe->ndev = ndev;
+
+ err = rxe_add(rxe, ndev->mtu, ibdev_name);
+ if (err) {
+ ib_dealloc_device(&rxe->ib_dev);
+ return err;
+ }
+
+ return 0;
+}
+
+static void rxe_port_event(struct rxe_dev *rxe,
+ enum ib_event_type event)
+{
+ struct ib_event ev;
+
+ ev.device = &rxe->ib_dev;
+ ev.element.port_num = 1;
+ ev.event = event;
+
+ ib_dispatch_event(&ev);
+}
+
+/* Caller must hold net_info_lock */
+void rxe_port_up(struct rxe_dev *rxe)
+{
+ struct rxe_port *port;
+
+ port = &rxe->port;
+ port->attr.state = IB_PORT_ACTIVE;
+
+ rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE);
+ dev_info(&rxe->ib_dev.dev, "set active\n");
+}
+
+/* Caller must hold net_info_lock */
+void rxe_port_down(struct rxe_dev *rxe)
+{
+ struct rxe_port *port;
+
+ port = &rxe->port;
+ port->attr.state = IB_PORT_DOWN;
+
+ rxe_port_event(rxe, IB_EVENT_PORT_ERR);
+ rxe_counter_inc(rxe, RXE_CNT_LINK_DOWNED);
+ dev_info(&rxe->ib_dev.dev, "set down\n");
+}
+
+void rxe_set_port_state(struct rxe_dev *rxe)
+{
+ if (netif_running(rxe->ndev) && netif_carrier_ok(rxe->ndev))
+ rxe_port_up(rxe);
+ else
+ rxe_port_down(rxe);
+}
+
+static int rxe_notify(struct notifier_block *not_blk,
+ unsigned long event,
+ void *arg)
+{
+ struct net_device *ndev = netdev_notifier_info_to_dev(arg);
+ struct rxe_dev *rxe = rxe_get_dev_from_net(ndev);
+
+ if (!rxe)
+ return NOTIFY_OK;
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ ib_unregister_device_queued(&rxe->ib_dev);
+ break;
+ case NETDEV_UP:
+ rxe_port_up(rxe);
+ break;
+ case NETDEV_DOWN:
+ rxe_port_down(rxe);
+ break;
+ case NETDEV_CHANGEMTU:
+ rxe_dbg_dev(rxe, "%s changed mtu to %d\n", ndev->name, ndev->mtu);
+ rxe_set_mtu(rxe, ndev->mtu);
+ break;
+ case NETDEV_CHANGE:
+ rxe_set_port_state(rxe);
+ break;
+ case NETDEV_REBOOT:
+ case NETDEV_GOING_DOWN:
+ case NETDEV_CHANGEADDR:
+ case NETDEV_CHANGENAME:
+ case NETDEV_FEAT_CHANGE:
+ default:
+ rxe_dbg_dev(rxe, "ignoring netdev event = %ld for %s\n",
+ event, ndev->name);
+ break;
+ }
+
+ ib_device_put(&rxe->ib_dev);
+ return NOTIFY_OK;
+}
+
+static struct notifier_block rxe_net_notifier = {
+ .notifier_call = rxe_notify,
+};
+
+static int rxe_net_ipv4_init(void)
+{
+ recv_sockets.sk4 = rxe_setup_udp_tunnel(&init_net,
+ htons(ROCE_V2_UDP_DPORT), false);
+ if (IS_ERR(recv_sockets.sk4)) {
+ recv_sockets.sk4 = NULL;
+ pr_err("Failed to create IPv4 UDP tunnel\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int rxe_net_ipv6_init(void)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+
+ recv_sockets.sk6 = rxe_setup_udp_tunnel(&init_net,
+ htons(ROCE_V2_UDP_DPORT), true);
+ if (PTR_ERR(recv_sockets.sk6) == -EAFNOSUPPORT) {
+ recv_sockets.sk6 = NULL;
+ pr_warn("IPv6 is not supported, can not create a UDPv6 socket\n");
+ return 0;
+ }
+
+ if (IS_ERR(recv_sockets.sk6)) {
+ recv_sockets.sk6 = NULL;
+ pr_err("Failed to create IPv6 UDP tunnel\n");
+ return -1;
+ }
+#endif
+ return 0;
+}
+
+void rxe_net_exit(void)
+{
+ rxe_release_udp_tunnel(recv_sockets.sk6);
+ rxe_release_udp_tunnel(recv_sockets.sk4);
+ unregister_netdevice_notifier(&rxe_net_notifier);
+}
+
+int rxe_net_init(void)
+{
+ int err;
+
+ recv_sockets.sk6 = NULL;
+
+ err = rxe_net_ipv4_init();
+ if (err)
+ return err;
+ err = rxe_net_ipv6_init();
+ if (err)
+ goto err_out;
+ err = register_netdevice_notifier(&rxe_net_notifier);
+ if (err) {
+ pr_err("Failed to register netdev notifier\n");
+ goto err_out;
+ }
+ return 0;
+err_out:
+ rxe_net_exit();
+ return err;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h
new file mode 100644
index 0000000000..45d80d00f8
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_net.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#ifndef RXE_NET_H
+#define RXE_NET_H
+
+#include <net/sock.h>
+#include <net/if_inet6.h>
+#include <linux/module.h>
+
+struct rxe_recv_sockets {
+ struct socket *sk4;
+ struct socket *sk6;
+};
+
+int rxe_net_add(const char *ibdev_name, struct net_device *ndev);
+
+int rxe_net_init(void);
+void rxe_net_exit(void);
+
+#endif /* RXE_NET_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.c b/drivers/infiniband/sw/rxe/rxe_opcode.c
new file mode 100644
index 0000000000..5c0d5c6ffd
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_opcode.c
@@ -0,0 +1,975 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#include <rdma/ib_pack.h>
+#include "rxe_opcode.h"
+#include "rxe_hdr.h"
+
+/* useful information about work request opcodes and pkt opcodes in
+ * table form
+ */
+struct rxe_wr_opcode_info rxe_wr_opcode_info[] = {
+ [IB_WR_RDMA_WRITE] = {
+ .name = "IB_WR_RDMA_WRITE",
+ .mask = {
+ [IB_QPT_RC] = WR_INLINE_MASK | WR_WRITE_MASK,
+ [IB_QPT_UC] = WR_INLINE_MASK | WR_WRITE_MASK,
+ },
+ },
+ [IB_WR_RDMA_WRITE_WITH_IMM] = {
+ .name = "IB_WR_RDMA_WRITE_WITH_IMM",
+ .mask = {
+ [IB_QPT_RC] = WR_INLINE_MASK | WR_WRITE_MASK,
+ [IB_QPT_UC] = WR_INLINE_MASK | WR_WRITE_MASK,
+ },
+ },
+ [IB_WR_SEND] = {
+ .name = "IB_WR_SEND",
+ .mask = {
+ [IB_QPT_GSI] = WR_INLINE_MASK | WR_SEND_MASK,
+ [IB_QPT_RC] = WR_INLINE_MASK | WR_SEND_MASK,
+ [IB_QPT_UC] = WR_INLINE_MASK | WR_SEND_MASK,
+ [IB_QPT_UD] = WR_INLINE_MASK | WR_SEND_MASK,
+ },
+ },
+ [IB_WR_SEND_WITH_IMM] = {
+ .name = "IB_WR_SEND_WITH_IMM",
+ .mask = {
+ [IB_QPT_GSI] = WR_INLINE_MASK | WR_SEND_MASK,
+ [IB_QPT_RC] = WR_INLINE_MASK | WR_SEND_MASK,
+ [IB_QPT_UC] = WR_INLINE_MASK | WR_SEND_MASK,
+ [IB_QPT_UD] = WR_INLINE_MASK | WR_SEND_MASK,
+ },
+ },
+ [IB_WR_RDMA_READ] = {
+ .name = "IB_WR_RDMA_READ",
+ .mask = {
+ [IB_QPT_RC] = WR_READ_MASK,
+ },
+ },
+ [IB_WR_ATOMIC_CMP_AND_SWP] = {
+ .name = "IB_WR_ATOMIC_CMP_AND_SWP",
+ .mask = {
+ [IB_QPT_RC] = WR_ATOMIC_MASK,
+ },
+ },
+ [IB_WR_ATOMIC_FETCH_AND_ADD] = {
+ .name = "IB_WR_ATOMIC_FETCH_AND_ADD",
+ .mask = {
+ [IB_QPT_RC] = WR_ATOMIC_MASK,
+ },
+ },
+ [IB_WR_LSO] = {
+ .name = "IB_WR_LSO",
+ .mask = {
+ /* not supported */
+ },
+ },
+ [IB_WR_SEND_WITH_INV] = {
+ .name = "IB_WR_SEND_WITH_INV",
+ .mask = {
+ [IB_QPT_RC] = WR_INLINE_MASK | WR_SEND_MASK,
+ [IB_QPT_UC] = WR_INLINE_MASK | WR_SEND_MASK,
+ [IB_QPT_UD] = WR_INLINE_MASK | WR_SEND_MASK,
+ },
+ },
+ [IB_WR_RDMA_READ_WITH_INV] = {
+ .name = "IB_WR_RDMA_READ_WITH_INV",
+ .mask = {
+ [IB_QPT_RC] = WR_READ_MASK,
+ },
+ },
+ [IB_WR_LOCAL_INV] = {
+ .name = "IB_WR_LOCAL_INV",
+ .mask = {
+ [IB_QPT_RC] = WR_LOCAL_OP_MASK,
+ },
+ },
+ [IB_WR_REG_MR] = {
+ .name = "IB_WR_REG_MR",
+ .mask = {
+ [IB_QPT_RC] = WR_LOCAL_OP_MASK,
+ },
+ },
+ [IB_WR_BIND_MW] = {
+ .name = "IB_WR_BIND_MW",
+ .mask = {
+ [IB_QPT_RC] = WR_LOCAL_OP_MASK,
+ [IB_QPT_UC] = WR_LOCAL_OP_MASK,
+ },
+ },
+ [IB_WR_FLUSH] = {
+ .name = "IB_WR_FLUSH",
+ .mask = {
+ [IB_QPT_RC] = WR_FLUSH_MASK,
+ },
+ },
+ [IB_WR_ATOMIC_WRITE] = {
+ .name = "IB_WR_ATOMIC_WRITE",
+ .mask = {
+ [IB_QPT_RC] = WR_ATOMIC_WRITE_MASK,
+ },
+ },
+};
+
+struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE] = {
+ [IB_OPCODE_RC_SEND_FIRST] = {
+ .name = "IB_OPCODE_RC_SEND_FIRST",
+ .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_RWR_MASK |
+ RXE_SEND_MASK | RXE_START_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_SEND_MIDDLE] = {
+ .name = "IB_OPCODE_RC_SEND_MIDDLE",
+ .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_SEND_MASK |
+ RXE_MIDDLE_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_SEND_LAST] = {
+ .name = "IB_OPCODE_RC_SEND_LAST",
+ .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK |
+ RXE_SEND_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE",
+ .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK |
+ RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_IMMDT] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_SEND_ONLY] = {
+ .name = "IB_OPCODE_RC_SEND_ONLY",
+ .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK |
+ RXE_RWR_MASK | RXE_SEND_MASK |
+ RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE",
+ .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK |
+ RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK |
+ RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_IMMDT] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_WRITE_FIRST] = {
+ .name = "IB_OPCODE_RC_RDMA_WRITE_FIRST",
+ .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK |
+ RXE_WRITE_MASK | RXE_START_MASK,
+ .length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_RETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_WRITE_MIDDLE] = {
+ .name = "IB_OPCODE_RC_RDMA_WRITE_MIDDLE",
+ .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK |
+ RXE_MIDDLE_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_WRITE_LAST] = {
+ .name = "IB_OPCODE_RC_RDMA_WRITE_LAST",
+ .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK |
+ RXE_END_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE",
+ .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK |
+ RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK |
+ RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_IMMDT] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_WRITE_ONLY] = {
+ .name = "IB_OPCODE_RC_RDMA_WRITE_ONLY",
+ .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK |
+ RXE_WRITE_MASK | RXE_START_MASK |
+ RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_RETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE",
+ .mask = RXE_RETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK |
+ RXE_REQ_MASK | RXE_WRITE_MASK |
+ RXE_COMP_MASK | RXE_RWR_MASK |
+ RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RETH] = RXE_BTH_BYTES,
+ [RXE_IMMDT] = RXE_BTH_BYTES +
+ RXE_RETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_RETH_BYTES +
+ RXE_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_READ_REQUEST] = {
+ .name = "IB_OPCODE_RC_RDMA_READ_REQUEST",
+ .mask = RXE_RETH_MASK | RXE_REQ_MASK | RXE_READ_MASK |
+ RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_RETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST] = {
+ .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST",
+ .mask = RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK |
+ RXE_START_MASK,
+ .length = RXE_BTH_BYTES + RXE_AETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_AETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE] = {
+ .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE",
+ .mask = RXE_PAYLOAD_MASK | RXE_ACK_MASK | RXE_MIDDLE_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST] = {
+ .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST",
+ .mask = RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK |
+ RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_AETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_AETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY] = {
+ .name = "IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY",
+ .mask = RXE_AETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK |
+ RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_AETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_AETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_ACKNOWLEDGE] = {
+ .name = "IB_OPCODE_RC_ACKNOWLEDGE",
+ .mask = RXE_AETH_MASK | RXE_ACK_MASK | RXE_START_MASK |
+ RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_AETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_AETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = {
+ .name = "IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE",
+ .mask = RXE_AETH_MASK | RXE_ATMACK_MASK | RXE_ACK_MASK |
+ RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_ATMACK_BYTES + RXE_AETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_AETH] = RXE_BTH_BYTES,
+ [RXE_ATMACK] = RXE_BTH_BYTES +
+ RXE_AETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_ATMACK_BYTES +
+ RXE_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_COMPARE_SWAP] = {
+ .name = "IB_OPCODE_RC_COMPARE_SWAP",
+ .mask = RXE_ATMETH_MASK | RXE_REQ_MASK | RXE_ATOMIC_MASK |
+ RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_ATMETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_ATMETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_FETCH_ADD] = {
+ .name = "IB_OPCODE_RC_FETCH_ADD",
+ .mask = RXE_ATMETH_MASK | RXE_REQ_MASK | RXE_ATOMIC_MASK |
+ RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_ATMETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_ATMETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = {
+ .name = "IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE",
+ .mask = RXE_IETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK |
+ RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_IETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_IETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = {
+ .name = "IB_OPCODE_RC_SEND_ONLY_INV",
+ .mask = RXE_IETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK |
+ RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK |
+ RXE_END_MASK | RXE_START_MASK,
+ .length = RXE_BTH_BYTES + RXE_IETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_IETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_IETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_FLUSH] = {
+ .name = "IB_OPCODE_RC_FLUSH",
+ .mask = RXE_FETH_MASK | RXE_RETH_MASK | RXE_FLUSH_MASK |
+ RXE_START_MASK | RXE_END_MASK | RXE_REQ_MASK,
+ .length = RXE_BTH_BYTES + RXE_FETH_BYTES + RXE_RETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_FETH] = RXE_BTH_BYTES,
+ [RXE_RETH] = RXE_BTH_BYTES + RXE_FETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RC_ATOMIC_WRITE] = {
+ .name = "IB_OPCODE_RC_ATOMIC_WRITE",
+ .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK |
+ RXE_ATOMIC_WRITE_MASK | RXE_START_MASK |
+ RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES + RXE_RETH_BYTES,
+ }
+ },
+
+ /* UC */
+ [IB_OPCODE_UC_SEND_FIRST] = {
+ .name = "IB_OPCODE_UC_SEND_FIRST",
+ .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_RWR_MASK |
+ RXE_SEND_MASK | RXE_START_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_SEND_MIDDLE] = {
+ .name = "IB_OPCODE_UC_SEND_MIDDLE",
+ .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_SEND_MASK |
+ RXE_MIDDLE_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_SEND_LAST] = {
+ .name = "IB_OPCODE_UC_SEND_LAST",
+ .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK |
+ RXE_SEND_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE",
+ .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK |
+ RXE_COMP_MASK | RXE_SEND_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_IMMDT] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_SEND_ONLY] = {
+ .name = "IB_OPCODE_UC_SEND_ONLY",
+ .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_COMP_MASK |
+ RXE_RWR_MASK | RXE_SEND_MASK |
+ RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE",
+ .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK |
+ RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK |
+ RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_IMMDT] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_RDMA_WRITE_FIRST] = {
+ .name = "IB_OPCODE_UC_RDMA_WRITE_FIRST",
+ .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK |
+ RXE_WRITE_MASK | RXE_START_MASK,
+ .length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_RETH_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_RDMA_WRITE_MIDDLE] = {
+ .name = "IB_OPCODE_UC_RDMA_WRITE_MIDDLE",
+ .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK |
+ RXE_MIDDLE_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_RDMA_WRITE_LAST] = {
+ .name = "IB_OPCODE_UC_RDMA_WRITE_LAST",
+ .mask = RXE_PAYLOAD_MASK | RXE_REQ_MASK | RXE_WRITE_MASK |
+ RXE_END_MASK,
+ .length = RXE_BTH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE",
+ .mask = RXE_IMMDT_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK |
+ RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK |
+ RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_IMMDT] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_RDMA_WRITE_ONLY] = {
+ .name = "IB_OPCODE_UC_RDMA_WRITE_ONLY",
+ .mask = RXE_RETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK |
+ RXE_WRITE_MASK | RXE_START_MASK |
+ RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_RETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_RETH_BYTES,
+ }
+ },
+ [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE",
+ .mask = RXE_RETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK |
+ RXE_REQ_MASK | RXE_WRITE_MASK |
+ RXE_COMP_MASK | RXE_RWR_MASK |
+ RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RETH] = RXE_BTH_BYTES,
+ [RXE_IMMDT] = RXE_BTH_BYTES +
+ RXE_RETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_RETH_BYTES +
+ RXE_IMMDT_BYTES,
+ }
+ },
+
+ /* RD */
+ [IB_OPCODE_RD_SEND_FIRST] = {
+ .name = "IB_OPCODE_RD_SEND_FIRST",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK |
+ RXE_REQ_MASK | RXE_RWR_MASK | RXE_SEND_MASK |
+ RXE_START_MASK,
+ .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_DETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_SEND_MIDDLE] = {
+ .name = "IB_OPCODE_RD_SEND_MIDDLE",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK |
+ RXE_REQ_MASK | RXE_SEND_MASK |
+ RXE_MIDDLE_MASK,
+ .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_DETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_SEND_LAST] = {
+ .name = "IB_OPCODE_RD_SEND_LAST",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK |
+ RXE_REQ_MASK | RXE_COMP_MASK | RXE_SEND_MASK |
+ RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_DETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_SEND_LAST_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_RD_SEND_LAST_WITH_IMMEDIATE",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK |
+ RXE_PAYLOAD_MASK | RXE_REQ_MASK |
+ RXE_COMP_MASK | RXE_SEND_MASK |
+ RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES +
+ RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES,
+ [RXE_IMMDT] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_DETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_DETH_BYTES +
+ RXE_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_SEND_ONLY] = {
+ .name = "IB_OPCODE_RD_SEND_ONLY",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK |
+ RXE_REQ_MASK | RXE_COMP_MASK | RXE_RWR_MASK |
+ RXE_SEND_MASK | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_DETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_SEND_ONLY_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_RD_SEND_ONLY_WITH_IMMEDIATE",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK |
+ RXE_PAYLOAD_MASK | RXE_REQ_MASK |
+ RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK |
+ RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES +
+ RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES,
+ [RXE_IMMDT] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_DETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_DETH_BYTES +
+ RXE_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_WRITE_FIRST] = {
+ .name = "IB_OPCODE_RD_RDMA_WRITE_FIRST",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK |
+ RXE_PAYLOAD_MASK | RXE_REQ_MASK |
+ RXE_WRITE_MASK | RXE_START_MASK,
+ .length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES +
+ RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES,
+ [RXE_RETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_DETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_DETH_BYTES +
+ RXE_RETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_WRITE_MIDDLE] = {
+ .name = "IB_OPCODE_RD_RDMA_WRITE_MIDDLE",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK |
+ RXE_REQ_MASK | RXE_WRITE_MASK |
+ RXE_MIDDLE_MASK,
+ .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_DETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_WRITE_LAST] = {
+ .name = "IB_OPCODE_RD_RDMA_WRITE_LAST",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_PAYLOAD_MASK |
+ RXE_REQ_MASK | RXE_WRITE_MASK |
+ RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_DETH_BYTES + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_DETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_WRITE_LAST_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_RD_RDMA_WRITE_LAST_WITH_IMMEDIATE",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_IMMDT_MASK |
+ RXE_PAYLOAD_MASK | RXE_REQ_MASK |
+ RXE_WRITE_MASK | RXE_COMP_MASK | RXE_RWR_MASK |
+ RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES +
+ RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES,
+ [RXE_IMMDT] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_DETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_DETH_BYTES +
+ RXE_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_WRITE_ONLY] = {
+ .name = "IB_OPCODE_RD_RDMA_WRITE_ONLY",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK |
+ RXE_PAYLOAD_MASK | RXE_REQ_MASK |
+ RXE_WRITE_MASK | RXE_START_MASK |
+ RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES +
+ RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES,
+ [RXE_RETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_DETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_DETH_BYTES +
+ RXE_RETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_RD_RDMA_WRITE_ONLY_WITH_IMMEDIATE",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK |
+ RXE_IMMDT_MASK | RXE_PAYLOAD_MASK |
+ RXE_REQ_MASK | RXE_WRITE_MASK |
+ RXE_COMP_MASK | RXE_RWR_MASK |
+ RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_RETH_BYTES +
+ RXE_DETH_BYTES + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES,
+ [RXE_RETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_DETH_BYTES,
+ [RXE_IMMDT] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_DETH_BYTES +
+ RXE_RETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_DETH_BYTES +
+ RXE_RETH_BYTES +
+ RXE_IMMDT_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_READ_REQUEST] = {
+ .name = "IB_OPCODE_RD_RDMA_READ_REQUEST",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_RETH_MASK |
+ RXE_REQ_MASK | RXE_READ_MASK |
+ RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_RETH_BYTES + RXE_DETH_BYTES +
+ RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES,
+ [RXE_RETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_DETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_RETH_BYTES +
+ RXE_DETH_BYTES +
+ RXE_RDETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_READ_RESPONSE_FIRST] = {
+ .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_FIRST",
+ .mask = RXE_RDETH_MASK | RXE_AETH_MASK |
+ RXE_PAYLOAD_MASK | RXE_ACK_MASK |
+ RXE_START_MASK,
+ .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_AETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_READ_RESPONSE_MIDDLE] = {
+ .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_MIDDLE",
+ .mask = RXE_RDETH_MASK | RXE_PAYLOAD_MASK | RXE_ACK_MASK |
+ RXE_MIDDLE_MASK,
+ .length = RXE_BTH_BYTES + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_READ_RESPONSE_LAST] = {
+ .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_LAST",
+ .mask = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_PAYLOAD_MASK |
+ RXE_ACK_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_AETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_RDMA_READ_RESPONSE_ONLY] = {
+ .name = "IB_OPCODE_RD_RDMA_READ_RESPONSE_ONLY",
+ .mask = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_PAYLOAD_MASK |
+ RXE_ACK_MASK | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_AETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_ACKNOWLEDGE] = {
+ .name = "IB_OPCODE_RD_ACKNOWLEDGE",
+ .mask = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_ACK_MASK |
+ RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_AETH_BYTES + RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_AETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_ATOMIC_ACKNOWLEDGE] = {
+ .name = "IB_OPCODE_RD_ATOMIC_ACKNOWLEDGE",
+ .mask = RXE_RDETH_MASK | RXE_AETH_MASK | RXE_ATMACK_MASK |
+ RXE_ACK_MASK | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_ATMACK_BYTES + RXE_AETH_BYTES +
+ RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_AETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES,
+ [RXE_ATMACK] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_AETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_COMPARE_SWAP] = {
+ .name = "RD_COMPARE_SWAP",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_ATMETH_MASK |
+ RXE_REQ_MASK | RXE_ATOMIC_MASK |
+ RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES + RXE_DETH_BYTES +
+ RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES,
+ [RXE_ATMETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_DETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_ATMETH_BYTES +
+ RXE_DETH_BYTES +
+ RXE_RDETH_BYTES,
+ }
+ },
+ [IB_OPCODE_RD_FETCH_ADD] = {
+ .name = "IB_OPCODE_RD_FETCH_ADD",
+ .mask = RXE_RDETH_MASK | RXE_DETH_MASK | RXE_ATMETH_MASK |
+ RXE_REQ_MASK | RXE_ATOMIC_MASK |
+ RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_ATMETH_BYTES + RXE_DETH_BYTES +
+ RXE_RDETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_RDETH] = RXE_BTH_BYTES,
+ [RXE_DETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES,
+ [RXE_ATMETH] = RXE_BTH_BYTES +
+ RXE_RDETH_BYTES +
+ RXE_DETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_ATMETH_BYTES +
+ RXE_DETH_BYTES +
+ RXE_RDETH_BYTES,
+ }
+ },
+
+ /* UD */
+ [IB_OPCODE_UD_SEND_ONLY] = {
+ .name = "IB_OPCODE_UD_SEND_ONLY",
+ .mask = RXE_DETH_MASK | RXE_PAYLOAD_MASK | RXE_REQ_MASK |
+ RXE_COMP_MASK | RXE_RWR_MASK | RXE_SEND_MASK |
+ RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_DETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_DETH] = RXE_BTH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_DETH_BYTES,
+ }
+ },
+ [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE] = {
+ .name = "IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE",
+ .mask = RXE_DETH_MASK | RXE_IMMDT_MASK | RXE_PAYLOAD_MASK |
+ RXE_REQ_MASK | RXE_COMP_MASK | RXE_RWR_MASK |
+ RXE_SEND_MASK | RXE_START_MASK | RXE_END_MASK,
+ .length = RXE_BTH_BYTES + RXE_IMMDT_BYTES + RXE_DETH_BYTES,
+ .offset = {
+ [RXE_BTH] = 0,
+ [RXE_DETH] = RXE_BTH_BYTES,
+ [RXE_IMMDT] = RXE_BTH_BYTES +
+ RXE_DETH_BYTES,
+ [RXE_PAYLOAD] = RXE_BTH_BYTES +
+ RXE_DETH_BYTES +
+ RXE_IMMDT_BYTES,
+ }
+ },
+
+};
diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.h b/drivers/infiniband/sw/rxe/rxe_opcode.h
new file mode 100644
index 0000000000..5686b691d6
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_opcode.h
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#ifndef RXE_OPCODE_H
+#define RXE_OPCODE_H
+
+/*
+ * contains header bit mask definitions and header lengths
+ * declaration of the rxe_opcode_info struct and
+ * rxe_wr_opcode_info struct
+ */
+
+enum rxe_wr_mask {
+ WR_INLINE_MASK = BIT(0),
+ WR_ATOMIC_MASK = BIT(1),
+ WR_SEND_MASK = BIT(2),
+ WR_READ_MASK = BIT(3),
+ WR_WRITE_MASK = BIT(4),
+ WR_LOCAL_OP_MASK = BIT(5),
+ WR_FLUSH_MASK = BIT(6),
+ WR_ATOMIC_WRITE_MASK = BIT(7),
+
+ WR_READ_OR_WRITE_MASK = WR_READ_MASK | WR_WRITE_MASK,
+ WR_WRITE_OR_SEND_MASK = WR_WRITE_MASK | WR_SEND_MASK,
+ WR_ATOMIC_OR_READ_MASK = WR_ATOMIC_MASK | WR_READ_MASK,
+};
+
+#define WR_MAX_QPT (8)
+
+struct rxe_wr_opcode_info {
+ char *name;
+ enum rxe_wr_mask mask[WR_MAX_QPT];
+};
+
+extern struct rxe_wr_opcode_info rxe_wr_opcode_info[];
+
+enum rxe_hdr_type {
+ RXE_LRH,
+ RXE_GRH,
+ RXE_BTH,
+ RXE_RETH,
+ RXE_AETH,
+ RXE_ATMETH,
+ RXE_ATMACK,
+ RXE_IETH,
+ RXE_RDETH,
+ RXE_DETH,
+ RXE_IMMDT,
+ RXE_FETH,
+ RXE_PAYLOAD,
+ NUM_HDR_TYPES
+};
+
+enum rxe_hdr_mask {
+ RXE_LRH_MASK = BIT(RXE_LRH),
+ RXE_GRH_MASK = BIT(RXE_GRH),
+ RXE_BTH_MASK = BIT(RXE_BTH),
+ RXE_IMMDT_MASK = BIT(RXE_IMMDT),
+ RXE_RETH_MASK = BIT(RXE_RETH),
+ RXE_AETH_MASK = BIT(RXE_AETH),
+ RXE_ATMETH_MASK = BIT(RXE_ATMETH),
+ RXE_ATMACK_MASK = BIT(RXE_ATMACK),
+ RXE_IETH_MASK = BIT(RXE_IETH),
+ RXE_RDETH_MASK = BIT(RXE_RDETH),
+ RXE_DETH_MASK = BIT(RXE_DETH),
+ RXE_FETH_MASK = BIT(RXE_FETH),
+ RXE_PAYLOAD_MASK = BIT(RXE_PAYLOAD),
+
+ RXE_REQ_MASK = BIT(NUM_HDR_TYPES + 0),
+ RXE_ACK_MASK = BIT(NUM_HDR_TYPES + 1),
+ RXE_SEND_MASK = BIT(NUM_HDR_TYPES + 2),
+ RXE_WRITE_MASK = BIT(NUM_HDR_TYPES + 3),
+ RXE_READ_MASK = BIT(NUM_HDR_TYPES + 4),
+ RXE_ATOMIC_MASK = BIT(NUM_HDR_TYPES + 5),
+ RXE_FLUSH_MASK = BIT(NUM_HDR_TYPES + 6),
+
+ RXE_RWR_MASK = BIT(NUM_HDR_TYPES + 7),
+ RXE_COMP_MASK = BIT(NUM_HDR_TYPES + 8),
+
+ RXE_START_MASK = BIT(NUM_HDR_TYPES + 9),
+ RXE_MIDDLE_MASK = BIT(NUM_HDR_TYPES + 10),
+ RXE_END_MASK = BIT(NUM_HDR_TYPES + 11),
+
+ RXE_LOOPBACK_MASK = BIT(NUM_HDR_TYPES + 12),
+
+ RXE_ATOMIC_WRITE_MASK = BIT(NUM_HDR_TYPES + 14),
+
+ RXE_READ_OR_ATOMIC_MASK = (RXE_READ_MASK | RXE_ATOMIC_MASK),
+ RXE_WRITE_OR_SEND_MASK = (RXE_WRITE_MASK | RXE_SEND_MASK),
+ RXE_READ_OR_WRITE_MASK = (RXE_READ_MASK | RXE_WRITE_MASK),
+ RXE_RDMA_OP_MASK = (RXE_READ_MASK | RXE_WRITE_MASK |
+ RXE_ATOMIC_WRITE_MASK | RXE_FLUSH_MASK |
+ RXE_ATOMIC_MASK),
+};
+
+#define OPCODE_NONE (-1)
+#define RXE_NUM_OPCODE 256
+
+struct rxe_opcode_info {
+ char *name;
+ enum rxe_hdr_mask mask;
+ int length;
+ int offset[NUM_HDR_TYPES];
+};
+
+extern struct rxe_opcode_info rxe_opcode[RXE_NUM_OPCODE];
+
+#endif /* RXE_OPCODE_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h
new file mode 100644
index 0000000000..d2f57ead78
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_param.h
@@ -0,0 +1,156 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#ifndef RXE_PARAM_H
+#define RXE_PARAM_H
+
+#include <uapi/rdma/rdma_user_rxe.h>
+
+#define DEFAULT_MAX_VALUE (1 << 20)
+
+static inline enum ib_mtu rxe_mtu_int_to_enum(int mtu)
+{
+ if (mtu < 256)
+ return 0;
+ else if (mtu < 512)
+ return IB_MTU_256;
+ else if (mtu < 1024)
+ return IB_MTU_512;
+ else if (mtu < 2048)
+ return IB_MTU_1024;
+ else if (mtu < 4096)
+ return IB_MTU_2048;
+ else
+ return IB_MTU_4096;
+}
+
+/* Find the IB mtu for a given network MTU. */
+static inline enum ib_mtu eth_mtu_int_to_enum(int mtu)
+{
+ mtu -= RXE_MAX_HDR_LENGTH;
+
+ return rxe_mtu_int_to_enum(mtu);
+}
+
+/* default/initial rxe device parameter settings */
+enum rxe_device_param {
+ RXE_MAX_MR_SIZE = -1ull,
+ RXE_PAGE_SIZE_CAP = 0xfffff000,
+ RXE_MAX_QP_WR = DEFAULT_MAX_VALUE,
+ RXE_DEVICE_CAP_FLAGS = IB_DEVICE_BAD_PKEY_CNTR
+ | IB_DEVICE_BAD_QKEY_CNTR
+ | IB_DEVICE_AUTO_PATH_MIG
+ | IB_DEVICE_CHANGE_PHY_PORT
+ | IB_DEVICE_UD_AV_PORT_ENFORCE
+ | IB_DEVICE_PORT_ACTIVE_EVENT
+ | IB_DEVICE_SYS_IMAGE_GUID
+ | IB_DEVICE_RC_RNR_NAK_GEN
+ | IB_DEVICE_SRQ_RESIZE
+ | IB_DEVICE_MEM_MGT_EXTENSIONS
+ | IB_DEVICE_MEM_WINDOW
+ | IB_DEVICE_FLUSH_GLOBAL
+ | IB_DEVICE_FLUSH_PERSISTENT
+#ifdef CONFIG_64BIT
+ | IB_DEVICE_MEM_WINDOW_TYPE_2B
+ | IB_DEVICE_ATOMIC_WRITE,
+#else
+ | IB_DEVICE_MEM_WINDOW_TYPE_2B,
+#endif /* CONFIG_64BIT */
+ RXE_MAX_SGE = 32,
+ RXE_MAX_WQE_SIZE = sizeof(struct rxe_send_wqe) +
+ sizeof(struct ib_sge) * RXE_MAX_SGE,
+ RXE_MAX_INLINE_DATA = RXE_MAX_WQE_SIZE -
+ sizeof(struct rxe_send_wqe),
+ RXE_MAX_SGE_RD = 32,
+ RXE_MAX_CQ = DEFAULT_MAX_VALUE,
+ RXE_MAX_LOG_CQE = 15,
+ RXE_MAX_PD = DEFAULT_MAX_VALUE,
+ RXE_MAX_QP_RD_ATOM = 128,
+ RXE_MAX_RES_RD_ATOM = 0x3f000,
+ RXE_MAX_QP_INIT_RD_ATOM = 128,
+ RXE_MAX_MCAST_GRP = 8192,
+ RXE_MAX_MCAST_QP_ATTACH = 56,
+ RXE_MAX_TOT_MCAST_QP_ATTACH = 0x70000,
+ RXE_MAX_AH = (1<<15) - 1, /* 32Ki - 1 */
+ RXE_MIN_AH_INDEX = 1,
+ RXE_MAX_AH_INDEX = RXE_MAX_AH,
+ RXE_MAX_SRQ_WR = DEFAULT_MAX_VALUE,
+ RXE_MIN_SRQ_WR = 1,
+ RXE_MAX_SRQ_SGE = 27,
+ RXE_MIN_SRQ_SGE = 1,
+ RXE_MAX_FMR_PAGE_LIST_LEN = 512,
+ RXE_MAX_PKEYS = 64,
+ RXE_LOCAL_CA_ACK_DELAY = 15,
+
+ RXE_MAX_UCONTEXT = DEFAULT_MAX_VALUE,
+
+ RXE_NUM_PORT = 1,
+
+ RXE_MIN_QP_INDEX = 16,
+ RXE_MAX_QP_INDEX = DEFAULT_MAX_VALUE,
+ RXE_MAX_QP = DEFAULT_MAX_VALUE - RXE_MIN_QP_INDEX,
+
+ RXE_MIN_SRQ_INDEX = 0x00020001,
+ RXE_MAX_SRQ_INDEX = DEFAULT_MAX_VALUE,
+ RXE_MAX_SRQ = DEFAULT_MAX_VALUE - RXE_MIN_SRQ_INDEX,
+
+ RXE_MIN_MR_INDEX = 0x00000001,
+ RXE_MAX_MR_INDEX = DEFAULT_MAX_VALUE >> 1,
+ RXE_MAX_MR = RXE_MAX_MR_INDEX - RXE_MIN_MR_INDEX,
+ RXE_MIN_MW_INDEX = RXE_MAX_MR_INDEX + 1,
+ RXE_MAX_MW_INDEX = DEFAULT_MAX_VALUE,
+ RXE_MAX_MW = RXE_MAX_MW_INDEX - RXE_MIN_MW_INDEX,
+
+ RXE_MAX_PKT_PER_ACK = 64,
+
+ RXE_MAX_UNACKED_PSNS = 128,
+
+ /* Max inflight SKBs per queue pair */
+ RXE_INFLIGHT_SKBS_PER_QP_HIGH = 64,
+ RXE_INFLIGHT_SKBS_PER_QP_LOW = 16,
+
+ /* Max number of interations of each work item
+ * before yielding the cpu to let other
+ * work make progress
+ */
+ RXE_MAX_ITERATIONS = 1024,
+
+ /* Delay before calling arbiter timer */
+ RXE_NSEC_ARB_TIMER_DELAY = 200,
+
+ /* IBTA v1.4 A3.3.1 VENDOR INFORMATION section */
+ RXE_VENDOR_ID = 0XFFFFFF,
+};
+
+/* default/initial rxe port parameters */
+enum rxe_port_param {
+ RXE_PORT_GID_TBL_LEN = 1024,
+ RXE_PORT_PORT_CAP_FLAGS = IB_PORT_CM_SUP,
+ RXE_PORT_MAX_MSG_SZ = 0x800000,
+ RXE_PORT_BAD_PKEY_CNTR = 0,
+ RXE_PORT_QKEY_VIOL_CNTR = 0,
+ RXE_PORT_LID = 0,
+ RXE_PORT_SM_LID = 0,
+ RXE_PORT_SM_SL = 0,
+ RXE_PORT_LMC = 0,
+ RXE_PORT_MAX_VL_NUM = 1,
+ RXE_PORT_SUBNET_TIMEOUT = 0,
+ RXE_PORT_INIT_TYPE_REPLY = 0,
+ RXE_PORT_ACTIVE_WIDTH = IB_WIDTH_1X,
+ RXE_PORT_ACTIVE_SPEED = 1,
+ RXE_PORT_PKEY_TBL_LEN = 1,
+ RXE_PORT_PHYS_STATE = IB_PORT_PHYS_STATE_POLLING,
+ RXE_PORT_SUBNET_PREFIX = 0xfe80000000000000ULL,
+};
+
+/* default/initial port info parameters */
+enum rxe_port_info_param {
+ RXE_PORT_INFO_VL_CAP = 4, /* 1-8 */
+ RXE_PORT_INFO_MTU_CAP = 5, /* 4096 */
+ RXE_PORT_INFO_OPER_VL = 1, /* 1 */
+};
+
+#endif /* RXE_PARAM_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
new file mode 100644
index 0000000000..6215c6de3a
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#include "rxe.h"
+
+#define RXE_POOL_TIMEOUT (200)
+#define RXE_POOL_ALIGN (16)
+
+static const struct rxe_type_info {
+ const char *name;
+ size_t size;
+ size_t elem_offset;
+ void (*cleanup)(struct rxe_pool_elem *elem);
+ u32 min_index;
+ u32 max_index;
+ u32 max_elem;
+} rxe_type_info[RXE_NUM_TYPES] = {
+ [RXE_TYPE_UC] = {
+ .name = "uc",
+ .size = sizeof(struct rxe_ucontext),
+ .elem_offset = offsetof(struct rxe_ucontext, elem),
+ .min_index = 1,
+ .max_index = RXE_MAX_UCONTEXT,
+ .max_elem = RXE_MAX_UCONTEXT,
+ },
+ [RXE_TYPE_PD] = {
+ .name = "pd",
+ .size = sizeof(struct rxe_pd),
+ .elem_offset = offsetof(struct rxe_pd, elem),
+ .min_index = 1,
+ .max_index = RXE_MAX_PD,
+ .max_elem = RXE_MAX_PD,
+ },
+ [RXE_TYPE_AH] = {
+ .name = "ah",
+ .size = sizeof(struct rxe_ah),
+ .elem_offset = offsetof(struct rxe_ah, elem),
+ .min_index = RXE_MIN_AH_INDEX,
+ .max_index = RXE_MAX_AH_INDEX,
+ .max_elem = RXE_MAX_AH,
+ },
+ [RXE_TYPE_SRQ] = {
+ .name = "srq",
+ .size = sizeof(struct rxe_srq),
+ .elem_offset = offsetof(struct rxe_srq, elem),
+ .cleanup = rxe_srq_cleanup,
+ .min_index = RXE_MIN_SRQ_INDEX,
+ .max_index = RXE_MAX_SRQ_INDEX,
+ .max_elem = RXE_MAX_SRQ,
+ },
+ [RXE_TYPE_QP] = {
+ .name = "qp",
+ .size = sizeof(struct rxe_qp),
+ .elem_offset = offsetof(struct rxe_qp, elem),
+ .cleanup = rxe_qp_cleanup,
+ .min_index = RXE_MIN_QP_INDEX,
+ .max_index = RXE_MAX_QP_INDEX,
+ .max_elem = RXE_MAX_QP,
+ },
+ [RXE_TYPE_CQ] = {
+ .name = "cq",
+ .size = sizeof(struct rxe_cq),
+ .elem_offset = offsetof(struct rxe_cq, elem),
+ .cleanup = rxe_cq_cleanup,
+ .min_index = 1,
+ .max_index = RXE_MAX_CQ,
+ .max_elem = RXE_MAX_CQ,
+ },
+ [RXE_TYPE_MR] = {
+ .name = "mr",
+ .size = sizeof(struct rxe_mr),
+ .elem_offset = offsetof(struct rxe_mr, elem),
+ .cleanup = rxe_mr_cleanup,
+ .min_index = RXE_MIN_MR_INDEX,
+ .max_index = RXE_MAX_MR_INDEX,
+ .max_elem = RXE_MAX_MR,
+ },
+ [RXE_TYPE_MW] = {
+ .name = "mw",
+ .size = sizeof(struct rxe_mw),
+ .elem_offset = offsetof(struct rxe_mw, elem),
+ .cleanup = rxe_mw_cleanup,
+ .min_index = RXE_MIN_MW_INDEX,
+ .max_index = RXE_MAX_MW_INDEX,
+ .max_elem = RXE_MAX_MW,
+ },
+};
+
+void rxe_pool_init(struct rxe_dev *rxe, struct rxe_pool *pool,
+ enum rxe_elem_type type)
+{
+ const struct rxe_type_info *info = &rxe_type_info[type];
+
+ memset(pool, 0, sizeof(*pool));
+
+ pool->rxe = rxe;
+ pool->name = info->name;
+ pool->type = type;
+ pool->max_elem = info->max_elem;
+ pool->elem_size = ALIGN(info->size, RXE_POOL_ALIGN);
+ pool->elem_offset = info->elem_offset;
+ pool->cleanup = info->cleanup;
+
+ atomic_set(&pool->num_elem, 0);
+
+ xa_init_flags(&pool->xa, XA_FLAGS_ALLOC);
+ pool->limit.min = info->min_index;
+ pool->limit.max = info->max_index;
+}
+
+void rxe_pool_cleanup(struct rxe_pool *pool)
+{
+ WARN_ON(!xa_empty(&pool->xa));
+}
+
+int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem,
+ bool sleepable)
+{
+ int err;
+ gfp_t gfp_flags;
+
+ if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
+ goto err_cnt;
+
+ elem->pool = pool;
+ elem->obj = (u8 *)elem - pool->elem_offset;
+ kref_init(&elem->ref_cnt);
+ init_completion(&elem->complete);
+
+ /* AH objects are unique in that the create_ah verb
+ * can be called in atomic context. If the create_ah
+ * call is not sleepable use GFP_ATOMIC.
+ */
+ gfp_flags = sleepable ? GFP_KERNEL : GFP_ATOMIC;
+
+ if (sleepable)
+ might_sleep();
+ err = xa_alloc_cyclic(&pool->xa, &elem->index, NULL, pool->limit,
+ &pool->next, gfp_flags);
+ if (err < 0)
+ goto err_cnt;
+
+ return 0;
+
+err_cnt:
+ atomic_dec(&pool->num_elem);
+ return -EINVAL;
+}
+
+void *rxe_pool_get_index(struct rxe_pool *pool, u32 index)
+{
+ struct rxe_pool_elem *elem;
+ struct xarray *xa = &pool->xa;
+ void *obj;
+
+ rcu_read_lock();
+ elem = xa_load(xa, index);
+ if (elem && kref_get_unless_zero(&elem->ref_cnt))
+ obj = elem->obj;
+ else
+ obj = NULL;
+ rcu_read_unlock();
+
+ return obj;
+}
+
+static void rxe_elem_release(struct kref *kref)
+{
+ struct rxe_pool_elem *elem = container_of(kref, typeof(*elem), ref_cnt);
+
+ complete(&elem->complete);
+}
+
+int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable)
+{
+ struct rxe_pool *pool = elem->pool;
+ struct xarray *xa = &pool->xa;
+ static int timeout = RXE_POOL_TIMEOUT;
+ int ret, err = 0;
+ void *xa_ret;
+
+ if (sleepable)
+ might_sleep();
+
+ /* erase xarray entry to prevent looking up
+ * the pool elem from its index
+ */
+ xa_ret = xa_erase(xa, elem->index);
+ WARN_ON(xa_err(xa_ret));
+
+ /* if this is the last call to rxe_put complete the
+ * object. It is safe to touch obj->elem after this since
+ * it is freed below
+ */
+ __rxe_put(elem);
+
+ /* wait until all references to the object have been
+ * dropped before final object specific cleanup and
+ * return to rdma-core
+ */
+ if (sleepable) {
+ if (!completion_done(&elem->complete) && timeout) {
+ ret = wait_for_completion_timeout(&elem->complete,
+ timeout);
+
+ /* Shouldn't happen. There are still references to
+ * the object but, rather than deadlock, free the
+ * object or pass back to rdma-core.
+ */
+ if (WARN_ON(!ret))
+ err = -EINVAL;
+ }
+ } else {
+ unsigned long until = jiffies + timeout;
+
+ /* AH objects are unique in that the destroy_ah verb
+ * can be called in atomic context. This delay
+ * replaces the wait_for_completion call above
+ * when the destroy_ah call is not sleepable
+ */
+ while (!completion_done(&elem->complete) &&
+ time_before(jiffies, until))
+ mdelay(1);
+
+ if (WARN_ON(!completion_done(&elem->complete)))
+ err = -EINVAL;
+ }
+
+ if (pool->cleanup)
+ pool->cleanup(elem);
+
+ atomic_dec(&pool->num_elem);
+
+ return err;
+}
+
+int __rxe_get(struct rxe_pool_elem *elem)
+{
+ return kref_get_unless_zero(&elem->ref_cnt);
+}
+
+int __rxe_put(struct rxe_pool_elem *elem)
+{
+ return kref_put(&elem->ref_cnt, rxe_elem_release);
+}
+
+void __rxe_finalize(struct rxe_pool_elem *elem)
+{
+ void *xa_ret;
+
+ xa_ret = xa_store(&elem->pool->xa, elem->index, elem, GFP_KERNEL);
+ WARN_ON(xa_err(xa_ret));
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h
new file mode 100644
index 0000000000..b42e26427a
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_pool.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#ifndef RXE_POOL_H
+#define RXE_POOL_H
+
+enum rxe_elem_type {
+ RXE_TYPE_UC,
+ RXE_TYPE_PD,
+ RXE_TYPE_AH,
+ RXE_TYPE_SRQ,
+ RXE_TYPE_QP,
+ RXE_TYPE_CQ,
+ RXE_TYPE_MR,
+ RXE_TYPE_MW,
+ RXE_NUM_TYPES, /* keep me last */
+};
+
+struct rxe_pool_elem {
+ struct rxe_pool *pool;
+ void *obj;
+ struct kref ref_cnt;
+ struct list_head list;
+ struct completion complete;
+ u32 index;
+};
+
+struct rxe_pool {
+ struct rxe_dev *rxe;
+ const char *name;
+ void (*cleanup)(struct rxe_pool_elem *elem);
+ enum rxe_elem_type type;
+
+ unsigned int max_elem;
+ atomic_t num_elem;
+ size_t elem_size;
+ size_t elem_offset;
+
+ struct xarray xa;
+ struct xa_limit limit;
+ u32 next;
+};
+
+/* initialize a pool of objects with given limit on
+ * number of elements. gets parameters from rxe_type_info
+ * pool elements will be allocated out of a slab cache
+ */
+void rxe_pool_init(struct rxe_dev *rxe, struct rxe_pool *pool,
+ enum rxe_elem_type type);
+
+/* free resources from object pool */
+void rxe_pool_cleanup(struct rxe_pool *pool);
+
+/* connect already allocated object to pool */
+int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem,
+ bool sleepable);
+#define rxe_add_to_pool(pool, obj) __rxe_add_to_pool(pool, &(obj)->elem, true)
+#define rxe_add_to_pool_ah(pool, obj, sleepable) __rxe_add_to_pool(pool, \
+ &(obj)->elem, sleepable)
+
+/* lookup an indexed object from index. takes a reference on object */
+void *rxe_pool_get_index(struct rxe_pool *pool, u32 index);
+
+int __rxe_get(struct rxe_pool_elem *elem);
+#define rxe_get(obj) __rxe_get(&(obj)->elem)
+
+int __rxe_put(struct rxe_pool_elem *elem);
+#define rxe_put(obj) __rxe_put(&(obj)->elem)
+
+int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable);
+#define rxe_cleanup(obj) __rxe_cleanup(&(obj)->elem, true)
+#define rxe_cleanup_ah(obj, sleepable) __rxe_cleanup(&(obj)->elem, sleepable)
+
+#define rxe_read(obj) kref_read(&(obj)->elem.ref_cnt)
+
+void __rxe_finalize(struct rxe_pool_elem *elem);
+#define rxe_finalize(obj) __rxe_finalize(&(obj)->elem)
+
+#endif /* RXE_POOL_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
new file mode 100644
index 0000000000..28e379c108
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_qp.c
@@ -0,0 +1,880 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#include <linux/skbuff.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <rdma/uverbs_ioctl.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+#include "rxe_task.h"
+
+static int rxe_qp_chk_cap(struct rxe_dev *rxe, struct ib_qp_cap *cap,
+ int has_srq)
+{
+ if (cap->max_send_wr > rxe->attr.max_qp_wr) {
+ rxe_dbg_dev(rxe, "invalid send wr = %u > %d\n",
+ cap->max_send_wr, rxe->attr.max_qp_wr);
+ goto err1;
+ }
+
+ if (cap->max_send_sge > rxe->attr.max_send_sge) {
+ rxe_dbg_dev(rxe, "invalid send sge = %u > %d\n",
+ cap->max_send_sge, rxe->attr.max_send_sge);
+ goto err1;
+ }
+
+ if (!has_srq) {
+ if (cap->max_recv_wr > rxe->attr.max_qp_wr) {
+ rxe_dbg_dev(rxe, "invalid recv wr = %u > %d\n",
+ cap->max_recv_wr, rxe->attr.max_qp_wr);
+ goto err1;
+ }
+
+ if (cap->max_recv_sge > rxe->attr.max_recv_sge) {
+ rxe_dbg_dev(rxe, "invalid recv sge = %u > %d\n",
+ cap->max_recv_sge, rxe->attr.max_recv_sge);
+ goto err1;
+ }
+ }
+
+ if (cap->max_inline_data > rxe->max_inline_data) {
+ rxe_dbg_dev(rxe, "invalid max inline data = %u > %d\n",
+ cap->max_inline_data, rxe->max_inline_data);
+ goto err1;
+ }
+
+ return 0;
+
+err1:
+ return -EINVAL;
+}
+
+int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init)
+{
+ struct ib_qp_cap *cap = &init->cap;
+ struct rxe_port *port;
+ int port_num = init->port_num;
+
+ switch (init->qp_type) {
+ case IB_QPT_GSI:
+ case IB_QPT_RC:
+ case IB_QPT_UC:
+ case IB_QPT_UD:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ if (!init->recv_cq || !init->send_cq) {
+ rxe_dbg_dev(rxe, "missing cq\n");
+ goto err1;
+ }
+
+ if (rxe_qp_chk_cap(rxe, cap, !!init->srq))
+ goto err1;
+
+ if (init->qp_type == IB_QPT_GSI) {
+ if (!rdma_is_port_valid(&rxe->ib_dev, port_num)) {
+ rxe_dbg_dev(rxe, "invalid port = %d\n", port_num);
+ goto err1;
+ }
+
+ port = &rxe->port;
+
+ if (init->qp_type == IB_QPT_GSI && port->qp_gsi_index) {
+ rxe_dbg_dev(rxe, "GSI QP exists for port %d\n", port_num);
+ goto err1;
+ }
+ }
+
+ return 0;
+
+err1:
+ return -EINVAL;
+}
+
+static int alloc_rd_atomic_resources(struct rxe_qp *qp, unsigned int n)
+{
+ qp->resp.res_head = 0;
+ qp->resp.res_tail = 0;
+ qp->resp.resources = kcalloc(n, sizeof(struct resp_res), GFP_KERNEL);
+
+ if (!qp->resp.resources)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void free_rd_atomic_resources(struct rxe_qp *qp)
+{
+ if (qp->resp.resources) {
+ int i;
+
+ for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) {
+ struct resp_res *res = &qp->resp.resources[i];
+
+ free_rd_atomic_resource(res);
+ }
+ kfree(qp->resp.resources);
+ qp->resp.resources = NULL;
+ }
+}
+
+void free_rd_atomic_resource(struct resp_res *res)
+{
+ res->type = 0;
+}
+
+static void cleanup_rd_atomic_resources(struct rxe_qp *qp)
+{
+ int i;
+ struct resp_res *res;
+
+ if (qp->resp.resources) {
+ for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) {
+ res = &qp->resp.resources[i];
+ free_rd_atomic_resource(res);
+ }
+ }
+}
+
+static void rxe_qp_init_misc(struct rxe_dev *rxe, struct rxe_qp *qp,
+ struct ib_qp_init_attr *init)
+{
+ struct rxe_port *port;
+ u32 qpn;
+
+ qp->sq_sig_type = init->sq_sig_type;
+ qp->attr.path_mtu = 1;
+ qp->mtu = ib_mtu_enum_to_int(qp->attr.path_mtu);
+
+ qpn = qp->elem.index;
+ port = &rxe->port;
+
+ switch (init->qp_type) {
+ case IB_QPT_GSI:
+ qp->ibqp.qp_num = 1;
+ port->qp_gsi_index = qpn;
+ qp->attr.port_num = init->port_num;
+ break;
+
+ default:
+ qp->ibqp.qp_num = qpn;
+ break;
+ }
+
+ spin_lock_init(&qp->state_lock);
+
+ spin_lock_init(&qp->sq.sq_lock);
+ spin_lock_init(&qp->rq.producer_lock);
+ spin_lock_init(&qp->rq.consumer_lock);
+
+ skb_queue_head_init(&qp->req_pkts);
+ skb_queue_head_init(&qp->resp_pkts);
+
+ atomic_set(&qp->ssn, 0);
+ atomic_set(&qp->skb_out, 0);
+}
+
+static int rxe_init_sq(struct rxe_qp *qp, struct ib_qp_init_attr *init,
+ struct ib_udata *udata,
+ struct rxe_create_qp_resp __user *uresp)
+{
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ int wqe_size;
+ int err;
+
+ qp->sq.max_wr = init->cap.max_send_wr;
+ wqe_size = max_t(int, init->cap.max_send_sge * sizeof(struct ib_sge),
+ init->cap.max_inline_data);
+ qp->sq.max_sge = wqe_size / sizeof(struct ib_sge);
+ qp->sq.max_inline = wqe_size;
+ wqe_size += sizeof(struct rxe_send_wqe);
+
+ qp->sq.queue = rxe_queue_init(rxe, &qp->sq.max_wr, wqe_size,
+ QUEUE_TYPE_FROM_CLIENT);
+ if (!qp->sq.queue) {
+ rxe_err_qp(qp, "Unable to allocate send queue");
+ err = -ENOMEM;
+ goto err_out;
+ }
+
+ /* prepare info for caller to mmap send queue if user space qp */
+ err = do_mmap_info(rxe, uresp ? &uresp->sq_mi : NULL, udata,
+ qp->sq.queue->buf, qp->sq.queue->buf_size,
+ &qp->sq.queue->ip);
+ if (err) {
+ rxe_err_qp(qp, "do_mmap_info failed, err = %d", err);
+ goto err_free;
+ }
+
+ /* return actual capabilities to caller which may be larger
+ * than requested
+ */
+ init->cap.max_send_wr = qp->sq.max_wr;
+ init->cap.max_send_sge = qp->sq.max_sge;
+ init->cap.max_inline_data = qp->sq.max_inline;
+
+ return 0;
+
+err_free:
+ vfree(qp->sq.queue->buf);
+ kfree(qp->sq.queue);
+ qp->sq.queue = NULL;
+err_out:
+ return err;
+}
+
+static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp,
+ struct ib_qp_init_attr *init, struct ib_udata *udata,
+ struct rxe_create_qp_resp __user *uresp)
+{
+ int err;
+
+ /* if we don't finish qp create make sure queue is valid */
+ skb_queue_head_init(&qp->req_pkts);
+
+ err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &qp->sk);
+ if (err < 0)
+ return err;
+ qp->sk->sk->sk_user_data = qp;
+
+ /* pick a source UDP port number for this QP based on
+ * the source QPN. this spreads traffic for different QPs
+ * across different NIC RX queues (while using a single
+ * flow for a given QP to maintain packet order).
+ * the port number must be in the Dynamic Ports range
+ * (0xc000 - 0xffff).
+ */
+ qp->src_port = RXE_ROCE_V2_SPORT + (hash_32(qp_num(qp), 14) & 0x3fff);
+
+ err = rxe_init_sq(qp, init, udata, uresp);
+ if (err)
+ return err;
+
+ qp->req.wqe_index = queue_get_producer(qp->sq.queue,
+ QUEUE_TYPE_FROM_CLIENT);
+
+ qp->req.opcode = -1;
+ qp->comp.opcode = -1;
+
+ rxe_init_task(&qp->req.task, qp, rxe_requester);
+ rxe_init_task(&qp->comp.task, qp, rxe_completer);
+
+ qp->qp_timeout_jiffies = 0; /* Can't be set for UD/UC in modify_qp */
+ if (init->qp_type == IB_QPT_RC) {
+ timer_setup(&qp->rnr_nak_timer, rnr_nak_timer, 0);
+ timer_setup(&qp->retrans_timer, retransmit_timer, 0);
+ }
+ return 0;
+}
+
+static int rxe_init_rq(struct rxe_qp *qp, struct ib_qp_init_attr *init,
+ struct ib_udata *udata,
+ struct rxe_create_qp_resp __user *uresp)
+{
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ int wqe_size;
+ int err;
+
+ qp->rq.max_wr = init->cap.max_recv_wr;
+ qp->rq.max_sge = init->cap.max_recv_sge;
+ wqe_size = sizeof(struct rxe_recv_wqe) +
+ qp->rq.max_sge*sizeof(struct ib_sge);
+
+ qp->rq.queue = rxe_queue_init(rxe, &qp->rq.max_wr, wqe_size,
+ QUEUE_TYPE_FROM_CLIENT);
+ if (!qp->rq.queue) {
+ rxe_err_qp(qp, "Unable to allocate recv queue");
+ err = -ENOMEM;
+ goto err_out;
+ }
+
+ /* prepare info for caller to mmap recv queue if user space qp */
+ err = do_mmap_info(rxe, uresp ? &uresp->rq_mi : NULL, udata,
+ qp->rq.queue->buf, qp->rq.queue->buf_size,
+ &qp->rq.queue->ip);
+ if (err) {
+ rxe_err_qp(qp, "do_mmap_info failed, err = %d", err);
+ goto err_free;
+ }
+
+ /* return actual capabilities to caller which may be larger
+ * than requested
+ */
+ init->cap.max_recv_wr = qp->rq.max_wr;
+
+ return 0;
+
+err_free:
+ vfree(qp->rq.queue->buf);
+ kfree(qp->rq.queue);
+ qp->rq.queue = NULL;
+err_out:
+ return err;
+}
+
+static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp,
+ struct ib_qp_init_attr *init,
+ struct ib_udata *udata,
+ struct rxe_create_qp_resp __user *uresp)
+{
+ int err;
+
+ /* if we don't finish qp create make sure queue is valid */
+ skb_queue_head_init(&qp->resp_pkts);
+
+ if (!qp->srq) {
+ err = rxe_init_rq(qp, init, udata, uresp);
+ if (err)
+ return err;
+ }
+
+ rxe_init_task(&qp->resp.task, qp, rxe_responder);
+
+ qp->resp.opcode = OPCODE_NONE;
+ qp->resp.msn = 0;
+
+ return 0;
+}
+
+/* called by the create qp verb */
+int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd,
+ struct ib_qp_init_attr *init,
+ struct rxe_create_qp_resp __user *uresp,
+ struct ib_pd *ibpd,
+ struct ib_udata *udata)
+{
+ int err;
+ struct rxe_cq *rcq = to_rcq(init->recv_cq);
+ struct rxe_cq *scq = to_rcq(init->send_cq);
+ struct rxe_srq *srq = init->srq ? to_rsrq(init->srq) : NULL;
+ unsigned long flags;
+
+ rxe_get(pd);
+ rxe_get(rcq);
+ rxe_get(scq);
+ if (srq)
+ rxe_get(srq);
+
+ qp->pd = pd;
+ qp->rcq = rcq;
+ qp->scq = scq;
+ qp->srq = srq;
+
+ atomic_inc(&rcq->num_wq);
+ atomic_inc(&scq->num_wq);
+
+ rxe_qp_init_misc(rxe, qp, init);
+
+ err = rxe_qp_init_req(rxe, qp, init, udata, uresp);
+ if (err)
+ goto err1;
+
+ err = rxe_qp_init_resp(rxe, qp, init, udata, uresp);
+ if (err)
+ goto err2;
+
+ spin_lock_irqsave(&qp->state_lock, flags);
+ qp->attr.qp_state = IB_QPS_RESET;
+ qp->valid = 1;
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+
+ return 0;
+
+err2:
+ rxe_queue_cleanup(qp->sq.queue);
+ qp->sq.queue = NULL;
+err1:
+ atomic_dec(&rcq->num_wq);
+ atomic_dec(&scq->num_wq);
+
+ qp->pd = NULL;
+ qp->rcq = NULL;
+ qp->scq = NULL;
+ qp->srq = NULL;
+
+ if (srq)
+ rxe_put(srq);
+ rxe_put(scq);
+ rxe_put(rcq);
+ rxe_put(pd);
+
+ return err;
+}
+
+/* called by the query qp verb */
+int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init)
+{
+ init->event_handler = qp->ibqp.event_handler;
+ init->qp_context = qp->ibqp.qp_context;
+ init->send_cq = qp->ibqp.send_cq;
+ init->recv_cq = qp->ibqp.recv_cq;
+ init->srq = qp->ibqp.srq;
+
+ init->cap.max_send_wr = qp->sq.max_wr;
+ init->cap.max_send_sge = qp->sq.max_sge;
+ init->cap.max_inline_data = qp->sq.max_inline;
+
+ if (!qp->srq) {
+ init->cap.max_recv_wr = qp->rq.max_wr;
+ init->cap.max_recv_sge = qp->rq.max_sge;
+ }
+
+ init->sq_sig_type = qp->sq_sig_type;
+
+ init->qp_type = qp->ibqp.qp_type;
+ init->port_num = 1;
+
+ return 0;
+}
+
+int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp,
+ struct ib_qp_attr *attr, int mask)
+{
+ if (mask & IB_QP_PORT) {
+ if (!rdma_is_port_valid(&rxe->ib_dev, attr->port_num)) {
+ rxe_dbg_qp(qp, "invalid port %d\n", attr->port_num);
+ goto err1;
+ }
+ }
+
+ if (mask & IB_QP_CAP && rxe_qp_chk_cap(rxe, &attr->cap, !!qp->srq))
+ goto err1;
+
+ if (mask & IB_QP_ACCESS_FLAGS) {
+ if (!(qp_type(qp) == IB_QPT_RC || qp_type(qp) == IB_QPT_UC))
+ goto err1;
+ if (attr->qp_access_flags & ~RXE_ACCESS_SUPPORTED_QP)
+ goto err1;
+ }
+
+ if (mask & IB_QP_AV && rxe_av_chk_attr(qp, &attr->ah_attr))
+ goto err1;
+
+ if (mask & IB_QP_ALT_PATH) {
+ if (rxe_av_chk_attr(qp, &attr->alt_ah_attr))
+ goto err1;
+ if (!rdma_is_port_valid(&rxe->ib_dev, attr->alt_port_num)) {
+ rxe_dbg_qp(qp, "invalid alt port %d\n", attr->alt_port_num);
+ goto err1;
+ }
+ if (attr->alt_timeout > 31) {
+ rxe_dbg_qp(qp, "invalid alt timeout %d > 31\n",
+ attr->alt_timeout);
+ goto err1;
+ }
+ }
+
+ if (mask & IB_QP_PATH_MTU) {
+ struct rxe_port *port = &rxe->port;
+
+ enum ib_mtu max_mtu = port->attr.max_mtu;
+ enum ib_mtu mtu = attr->path_mtu;
+
+ if (mtu > max_mtu) {
+ rxe_dbg_qp(qp, "invalid mtu (%d) > (%d)\n",
+ ib_mtu_enum_to_int(mtu),
+ ib_mtu_enum_to_int(max_mtu));
+ goto err1;
+ }
+ }
+
+ if (mask & IB_QP_MAX_QP_RD_ATOMIC) {
+ if (attr->max_rd_atomic > rxe->attr.max_qp_rd_atom) {
+ rxe_dbg_qp(qp, "invalid max_rd_atomic %d > %d\n",
+ attr->max_rd_atomic,
+ rxe->attr.max_qp_rd_atom);
+ goto err1;
+ }
+ }
+
+ if (mask & IB_QP_TIMEOUT) {
+ if (attr->timeout > 31) {
+ rxe_dbg_qp(qp, "invalid timeout %d > 31\n",
+ attr->timeout);
+ goto err1;
+ }
+ }
+
+ return 0;
+
+err1:
+ return -EINVAL;
+}
+
+/* move the qp to the reset state */
+static void rxe_qp_reset(struct rxe_qp *qp)
+{
+ /* stop tasks from running */
+ rxe_disable_task(&qp->resp.task);
+ rxe_disable_task(&qp->comp.task);
+ rxe_disable_task(&qp->req.task);
+
+ /* drain work and packet queuesc */
+ rxe_requester(qp);
+ rxe_completer(qp);
+ rxe_responder(qp);
+
+ if (qp->rq.queue)
+ rxe_queue_reset(qp->rq.queue);
+ if (qp->sq.queue)
+ rxe_queue_reset(qp->sq.queue);
+
+ /* cleanup attributes */
+ atomic_set(&qp->ssn, 0);
+ qp->req.opcode = -1;
+ qp->req.need_retry = 0;
+ qp->req.wait_for_rnr_timer = 0;
+ qp->req.noack_pkts = 0;
+ qp->resp.msn = 0;
+ qp->resp.opcode = -1;
+ qp->resp.drop_msg = 0;
+ qp->resp.goto_error = 0;
+ qp->resp.sent_psn_nak = 0;
+
+ if (qp->resp.mr) {
+ rxe_put(qp->resp.mr);
+ qp->resp.mr = NULL;
+ }
+
+ cleanup_rd_atomic_resources(qp);
+
+ /* reenable tasks */
+ rxe_enable_task(&qp->resp.task);
+ rxe_enable_task(&qp->comp.task);
+ rxe_enable_task(&qp->req.task);
+}
+
+/* move the qp to the error state */
+void rxe_qp_error(struct rxe_qp *qp)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->state_lock, flags);
+ qp->attr.qp_state = IB_QPS_ERR;
+
+ /* drain work and packet queues */
+ rxe_sched_task(&qp->resp.task);
+ rxe_sched_task(&qp->comp.task);
+ rxe_sched_task(&qp->req.task);
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+}
+
+static void rxe_qp_sqd(struct rxe_qp *qp, struct ib_qp_attr *attr,
+ int mask)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->state_lock, flags);
+ qp->attr.sq_draining = 1;
+ rxe_sched_task(&qp->comp.task);
+ rxe_sched_task(&qp->req.task);
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+}
+
+/* caller should hold qp->state_lock */
+static int __qp_chk_state(struct rxe_qp *qp, struct ib_qp_attr *attr,
+ int mask)
+{
+ enum ib_qp_state cur_state;
+ enum ib_qp_state new_state;
+
+ cur_state = (mask & IB_QP_CUR_STATE) ?
+ attr->cur_qp_state : qp->attr.qp_state;
+ new_state = (mask & IB_QP_STATE) ?
+ attr->qp_state : cur_state;
+
+ if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask))
+ return -EINVAL;
+
+ if (mask & IB_QP_STATE && cur_state == IB_QPS_SQD) {
+ if (qp->attr.sq_draining && new_state != IB_QPS_ERR)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const char *const qps2str[] = {
+ [IB_QPS_RESET] = "RESET",
+ [IB_QPS_INIT] = "INIT",
+ [IB_QPS_RTR] = "RTR",
+ [IB_QPS_RTS] = "RTS",
+ [IB_QPS_SQD] = "SQD",
+ [IB_QPS_SQE] = "SQE",
+ [IB_QPS_ERR] = "ERR",
+};
+
+/* called by the modify qp verb */
+int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask,
+ struct ib_udata *udata)
+{
+ int err;
+
+ if (mask & IB_QP_CUR_STATE)
+ qp->attr.cur_qp_state = attr->qp_state;
+
+ if (mask & IB_QP_STATE) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->state_lock, flags);
+ err = __qp_chk_state(qp, attr, mask);
+ if (!err) {
+ qp->attr.qp_state = attr->qp_state;
+ rxe_dbg_qp(qp, "state -> %s\n",
+ qps2str[attr->qp_state]);
+ }
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+
+ if (err)
+ return err;
+
+ switch (attr->qp_state) {
+ case IB_QPS_RESET:
+ rxe_qp_reset(qp);
+ break;
+ case IB_QPS_SQD:
+ rxe_qp_sqd(qp, attr, mask);
+ break;
+ case IB_QPS_ERR:
+ rxe_qp_error(qp);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (mask & IB_QP_MAX_QP_RD_ATOMIC) {
+ int max_rd_atomic = attr->max_rd_atomic ?
+ roundup_pow_of_two(attr->max_rd_atomic) : 0;
+
+ qp->attr.max_rd_atomic = max_rd_atomic;
+ atomic_set(&qp->req.rd_atomic, max_rd_atomic);
+ }
+
+ if (mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+ int max_dest_rd_atomic = attr->max_dest_rd_atomic ?
+ roundup_pow_of_two(attr->max_dest_rd_atomic) : 0;
+
+ qp->attr.max_dest_rd_atomic = max_dest_rd_atomic;
+
+ free_rd_atomic_resources(qp);
+
+ err = alloc_rd_atomic_resources(qp, max_dest_rd_atomic);
+ if (err)
+ return err;
+ }
+
+ if (mask & IB_QP_EN_SQD_ASYNC_NOTIFY)
+ qp->attr.en_sqd_async_notify = attr->en_sqd_async_notify;
+
+ if (mask & IB_QP_ACCESS_FLAGS)
+ qp->attr.qp_access_flags = attr->qp_access_flags;
+
+ if (mask & IB_QP_PKEY_INDEX)
+ qp->attr.pkey_index = attr->pkey_index;
+
+ if (mask & IB_QP_PORT)
+ qp->attr.port_num = attr->port_num;
+
+ if (mask & IB_QP_QKEY)
+ qp->attr.qkey = attr->qkey;
+
+ if (mask & IB_QP_AV)
+ rxe_init_av(&attr->ah_attr, &qp->pri_av);
+
+ if (mask & IB_QP_ALT_PATH) {
+ rxe_init_av(&attr->alt_ah_attr, &qp->alt_av);
+ qp->attr.alt_port_num = attr->alt_port_num;
+ qp->attr.alt_pkey_index = attr->alt_pkey_index;
+ qp->attr.alt_timeout = attr->alt_timeout;
+ }
+
+ if (mask & IB_QP_PATH_MTU) {
+ qp->attr.path_mtu = attr->path_mtu;
+ qp->mtu = ib_mtu_enum_to_int(attr->path_mtu);
+ }
+
+ if (mask & IB_QP_TIMEOUT) {
+ qp->attr.timeout = attr->timeout;
+ if (attr->timeout == 0) {
+ qp->qp_timeout_jiffies = 0;
+ } else {
+ /* According to the spec, timeout = 4.096 * 2 ^ attr->timeout [us] */
+ int j = nsecs_to_jiffies(4096ULL << attr->timeout);
+
+ qp->qp_timeout_jiffies = j ? j : 1;
+ }
+ }
+
+ if (mask & IB_QP_RETRY_CNT) {
+ qp->attr.retry_cnt = attr->retry_cnt;
+ qp->comp.retry_cnt = attr->retry_cnt;
+ rxe_dbg_qp(qp, "set retry count = %d\n", attr->retry_cnt);
+ }
+
+ if (mask & IB_QP_RNR_RETRY) {
+ qp->attr.rnr_retry = attr->rnr_retry;
+ qp->comp.rnr_retry = attr->rnr_retry;
+ rxe_dbg_qp(qp, "set rnr retry count = %d\n", attr->rnr_retry);
+ }
+
+ if (mask & IB_QP_RQ_PSN) {
+ qp->attr.rq_psn = (attr->rq_psn & BTH_PSN_MASK);
+ qp->resp.psn = qp->attr.rq_psn;
+ rxe_dbg_qp(qp, "set resp psn = 0x%x\n", qp->resp.psn);
+ }
+
+ if (mask & IB_QP_MIN_RNR_TIMER) {
+ qp->attr.min_rnr_timer = attr->min_rnr_timer;
+ rxe_dbg_qp(qp, "set min rnr timer = 0x%x\n",
+ attr->min_rnr_timer);
+ }
+
+ if (mask & IB_QP_SQ_PSN) {
+ qp->attr.sq_psn = (attr->sq_psn & BTH_PSN_MASK);
+ qp->req.psn = qp->attr.sq_psn;
+ qp->comp.psn = qp->attr.sq_psn;
+ rxe_dbg_qp(qp, "set req psn = 0x%x\n", qp->req.psn);
+ }
+
+ if (mask & IB_QP_PATH_MIG_STATE)
+ qp->attr.path_mig_state = attr->path_mig_state;
+
+ if (mask & IB_QP_DEST_QPN)
+ qp->attr.dest_qp_num = attr->dest_qp_num;
+
+ return 0;
+}
+
+/* called by the query qp verb */
+int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask)
+{
+ unsigned long flags;
+
+ *attr = qp->attr;
+
+ attr->rq_psn = qp->resp.psn;
+ attr->sq_psn = qp->req.psn;
+
+ attr->cap.max_send_wr = qp->sq.max_wr;
+ attr->cap.max_send_sge = qp->sq.max_sge;
+ attr->cap.max_inline_data = qp->sq.max_inline;
+
+ if (!qp->srq) {
+ attr->cap.max_recv_wr = qp->rq.max_wr;
+ attr->cap.max_recv_sge = qp->rq.max_sge;
+ }
+
+ rxe_av_to_attr(&qp->pri_av, &attr->ah_attr);
+ rxe_av_to_attr(&qp->alt_av, &attr->alt_ah_attr);
+
+ /* Applications that get this state typically spin on it.
+ * Yield the processor
+ */
+ spin_lock_irqsave(&qp->state_lock, flags);
+ if (qp->attr.sq_draining) {
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+ cond_resched();
+ } else {
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+ }
+
+ return 0;
+}
+
+int rxe_qp_chk_destroy(struct rxe_qp *qp)
+{
+ /* See IBA o10-2.2.3
+ * An attempt to destroy a QP while attached to a mcast group
+ * will fail immediately.
+ */
+ if (atomic_read(&qp->mcg_num)) {
+ rxe_dbg_qp(qp, "Attempt to destroy while attached to multicast group\n");
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+/* called when the last reference to the qp is dropped */
+static void rxe_qp_do_cleanup(struct work_struct *work)
+{
+ struct rxe_qp *qp = container_of(work, typeof(*qp), cleanup_work.work);
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->state_lock, flags);
+ qp->valid = 0;
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+ qp->qp_timeout_jiffies = 0;
+
+ if (qp_type(qp) == IB_QPT_RC) {
+ del_timer_sync(&qp->retrans_timer);
+ del_timer_sync(&qp->rnr_nak_timer);
+ }
+
+ if (qp->resp.task.func)
+ rxe_cleanup_task(&qp->resp.task);
+
+ if (qp->req.task.func)
+ rxe_cleanup_task(&qp->req.task);
+
+ if (qp->comp.task.func)
+ rxe_cleanup_task(&qp->comp.task);
+
+ /* flush out any receive wr's or pending requests */
+ rxe_requester(qp);
+ rxe_completer(qp);
+ rxe_responder(qp);
+
+ if (qp->sq.queue)
+ rxe_queue_cleanup(qp->sq.queue);
+
+ if (qp->srq)
+ rxe_put(qp->srq);
+
+ if (qp->rq.queue)
+ rxe_queue_cleanup(qp->rq.queue);
+
+ if (qp->scq) {
+ atomic_dec(&qp->scq->num_wq);
+ rxe_put(qp->scq);
+ }
+
+ if (qp->rcq) {
+ atomic_dec(&qp->rcq->num_wq);
+ rxe_put(qp->rcq);
+ }
+
+ if (qp->pd)
+ rxe_put(qp->pd);
+
+ if (qp->resp.mr)
+ rxe_put(qp->resp.mr);
+
+ free_rd_atomic_resources(qp);
+
+ if (qp->sk) {
+ if (qp_type(qp) == IB_QPT_RC)
+ sk_dst_reset(qp->sk->sk);
+
+ kernel_sock_shutdown(qp->sk, SHUT_RDWR);
+ sock_release(qp->sk);
+ }
+}
+
+/* called when the last reference to the qp is dropped */
+void rxe_qp_cleanup(struct rxe_pool_elem *elem)
+{
+ struct rxe_qp *qp = container_of(elem, typeof(*qp), elem);
+
+ execute_in_process_context(rxe_qp_do_cleanup, &qp->cleanup_work);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_queue.c b/drivers/infiniband/sw/rxe/rxe_queue.c
new file mode 100644
index 0000000000..9611ee191a
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_queue.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#include <linux/vmalloc.h>
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+int do_mmap_info(struct rxe_dev *rxe, struct mminfo __user *outbuf,
+ struct ib_udata *udata, struct rxe_queue_buf *buf,
+ size_t buf_size, struct rxe_mmap_info **ip_p)
+{
+ int err;
+ struct rxe_mmap_info *ip = NULL;
+
+ if (outbuf) {
+ ip = rxe_create_mmap_info(rxe, buf_size, udata, buf);
+ if (IS_ERR(ip)) {
+ err = PTR_ERR(ip);
+ goto err1;
+ }
+
+ if (copy_to_user(outbuf, &ip->info, sizeof(ip->info))) {
+ err = -EFAULT;
+ goto err2;
+ }
+
+ spin_lock_bh(&rxe->pending_lock);
+ list_add(&ip->pending_mmaps, &rxe->pending_mmaps);
+ spin_unlock_bh(&rxe->pending_lock);
+ }
+
+ *ip_p = ip;
+
+ return 0;
+
+err2:
+ kfree(ip);
+err1:
+ return err;
+}
+
+inline void rxe_queue_reset(struct rxe_queue *q)
+{
+ /* queue is comprised from header and the memory
+ * of the actual queue. See "struct rxe_queue_buf" in rxe_queue.h
+ * reset only the queue itself and not the management header
+ */
+ memset(q->buf->data, 0, q->buf_size - sizeof(struct rxe_queue_buf));
+}
+
+struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe, int *num_elem,
+ unsigned int elem_size, enum queue_type type)
+{
+ struct rxe_queue *q;
+ size_t buf_size;
+ unsigned int num_slots;
+
+ /* num_elem == 0 is allowed, but uninteresting */
+ if (*num_elem < 0)
+ return NULL;
+
+ q = kzalloc(sizeof(*q), GFP_KERNEL);
+ if (!q)
+ return NULL;
+
+ q->rxe = rxe;
+ q->type = type;
+
+ /* used in resize, only need to copy used part of queue */
+ q->elem_size = elem_size;
+
+ /* pad element up to at least a cacheline and always a power of 2 */
+ if (elem_size < cache_line_size())
+ elem_size = cache_line_size();
+ elem_size = roundup_pow_of_two(elem_size);
+
+ q->log2_elem_size = order_base_2(elem_size);
+
+ num_slots = *num_elem + 1;
+ num_slots = roundup_pow_of_two(num_slots);
+ q->index_mask = num_slots - 1;
+
+ buf_size = sizeof(struct rxe_queue_buf) + num_slots * elem_size;
+
+ q->buf = vmalloc_user(buf_size);
+ if (!q->buf)
+ goto err2;
+
+ q->buf->log2_elem_size = q->log2_elem_size;
+ q->buf->index_mask = q->index_mask;
+
+ q->buf_size = buf_size;
+
+ *num_elem = num_slots - 1;
+ return q;
+
+err2:
+ kfree(q);
+ return NULL;
+}
+
+/* copies elements from original q to new q and then swaps the contents of the
+ * two q headers. This is so that if anyone is holding a pointer to q it will
+ * still work
+ */
+static int resize_finish(struct rxe_queue *q, struct rxe_queue *new_q,
+ unsigned int num_elem)
+{
+ enum queue_type type = q->type;
+ u32 new_prod;
+ u32 prod;
+ u32 cons;
+
+ if (!queue_empty(q, q->type) && (num_elem < queue_count(q, type)))
+ return -EINVAL;
+
+ new_prod = queue_get_producer(new_q, type);
+ prod = queue_get_producer(q, type);
+ cons = queue_get_consumer(q, type);
+
+ while ((prod - cons) & q->index_mask) {
+ memcpy(queue_addr_from_index(new_q, new_prod),
+ queue_addr_from_index(q, cons), new_q->elem_size);
+ new_prod = queue_next_index(new_q, new_prod);
+ cons = queue_next_index(q, cons);
+ }
+
+ new_q->buf->producer_index = new_prod;
+ q->buf->consumer_index = cons;
+
+ /* update private index copies */
+ if (type == QUEUE_TYPE_TO_CLIENT)
+ new_q->index = new_q->buf->producer_index;
+ else
+ q->index = q->buf->consumer_index;
+
+ /* exchange rxe_queue headers */
+ swap(*q, *new_q);
+
+ return 0;
+}
+
+int rxe_queue_resize(struct rxe_queue *q, unsigned int *num_elem_p,
+ unsigned int elem_size, struct ib_udata *udata,
+ struct mminfo __user *outbuf, spinlock_t *producer_lock,
+ spinlock_t *consumer_lock)
+{
+ struct rxe_queue *new_q;
+ unsigned int num_elem = *num_elem_p;
+ int err;
+ unsigned long producer_flags;
+ unsigned long consumer_flags;
+
+ new_q = rxe_queue_init(q->rxe, &num_elem, elem_size, q->type);
+ if (!new_q)
+ return -ENOMEM;
+
+ err = do_mmap_info(new_q->rxe, outbuf, udata, new_q->buf,
+ new_q->buf_size, &new_q->ip);
+ if (err) {
+ vfree(new_q->buf);
+ kfree(new_q);
+ goto err1;
+ }
+
+ spin_lock_irqsave(consumer_lock, consumer_flags);
+
+ if (producer_lock) {
+ spin_lock_irqsave(producer_lock, producer_flags);
+ err = resize_finish(q, new_q, num_elem);
+ spin_unlock_irqrestore(producer_lock, producer_flags);
+ } else {
+ err = resize_finish(q, new_q, num_elem);
+ }
+
+ spin_unlock_irqrestore(consumer_lock, consumer_flags);
+
+ rxe_queue_cleanup(new_q); /* new/old dep on err */
+ if (err)
+ goto err1;
+
+ *num_elem_p = num_elem;
+ return 0;
+
+err1:
+ return err;
+}
+
+void rxe_queue_cleanup(struct rxe_queue *q)
+{
+ if (q->ip)
+ kref_put(&q->ip->ref, rxe_mmap_release);
+ else
+ vfree(q->buf);
+
+ kfree(q);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_queue.h b/drivers/infiniband/sw/rxe/rxe_queue.h
new file mode 100644
index 0000000000..c711cb98b9
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_queue.h
@@ -0,0 +1,284 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#ifndef RXE_QUEUE_H
+#define RXE_QUEUE_H
+
+/* Implements a simple circular buffer that is shared between user
+ * and the driver and can be resized. The requested element size is
+ * rounded up to a power of 2 and the number of elements in the buffer
+ * is also rounded up to a power of 2. Since the queue is empty when
+ * the producer and consumer indices match the maximum capacity of the
+ * queue is one less than the number of element slots.
+ *
+ * Notes:
+ * - The driver indices are always masked off to q->index_mask
+ * before storing so do not need to be checked on reads.
+ * - The user whether user space or kernel is generally
+ * not trusted so its parameters are masked to make sure
+ * they do not access the queue out of bounds on reads.
+ * - The driver indices for queues must not be written
+ * by user so a local copy is used and a shared copy is
+ * stored when the local copy is changed.
+ * - By passing the type in the parameter list separate from q
+ * the compiler can eliminate the switch statement when the
+ * actual queue type is known when the function is called at
+ * compile time.
+ * - These queues are lock free. The user and driver must protect
+ * changes to their end of the queues with locks if more than one
+ * CPU can be accessing it at the same time.
+ */
+
+/**
+ * enum queue_type - type of queue
+ * @QUEUE_TYPE_TO_CLIENT: Queue is written by rxe driver and
+ * read by client which may be a user space
+ * application or a kernel ulp.
+ * Used by rxe internals only.
+ * @QUEUE_TYPE_FROM_CLIENT: Queue is written by client and
+ * read by rxe driver.
+ * Used by rxe internals only.
+ * @QUEUE_TYPE_FROM_ULP: Queue is written by kernel ulp and
+ * read by rxe driver.
+ * Used by kernel verbs APIs only on
+ * behalf of ulps.
+ * @QUEUE_TYPE_TO_ULP: Queue is written by rxe driver and
+ * read by kernel ulp.
+ * Used by kernel verbs APIs only on
+ * behalf of ulps.
+ */
+enum queue_type {
+ QUEUE_TYPE_TO_CLIENT,
+ QUEUE_TYPE_FROM_CLIENT,
+ QUEUE_TYPE_FROM_ULP,
+ QUEUE_TYPE_TO_ULP,
+};
+
+struct rxe_queue_buf;
+
+struct rxe_queue {
+ struct rxe_dev *rxe;
+ struct rxe_queue_buf *buf;
+ struct rxe_mmap_info *ip;
+ size_t buf_size;
+ size_t elem_size;
+ unsigned int log2_elem_size;
+ u32 index_mask;
+ enum queue_type type;
+ /* private copy of index for shared queues between
+ * driver and clients. Driver reads and writes
+ * this copy and then replicates to rxe_queue_buf
+ * for read access by clients.
+ */
+ u32 index;
+};
+
+int do_mmap_info(struct rxe_dev *rxe, struct mminfo __user *outbuf,
+ struct ib_udata *udata, struct rxe_queue_buf *buf,
+ size_t buf_size, struct rxe_mmap_info **ip_p);
+
+void rxe_queue_reset(struct rxe_queue *q);
+
+struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe, int *num_elem,
+ unsigned int elem_size, enum queue_type type);
+
+int rxe_queue_resize(struct rxe_queue *q, unsigned int *num_elem_p,
+ unsigned int elem_size, struct ib_udata *udata,
+ struct mminfo __user *outbuf,
+ spinlock_t *producer_lock, spinlock_t *consumer_lock);
+
+void rxe_queue_cleanup(struct rxe_queue *queue);
+
+static inline u32 queue_next_index(struct rxe_queue *q, int index)
+{
+ return (index + 1) & q->index_mask;
+}
+
+static inline u32 queue_get_producer(const struct rxe_queue *q,
+ enum queue_type type)
+{
+ u32 prod;
+
+ switch (type) {
+ case QUEUE_TYPE_FROM_CLIENT:
+ /* used by rxe, client owns the index */
+ prod = smp_load_acquire(&q->buf->producer_index);
+ break;
+ case QUEUE_TYPE_TO_CLIENT:
+ /* used by rxe which owns the index */
+ prod = q->index;
+ break;
+ case QUEUE_TYPE_FROM_ULP:
+ /* used by ulp which owns the index */
+ prod = q->buf->producer_index;
+ break;
+ case QUEUE_TYPE_TO_ULP:
+ /* used by ulp, rxe owns the index */
+ prod = smp_load_acquire(&q->buf->producer_index);
+ break;
+ }
+
+ return prod;
+}
+
+static inline u32 queue_get_consumer(const struct rxe_queue *q,
+ enum queue_type type)
+{
+ u32 cons;
+
+ switch (type) {
+ case QUEUE_TYPE_FROM_CLIENT:
+ /* used by rxe which owns the index */
+ cons = q->index;
+ break;
+ case QUEUE_TYPE_TO_CLIENT:
+ /* used by rxe, client owns the index */
+ cons = smp_load_acquire(&q->buf->consumer_index);
+ break;
+ case QUEUE_TYPE_FROM_ULP:
+ /* used by ulp, rxe owns the index */
+ cons = smp_load_acquire(&q->buf->consumer_index);
+ break;
+ case QUEUE_TYPE_TO_ULP:
+ /* used by ulp which owns the index */
+ cons = q->buf->consumer_index;
+ break;
+ }
+
+ return cons;
+}
+
+static inline int queue_empty(struct rxe_queue *q, enum queue_type type)
+{
+ u32 prod = queue_get_producer(q, type);
+ u32 cons = queue_get_consumer(q, type);
+
+ return ((prod - cons) & q->index_mask) == 0;
+}
+
+static inline int queue_full(struct rxe_queue *q, enum queue_type type)
+{
+ u32 prod = queue_get_producer(q, type);
+ u32 cons = queue_get_consumer(q, type);
+
+ return ((prod + 1 - cons) & q->index_mask) == 0;
+}
+
+static inline u32 queue_count(const struct rxe_queue *q,
+ enum queue_type type)
+{
+ u32 prod = queue_get_producer(q, type);
+ u32 cons = queue_get_consumer(q, type);
+
+ return (prod - cons) & q->index_mask;
+}
+
+static inline void queue_advance_producer(struct rxe_queue *q,
+ enum queue_type type)
+{
+ u32 prod;
+
+ switch (type) {
+ case QUEUE_TYPE_FROM_CLIENT:
+ /* used by rxe, client owns the index */
+ if (WARN_ON(1))
+ pr_warn("%s: attempt to advance client index\n",
+ __func__);
+ break;
+ case QUEUE_TYPE_TO_CLIENT:
+ /* used by rxe which owns the index */
+ prod = q->index;
+ prod = (prod + 1) & q->index_mask;
+ q->index = prod;
+ /* release so client can read it safely */
+ smp_store_release(&q->buf->producer_index, prod);
+ break;
+ case QUEUE_TYPE_FROM_ULP:
+ /* used by ulp which owns the index */
+ prod = q->buf->producer_index;
+ prod = (prod + 1) & q->index_mask;
+ /* release so rxe can read it safely */
+ smp_store_release(&q->buf->producer_index, prod);
+ break;
+ case QUEUE_TYPE_TO_ULP:
+ /* used by ulp, rxe owns the index */
+ if (WARN_ON(1))
+ pr_warn("%s: attempt to advance driver index\n",
+ __func__);
+ break;
+ }
+}
+
+static inline void queue_advance_consumer(struct rxe_queue *q,
+ enum queue_type type)
+{
+ u32 cons;
+
+ switch (type) {
+ case QUEUE_TYPE_FROM_CLIENT:
+ /* used by rxe which owns the index */
+ cons = (q->index + 1) & q->index_mask;
+ q->index = cons;
+ /* release so client can read it safely */
+ smp_store_release(&q->buf->consumer_index, cons);
+ break;
+ case QUEUE_TYPE_TO_CLIENT:
+ /* used by rxe, client owns the index */
+ if (WARN_ON(1))
+ pr_warn("%s: attempt to advance client index\n",
+ __func__);
+ break;
+ case QUEUE_TYPE_FROM_ULP:
+ /* used by ulp, rxe owns the index */
+ if (WARN_ON(1))
+ pr_warn("%s: attempt to advance driver index\n",
+ __func__);
+ break;
+ case QUEUE_TYPE_TO_ULP:
+ /* used by ulp which owns the index */
+ cons = q->buf->consumer_index;
+ cons = (cons + 1) & q->index_mask;
+ /* release so rxe can read it safely */
+ smp_store_release(&q->buf->consumer_index, cons);
+ break;
+ }
+}
+
+static inline void *queue_producer_addr(struct rxe_queue *q,
+ enum queue_type type)
+{
+ u32 prod = queue_get_producer(q, type);
+
+ return q->buf->data + (prod << q->log2_elem_size);
+}
+
+static inline void *queue_consumer_addr(struct rxe_queue *q,
+ enum queue_type type)
+{
+ u32 cons = queue_get_consumer(q, type);
+
+ return q->buf->data + (cons << q->log2_elem_size);
+}
+
+static inline void *queue_addr_from_index(struct rxe_queue *q, u32 index)
+{
+ return q->buf->data + ((index & q->index_mask)
+ << q->log2_elem_size);
+}
+
+static inline u32 queue_index_from_addr(const struct rxe_queue *q,
+ const void *addr)
+{
+ return (((u8 *)addr - q->buf->data) >> q->log2_elem_size)
+ & q->index_mask;
+}
+
+static inline void *queue_head(struct rxe_queue *q, enum queue_type type)
+{
+ return queue_empty(q, type) ? NULL : queue_consumer_addr(q, type);
+}
+
+#endif /* RXE_QUEUE_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c
new file mode 100644
index 0000000000..5861e42440
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_recv.c
@@ -0,0 +1,359 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#include <linux/skbuff.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+
+/* check that QP matches packet opcode type and is in a valid state */
+static int check_type_state(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+ struct rxe_qp *qp)
+{
+ unsigned int pkt_type;
+ unsigned long flags;
+
+ if (unlikely(!qp->valid))
+ return -EINVAL;
+
+ pkt_type = pkt->opcode & 0xe0;
+
+ switch (qp_type(qp)) {
+ case IB_QPT_RC:
+ if (unlikely(pkt_type != IB_OPCODE_RC))
+ return -EINVAL;
+ break;
+ case IB_QPT_UC:
+ if (unlikely(pkt_type != IB_OPCODE_UC))
+ return -EINVAL;
+ break;
+ case IB_QPT_UD:
+ case IB_QPT_GSI:
+ if (unlikely(pkt_type != IB_OPCODE_UD))
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ spin_lock_irqsave(&qp->state_lock, flags);
+ if (pkt->mask & RXE_REQ_MASK) {
+ if (unlikely(qp_state(qp) < IB_QPS_RTR)) {
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+ return -EINVAL;
+ }
+ } else {
+ if (unlikely(qp_state(qp) < IB_QPS_RTS)) {
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+ return -EINVAL;
+ }
+ }
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+
+ return 0;
+}
+
+static void set_bad_pkey_cntr(struct rxe_port *port)
+{
+ spin_lock_bh(&port->port_lock);
+ port->attr.bad_pkey_cntr = min((u32)0xffff,
+ port->attr.bad_pkey_cntr + 1);
+ spin_unlock_bh(&port->port_lock);
+}
+
+static void set_qkey_viol_cntr(struct rxe_port *port)
+{
+ spin_lock_bh(&port->port_lock);
+ port->attr.qkey_viol_cntr = min((u32)0xffff,
+ port->attr.qkey_viol_cntr + 1);
+ spin_unlock_bh(&port->port_lock);
+}
+
+static int check_keys(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+ u32 qpn, struct rxe_qp *qp)
+{
+ struct rxe_port *port = &rxe->port;
+ u16 pkey = bth_pkey(pkt);
+
+ pkt->pkey_index = 0;
+
+ if (!pkey_match(pkey, IB_DEFAULT_PKEY_FULL)) {
+ set_bad_pkey_cntr(port);
+ return -EINVAL;
+ }
+
+ if (qp_type(qp) == IB_QPT_UD || qp_type(qp) == IB_QPT_GSI) {
+ u32 qkey = (qpn == 1) ? GSI_QKEY : qp->attr.qkey;
+
+ if (unlikely(deth_qkey(pkt) != qkey)) {
+ set_qkey_viol_cntr(port);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int check_addr(struct rxe_dev *rxe, struct rxe_pkt_info *pkt,
+ struct rxe_qp *qp)
+{
+ struct sk_buff *skb = PKT_TO_SKB(pkt);
+
+ if (qp_type(qp) != IB_QPT_RC && qp_type(qp) != IB_QPT_UC)
+ return 0;
+
+ if (unlikely(pkt->port_num != qp->attr.port_num))
+ return -EINVAL;
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ struct in_addr *saddr =
+ &qp->pri_av.sgid_addr._sockaddr_in.sin_addr;
+ struct in_addr *daddr =
+ &qp->pri_av.dgid_addr._sockaddr_in.sin_addr;
+
+ if ((ip_hdr(skb)->daddr != saddr->s_addr) ||
+ (ip_hdr(skb)->saddr != daddr->s_addr))
+ return -EINVAL;
+
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ struct in6_addr *saddr =
+ &qp->pri_av.sgid_addr._sockaddr_in6.sin6_addr;
+ struct in6_addr *daddr =
+ &qp->pri_av.dgid_addr._sockaddr_in6.sin6_addr;
+
+ if (memcmp(&ipv6_hdr(skb)->daddr, saddr, sizeof(*saddr)) ||
+ memcmp(&ipv6_hdr(skb)->saddr, daddr, sizeof(*daddr)))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int hdr_check(struct rxe_pkt_info *pkt)
+{
+ struct rxe_dev *rxe = pkt->rxe;
+ struct rxe_port *port = &rxe->port;
+ struct rxe_qp *qp = NULL;
+ u32 qpn = bth_qpn(pkt);
+ int index;
+ int err;
+
+ if (unlikely(bth_tver(pkt) != BTH_TVER))
+ goto err1;
+
+ if (unlikely(qpn == 0))
+ goto err1;
+
+ if (qpn != IB_MULTICAST_QPN) {
+ index = (qpn == 1) ? port->qp_gsi_index : qpn;
+
+ qp = rxe_pool_get_index(&rxe->qp_pool, index);
+ if (unlikely(!qp))
+ goto err1;
+
+ err = check_type_state(rxe, pkt, qp);
+ if (unlikely(err))
+ goto err2;
+
+ err = check_addr(rxe, pkt, qp);
+ if (unlikely(err))
+ goto err2;
+
+ err = check_keys(rxe, pkt, qpn, qp);
+ if (unlikely(err))
+ goto err2;
+ } else {
+ if (unlikely((pkt->mask & RXE_GRH_MASK) == 0))
+ goto err1;
+ }
+
+ pkt->qp = qp;
+ return 0;
+
+err2:
+ rxe_put(qp);
+err1:
+ return -EINVAL;
+}
+
+static inline void rxe_rcv_pkt(struct rxe_pkt_info *pkt, struct sk_buff *skb)
+{
+ if (pkt->mask & RXE_REQ_MASK)
+ rxe_resp_queue_pkt(pkt->qp, skb);
+ else
+ rxe_comp_queue_pkt(pkt->qp, skb);
+}
+
+static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb)
+{
+ struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+ struct rxe_mcg *mcg;
+ struct rxe_mca *mca;
+ struct rxe_qp *qp;
+ union ib_gid dgid;
+ int err;
+
+ if (skb->protocol == htons(ETH_P_IP))
+ ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr,
+ (struct in6_addr *)&dgid);
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ memcpy(&dgid, &ipv6_hdr(skb)->daddr, sizeof(dgid));
+
+ /* lookup mcast group corresponding to mgid, takes a ref */
+ mcg = rxe_lookup_mcg(rxe, &dgid);
+ if (!mcg)
+ goto drop; /* mcast group not registered */
+
+ spin_lock_bh(&rxe->mcg_lock);
+
+ /* this is unreliable datagram service so we let
+ * failures to deliver a multicast packet to a
+ * single QP happen and just move on and try
+ * the rest of them on the list
+ */
+ list_for_each_entry(mca, &mcg->qp_list, qp_list) {
+ qp = mca->qp;
+
+ /* validate qp for incoming packet */
+ err = check_type_state(rxe, pkt, qp);
+ if (err)
+ continue;
+
+ err = check_keys(rxe, pkt, bth_qpn(pkt), qp);
+ if (err)
+ continue;
+
+ /* for all but the last QP create a new clone of the
+ * skb and pass to the QP. Pass the original skb to
+ * the last QP in the list.
+ */
+ if (mca->qp_list.next != &mcg->qp_list) {
+ struct sk_buff *cskb;
+ struct rxe_pkt_info *cpkt;
+
+ cskb = skb_clone(skb, GFP_ATOMIC);
+ if (unlikely(!cskb))
+ continue;
+
+ if (WARN_ON(!ib_device_try_get(&rxe->ib_dev))) {
+ kfree_skb(cskb);
+ break;
+ }
+
+ cpkt = SKB_TO_PKT(cskb);
+ cpkt->qp = qp;
+ rxe_get(qp);
+ rxe_rcv_pkt(cpkt, cskb);
+ } else {
+ pkt->qp = qp;
+ rxe_get(qp);
+ rxe_rcv_pkt(pkt, skb);
+ skb = NULL; /* mark consumed */
+ }
+ }
+
+ spin_unlock_bh(&rxe->mcg_lock);
+
+ kref_put(&mcg->ref_cnt, rxe_cleanup_mcg);
+
+ if (likely(!skb))
+ return;
+
+ /* This only occurs if one of the checks fails on the last
+ * QP in the list above
+ */
+
+drop:
+ kfree_skb(skb);
+ ib_device_put(&rxe->ib_dev);
+}
+
+/**
+ * rxe_chk_dgid - validate destination IP address
+ * @rxe: rxe device that received packet
+ * @skb: the received packet buffer
+ *
+ * Accept any loopback packets
+ * Extract IP address from packet and
+ * Accept if multicast packet
+ * Accept if matches an SGID table entry
+ */
+static int rxe_chk_dgid(struct rxe_dev *rxe, struct sk_buff *skb)
+{
+ struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+ const struct ib_gid_attr *gid_attr;
+ union ib_gid dgid;
+ union ib_gid *pdgid;
+
+ if (pkt->mask & RXE_LOOPBACK_MASK)
+ return 0;
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr,
+ (struct in6_addr *)&dgid);
+ pdgid = &dgid;
+ } else {
+ pdgid = (union ib_gid *)&ipv6_hdr(skb)->daddr;
+ }
+
+ if (rdma_is_multicast_addr((struct in6_addr *)pdgid))
+ return 0;
+
+ gid_attr = rdma_find_gid_by_port(&rxe->ib_dev, pdgid,
+ IB_GID_TYPE_ROCE_UDP_ENCAP,
+ 1, skb->dev);
+ if (IS_ERR(gid_attr))
+ return PTR_ERR(gid_attr);
+
+ rdma_put_gid_attr(gid_attr);
+ return 0;
+}
+
+/* rxe_rcv is called from the interface driver */
+void rxe_rcv(struct sk_buff *skb)
+{
+ int err;
+ struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+ struct rxe_dev *rxe = pkt->rxe;
+
+ if (unlikely(skb->len < RXE_BTH_BYTES))
+ goto drop;
+
+ if (rxe_chk_dgid(rxe, skb) < 0)
+ goto drop;
+
+ pkt->opcode = bth_opcode(pkt);
+ pkt->psn = bth_psn(pkt);
+ pkt->qp = NULL;
+ pkt->mask |= rxe_opcode[pkt->opcode].mask;
+
+ if (unlikely(skb->len < header_size(pkt)))
+ goto drop;
+
+ err = hdr_check(pkt);
+ if (unlikely(err))
+ goto drop;
+
+ err = rxe_icrc_check(skb, pkt);
+ if (unlikely(err))
+ goto drop;
+
+ rxe_counter_inc(rxe, RXE_CNT_RCVD_PKTS);
+
+ if (unlikely(bth_qpn(pkt) == IB_MULTICAST_QPN))
+ rxe_rcv_mcast_pkt(rxe, skb);
+ else
+ rxe_rcv_pkt(pkt, skb);
+
+ return;
+
+drop:
+ if (pkt->qp)
+ rxe_put(pkt->qp);
+
+ kfree_skb(skb);
+ ib_device_put(&rxe->ib_dev);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
new file mode 100644
index 0000000000..d8c41fd626
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_req.c
@@ -0,0 +1,880 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#include <linux/skbuff.h>
+#include <crypto/hash.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+static int next_opcode(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
+ u32 opcode);
+
+static inline void retry_first_write_send(struct rxe_qp *qp,
+ struct rxe_send_wqe *wqe, int npsn)
+{
+ int i;
+
+ for (i = 0; i < npsn; i++) {
+ int to_send = (wqe->dma.resid > qp->mtu) ?
+ qp->mtu : wqe->dma.resid;
+
+ qp->req.opcode = next_opcode(qp, wqe,
+ wqe->wr.opcode);
+
+ if (wqe->wr.send_flags & IB_SEND_INLINE) {
+ wqe->dma.resid -= to_send;
+ wqe->dma.sge_offset += to_send;
+ } else {
+ advance_dma_data(&wqe->dma, to_send);
+ }
+ }
+}
+
+static void req_retry(struct rxe_qp *qp)
+{
+ struct rxe_send_wqe *wqe;
+ unsigned int wqe_index;
+ unsigned int mask;
+ int npsn;
+ int first = 1;
+ struct rxe_queue *q = qp->sq.queue;
+ unsigned int cons;
+ unsigned int prod;
+
+ cons = queue_get_consumer(q, QUEUE_TYPE_FROM_CLIENT);
+ prod = queue_get_producer(q, QUEUE_TYPE_FROM_CLIENT);
+
+ qp->req.wqe_index = cons;
+ qp->req.psn = qp->comp.psn;
+ qp->req.opcode = -1;
+
+ for (wqe_index = cons; wqe_index != prod;
+ wqe_index = queue_next_index(q, wqe_index)) {
+ wqe = queue_addr_from_index(qp->sq.queue, wqe_index);
+ mask = wr_opcode_mask(wqe->wr.opcode, qp);
+
+ if (wqe->state == wqe_state_posted)
+ break;
+
+ if (wqe->state == wqe_state_done)
+ continue;
+
+ wqe->iova = (mask & WR_ATOMIC_MASK) ?
+ wqe->wr.wr.atomic.remote_addr :
+ (mask & WR_READ_OR_WRITE_MASK) ?
+ wqe->wr.wr.rdma.remote_addr :
+ 0;
+
+ if (!first || (mask & WR_READ_MASK) == 0) {
+ wqe->dma.resid = wqe->dma.length;
+ wqe->dma.cur_sge = 0;
+ wqe->dma.sge_offset = 0;
+ }
+
+ if (first) {
+ first = 0;
+
+ if (mask & WR_WRITE_OR_SEND_MASK) {
+ npsn = (qp->comp.psn - wqe->first_psn) &
+ BTH_PSN_MASK;
+ retry_first_write_send(qp, wqe, npsn);
+ }
+
+ if (mask & WR_READ_MASK) {
+ npsn = (wqe->dma.length - wqe->dma.resid) /
+ qp->mtu;
+ wqe->iova += npsn * qp->mtu;
+ }
+ }
+
+ wqe->state = wqe_state_posted;
+ }
+}
+
+void rnr_nak_timer(struct timer_list *t)
+{
+ struct rxe_qp *qp = from_timer(qp, t, rnr_nak_timer);
+ unsigned long flags;
+
+ rxe_dbg_qp(qp, "nak timer fired\n");
+
+ spin_lock_irqsave(&qp->state_lock, flags);
+ if (qp->valid) {
+ /* request a send queue retry */
+ qp->req.need_retry = 1;
+ qp->req.wait_for_rnr_timer = 0;
+ rxe_sched_task(&qp->req.task);
+ }
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+}
+
+static void req_check_sq_drain_done(struct rxe_qp *qp)
+{
+ struct rxe_queue *q;
+ unsigned int index;
+ unsigned int cons;
+ struct rxe_send_wqe *wqe;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->state_lock, flags);
+ if (qp_state(qp) == IB_QPS_SQD) {
+ q = qp->sq.queue;
+ index = qp->req.wqe_index;
+ cons = queue_get_consumer(q, QUEUE_TYPE_FROM_CLIENT);
+ wqe = queue_addr_from_index(q, cons);
+
+ /* check to see if we are drained;
+ * state_lock used by requester and completer
+ */
+ do {
+ if (!qp->attr.sq_draining)
+ /* comp just finished */
+ break;
+
+ if (wqe && ((index != cons) ||
+ (wqe->state != wqe_state_posted)))
+ /* comp not done yet */
+ break;
+
+ qp->attr.sq_draining = 0;
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+
+ if (qp->ibqp.event_handler) {
+ struct ib_event ev;
+
+ ev.device = qp->ibqp.device;
+ ev.element.qp = &qp->ibqp;
+ ev.event = IB_EVENT_SQ_DRAINED;
+ qp->ibqp.event_handler(&ev,
+ qp->ibqp.qp_context);
+ }
+ return;
+ } while (0);
+ }
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+}
+
+static struct rxe_send_wqe *__req_next_wqe(struct rxe_qp *qp)
+{
+ struct rxe_queue *q = qp->sq.queue;
+ unsigned int index = qp->req.wqe_index;
+ unsigned int prod;
+
+ prod = queue_get_producer(q, QUEUE_TYPE_FROM_CLIENT);
+ if (index == prod)
+ return NULL;
+ else
+ return queue_addr_from_index(q, index);
+}
+
+static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp)
+{
+ struct rxe_send_wqe *wqe;
+ unsigned long flags;
+
+ req_check_sq_drain_done(qp);
+
+ wqe = __req_next_wqe(qp);
+ if (wqe == NULL)
+ return NULL;
+
+ spin_lock_irqsave(&qp->state_lock, flags);
+ if (unlikely((qp_state(qp) == IB_QPS_SQD) &&
+ (wqe->state != wqe_state_processing))) {
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+ return NULL;
+ }
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+
+ wqe->mask = wr_opcode_mask(wqe->wr.opcode, qp);
+ return wqe;
+}
+
+/**
+ * rxe_wqe_is_fenced - check if next wqe is fenced
+ * @qp: the queue pair
+ * @wqe: the next wqe
+ *
+ * Returns: 1 if wqe needs to wait
+ * 0 if wqe is ready to go
+ */
+static int rxe_wqe_is_fenced(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
+{
+ /* Local invalidate fence (LIF) see IBA 10.6.5.1
+ * Requires ALL previous operations on the send queue
+ * are complete. Make mandatory for the rxe driver.
+ */
+ if (wqe->wr.opcode == IB_WR_LOCAL_INV)
+ return qp->req.wqe_index != queue_get_consumer(qp->sq.queue,
+ QUEUE_TYPE_FROM_CLIENT);
+
+ /* Fence see IBA 10.8.3.3
+ * Requires that all previous read and atomic operations
+ * are complete.
+ */
+ return (wqe->wr.send_flags & IB_SEND_FENCE) &&
+ atomic_read(&qp->req.rd_atomic) != qp->attr.max_rd_atomic;
+}
+
+static int next_opcode_rc(struct rxe_qp *qp, u32 opcode, int fits)
+{
+ switch (opcode) {
+ case IB_WR_RDMA_WRITE:
+ if (qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_FIRST ||
+ qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_MIDDLE)
+ return fits ?
+ IB_OPCODE_RC_RDMA_WRITE_LAST :
+ IB_OPCODE_RC_RDMA_WRITE_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_RC_RDMA_WRITE_ONLY :
+ IB_OPCODE_RC_RDMA_WRITE_FIRST;
+
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ if (qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_FIRST ||
+ qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_MIDDLE)
+ return fits ?
+ IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE :
+ IB_OPCODE_RC_RDMA_WRITE_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE :
+ IB_OPCODE_RC_RDMA_WRITE_FIRST;
+
+ case IB_WR_SEND:
+ if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST ||
+ qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE)
+ return fits ?
+ IB_OPCODE_RC_SEND_LAST :
+ IB_OPCODE_RC_SEND_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_RC_SEND_ONLY :
+ IB_OPCODE_RC_SEND_FIRST;
+
+ case IB_WR_SEND_WITH_IMM:
+ if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST ||
+ qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE)
+ return fits ?
+ IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE :
+ IB_OPCODE_RC_SEND_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE :
+ IB_OPCODE_RC_SEND_FIRST;
+
+ case IB_WR_FLUSH:
+ return IB_OPCODE_RC_FLUSH;
+
+ case IB_WR_RDMA_READ:
+ return IB_OPCODE_RC_RDMA_READ_REQUEST;
+
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ return IB_OPCODE_RC_COMPARE_SWAP;
+
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ return IB_OPCODE_RC_FETCH_ADD;
+
+ case IB_WR_SEND_WITH_INV:
+ if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST ||
+ qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE)
+ return fits ? IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE :
+ IB_OPCODE_RC_SEND_MIDDLE;
+ else
+ return fits ? IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE :
+ IB_OPCODE_RC_SEND_FIRST;
+
+ case IB_WR_ATOMIC_WRITE:
+ return IB_OPCODE_RC_ATOMIC_WRITE;
+
+ case IB_WR_REG_MR:
+ case IB_WR_LOCAL_INV:
+ return opcode;
+ }
+
+ return -EINVAL;
+}
+
+static int next_opcode_uc(struct rxe_qp *qp, u32 opcode, int fits)
+{
+ switch (opcode) {
+ case IB_WR_RDMA_WRITE:
+ if (qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_FIRST ||
+ qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_MIDDLE)
+ return fits ?
+ IB_OPCODE_UC_RDMA_WRITE_LAST :
+ IB_OPCODE_UC_RDMA_WRITE_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_UC_RDMA_WRITE_ONLY :
+ IB_OPCODE_UC_RDMA_WRITE_FIRST;
+
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ if (qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_FIRST ||
+ qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_MIDDLE)
+ return fits ?
+ IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE :
+ IB_OPCODE_UC_RDMA_WRITE_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE :
+ IB_OPCODE_UC_RDMA_WRITE_FIRST;
+
+ case IB_WR_SEND:
+ if (qp->req.opcode == IB_OPCODE_UC_SEND_FIRST ||
+ qp->req.opcode == IB_OPCODE_UC_SEND_MIDDLE)
+ return fits ?
+ IB_OPCODE_UC_SEND_LAST :
+ IB_OPCODE_UC_SEND_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_UC_SEND_ONLY :
+ IB_OPCODE_UC_SEND_FIRST;
+
+ case IB_WR_SEND_WITH_IMM:
+ if (qp->req.opcode == IB_OPCODE_UC_SEND_FIRST ||
+ qp->req.opcode == IB_OPCODE_UC_SEND_MIDDLE)
+ return fits ?
+ IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE :
+ IB_OPCODE_UC_SEND_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE :
+ IB_OPCODE_UC_SEND_FIRST;
+ }
+
+ return -EINVAL;
+}
+
+static int next_opcode(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
+ u32 opcode)
+{
+ int fits = (wqe->dma.resid <= qp->mtu);
+
+ switch (qp_type(qp)) {
+ case IB_QPT_RC:
+ return next_opcode_rc(qp, opcode, fits);
+
+ case IB_QPT_UC:
+ return next_opcode_uc(qp, opcode, fits);
+
+ case IB_QPT_UD:
+ case IB_QPT_GSI:
+ switch (opcode) {
+ case IB_WR_SEND:
+ return IB_OPCODE_UD_SEND_ONLY;
+
+ case IB_WR_SEND_WITH_IMM:
+ return IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ return -EINVAL;
+}
+
+static inline int check_init_depth(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
+{
+ int depth;
+
+ if (wqe->has_rd_atomic)
+ return 0;
+
+ qp->req.need_rd_atomic = 1;
+ depth = atomic_dec_return(&qp->req.rd_atomic);
+
+ if (depth >= 0) {
+ qp->req.need_rd_atomic = 0;
+ wqe->has_rd_atomic = 1;
+ return 0;
+ }
+
+ atomic_inc(&qp->req.rd_atomic);
+ return -EAGAIN;
+}
+
+static inline int get_mtu(struct rxe_qp *qp)
+{
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+
+ if ((qp_type(qp) == IB_QPT_RC) || (qp_type(qp) == IB_QPT_UC))
+ return qp->mtu;
+
+ return rxe->port.mtu_cap;
+}
+
+static struct sk_buff *init_req_packet(struct rxe_qp *qp,
+ struct rxe_av *av,
+ struct rxe_send_wqe *wqe,
+ int opcode, u32 payload,
+ struct rxe_pkt_info *pkt)
+{
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ struct sk_buff *skb;
+ struct rxe_send_wr *ibwr = &wqe->wr;
+ int pad = (-payload) & 0x3;
+ int paylen;
+ int solicited;
+ u32 qp_num;
+ int ack_req;
+
+ /* length from start of bth to end of icrc */
+ paylen = rxe_opcode[opcode].length + payload + pad + RXE_ICRC_SIZE;
+ pkt->paylen = paylen;
+
+ /* init skb */
+ skb = rxe_init_packet(rxe, av, paylen, pkt);
+ if (unlikely(!skb))
+ return NULL;
+
+ /* init bth */
+ solicited = (ibwr->send_flags & IB_SEND_SOLICITED) &&
+ (pkt->mask & RXE_END_MASK) &&
+ ((pkt->mask & (RXE_SEND_MASK)) ||
+ (pkt->mask & (RXE_WRITE_MASK | RXE_IMMDT_MASK)) ==
+ (RXE_WRITE_MASK | RXE_IMMDT_MASK));
+
+ qp_num = (pkt->mask & RXE_DETH_MASK) ? ibwr->wr.ud.remote_qpn :
+ qp->attr.dest_qp_num;
+
+ ack_req = ((pkt->mask & RXE_END_MASK) ||
+ (qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK));
+ if (ack_req)
+ qp->req.noack_pkts = 0;
+
+ bth_init(pkt, pkt->opcode, solicited, 0, pad, IB_DEFAULT_PKEY_FULL, qp_num,
+ ack_req, pkt->psn);
+
+ /* init optional headers */
+ if (pkt->mask & RXE_RETH_MASK) {
+ if (pkt->mask & RXE_FETH_MASK)
+ reth_set_rkey(pkt, ibwr->wr.flush.rkey);
+ else
+ reth_set_rkey(pkt, ibwr->wr.rdma.rkey);
+ reth_set_va(pkt, wqe->iova);
+ reth_set_len(pkt, wqe->dma.resid);
+ }
+
+ /* Fill Flush Extension Transport Header */
+ if (pkt->mask & RXE_FETH_MASK)
+ feth_init(pkt, ibwr->wr.flush.type, ibwr->wr.flush.level);
+
+ if (pkt->mask & RXE_IMMDT_MASK)
+ immdt_set_imm(pkt, ibwr->ex.imm_data);
+
+ if (pkt->mask & RXE_IETH_MASK)
+ ieth_set_rkey(pkt, ibwr->ex.invalidate_rkey);
+
+ if (pkt->mask & RXE_ATMETH_MASK) {
+ atmeth_set_va(pkt, wqe->iova);
+ if (opcode == IB_OPCODE_RC_COMPARE_SWAP) {
+ atmeth_set_swap_add(pkt, ibwr->wr.atomic.swap);
+ atmeth_set_comp(pkt, ibwr->wr.atomic.compare_add);
+ } else {
+ atmeth_set_swap_add(pkt, ibwr->wr.atomic.compare_add);
+ }
+ atmeth_set_rkey(pkt, ibwr->wr.atomic.rkey);
+ }
+
+ if (pkt->mask & RXE_DETH_MASK) {
+ if (qp->ibqp.qp_num == 1)
+ deth_set_qkey(pkt, GSI_QKEY);
+ else
+ deth_set_qkey(pkt, ibwr->wr.ud.remote_qkey);
+ deth_set_sqp(pkt, qp->ibqp.qp_num);
+ }
+
+ return skb;
+}
+
+static int finish_packet(struct rxe_qp *qp, struct rxe_av *av,
+ struct rxe_send_wqe *wqe, struct rxe_pkt_info *pkt,
+ struct sk_buff *skb, u32 payload)
+{
+ int err;
+
+ err = rxe_prepare(av, pkt, skb);
+ if (err)
+ return err;
+
+ if (pkt->mask & RXE_WRITE_OR_SEND_MASK) {
+ if (wqe->wr.send_flags & IB_SEND_INLINE) {
+ u8 *tmp = &wqe->dma.inline_data[wqe->dma.sge_offset];
+
+ memcpy(payload_addr(pkt), tmp, payload);
+
+ wqe->dma.resid -= payload;
+ wqe->dma.sge_offset += payload;
+ } else {
+ err = copy_data(qp->pd, 0, &wqe->dma,
+ payload_addr(pkt), payload,
+ RXE_FROM_MR_OBJ);
+ if (err)
+ return err;
+ }
+ if (bth_pad(pkt)) {
+ u8 *pad = payload_addr(pkt) + payload;
+
+ memset(pad, 0, bth_pad(pkt));
+ }
+ } else if (pkt->mask & RXE_FLUSH_MASK) {
+ /* oA19-2: shall have no payload. */
+ wqe->dma.resid = 0;
+ }
+
+ if (pkt->mask & RXE_ATOMIC_WRITE_MASK) {
+ memcpy(payload_addr(pkt), wqe->dma.atomic_wr, payload);
+ wqe->dma.resid -= payload;
+ }
+
+ return 0;
+}
+
+static void update_wqe_state(struct rxe_qp *qp,
+ struct rxe_send_wqe *wqe,
+ struct rxe_pkt_info *pkt)
+{
+ if (pkt->mask & RXE_END_MASK) {
+ if (qp_type(qp) == IB_QPT_RC)
+ wqe->state = wqe_state_pending;
+ } else {
+ wqe->state = wqe_state_processing;
+ }
+}
+
+static void update_wqe_psn(struct rxe_qp *qp,
+ struct rxe_send_wqe *wqe,
+ struct rxe_pkt_info *pkt,
+ u32 payload)
+{
+ /* number of packets left to send including current one */
+ int num_pkt = (wqe->dma.resid + payload + qp->mtu - 1) / qp->mtu;
+
+ /* handle zero length packet case */
+ if (num_pkt == 0)
+ num_pkt = 1;
+
+ if (pkt->mask & RXE_START_MASK) {
+ wqe->first_psn = qp->req.psn;
+ wqe->last_psn = (qp->req.psn + num_pkt - 1) & BTH_PSN_MASK;
+ }
+
+ if (pkt->mask & RXE_READ_MASK)
+ qp->req.psn = (wqe->first_psn + num_pkt) & BTH_PSN_MASK;
+ else
+ qp->req.psn = (qp->req.psn + 1) & BTH_PSN_MASK;
+}
+
+static void save_state(struct rxe_send_wqe *wqe,
+ struct rxe_qp *qp,
+ struct rxe_send_wqe *rollback_wqe,
+ u32 *rollback_psn)
+{
+ rollback_wqe->state = wqe->state;
+ rollback_wqe->first_psn = wqe->first_psn;
+ rollback_wqe->last_psn = wqe->last_psn;
+ rollback_wqe->dma = wqe->dma;
+ *rollback_psn = qp->req.psn;
+}
+
+static void rollback_state(struct rxe_send_wqe *wqe,
+ struct rxe_qp *qp,
+ struct rxe_send_wqe *rollback_wqe,
+ u32 rollback_psn)
+{
+ wqe->state = rollback_wqe->state;
+ wqe->first_psn = rollback_wqe->first_psn;
+ wqe->last_psn = rollback_wqe->last_psn;
+ wqe->dma = rollback_wqe->dma;
+ qp->req.psn = rollback_psn;
+}
+
+static void update_state(struct rxe_qp *qp, struct rxe_pkt_info *pkt)
+{
+ qp->req.opcode = pkt->opcode;
+
+ if (pkt->mask & RXE_END_MASK)
+ qp->req.wqe_index = queue_next_index(qp->sq.queue,
+ qp->req.wqe_index);
+
+ qp->need_req_skb = 0;
+
+ if (qp->qp_timeout_jiffies && !timer_pending(&qp->retrans_timer))
+ mod_timer(&qp->retrans_timer,
+ jiffies + qp->qp_timeout_jiffies);
+}
+
+static int rxe_do_local_ops(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
+{
+ u8 opcode = wqe->wr.opcode;
+ u32 rkey;
+ int ret;
+
+ switch (opcode) {
+ case IB_WR_LOCAL_INV:
+ rkey = wqe->wr.ex.invalidate_rkey;
+ if (rkey_is_mw(rkey))
+ ret = rxe_invalidate_mw(qp, rkey);
+ else
+ ret = rxe_invalidate_mr(qp, rkey);
+
+ if (unlikely(ret)) {
+ wqe->status = IB_WC_LOC_QP_OP_ERR;
+ return ret;
+ }
+ break;
+ case IB_WR_REG_MR:
+ ret = rxe_reg_fast_mr(qp, wqe);
+ if (unlikely(ret)) {
+ wqe->status = IB_WC_LOC_QP_OP_ERR;
+ return ret;
+ }
+ break;
+ case IB_WR_BIND_MW:
+ ret = rxe_bind_mw(qp, wqe);
+ if (unlikely(ret)) {
+ wqe->status = IB_WC_MW_BIND_ERR;
+ return ret;
+ }
+ break;
+ default:
+ rxe_dbg_qp(qp, "Unexpected send wqe opcode %d\n", opcode);
+ wqe->status = IB_WC_LOC_QP_OP_ERR;
+ return -EINVAL;
+ }
+
+ wqe->state = wqe_state_done;
+ wqe->status = IB_WC_SUCCESS;
+ qp->req.wqe_index = queue_next_index(qp->sq.queue, qp->req.wqe_index);
+
+ /* There is no ack coming for local work requests
+ * which can lead to a deadlock. So go ahead and complete
+ * it now.
+ */
+ rxe_sched_task(&qp->comp.task);
+
+ return 0;
+}
+
+int rxe_requester(struct rxe_qp *qp)
+{
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ struct rxe_pkt_info pkt;
+ struct sk_buff *skb;
+ struct rxe_send_wqe *wqe;
+ enum rxe_hdr_mask mask;
+ u32 payload;
+ int mtu;
+ int opcode;
+ int err;
+ int ret;
+ struct rxe_send_wqe rollback_wqe;
+ u32 rollback_psn;
+ struct rxe_queue *q = qp->sq.queue;
+ struct rxe_ah *ah;
+ struct rxe_av *av;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->state_lock, flags);
+ if (unlikely(!qp->valid)) {
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+ goto exit;
+ }
+
+ if (unlikely(qp_state(qp) == IB_QPS_ERR)) {
+ wqe = __req_next_wqe(qp);
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+ if (wqe)
+ goto err;
+ else
+ goto exit;
+ }
+
+ if (unlikely(qp_state(qp) == IB_QPS_RESET)) {
+ qp->req.wqe_index = queue_get_consumer(q,
+ QUEUE_TYPE_FROM_CLIENT);
+ qp->req.opcode = -1;
+ qp->req.need_rd_atomic = 0;
+ qp->req.wait_psn = 0;
+ qp->req.need_retry = 0;
+ qp->req.wait_for_rnr_timer = 0;
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+ goto exit;
+ }
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+
+ /* we come here if the retransmit timer has fired
+ * or if the rnr timer has fired. If the retransmit
+ * timer fires while we are processing an RNR NAK wait
+ * until the rnr timer has fired before starting the
+ * retry flow
+ */
+ if (unlikely(qp->req.need_retry && !qp->req.wait_for_rnr_timer)) {
+ req_retry(qp);
+ qp->req.need_retry = 0;
+ }
+
+ wqe = req_next_wqe(qp);
+ if (unlikely(!wqe))
+ goto exit;
+
+ if (rxe_wqe_is_fenced(qp, wqe)) {
+ qp->req.wait_fence = 1;
+ goto exit;
+ }
+
+ if (wqe->mask & WR_LOCAL_OP_MASK) {
+ err = rxe_do_local_ops(qp, wqe);
+ if (unlikely(err))
+ goto err;
+ else
+ goto done;
+ }
+
+ if (unlikely(qp_type(qp) == IB_QPT_RC &&
+ psn_compare(qp->req.psn, (qp->comp.psn +
+ RXE_MAX_UNACKED_PSNS)) > 0)) {
+ qp->req.wait_psn = 1;
+ goto exit;
+ }
+
+ /* Limit the number of inflight SKBs per QP */
+ if (unlikely(atomic_read(&qp->skb_out) >
+ RXE_INFLIGHT_SKBS_PER_QP_HIGH)) {
+ qp->need_req_skb = 1;
+ goto exit;
+ }
+
+ opcode = next_opcode(qp, wqe, wqe->wr.opcode);
+ if (unlikely(opcode < 0)) {
+ wqe->status = IB_WC_LOC_QP_OP_ERR;
+ goto err;
+ }
+
+ mask = rxe_opcode[opcode].mask;
+ if (unlikely(mask & (RXE_READ_OR_ATOMIC_MASK |
+ RXE_ATOMIC_WRITE_MASK))) {
+ if (check_init_depth(qp, wqe))
+ goto exit;
+ }
+
+ mtu = get_mtu(qp);
+ payload = (mask & (RXE_WRITE_OR_SEND_MASK | RXE_ATOMIC_WRITE_MASK)) ?
+ wqe->dma.resid : 0;
+ if (payload > mtu) {
+ if (qp_type(qp) == IB_QPT_UD) {
+ /* C10-93.1.1: If the total sum of all the buffer lengths specified for a
+ * UD message exceeds the MTU of the port as returned by QueryHCA, the CI
+ * shall not emit any packets for this message. Further, the CI shall not
+ * generate an error due to this condition.
+ */
+
+ /* fake a successful UD send */
+ wqe->first_psn = qp->req.psn;
+ wqe->last_psn = qp->req.psn;
+ qp->req.psn = (qp->req.psn + 1) & BTH_PSN_MASK;
+ qp->req.opcode = IB_OPCODE_UD_SEND_ONLY;
+ qp->req.wqe_index = queue_next_index(qp->sq.queue,
+ qp->req.wqe_index);
+ wqe->state = wqe_state_done;
+ wqe->status = IB_WC_SUCCESS;
+ rxe_sched_task(&qp->comp.task);
+ goto done;
+ }
+ payload = mtu;
+ }
+
+ pkt.rxe = rxe;
+ pkt.opcode = opcode;
+ pkt.qp = qp;
+ pkt.psn = qp->req.psn;
+ pkt.mask = rxe_opcode[opcode].mask;
+ pkt.wqe = wqe;
+
+ /* save wqe state before we build and send packet */
+ save_state(wqe, qp, &rollback_wqe, &rollback_psn);
+
+ av = rxe_get_av(&pkt, &ah);
+ if (unlikely(!av)) {
+ rxe_dbg_qp(qp, "Failed no address vector\n");
+ wqe->status = IB_WC_LOC_QP_OP_ERR;
+ goto err;
+ }
+
+ skb = init_req_packet(qp, av, wqe, opcode, payload, &pkt);
+ if (unlikely(!skb)) {
+ rxe_dbg_qp(qp, "Failed allocating skb\n");
+ wqe->status = IB_WC_LOC_QP_OP_ERR;
+ if (ah)
+ rxe_put(ah);
+ goto err;
+ }
+
+ err = finish_packet(qp, av, wqe, &pkt, skb, payload);
+ if (unlikely(err)) {
+ rxe_dbg_qp(qp, "Error during finish packet\n");
+ if (err == -EFAULT)
+ wqe->status = IB_WC_LOC_PROT_ERR;
+ else
+ wqe->status = IB_WC_LOC_QP_OP_ERR;
+ kfree_skb(skb);
+ if (ah)
+ rxe_put(ah);
+ goto err;
+ }
+
+ if (ah)
+ rxe_put(ah);
+
+ /* update wqe state as though we had sent it */
+ update_wqe_state(qp, wqe, &pkt);
+ update_wqe_psn(qp, wqe, &pkt, payload);
+
+ err = rxe_xmit_packet(qp, &pkt, skb);
+ if (err) {
+ if (err != -EAGAIN) {
+ wqe->status = IB_WC_LOC_QP_OP_ERR;
+ goto err;
+ }
+
+ /* the packet was dropped so reset wqe to the state
+ * before we sent it so we can try to resend
+ */
+ rollback_state(wqe, qp, &rollback_wqe, rollback_psn);
+
+ /* force a delay until the dropped packet is freed and
+ * the send queue is drained below the low water mark
+ */
+ qp->need_req_skb = 1;
+
+ rxe_sched_task(&qp->req.task);
+ goto exit;
+ }
+
+ update_state(qp, &pkt);
+
+ /* A non-zero return value will cause rxe_do_task to
+ * exit its loop and end the work item. A zero return
+ * will continue looping and return to rxe_requester
+ */
+done:
+ ret = 0;
+ goto out;
+err:
+ /* update wqe_index for each wqe completion */
+ qp->req.wqe_index = queue_next_index(qp->sq.queue, qp->req.wqe_index);
+ wqe->state = wqe_state_error;
+ rxe_qp_error(qp);
+exit:
+ ret = -EAGAIN;
+out:
+ return ret;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
new file mode 100644
index 0000000000..da470a925e
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_resp.c
@@ -0,0 +1,1691 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#include <linux/skbuff.h>
+
+#include "rxe.h"
+#include "rxe_loc.h"
+#include "rxe_queue.h"
+
+static char *resp_state_name[] = {
+ [RESPST_NONE] = "NONE",
+ [RESPST_GET_REQ] = "GET_REQ",
+ [RESPST_CHK_PSN] = "CHK_PSN",
+ [RESPST_CHK_OP_SEQ] = "CHK_OP_SEQ",
+ [RESPST_CHK_OP_VALID] = "CHK_OP_VALID",
+ [RESPST_CHK_RESOURCE] = "CHK_RESOURCE",
+ [RESPST_CHK_LENGTH] = "CHK_LENGTH",
+ [RESPST_CHK_RKEY] = "CHK_RKEY",
+ [RESPST_EXECUTE] = "EXECUTE",
+ [RESPST_READ_REPLY] = "READ_REPLY",
+ [RESPST_ATOMIC_REPLY] = "ATOMIC_REPLY",
+ [RESPST_ATOMIC_WRITE_REPLY] = "ATOMIC_WRITE_REPLY",
+ [RESPST_PROCESS_FLUSH] = "PROCESS_FLUSH",
+ [RESPST_COMPLETE] = "COMPLETE",
+ [RESPST_ACKNOWLEDGE] = "ACKNOWLEDGE",
+ [RESPST_CLEANUP] = "CLEANUP",
+ [RESPST_DUPLICATE_REQUEST] = "DUPLICATE_REQUEST",
+ [RESPST_ERR_MALFORMED_WQE] = "ERR_MALFORMED_WQE",
+ [RESPST_ERR_UNSUPPORTED_OPCODE] = "ERR_UNSUPPORTED_OPCODE",
+ [RESPST_ERR_MISALIGNED_ATOMIC] = "ERR_MISALIGNED_ATOMIC",
+ [RESPST_ERR_PSN_OUT_OF_SEQ] = "ERR_PSN_OUT_OF_SEQ",
+ [RESPST_ERR_MISSING_OPCODE_FIRST] = "ERR_MISSING_OPCODE_FIRST",
+ [RESPST_ERR_MISSING_OPCODE_LAST_C] = "ERR_MISSING_OPCODE_LAST_C",
+ [RESPST_ERR_MISSING_OPCODE_LAST_D1E] = "ERR_MISSING_OPCODE_LAST_D1E",
+ [RESPST_ERR_TOO_MANY_RDMA_ATM_REQ] = "ERR_TOO_MANY_RDMA_ATM_REQ",
+ [RESPST_ERR_RNR] = "ERR_RNR",
+ [RESPST_ERR_RKEY_VIOLATION] = "ERR_RKEY_VIOLATION",
+ [RESPST_ERR_INVALIDATE_RKEY] = "ERR_INVALIDATE_RKEY_VIOLATION",
+ [RESPST_ERR_LENGTH] = "ERR_LENGTH",
+ [RESPST_ERR_CQ_OVERFLOW] = "ERR_CQ_OVERFLOW",
+ [RESPST_ERROR] = "ERROR",
+ [RESPST_DONE] = "DONE",
+ [RESPST_EXIT] = "EXIT",
+};
+
+/* rxe_recv calls here to add a request packet to the input queue */
+void rxe_resp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb)
+{
+ int must_sched;
+ struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
+
+ skb_queue_tail(&qp->req_pkts, skb);
+
+ must_sched = (pkt->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST) ||
+ (skb_queue_len(&qp->req_pkts) > 1);
+
+ if (must_sched)
+ rxe_sched_task(&qp->resp.task);
+ else
+ rxe_run_task(&qp->resp.task);
+}
+
+static inline enum resp_states get_req(struct rxe_qp *qp,
+ struct rxe_pkt_info **pkt_p)
+{
+ struct sk_buff *skb;
+
+ skb = skb_peek(&qp->req_pkts);
+ if (!skb)
+ return RESPST_EXIT;
+
+ *pkt_p = SKB_TO_PKT(skb);
+
+ return (qp->resp.res) ? RESPST_READ_REPLY : RESPST_CHK_PSN;
+}
+
+static enum resp_states check_psn(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ int diff = psn_compare(pkt->psn, qp->resp.psn);
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+
+ switch (qp_type(qp)) {
+ case IB_QPT_RC:
+ if (diff > 0) {
+ if (qp->resp.sent_psn_nak)
+ return RESPST_CLEANUP;
+
+ qp->resp.sent_psn_nak = 1;
+ rxe_counter_inc(rxe, RXE_CNT_OUT_OF_SEQ_REQ);
+ return RESPST_ERR_PSN_OUT_OF_SEQ;
+
+ } else if (diff < 0) {
+ rxe_counter_inc(rxe, RXE_CNT_DUP_REQ);
+ return RESPST_DUPLICATE_REQUEST;
+ }
+
+ if (qp->resp.sent_psn_nak)
+ qp->resp.sent_psn_nak = 0;
+
+ break;
+
+ case IB_QPT_UC:
+ if (qp->resp.drop_msg || diff != 0) {
+ if (pkt->mask & RXE_START_MASK) {
+ qp->resp.drop_msg = 0;
+ return RESPST_CHK_OP_SEQ;
+ }
+
+ qp->resp.drop_msg = 1;
+ return RESPST_CLEANUP;
+ }
+ break;
+ default:
+ break;
+ }
+
+ return RESPST_CHK_OP_SEQ;
+}
+
+static enum resp_states check_op_seq(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ switch (qp_type(qp)) {
+ case IB_QPT_RC:
+ switch (qp->resp.opcode) {
+ case IB_OPCODE_RC_SEND_FIRST:
+ case IB_OPCODE_RC_SEND_MIDDLE:
+ switch (pkt->opcode) {
+ case IB_OPCODE_RC_SEND_MIDDLE:
+ case IB_OPCODE_RC_SEND_LAST:
+ case IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE:
+ case IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE:
+ return RESPST_CHK_OP_VALID;
+ default:
+ return RESPST_ERR_MISSING_OPCODE_LAST_C;
+ }
+
+ case IB_OPCODE_RC_RDMA_WRITE_FIRST:
+ case IB_OPCODE_RC_RDMA_WRITE_MIDDLE:
+ switch (pkt->opcode) {
+ case IB_OPCODE_RC_RDMA_WRITE_MIDDLE:
+ case IB_OPCODE_RC_RDMA_WRITE_LAST:
+ case IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+ return RESPST_CHK_OP_VALID;
+ default:
+ return RESPST_ERR_MISSING_OPCODE_LAST_C;
+ }
+
+ default:
+ switch (pkt->opcode) {
+ case IB_OPCODE_RC_SEND_MIDDLE:
+ case IB_OPCODE_RC_SEND_LAST:
+ case IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE:
+ case IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE:
+ case IB_OPCODE_RC_RDMA_WRITE_MIDDLE:
+ case IB_OPCODE_RC_RDMA_WRITE_LAST:
+ case IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+ return RESPST_ERR_MISSING_OPCODE_FIRST;
+ default:
+ return RESPST_CHK_OP_VALID;
+ }
+ }
+ break;
+
+ case IB_QPT_UC:
+ switch (qp->resp.opcode) {
+ case IB_OPCODE_UC_SEND_FIRST:
+ case IB_OPCODE_UC_SEND_MIDDLE:
+ switch (pkt->opcode) {
+ case IB_OPCODE_UC_SEND_MIDDLE:
+ case IB_OPCODE_UC_SEND_LAST:
+ case IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE:
+ return RESPST_CHK_OP_VALID;
+ default:
+ return RESPST_ERR_MISSING_OPCODE_LAST_D1E;
+ }
+
+ case IB_OPCODE_UC_RDMA_WRITE_FIRST:
+ case IB_OPCODE_UC_RDMA_WRITE_MIDDLE:
+ switch (pkt->opcode) {
+ case IB_OPCODE_UC_RDMA_WRITE_MIDDLE:
+ case IB_OPCODE_UC_RDMA_WRITE_LAST:
+ case IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+ return RESPST_CHK_OP_VALID;
+ default:
+ return RESPST_ERR_MISSING_OPCODE_LAST_D1E;
+ }
+
+ default:
+ switch (pkt->opcode) {
+ case IB_OPCODE_UC_SEND_MIDDLE:
+ case IB_OPCODE_UC_SEND_LAST:
+ case IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE:
+ case IB_OPCODE_UC_RDMA_WRITE_MIDDLE:
+ case IB_OPCODE_UC_RDMA_WRITE_LAST:
+ case IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+ qp->resp.drop_msg = 1;
+ return RESPST_CLEANUP;
+ default:
+ return RESPST_CHK_OP_VALID;
+ }
+ }
+ break;
+
+ default:
+ return RESPST_CHK_OP_VALID;
+ }
+}
+
+static bool check_qp_attr_access(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ if (((pkt->mask & RXE_READ_MASK) &&
+ !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_READ)) ||
+ ((pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) &&
+ !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) ||
+ ((pkt->mask & RXE_ATOMIC_MASK) &&
+ !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+ return false;
+
+ if (pkt->mask & RXE_FLUSH_MASK) {
+ u32 flush_type = feth_plt(pkt);
+
+ if ((flush_type & IB_FLUSH_GLOBAL &&
+ !(qp->attr.qp_access_flags & IB_ACCESS_FLUSH_GLOBAL)) ||
+ (flush_type & IB_FLUSH_PERSISTENT &&
+ !(qp->attr.qp_access_flags & IB_ACCESS_FLUSH_PERSISTENT)))
+ return false;
+ }
+
+ return true;
+}
+
+static enum resp_states check_op_valid(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ switch (qp_type(qp)) {
+ case IB_QPT_RC:
+ if (!check_qp_attr_access(qp, pkt))
+ return RESPST_ERR_UNSUPPORTED_OPCODE;
+
+ break;
+
+ case IB_QPT_UC:
+ if ((pkt->mask & RXE_WRITE_MASK) &&
+ !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) {
+ qp->resp.drop_msg = 1;
+ return RESPST_CLEANUP;
+ }
+
+ break;
+
+ case IB_QPT_UD:
+ case IB_QPT_GSI:
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ return RESPST_CHK_RESOURCE;
+}
+
+static enum resp_states get_srq_wqe(struct rxe_qp *qp)
+{
+ struct rxe_srq *srq = qp->srq;
+ struct rxe_queue *q = srq->rq.queue;
+ struct rxe_recv_wqe *wqe;
+ struct ib_event ev;
+ unsigned int count;
+ size_t size;
+ unsigned long flags;
+
+ if (srq->error)
+ return RESPST_ERR_RNR;
+
+ spin_lock_irqsave(&srq->rq.consumer_lock, flags);
+
+ wqe = queue_head(q, QUEUE_TYPE_FROM_CLIENT);
+ if (!wqe) {
+ spin_unlock_irqrestore(&srq->rq.consumer_lock, flags);
+ return RESPST_ERR_RNR;
+ }
+
+ /* don't trust user space data */
+ if (unlikely(wqe->dma.num_sge > srq->rq.max_sge)) {
+ spin_unlock_irqrestore(&srq->rq.consumer_lock, flags);
+ rxe_dbg_qp(qp, "invalid num_sge in SRQ entry\n");
+ return RESPST_ERR_MALFORMED_WQE;
+ }
+ size = sizeof(*wqe) + wqe->dma.num_sge*sizeof(struct rxe_sge);
+ memcpy(&qp->resp.srq_wqe, wqe, size);
+
+ qp->resp.wqe = &qp->resp.srq_wqe.wqe;
+ queue_advance_consumer(q, QUEUE_TYPE_FROM_CLIENT);
+ count = queue_count(q, QUEUE_TYPE_FROM_CLIENT);
+
+ if (srq->limit && srq->ibsrq.event_handler && (count < srq->limit)) {
+ srq->limit = 0;
+ goto event;
+ }
+
+ spin_unlock_irqrestore(&srq->rq.consumer_lock, flags);
+ return RESPST_CHK_LENGTH;
+
+event:
+ spin_unlock_irqrestore(&srq->rq.consumer_lock, flags);
+ ev.device = qp->ibqp.device;
+ ev.element.srq = qp->ibqp.srq;
+ ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+ srq->ibsrq.event_handler(&ev, srq->ibsrq.srq_context);
+ return RESPST_CHK_LENGTH;
+}
+
+static enum resp_states check_resource(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ struct rxe_srq *srq = qp->srq;
+
+ if (pkt->mask & (RXE_READ_OR_ATOMIC_MASK | RXE_ATOMIC_WRITE_MASK)) {
+ /* it is the requesters job to not send
+ * too many read/atomic ops, we just
+ * recycle the responder resource queue
+ */
+ if (likely(qp->attr.max_dest_rd_atomic > 0))
+ return RESPST_CHK_LENGTH;
+ else
+ return RESPST_ERR_TOO_MANY_RDMA_ATM_REQ;
+ }
+
+ if (pkt->mask & RXE_RWR_MASK) {
+ if (srq)
+ return get_srq_wqe(qp);
+
+ qp->resp.wqe = queue_head(qp->rq.queue,
+ QUEUE_TYPE_FROM_CLIENT);
+ return (qp->resp.wqe) ? RESPST_CHK_LENGTH : RESPST_ERR_RNR;
+ }
+
+ return RESPST_CHK_LENGTH;
+}
+
+static enum resp_states rxe_resp_check_length(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ /*
+ * See IBA C9-92
+ * For UD QPs we only check if the packet will fit in the
+ * receive buffer later. For rmda operations additional
+ * length checks are performed in check_rkey.
+ */
+ if (pkt->mask & RXE_PAYLOAD_MASK && ((qp_type(qp) == IB_QPT_RC) ||
+ (qp_type(qp) == IB_QPT_UC))) {
+ unsigned int mtu = qp->mtu;
+ unsigned int payload = payload_size(pkt);
+
+ if ((pkt->mask & RXE_START_MASK) &&
+ (pkt->mask & RXE_END_MASK)) {
+ if (unlikely(payload > mtu)) {
+ rxe_dbg_qp(qp, "only packet too long");
+ return RESPST_ERR_LENGTH;
+ }
+ } else if ((pkt->mask & RXE_START_MASK) ||
+ (pkt->mask & RXE_MIDDLE_MASK)) {
+ if (unlikely(payload != mtu)) {
+ rxe_dbg_qp(qp, "first or middle packet not mtu");
+ return RESPST_ERR_LENGTH;
+ }
+ } else if (pkt->mask & RXE_END_MASK) {
+ if (unlikely((payload == 0) || (payload > mtu))) {
+ rxe_dbg_qp(qp, "last packet zero or too long");
+ return RESPST_ERR_LENGTH;
+ }
+ }
+ }
+
+ /* See IBA C9-94 */
+ if (pkt->mask & RXE_RETH_MASK) {
+ if (reth_len(pkt) > (1U << 31)) {
+ rxe_dbg_qp(qp, "dma length too long");
+ return RESPST_ERR_LENGTH;
+ }
+ }
+
+ if (pkt->mask & RXE_RDMA_OP_MASK)
+ return RESPST_CHK_RKEY;
+ else
+ return RESPST_EXECUTE;
+}
+
+/* if the reth length field is zero we can assume nothing
+ * about the rkey value and should not validate or use it.
+ * Instead set qp->resp.rkey to 0 which is an invalid rkey
+ * value since the minimum index part is 1.
+ */
+static void qp_resp_from_reth(struct rxe_qp *qp, struct rxe_pkt_info *pkt)
+{
+ unsigned int length = reth_len(pkt);
+
+ qp->resp.va = reth_va(pkt);
+ qp->resp.offset = 0;
+ qp->resp.resid = length;
+ qp->resp.length = length;
+ if (pkt->mask & RXE_READ_OR_WRITE_MASK && length == 0)
+ qp->resp.rkey = 0;
+ else
+ qp->resp.rkey = reth_rkey(pkt);
+}
+
+static void qp_resp_from_atmeth(struct rxe_qp *qp, struct rxe_pkt_info *pkt)
+{
+ qp->resp.va = atmeth_va(pkt);
+ qp->resp.offset = 0;
+ qp->resp.rkey = atmeth_rkey(pkt);
+ qp->resp.resid = sizeof(u64);
+}
+
+/* resolve the packet rkey to qp->resp.mr or set qp->resp.mr to NULL
+ * if an invalid rkey is received or the rdma length is zero. For middle
+ * or last packets use the stored value of mr.
+ */
+static enum resp_states check_rkey(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ struct rxe_mr *mr = NULL;
+ struct rxe_mw *mw = NULL;
+ u64 va;
+ u32 rkey;
+ u32 resid;
+ u32 pktlen;
+ int mtu = qp->mtu;
+ enum resp_states state;
+ int access = 0;
+
+ /* parse RETH or ATMETH header for first/only packets
+ * for va, length, rkey, etc. or use current value for
+ * middle/last packets.
+ */
+ if (pkt->mask & (RXE_READ_OR_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) {
+ if (pkt->mask & RXE_RETH_MASK)
+ qp_resp_from_reth(qp, pkt);
+
+ access = (pkt->mask & RXE_READ_MASK) ? IB_ACCESS_REMOTE_READ
+ : IB_ACCESS_REMOTE_WRITE;
+ } else if (pkt->mask & RXE_FLUSH_MASK) {
+ u32 flush_type = feth_plt(pkt);
+
+ if (pkt->mask & RXE_RETH_MASK)
+ qp_resp_from_reth(qp, pkt);
+
+ if (flush_type & IB_FLUSH_GLOBAL)
+ access |= IB_ACCESS_FLUSH_GLOBAL;
+ if (flush_type & IB_FLUSH_PERSISTENT)
+ access |= IB_ACCESS_FLUSH_PERSISTENT;
+ } else if (pkt->mask & RXE_ATOMIC_MASK) {
+ qp_resp_from_atmeth(qp, pkt);
+ access = IB_ACCESS_REMOTE_ATOMIC;
+ } else {
+ /* shouldn't happen */
+ WARN_ON(1);
+ }
+
+ /* A zero-byte read or write op is not required to
+ * set an addr or rkey. See C9-88
+ */
+ if ((pkt->mask & RXE_READ_OR_WRITE_MASK) &&
+ (pkt->mask & RXE_RETH_MASK) && reth_len(pkt) == 0) {
+ qp->resp.mr = NULL;
+ return RESPST_EXECUTE;
+ }
+
+ va = qp->resp.va;
+ rkey = qp->resp.rkey;
+ resid = qp->resp.resid;
+ pktlen = payload_size(pkt);
+
+ if (rkey_is_mw(rkey)) {
+ mw = rxe_lookup_mw(qp, access, rkey);
+ if (!mw) {
+ rxe_dbg_qp(qp, "no MW matches rkey %#x\n", rkey);
+ state = RESPST_ERR_RKEY_VIOLATION;
+ goto err;
+ }
+
+ mr = mw->mr;
+ if (!mr) {
+ rxe_dbg_qp(qp, "MW doesn't have an MR\n");
+ state = RESPST_ERR_RKEY_VIOLATION;
+ goto err;
+ }
+
+ if (mw->access & IB_ZERO_BASED)
+ qp->resp.offset = mw->addr;
+
+ rxe_get(mr);
+ rxe_put(mw);
+ mw = NULL;
+ } else {
+ mr = lookup_mr(qp->pd, access, rkey, RXE_LOOKUP_REMOTE);
+ if (!mr) {
+ rxe_dbg_qp(qp, "no MR matches rkey %#x\n", rkey);
+ state = RESPST_ERR_RKEY_VIOLATION;
+ goto err;
+ }
+ }
+
+ if (pkt->mask & RXE_FLUSH_MASK) {
+ /* FLUSH MR may not set va or resid
+ * no need to check range since we will flush whole mr
+ */
+ if (feth_sel(pkt) == IB_FLUSH_MR)
+ goto skip_check_range;
+ }
+
+ if (mr_check_range(mr, va + qp->resp.offset, resid)) {
+ state = RESPST_ERR_RKEY_VIOLATION;
+ goto err;
+ }
+
+skip_check_range:
+ if (pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) {
+ if (resid > mtu) {
+ if (pktlen != mtu || bth_pad(pkt)) {
+ state = RESPST_ERR_LENGTH;
+ goto err;
+ }
+ } else {
+ if (pktlen != resid) {
+ state = RESPST_ERR_LENGTH;
+ goto err;
+ }
+ if ((bth_pad(pkt) != (0x3 & (-resid)))) {
+ /* This case may not be exactly that
+ * but nothing else fits.
+ */
+ state = RESPST_ERR_LENGTH;
+ goto err;
+ }
+ }
+ }
+
+ WARN_ON_ONCE(qp->resp.mr);
+
+ qp->resp.mr = mr;
+ return RESPST_EXECUTE;
+
+err:
+ qp->resp.mr = NULL;
+ if (mr)
+ rxe_put(mr);
+ if (mw)
+ rxe_put(mw);
+
+ return state;
+}
+
+static enum resp_states send_data_in(struct rxe_qp *qp, void *data_addr,
+ int data_len)
+{
+ int err;
+
+ err = copy_data(qp->pd, IB_ACCESS_LOCAL_WRITE, &qp->resp.wqe->dma,
+ data_addr, data_len, RXE_TO_MR_OBJ);
+ if (unlikely(err))
+ return (err == -ENOSPC) ? RESPST_ERR_LENGTH
+ : RESPST_ERR_MALFORMED_WQE;
+
+ return RESPST_NONE;
+}
+
+static enum resp_states write_data_in(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ enum resp_states rc = RESPST_NONE;
+ int err;
+ int data_len = payload_size(pkt);
+
+ err = rxe_mr_copy(qp->resp.mr, qp->resp.va + qp->resp.offset,
+ payload_addr(pkt), data_len, RXE_TO_MR_OBJ);
+ if (err) {
+ rc = RESPST_ERR_RKEY_VIOLATION;
+ goto out;
+ }
+
+ qp->resp.va += data_len;
+ qp->resp.resid -= data_len;
+
+out:
+ return rc;
+}
+
+static struct resp_res *rxe_prepare_res(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt,
+ int type)
+{
+ struct resp_res *res;
+ u32 pkts;
+
+ res = &qp->resp.resources[qp->resp.res_head];
+ rxe_advance_resp_resource(qp);
+ free_rd_atomic_resource(res);
+
+ res->type = type;
+ res->replay = 0;
+
+ switch (type) {
+ case RXE_READ_MASK:
+ res->read.va = qp->resp.va + qp->resp.offset;
+ res->read.va_org = qp->resp.va + qp->resp.offset;
+ res->read.resid = qp->resp.resid;
+ res->read.length = qp->resp.resid;
+ res->read.rkey = qp->resp.rkey;
+
+ pkts = max_t(u32, (reth_len(pkt) + qp->mtu - 1)/qp->mtu, 1);
+ res->first_psn = pkt->psn;
+ res->cur_psn = pkt->psn;
+ res->last_psn = (pkt->psn + pkts - 1) & BTH_PSN_MASK;
+
+ res->state = rdatm_res_state_new;
+ break;
+ case RXE_ATOMIC_MASK:
+ case RXE_ATOMIC_WRITE_MASK:
+ res->first_psn = pkt->psn;
+ res->last_psn = pkt->psn;
+ res->cur_psn = pkt->psn;
+ break;
+ case RXE_FLUSH_MASK:
+ res->flush.va = qp->resp.va + qp->resp.offset;
+ res->flush.length = qp->resp.length;
+ res->flush.type = feth_plt(pkt);
+ res->flush.level = feth_sel(pkt);
+ }
+
+ return res;
+}
+
+static enum resp_states process_flush(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ u64 length, start;
+ struct rxe_mr *mr = qp->resp.mr;
+ struct resp_res *res = qp->resp.res;
+
+ /* oA19-14, oA19-15 */
+ if (res && res->replay)
+ return RESPST_ACKNOWLEDGE;
+ else if (!res) {
+ res = rxe_prepare_res(qp, pkt, RXE_FLUSH_MASK);
+ qp->resp.res = res;
+ }
+
+ if (res->flush.level == IB_FLUSH_RANGE) {
+ start = res->flush.va;
+ length = res->flush.length;
+ } else { /* level == IB_FLUSH_MR */
+ start = mr->ibmr.iova;
+ length = mr->ibmr.length;
+ }
+
+ if (res->flush.type & IB_FLUSH_PERSISTENT) {
+ if (rxe_flush_pmem_iova(mr, start, length))
+ return RESPST_ERR_RKEY_VIOLATION;
+ /* Make data persistent. */
+ wmb();
+ } else if (res->flush.type & IB_FLUSH_GLOBAL) {
+ /* Make data global visibility. */
+ wmb();
+ }
+
+ qp->resp.msn++;
+
+ /* next expected psn, read handles this separately */
+ qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
+ qp->resp.ack_psn = qp->resp.psn;
+
+ qp->resp.opcode = pkt->opcode;
+ qp->resp.status = IB_WC_SUCCESS;
+
+ return RESPST_ACKNOWLEDGE;
+}
+
+static enum resp_states atomic_reply(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ struct rxe_mr *mr = qp->resp.mr;
+ struct resp_res *res = qp->resp.res;
+ int err;
+
+ if (!res) {
+ res = rxe_prepare_res(qp, pkt, RXE_ATOMIC_MASK);
+ qp->resp.res = res;
+ }
+
+ if (!res->replay) {
+ u64 iova = qp->resp.va + qp->resp.offset;
+
+ err = rxe_mr_do_atomic_op(mr, iova, pkt->opcode,
+ atmeth_comp(pkt),
+ atmeth_swap_add(pkt),
+ &res->atomic.orig_val);
+ if (err)
+ return err;
+
+ qp->resp.msn++;
+
+ /* next expected psn, read handles this separately */
+ qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
+ qp->resp.ack_psn = qp->resp.psn;
+
+ qp->resp.opcode = pkt->opcode;
+ qp->resp.status = IB_WC_SUCCESS;
+ }
+
+ return RESPST_ACKNOWLEDGE;
+}
+
+static enum resp_states atomic_write_reply(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ struct resp_res *res = qp->resp.res;
+ struct rxe_mr *mr;
+ u64 value;
+ u64 iova;
+ int err;
+
+ if (!res) {
+ res = rxe_prepare_res(qp, pkt, RXE_ATOMIC_WRITE_MASK);
+ qp->resp.res = res;
+ }
+
+ if (res->replay)
+ return RESPST_ACKNOWLEDGE;
+
+ mr = qp->resp.mr;
+ value = *(u64 *)payload_addr(pkt);
+ iova = qp->resp.va + qp->resp.offset;
+
+ err = rxe_mr_do_atomic_write(mr, iova, value);
+ if (err)
+ return err;
+
+ qp->resp.resid = 0;
+ qp->resp.msn++;
+
+ /* next expected psn, read handles this separately */
+ qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
+ qp->resp.ack_psn = qp->resp.psn;
+
+ qp->resp.opcode = pkt->opcode;
+ qp->resp.status = IB_WC_SUCCESS;
+
+ return RESPST_ACKNOWLEDGE;
+}
+
+static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp,
+ struct rxe_pkt_info *ack,
+ int opcode,
+ int payload,
+ u32 psn,
+ u8 syndrome)
+{
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ struct sk_buff *skb;
+ int paylen;
+ int pad;
+ int err;
+
+ /*
+ * allocate packet
+ */
+ pad = (-payload) & 0x3;
+ paylen = rxe_opcode[opcode].length + payload + pad + RXE_ICRC_SIZE;
+
+ skb = rxe_init_packet(rxe, &qp->pri_av, paylen, ack);
+ if (!skb)
+ return NULL;
+
+ ack->qp = qp;
+ ack->opcode = opcode;
+ ack->mask = rxe_opcode[opcode].mask;
+ ack->paylen = paylen;
+ ack->psn = psn;
+
+ bth_init(ack, opcode, 0, 0, pad, IB_DEFAULT_PKEY_FULL,
+ qp->attr.dest_qp_num, 0, psn);
+
+ if (ack->mask & RXE_AETH_MASK) {
+ aeth_set_syn(ack, syndrome);
+ aeth_set_msn(ack, qp->resp.msn);
+ }
+
+ if (ack->mask & RXE_ATMACK_MASK)
+ atmack_set_orig(ack, qp->resp.res->atomic.orig_val);
+
+ err = rxe_prepare(&qp->pri_av, ack, skb);
+ if (err) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ return skb;
+}
+
+/**
+ * rxe_recheck_mr - revalidate MR from rkey and get a reference
+ * @qp: the qp
+ * @rkey: the rkey
+ *
+ * This code allows the MR to be invalidated or deregistered or
+ * the MW if one was used to be invalidated or deallocated.
+ * It is assumed that the access permissions if originally good
+ * are OK and the mappings to be unchanged.
+ *
+ * TODO: If someone reregisters an MR to change its size or
+ * access permissions during the processing of an RDMA read
+ * we should kill the responder resource and complete the
+ * operation with an error.
+ *
+ * Return: mr on success else NULL
+ */
+static struct rxe_mr *rxe_recheck_mr(struct rxe_qp *qp, u32 rkey)
+{
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ struct rxe_mr *mr;
+ struct rxe_mw *mw;
+
+ if (rkey_is_mw(rkey)) {
+ mw = rxe_pool_get_index(&rxe->mw_pool, rkey >> 8);
+ if (!mw)
+ return NULL;
+
+ mr = mw->mr;
+ if (mw->rkey != rkey || mw->state != RXE_MW_STATE_VALID ||
+ !mr || mr->state != RXE_MR_STATE_VALID) {
+ rxe_put(mw);
+ return NULL;
+ }
+
+ rxe_get(mr);
+ rxe_put(mw);
+
+ return mr;
+ }
+
+ mr = rxe_pool_get_index(&rxe->mr_pool, rkey >> 8);
+ if (!mr)
+ return NULL;
+
+ if (mr->rkey != rkey || mr->state != RXE_MR_STATE_VALID) {
+ rxe_put(mr);
+ return NULL;
+ }
+
+ return mr;
+}
+
+/* RDMA read response. If res is not NULL, then we have a current RDMA request
+ * being processed or replayed.
+ */
+static enum resp_states read_reply(struct rxe_qp *qp,
+ struct rxe_pkt_info *req_pkt)
+{
+ struct rxe_pkt_info ack_pkt;
+ struct sk_buff *skb;
+ int mtu = qp->mtu;
+ enum resp_states state;
+ int payload;
+ int opcode;
+ int err;
+ struct resp_res *res = qp->resp.res;
+ struct rxe_mr *mr;
+
+ if (!res) {
+ res = rxe_prepare_res(qp, req_pkt, RXE_READ_MASK);
+ qp->resp.res = res;
+ }
+
+ if (res->state == rdatm_res_state_new) {
+ if (!res->replay || qp->resp.length == 0) {
+ /* if length == 0 mr will be NULL (is ok)
+ * otherwise qp->resp.mr holds a ref on mr
+ * which we transfer to mr and drop below.
+ */
+ mr = qp->resp.mr;
+ qp->resp.mr = NULL;
+ } else {
+ mr = rxe_recheck_mr(qp, res->read.rkey);
+ if (!mr)
+ return RESPST_ERR_RKEY_VIOLATION;
+ }
+
+ if (res->read.resid <= mtu)
+ opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY;
+ else
+ opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST;
+ } else {
+ /* re-lookup mr from rkey on all later packets.
+ * length will be non-zero. This can fail if someone
+ * modifies or destroys the mr since the first packet.
+ */
+ mr = rxe_recheck_mr(qp, res->read.rkey);
+ if (!mr)
+ return RESPST_ERR_RKEY_VIOLATION;
+
+ if (res->read.resid > mtu)
+ opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE;
+ else
+ opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST;
+ }
+
+ res->state = rdatm_res_state_next;
+
+ payload = min_t(int, res->read.resid, mtu);
+
+ skb = prepare_ack_packet(qp, &ack_pkt, opcode, payload,
+ res->cur_psn, AETH_ACK_UNLIMITED);
+ if (!skb) {
+ state = RESPST_ERR_RNR;
+ goto err_out;
+ }
+
+ err = rxe_mr_copy(mr, res->read.va, payload_addr(&ack_pkt),
+ payload, RXE_FROM_MR_OBJ);
+ if (err) {
+ kfree_skb(skb);
+ state = RESPST_ERR_RKEY_VIOLATION;
+ goto err_out;
+ }
+
+ if (bth_pad(&ack_pkt)) {
+ u8 *pad = payload_addr(&ack_pkt) + payload;
+
+ memset(pad, 0, bth_pad(&ack_pkt));
+ }
+
+ /* rxe_xmit_packet always consumes the skb */
+ err = rxe_xmit_packet(qp, &ack_pkt, skb);
+ if (err) {
+ state = RESPST_ERR_RNR;
+ goto err_out;
+ }
+
+ res->read.va += payload;
+ res->read.resid -= payload;
+ res->cur_psn = (res->cur_psn + 1) & BTH_PSN_MASK;
+
+ if (res->read.resid > 0) {
+ state = RESPST_DONE;
+ } else {
+ qp->resp.res = NULL;
+ if (!res->replay)
+ qp->resp.opcode = -1;
+ if (psn_compare(res->cur_psn, qp->resp.psn) >= 0)
+ qp->resp.psn = res->cur_psn;
+ state = RESPST_CLEANUP;
+ }
+
+err_out:
+ if (mr)
+ rxe_put(mr);
+ return state;
+}
+
+static int invalidate_rkey(struct rxe_qp *qp, u32 rkey)
+{
+ if (rkey_is_mw(rkey))
+ return rxe_invalidate_mw(qp, rkey);
+ else
+ return rxe_invalidate_mr(qp, rkey);
+}
+
+/* Executes a new request. A retried request never reach that function (send
+ * and writes are discarded, and reads and atomics are retried elsewhere.
+ */
+static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt)
+{
+ enum resp_states err;
+ struct sk_buff *skb = PKT_TO_SKB(pkt);
+ union rdma_network_hdr hdr;
+
+ if (pkt->mask & RXE_SEND_MASK) {
+ if (qp_type(qp) == IB_QPT_UD ||
+ qp_type(qp) == IB_QPT_GSI) {
+ if (skb->protocol == htons(ETH_P_IP)) {
+ memset(&hdr.reserved, 0,
+ sizeof(hdr.reserved));
+ memcpy(&hdr.roce4grh, ip_hdr(skb),
+ sizeof(hdr.roce4grh));
+ err = send_data_in(qp, &hdr, sizeof(hdr));
+ } else {
+ err = send_data_in(qp, ipv6_hdr(skb),
+ sizeof(hdr));
+ }
+ if (err)
+ return err;
+ }
+ err = send_data_in(qp, payload_addr(pkt), payload_size(pkt));
+ if (err)
+ return err;
+ } else if (pkt->mask & RXE_WRITE_MASK) {
+ err = write_data_in(qp, pkt);
+ if (err)
+ return err;
+ } else if (pkt->mask & RXE_READ_MASK) {
+ /* For RDMA Read we can increment the msn now. See C9-148. */
+ qp->resp.msn++;
+ return RESPST_READ_REPLY;
+ } else if (pkt->mask & RXE_ATOMIC_MASK) {
+ return RESPST_ATOMIC_REPLY;
+ } else if (pkt->mask & RXE_ATOMIC_WRITE_MASK) {
+ return RESPST_ATOMIC_WRITE_REPLY;
+ } else if (pkt->mask & RXE_FLUSH_MASK) {
+ return RESPST_PROCESS_FLUSH;
+ } else {
+ /* Unreachable */
+ WARN_ON_ONCE(1);
+ }
+
+ if (pkt->mask & RXE_IETH_MASK) {
+ u32 rkey = ieth_rkey(pkt);
+
+ err = invalidate_rkey(qp, rkey);
+ if (err)
+ return RESPST_ERR_INVALIDATE_RKEY;
+ }
+
+ if (pkt->mask & RXE_END_MASK)
+ /* We successfully processed this new request. */
+ qp->resp.msn++;
+
+ /* next expected psn, read handles this separately */
+ qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
+ qp->resp.ack_psn = qp->resp.psn;
+
+ qp->resp.opcode = pkt->opcode;
+ qp->resp.status = IB_WC_SUCCESS;
+
+ if (pkt->mask & RXE_COMP_MASK)
+ return RESPST_COMPLETE;
+ else if (qp_type(qp) == IB_QPT_RC)
+ return RESPST_ACKNOWLEDGE;
+ else
+ return RESPST_CLEANUP;
+}
+
+static enum resp_states do_complete(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ struct rxe_cqe cqe;
+ struct ib_wc *wc = &cqe.ibwc;
+ struct ib_uverbs_wc *uwc = &cqe.uibwc;
+ struct rxe_recv_wqe *wqe = qp->resp.wqe;
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ unsigned long flags;
+
+ if (!wqe)
+ goto finish;
+
+ memset(&cqe, 0, sizeof(cqe));
+
+ if (qp->rcq->is_user) {
+ uwc->status = qp->resp.status;
+ uwc->qp_num = qp->ibqp.qp_num;
+ uwc->wr_id = wqe->wr_id;
+ } else {
+ wc->status = qp->resp.status;
+ wc->qp = &qp->ibqp;
+ wc->wr_id = wqe->wr_id;
+ }
+
+ if (wc->status == IB_WC_SUCCESS) {
+ rxe_counter_inc(rxe, RXE_CNT_RDMA_RECV);
+ wc->opcode = (pkt->mask & RXE_IMMDT_MASK &&
+ pkt->mask & RXE_WRITE_MASK) ?
+ IB_WC_RECV_RDMA_WITH_IMM : IB_WC_RECV;
+ wc->byte_len = (pkt->mask & RXE_IMMDT_MASK &&
+ pkt->mask & RXE_WRITE_MASK) ?
+ qp->resp.length : wqe->dma.length - wqe->dma.resid;
+
+ /* fields after byte_len are different between kernel and user
+ * space
+ */
+ if (qp->rcq->is_user) {
+ uwc->wc_flags = IB_WC_GRH;
+
+ if (pkt->mask & RXE_IMMDT_MASK) {
+ uwc->wc_flags |= IB_WC_WITH_IMM;
+ uwc->ex.imm_data = immdt_imm(pkt);
+ }
+
+ if (pkt->mask & RXE_IETH_MASK) {
+ uwc->wc_flags |= IB_WC_WITH_INVALIDATE;
+ uwc->ex.invalidate_rkey = ieth_rkey(pkt);
+ }
+
+ if (pkt->mask & RXE_DETH_MASK)
+ uwc->src_qp = deth_sqp(pkt);
+
+ uwc->port_num = qp->attr.port_num;
+ } else {
+ struct sk_buff *skb = PKT_TO_SKB(pkt);
+
+ wc->wc_flags = IB_WC_GRH | IB_WC_WITH_NETWORK_HDR_TYPE;
+ if (skb->protocol == htons(ETH_P_IP))
+ wc->network_hdr_type = RDMA_NETWORK_IPV4;
+ else
+ wc->network_hdr_type = RDMA_NETWORK_IPV6;
+
+ if (is_vlan_dev(skb->dev)) {
+ wc->wc_flags |= IB_WC_WITH_VLAN;
+ wc->vlan_id = vlan_dev_vlan_id(skb->dev);
+ }
+
+ if (pkt->mask & RXE_IMMDT_MASK) {
+ wc->wc_flags |= IB_WC_WITH_IMM;
+ wc->ex.imm_data = immdt_imm(pkt);
+ }
+
+ if (pkt->mask & RXE_IETH_MASK) {
+ wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+ wc->ex.invalidate_rkey = ieth_rkey(pkt);
+ }
+
+ if (pkt->mask & RXE_DETH_MASK)
+ wc->src_qp = deth_sqp(pkt);
+
+ wc->port_num = qp->attr.port_num;
+ }
+ } else {
+ if (wc->status != IB_WC_WR_FLUSH_ERR)
+ rxe_err_qp(qp, "non-flush error status = %d",
+ wc->status);
+ }
+
+ /* have copy for srq and reference for !srq */
+ if (!qp->srq)
+ queue_advance_consumer(qp->rq.queue, QUEUE_TYPE_FROM_CLIENT);
+
+ qp->resp.wqe = NULL;
+
+ if (rxe_cq_post(qp->rcq, &cqe, pkt ? bth_se(pkt) : 1))
+ return RESPST_ERR_CQ_OVERFLOW;
+
+finish:
+ spin_lock_irqsave(&qp->state_lock, flags);
+ if (unlikely(qp_state(qp) == IB_QPS_ERR)) {
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+ return RESPST_CHK_RESOURCE;
+ }
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+
+ if (unlikely(!pkt))
+ return RESPST_DONE;
+ if (qp_type(qp) == IB_QPT_RC)
+ return RESPST_ACKNOWLEDGE;
+ else
+ return RESPST_CLEANUP;
+}
+
+
+static int send_common_ack(struct rxe_qp *qp, u8 syndrome, u32 psn,
+ int opcode, const char *msg)
+{
+ int err;
+ struct rxe_pkt_info ack_pkt;
+ struct sk_buff *skb;
+
+ skb = prepare_ack_packet(qp, &ack_pkt, opcode, 0, psn, syndrome);
+ if (!skb)
+ return -ENOMEM;
+
+ err = rxe_xmit_packet(qp, &ack_pkt, skb);
+ if (err)
+ rxe_dbg_qp(qp, "Failed sending %s\n", msg);
+
+ return err;
+}
+
+static int send_ack(struct rxe_qp *qp, u8 syndrome, u32 psn)
+{
+ return send_common_ack(qp, syndrome, psn,
+ IB_OPCODE_RC_ACKNOWLEDGE, "ACK");
+}
+
+static int send_atomic_ack(struct rxe_qp *qp, u8 syndrome, u32 psn)
+{
+ int ret = send_common_ack(qp, syndrome, psn,
+ IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE, "ATOMIC ACK");
+
+ /* have to clear this since it is used to trigger
+ * long read replies
+ */
+ qp->resp.res = NULL;
+ return ret;
+}
+
+static int send_read_response_ack(struct rxe_qp *qp, u8 syndrome, u32 psn)
+{
+ int ret = send_common_ack(qp, syndrome, psn,
+ IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY,
+ "RDMA READ response of length zero ACK");
+
+ /* have to clear this since it is used to trigger
+ * long read replies
+ */
+ qp->resp.res = NULL;
+ return ret;
+}
+
+static enum resp_states acknowledge(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ if (qp_type(qp) != IB_QPT_RC)
+ return RESPST_CLEANUP;
+
+ if (qp->resp.aeth_syndrome != AETH_ACK_UNLIMITED)
+ send_ack(qp, qp->resp.aeth_syndrome, pkt->psn);
+ else if (pkt->mask & RXE_ATOMIC_MASK)
+ send_atomic_ack(qp, AETH_ACK_UNLIMITED, pkt->psn);
+ else if (pkt->mask & (RXE_FLUSH_MASK | RXE_ATOMIC_WRITE_MASK))
+ send_read_response_ack(qp, AETH_ACK_UNLIMITED, pkt->psn);
+ else if (bth_ack(pkt))
+ send_ack(qp, AETH_ACK_UNLIMITED, pkt->psn);
+
+ return RESPST_CLEANUP;
+}
+
+static enum resp_states cleanup(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ struct sk_buff *skb;
+
+ if (pkt) {
+ skb = skb_dequeue(&qp->req_pkts);
+ rxe_put(qp);
+ kfree_skb(skb);
+ ib_device_put(qp->ibqp.device);
+ }
+
+ if (qp->resp.mr) {
+ rxe_put(qp->resp.mr);
+ qp->resp.mr = NULL;
+ }
+
+ return RESPST_DONE;
+}
+
+static struct resp_res *find_resource(struct rxe_qp *qp, u32 psn)
+{
+ int i;
+
+ for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) {
+ struct resp_res *res = &qp->resp.resources[i];
+
+ if (res->type == 0)
+ continue;
+
+ if (psn_compare(psn, res->first_psn) >= 0 &&
+ psn_compare(psn, res->last_psn) <= 0) {
+ return res;
+ }
+ }
+
+ return NULL;
+}
+
+static enum resp_states duplicate_request(struct rxe_qp *qp,
+ struct rxe_pkt_info *pkt)
+{
+ enum resp_states rc;
+ u32 prev_psn = (qp->resp.ack_psn - 1) & BTH_PSN_MASK;
+
+ if (pkt->mask & RXE_SEND_MASK ||
+ pkt->mask & RXE_WRITE_MASK) {
+ /* SEND. Ack again and cleanup. C9-105. */
+ send_ack(qp, AETH_ACK_UNLIMITED, prev_psn);
+ return RESPST_CLEANUP;
+ } else if (pkt->mask & RXE_FLUSH_MASK) {
+ struct resp_res *res;
+
+ /* Find the operation in our list of responder resources. */
+ res = find_resource(qp, pkt->psn);
+ if (res) {
+ res->replay = 1;
+ res->cur_psn = pkt->psn;
+ qp->resp.res = res;
+ rc = RESPST_PROCESS_FLUSH;
+ goto out;
+ }
+
+ /* Resource not found. Class D error. Drop the request. */
+ rc = RESPST_CLEANUP;
+ goto out;
+ } else if (pkt->mask & RXE_READ_MASK) {
+ struct resp_res *res;
+
+ res = find_resource(qp, pkt->psn);
+ if (!res) {
+ /* Resource not found. Class D error. Drop the
+ * request.
+ */
+ rc = RESPST_CLEANUP;
+ goto out;
+ } else {
+ /* Ensure this new request is the same as the previous
+ * one or a subset of it.
+ */
+ u64 iova = reth_va(pkt);
+ u32 resid = reth_len(pkt);
+
+ if (iova < res->read.va_org ||
+ resid > res->read.length ||
+ (iova + resid) > (res->read.va_org +
+ res->read.length)) {
+ rc = RESPST_CLEANUP;
+ goto out;
+ }
+
+ if (reth_rkey(pkt) != res->read.rkey) {
+ rc = RESPST_CLEANUP;
+ goto out;
+ }
+
+ res->cur_psn = pkt->psn;
+ res->state = (pkt->psn == res->first_psn) ?
+ rdatm_res_state_new :
+ rdatm_res_state_replay;
+ res->replay = 1;
+
+ /* Reset the resource, except length. */
+ res->read.va_org = iova;
+ res->read.va = iova;
+ res->read.resid = resid;
+
+ /* Replay the RDMA read reply. */
+ qp->resp.res = res;
+ rc = RESPST_READ_REPLY;
+ goto out;
+ }
+ } else {
+ struct resp_res *res;
+
+ /* Find the operation in our list of responder resources. */
+ res = find_resource(qp, pkt->psn);
+ if (res) {
+ res->replay = 1;
+ res->cur_psn = pkt->psn;
+ qp->resp.res = res;
+ rc = pkt->mask & RXE_ATOMIC_MASK ?
+ RESPST_ATOMIC_REPLY :
+ RESPST_ATOMIC_WRITE_REPLY;
+ goto out;
+ }
+
+ /* Resource not found. Class D error. Drop the request. */
+ rc = RESPST_CLEANUP;
+ goto out;
+ }
+out:
+ return rc;
+}
+
+/* Process a class A or C. Both are treated the same in this implementation. */
+static void do_class_ac_error(struct rxe_qp *qp, u8 syndrome,
+ enum ib_wc_status status)
+{
+ qp->resp.aeth_syndrome = syndrome;
+ qp->resp.status = status;
+
+ /* indicate that we should go through the ERROR state */
+ qp->resp.goto_error = 1;
+}
+
+static enum resp_states do_class_d1e_error(struct rxe_qp *qp)
+{
+ /* UC */
+ if (qp->srq) {
+ /* Class E */
+ qp->resp.drop_msg = 1;
+ if (qp->resp.wqe) {
+ qp->resp.status = IB_WC_REM_INV_REQ_ERR;
+ return RESPST_COMPLETE;
+ } else {
+ return RESPST_CLEANUP;
+ }
+ } else {
+ /* Class D1. This packet may be the start of a
+ * new message and could be valid. The previous
+ * message is invalid and ignored. reset the
+ * recv wr to its original state
+ */
+ if (qp->resp.wqe) {
+ qp->resp.wqe->dma.resid = qp->resp.wqe->dma.length;
+ qp->resp.wqe->dma.cur_sge = 0;
+ qp->resp.wqe->dma.sge_offset = 0;
+ qp->resp.opcode = -1;
+ }
+
+ if (qp->resp.mr) {
+ rxe_put(qp->resp.mr);
+ qp->resp.mr = NULL;
+ }
+
+ return RESPST_CLEANUP;
+ }
+}
+
+/* drain incoming request packet queue */
+static void drain_req_pkts(struct rxe_qp *qp)
+{
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&qp->req_pkts))) {
+ rxe_put(qp);
+ kfree_skb(skb);
+ ib_device_put(qp->ibqp.device);
+ }
+}
+
+/* complete receive wqe with flush error */
+static int flush_recv_wqe(struct rxe_qp *qp, struct rxe_recv_wqe *wqe)
+{
+ struct rxe_cqe cqe = {};
+ struct ib_wc *wc = &cqe.ibwc;
+ struct ib_uverbs_wc *uwc = &cqe.uibwc;
+ int err;
+
+ if (qp->rcq->is_user) {
+ uwc->wr_id = wqe->wr_id;
+ uwc->status = IB_WC_WR_FLUSH_ERR;
+ uwc->qp_num = qp_num(qp);
+ } else {
+ wc->wr_id = wqe->wr_id;
+ wc->status = IB_WC_WR_FLUSH_ERR;
+ wc->qp = &qp->ibqp;
+ }
+
+ err = rxe_cq_post(qp->rcq, &cqe, 0);
+ if (err)
+ rxe_dbg_cq(qp->rcq, "post cq failed err = %d", err);
+
+ return err;
+}
+
+/* drain and optionally complete the recive queue
+ * if unable to complete a wqe stop completing and
+ * just flush the remaining wqes
+ */
+static void flush_recv_queue(struct rxe_qp *qp, bool notify)
+{
+ struct rxe_queue *q = qp->rq.queue;
+ struct rxe_recv_wqe *wqe;
+ int err;
+
+ if (qp->srq) {
+ if (notify && qp->ibqp.event_handler) {
+ struct ib_event ev;
+
+ ev.device = qp->ibqp.device;
+ ev.element.qp = &qp->ibqp;
+ ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+ qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+ }
+ return;
+ }
+
+ /* recv queue not created. nothing to do. */
+ if (!qp->rq.queue)
+ return;
+
+ while ((wqe = queue_head(q, q->type))) {
+ if (notify) {
+ err = flush_recv_wqe(qp, wqe);
+ if (err)
+ notify = 0;
+ }
+ queue_advance_consumer(q, q->type);
+ }
+
+ qp->resp.wqe = NULL;
+}
+
+int rxe_responder(struct rxe_qp *qp)
+{
+ struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
+ enum resp_states state;
+ struct rxe_pkt_info *pkt = NULL;
+ int ret;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->state_lock, flags);
+ if (!qp->valid || qp_state(qp) == IB_QPS_ERR ||
+ qp_state(qp) == IB_QPS_RESET) {
+ bool notify = qp->valid && (qp_state(qp) == IB_QPS_ERR);
+
+ drain_req_pkts(qp);
+ flush_recv_queue(qp, notify);
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+ goto exit;
+ }
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+
+ qp->resp.aeth_syndrome = AETH_ACK_UNLIMITED;
+
+ state = RESPST_GET_REQ;
+
+ while (1) {
+ rxe_dbg_qp(qp, "state = %s\n", resp_state_name[state]);
+ switch (state) {
+ case RESPST_GET_REQ:
+ state = get_req(qp, &pkt);
+ break;
+ case RESPST_CHK_PSN:
+ state = check_psn(qp, pkt);
+ break;
+ case RESPST_CHK_OP_SEQ:
+ state = check_op_seq(qp, pkt);
+ break;
+ case RESPST_CHK_OP_VALID:
+ state = check_op_valid(qp, pkt);
+ break;
+ case RESPST_CHK_RESOURCE:
+ state = check_resource(qp, pkt);
+ break;
+ case RESPST_CHK_LENGTH:
+ state = rxe_resp_check_length(qp, pkt);
+ break;
+ case RESPST_CHK_RKEY:
+ state = check_rkey(qp, pkt);
+ break;
+ case RESPST_EXECUTE:
+ state = execute(qp, pkt);
+ break;
+ case RESPST_COMPLETE:
+ state = do_complete(qp, pkt);
+ break;
+ case RESPST_READ_REPLY:
+ state = read_reply(qp, pkt);
+ break;
+ case RESPST_ATOMIC_REPLY:
+ state = atomic_reply(qp, pkt);
+ break;
+ case RESPST_ATOMIC_WRITE_REPLY:
+ state = atomic_write_reply(qp, pkt);
+ break;
+ case RESPST_PROCESS_FLUSH:
+ state = process_flush(qp, pkt);
+ break;
+ case RESPST_ACKNOWLEDGE:
+ state = acknowledge(qp, pkt);
+ break;
+ case RESPST_CLEANUP:
+ state = cleanup(qp, pkt);
+ break;
+ case RESPST_DUPLICATE_REQUEST:
+ state = duplicate_request(qp, pkt);
+ break;
+ case RESPST_ERR_PSN_OUT_OF_SEQ:
+ /* RC only - Class B. Drop packet. */
+ send_ack(qp, AETH_NAK_PSN_SEQ_ERROR, qp->resp.psn);
+ state = RESPST_CLEANUP;
+ break;
+
+ case RESPST_ERR_TOO_MANY_RDMA_ATM_REQ:
+ case RESPST_ERR_MISSING_OPCODE_FIRST:
+ case RESPST_ERR_MISSING_OPCODE_LAST_C:
+ case RESPST_ERR_UNSUPPORTED_OPCODE:
+ case RESPST_ERR_MISALIGNED_ATOMIC:
+ /* RC Only - Class C. */
+ do_class_ac_error(qp, AETH_NAK_INVALID_REQ,
+ IB_WC_REM_INV_REQ_ERR);
+ state = RESPST_COMPLETE;
+ break;
+
+ case RESPST_ERR_MISSING_OPCODE_LAST_D1E:
+ state = do_class_d1e_error(qp);
+ break;
+ case RESPST_ERR_RNR:
+ if (qp_type(qp) == IB_QPT_RC) {
+ rxe_counter_inc(rxe, RXE_CNT_SND_RNR);
+ /* RC - class B */
+ send_ack(qp, AETH_RNR_NAK |
+ (~AETH_TYPE_MASK &
+ qp->attr.min_rnr_timer),
+ pkt->psn);
+ } else {
+ /* UD/UC - class D */
+ qp->resp.drop_msg = 1;
+ }
+ state = RESPST_CLEANUP;
+ break;
+
+ case RESPST_ERR_RKEY_VIOLATION:
+ if (qp_type(qp) == IB_QPT_RC) {
+ /* Class C */
+ do_class_ac_error(qp, AETH_NAK_REM_ACC_ERR,
+ IB_WC_REM_ACCESS_ERR);
+ state = RESPST_COMPLETE;
+ } else {
+ qp->resp.drop_msg = 1;
+ if (qp->srq) {
+ /* UC/SRQ Class D */
+ qp->resp.status = IB_WC_REM_ACCESS_ERR;
+ state = RESPST_COMPLETE;
+ } else {
+ /* UC/non-SRQ Class E. */
+ state = RESPST_CLEANUP;
+ }
+ }
+ break;
+
+ case RESPST_ERR_INVALIDATE_RKEY:
+ /* RC - Class J. */
+ qp->resp.goto_error = 1;
+ qp->resp.status = IB_WC_REM_INV_REQ_ERR;
+ state = RESPST_COMPLETE;
+ break;
+
+ case RESPST_ERR_LENGTH:
+ if (qp_type(qp) == IB_QPT_RC) {
+ /* Class C */
+ do_class_ac_error(qp, AETH_NAK_INVALID_REQ,
+ IB_WC_REM_INV_REQ_ERR);
+ state = RESPST_COMPLETE;
+ } else if (qp->srq) {
+ /* UC/UD - class E */
+ qp->resp.status = IB_WC_REM_INV_REQ_ERR;
+ state = RESPST_COMPLETE;
+ } else {
+ /* UC/UD - class D */
+ qp->resp.drop_msg = 1;
+ state = RESPST_CLEANUP;
+ }
+ break;
+
+ case RESPST_ERR_MALFORMED_WQE:
+ /* All, Class A. */
+ do_class_ac_error(qp, AETH_NAK_REM_OP_ERR,
+ IB_WC_LOC_QP_OP_ERR);
+ state = RESPST_COMPLETE;
+ break;
+
+ case RESPST_ERR_CQ_OVERFLOW:
+ /* All - Class G */
+ state = RESPST_ERROR;
+ break;
+
+ case RESPST_DONE:
+ if (qp->resp.goto_error) {
+ state = RESPST_ERROR;
+ break;
+ }
+
+ goto done;
+
+ case RESPST_EXIT:
+ if (qp->resp.goto_error) {
+ state = RESPST_ERROR;
+ break;
+ }
+
+ goto exit;
+
+ case RESPST_ERROR:
+ qp->resp.goto_error = 0;
+ rxe_dbg_qp(qp, "moved to error state\n");
+ rxe_qp_error(qp);
+ goto exit;
+
+ default:
+ WARN_ON_ONCE(1);
+ }
+ }
+
+ /* A non-zero return value will cause rxe_do_task to
+ * exit its loop and end the work item. A zero return
+ * will continue looping and return to rxe_responder
+ */
+done:
+ ret = 0;
+ goto out;
+exit:
+ ret = -EAGAIN;
+out:
+ return ret;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c
new file mode 100644
index 0000000000..3661cb627d
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_srq.c
@@ -0,0 +1,199 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#include <linux/vmalloc.h>
+#include "rxe.h"
+#include "rxe_queue.h"
+
+int rxe_srq_chk_init(struct rxe_dev *rxe, struct ib_srq_init_attr *init)
+{
+ struct ib_srq_attr *attr = &init->attr;
+
+ if (attr->max_wr > rxe->attr.max_srq_wr) {
+ rxe_dbg_dev(rxe, "max_wr(%d) > max_srq_wr(%d)\n",
+ attr->max_wr, rxe->attr.max_srq_wr);
+ goto err1;
+ }
+
+ if (attr->max_wr <= 0) {
+ rxe_dbg_dev(rxe, "max_wr(%d) <= 0\n", attr->max_wr);
+ goto err1;
+ }
+
+ if (attr->max_wr < RXE_MIN_SRQ_WR)
+ attr->max_wr = RXE_MIN_SRQ_WR;
+
+ if (attr->max_sge > rxe->attr.max_srq_sge) {
+ rxe_dbg_dev(rxe, "max_sge(%d) > max_srq_sge(%d)\n",
+ attr->max_sge, rxe->attr.max_srq_sge);
+ goto err1;
+ }
+
+ if (attr->max_sge < RXE_MIN_SRQ_SGE)
+ attr->max_sge = RXE_MIN_SRQ_SGE;
+
+ return 0;
+
+err1:
+ return -EINVAL;
+}
+
+int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq,
+ struct ib_srq_init_attr *init, struct ib_udata *udata,
+ struct rxe_create_srq_resp __user *uresp)
+{
+ struct rxe_queue *q;
+ int wqe_size;
+ int err;
+
+ srq->ibsrq.event_handler = init->event_handler;
+ srq->ibsrq.srq_context = init->srq_context;
+ srq->limit = init->attr.srq_limit;
+ srq->srq_num = srq->elem.index;
+ srq->rq.max_wr = init->attr.max_wr;
+ srq->rq.max_sge = init->attr.max_sge;
+
+ wqe_size = sizeof(struct rxe_recv_wqe) +
+ srq->rq.max_sge*sizeof(struct ib_sge);
+
+ spin_lock_init(&srq->rq.producer_lock);
+ spin_lock_init(&srq->rq.consumer_lock);
+
+ q = rxe_queue_init(rxe, &srq->rq.max_wr, wqe_size,
+ QUEUE_TYPE_FROM_CLIENT);
+ if (!q) {
+ rxe_dbg_srq(srq, "Unable to allocate queue\n");
+ err = -ENOMEM;
+ goto err_out;
+ }
+
+ err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, udata, q->buf,
+ q->buf_size, &q->ip);
+ if (err) {
+ rxe_dbg_srq(srq, "Unable to init mmap info for caller\n");
+ goto err_free;
+ }
+
+ srq->rq.queue = q;
+ init->attr.max_wr = srq->rq.max_wr;
+
+ if (uresp) {
+ if (copy_to_user(&uresp->srq_num, &srq->srq_num,
+ sizeof(uresp->srq_num))) {
+ rxe_queue_cleanup(q);
+ return -EFAULT;
+ }
+ }
+
+ return 0;
+
+err_free:
+ vfree(q->buf);
+ kfree(q);
+err_out:
+ return err;
+}
+
+int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
+ struct ib_srq_attr *attr, enum ib_srq_attr_mask mask)
+{
+ if (srq->error) {
+ rxe_dbg_srq(srq, "in error state\n");
+ goto err1;
+ }
+
+ if (mask & IB_SRQ_MAX_WR) {
+ if (attr->max_wr > rxe->attr.max_srq_wr) {
+ rxe_dbg_srq(srq, "max_wr(%d) > max_srq_wr(%d)\n",
+ attr->max_wr, rxe->attr.max_srq_wr);
+ goto err1;
+ }
+
+ if (attr->max_wr <= 0) {
+ rxe_dbg_srq(srq, "max_wr(%d) <= 0\n", attr->max_wr);
+ goto err1;
+ }
+
+ if (srq->limit && (attr->max_wr < srq->limit)) {
+ rxe_dbg_srq(srq, "max_wr (%d) < srq->limit (%d)\n",
+ attr->max_wr, srq->limit);
+ goto err1;
+ }
+
+ if (attr->max_wr < RXE_MIN_SRQ_WR)
+ attr->max_wr = RXE_MIN_SRQ_WR;
+ }
+
+ if (mask & IB_SRQ_LIMIT) {
+ if (attr->srq_limit > rxe->attr.max_srq_wr) {
+ rxe_dbg_srq(srq, "srq_limit(%d) > max_srq_wr(%d)\n",
+ attr->srq_limit, rxe->attr.max_srq_wr);
+ goto err1;
+ }
+
+ if (attr->srq_limit > srq->rq.queue->buf->index_mask) {
+ rxe_dbg_srq(srq, "srq_limit (%d) > cur limit(%d)\n",
+ attr->srq_limit,
+ srq->rq.queue->buf->index_mask);
+ goto err1;
+ }
+ }
+
+ return 0;
+
+err1:
+ return -EINVAL;
+}
+
+int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
+ struct ib_srq_attr *attr, enum ib_srq_attr_mask mask,
+ struct rxe_modify_srq_cmd *ucmd, struct ib_udata *udata)
+{
+ struct rxe_queue *q = srq->rq.queue;
+ struct mminfo __user *mi = NULL;
+ int wqe_size;
+ int err;
+
+ if (mask & IB_SRQ_MAX_WR) {
+ /*
+ * This is completely screwed up, the response is supposed to
+ * be in the outbuf not like this.
+ */
+ mi = u64_to_user_ptr(ucmd->mmap_info_addr);
+
+ wqe_size = sizeof(struct rxe_recv_wqe) +
+ srq->rq.max_sge*sizeof(struct ib_sge);
+
+ err = rxe_queue_resize(q, &attr->max_wr, wqe_size,
+ udata, mi, &srq->rq.producer_lock,
+ &srq->rq.consumer_lock);
+ if (err)
+ goto err_free;
+
+ srq->rq.max_wr = attr->max_wr;
+ }
+
+ if (mask & IB_SRQ_LIMIT)
+ srq->limit = attr->srq_limit;
+
+ return 0;
+
+err_free:
+ rxe_queue_cleanup(q);
+ srq->rq.queue = NULL;
+ return err;
+}
+
+void rxe_srq_cleanup(struct rxe_pool_elem *elem)
+{
+ struct rxe_srq *srq = container_of(elem, typeof(*srq), elem);
+
+ if (srq->pd)
+ rxe_put(srq->pd);
+
+ if (srq->rq.queue)
+ rxe_queue_cleanup(srq->rq.queue);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c
new file mode 100644
index 0000000000..1501120d4f
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_task.c
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#include "rxe.h"
+
+static struct workqueue_struct *rxe_wq;
+
+int rxe_alloc_wq(void)
+{
+ rxe_wq = alloc_workqueue("rxe_wq", WQ_UNBOUND, WQ_MAX_ACTIVE);
+ if (!rxe_wq)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void rxe_destroy_wq(void)
+{
+ destroy_workqueue(rxe_wq);
+}
+
+/* Check if task is idle i.e. not running, not scheduled in
+ * work queue and not draining. If so move to busy to
+ * reserve a slot in do_task() by setting to busy and taking
+ * a qp reference to cover the gap from now until the task finishes.
+ * state will move out of busy if task returns a non zero value
+ * in do_task(). If state is already busy it is raised to armed
+ * to indicate to do_task that additional pass should be made
+ * over the task.
+ * Context: caller should hold task->lock.
+ * Returns: true if state transitioned from idle to busy else false.
+ */
+static bool __reserve_if_idle(struct rxe_task *task)
+{
+ WARN_ON(rxe_read(task->qp) <= 0);
+
+ if (task->state == TASK_STATE_IDLE) {
+ rxe_get(task->qp);
+ task->state = TASK_STATE_BUSY;
+ task->num_sched++;
+ return true;
+ }
+
+ if (task->state == TASK_STATE_BUSY)
+ task->state = TASK_STATE_ARMED;
+
+ return false;
+}
+
+/* check if task is idle or drained and not currently
+ * scheduled in the work queue. This routine is
+ * called by rxe_cleanup_task or rxe_disable_task to
+ * see if the queue is empty.
+ * Context: caller should hold task->lock.
+ * Returns true if done else false.
+ */
+static bool __is_done(struct rxe_task *task)
+{
+ if (work_pending(&task->work))
+ return false;
+
+ if (task->state == TASK_STATE_IDLE ||
+ task->state == TASK_STATE_DRAINED) {
+ return true;
+ }
+
+ return false;
+}
+
+/* a locked version of __is_done */
+static bool is_done(struct rxe_task *task)
+{
+ unsigned long flags;
+ int done;
+
+ spin_lock_irqsave(&task->lock, flags);
+ done = __is_done(task);
+ spin_unlock_irqrestore(&task->lock, flags);
+
+ return done;
+}
+
+/* do_task is a wrapper for the three tasks (requester,
+ * completer, responder) and calls them in a loop until
+ * they return a non-zero value. It is called either
+ * directly by rxe_run_task or indirectly if rxe_sched_task
+ * schedules the task. They must call __reserve_if_idle to
+ * move the task to busy before calling or scheduling.
+ * The task can also be moved to drained or invalid
+ * by calls to rxe_cleanup_task or rxe_disable_task.
+ * In that case tasks which get here are not executed but
+ * just flushed. The tasks are designed to look to see if
+ * there is work to do and then do part of it before returning
+ * here with a return value of zero until all the work
+ * has been consumed then it returns a non-zero value.
+ * The number of times the task can be run is limited by
+ * max iterations so one task cannot hold the cpu forever.
+ * If the limit is hit and work remains the task is rescheduled.
+ */
+static void do_task(struct rxe_task *task)
+{
+ unsigned int iterations;
+ unsigned long flags;
+ int resched = 0;
+ int cont;
+ int ret;
+
+ WARN_ON(rxe_read(task->qp) <= 0);
+
+ spin_lock_irqsave(&task->lock, flags);
+ if (task->state >= TASK_STATE_DRAINED) {
+ rxe_put(task->qp);
+ task->num_done++;
+ spin_unlock_irqrestore(&task->lock, flags);
+ return;
+ }
+ spin_unlock_irqrestore(&task->lock, flags);
+
+ do {
+ iterations = RXE_MAX_ITERATIONS;
+ cont = 0;
+
+ do {
+ ret = task->func(task->qp);
+ } while (ret == 0 && iterations-- > 0);
+
+ spin_lock_irqsave(&task->lock, flags);
+ /* we're not done yet but we ran out of iterations.
+ * yield the cpu and reschedule the task
+ */
+ if (!ret) {
+ task->state = TASK_STATE_IDLE;
+ resched = 1;
+ goto exit;
+ }
+
+ switch (task->state) {
+ case TASK_STATE_BUSY:
+ task->state = TASK_STATE_IDLE;
+ break;
+
+ /* someone tried to schedule the task while we
+ * were running, keep going
+ */
+ case TASK_STATE_ARMED:
+ task->state = TASK_STATE_BUSY;
+ cont = 1;
+ break;
+
+ case TASK_STATE_DRAINING:
+ task->state = TASK_STATE_DRAINED;
+ break;
+
+ default:
+ WARN_ON(1);
+ rxe_dbg_qp(task->qp, "unexpected task state = %d",
+ task->state);
+ task->state = TASK_STATE_IDLE;
+ }
+
+exit:
+ if (!cont) {
+ task->num_done++;
+ if (WARN_ON(task->num_done != task->num_sched))
+ rxe_dbg_qp(
+ task->qp,
+ "%ld tasks scheduled, %ld tasks done",
+ task->num_sched, task->num_done);
+ }
+ spin_unlock_irqrestore(&task->lock, flags);
+ } while (cont);
+
+ task->ret = ret;
+
+ if (resched)
+ rxe_sched_task(task);
+
+ rxe_put(task->qp);
+}
+
+/* wrapper around do_task to fix argument for work queue */
+static void do_work(struct work_struct *work)
+{
+ do_task(container_of(work, struct rxe_task, work));
+}
+
+int rxe_init_task(struct rxe_task *task, struct rxe_qp *qp,
+ int (*func)(struct rxe_qp *))
+{
+ WARN_ON(rxe_read(qp) <= 0);
+
+ task->qp = qp;
+ task->func = func;
+ task->state = TASK_STATE_IDLE;
+ spin_lock_init(&task->lock);
+ INIT_WORK(&task->work, do_work);
+
+ return 0;
+}
+
+/* rxe_cleanup_task is only called from rxe_do_qp_cleanup in
+ * process context. The qp is already completed with no
+ * remaining references. Once the queue is drained the
+ * task is moved to invalid and returns. The qp cleanup
+ * code then calls the task functions directly without
+ * using the task struct to drain any late arriving packets
+ * or work requests.
+ */
+void rxe_cleanup_task(struct rxe_task *task)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&task->lock, flags);
+ if (!__is_done(task) && task->state < TASK_STATE_DRAINED) {
+ task->state = TASK_STATE_DRAINING;
+ } else {
+ task->state = TASK_STATE_INVALID;
+ spin_unlock_irqrestore(&task->lock, flags);
+ return;
+ }
+ spin_unlock_irqrestore(&task->lock, flags);
+
+ /* now the task cannot be scheduled or run just wait
+ * for the previously scheduled tasks to finish.
+ */
+ while (!is_done(task))
+ cond_resched();
+
+ spin_lock_irqsave(&task->lock, flags);
+ task->state = TASK_STATE_INVALID;
+ spin_unlock_irqrestore(&task->lock, flags);
+}
+
+/* run the task inline if it is currently idle
+ * cannot call do_task holding the lock
+ */
+void rxe_run_task(struct rxe_task *task)
+{
+ unsigned long flags;
+ bool run;
+
+ WARN_ON(rxe_read(task->qp) <= 0);
+
+ spin_lock_irqsave(&task->lock, flags);
+ run = __reserve_if_idle(task);
+ spin_unlock_irqrestore(&task->lock, flags);
+
+ if (run)
+ do_task(task);
+}
+
+/* schedule the task to run later as a work queue entry.
+ * the queue_work call can be called holding
+ * the lock.
+ */
+void rxe_sched_task(struct rxe_task *task)
+{
+ unsigned long flags;
+
+ WARN_ON(rxe_read(task->qp) <= 0);
+
+ spin_lock_irqsave(&task->lock, flags);
+ if (__reserve_if_idle(task))
+ queue_work(rxe_wq, &task->work);
+ spin_unlock_irqrestore(&task->lock, flags);
+}
+
+/* rxe_disable/enable_task are only called from
+ * rxe_modify_qp in process context. Task is moved
+ * to the drained state by do_task.
+ */
+void rxe_disable_task(struct rxe_task *task)
+{
+ unsigned long flags;
+
+ WARN_ON(rxe_read(task->qp) <= 0);
+
+ spin_lock_irqsave(&task->lock, flags);
+ if (!__is_done(task) && task->state < TASK_STATE_DRAINED) {
+ task->state = TASK_STATE_DRAINING;
+ } else {
+ task->state = TASK_STATE_DRAINED;
+ spin_unlock_irqrestore(&task->lock, flags);
+ return;
+ }
+ spin_unlock_irqrestore(&task->lock, flags);
+
+ while (!is_done(task))
+ cond_resched();
+
+ spin_lock_irqsave(&task->lock, flags);
+ task->state = TASK_STATE_DRAINED;
+ spin_unlock_irqrestore(&task->lock, flags);
+}
+
+void rxe_enable_task(struct rxe_task *task)
+{
+ unsigned long flags;
+
+ WARN_ON(rxe_read(task->qp) <= 0);
+
+ spin_lock_irqsave(&task->lock, flags);
+ if (task->state == TASK_STATE_INVALID) {
+ spin_unlock_irqrestore(&task->lock, flags);
+ return;
+ }
+
+ task->state = TASK_STATE_IDLE;
+ spin_unlock_irqrestore(&task->lock, flags);
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h
new file mode 100644
index 0000000000..a63e258b3d
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_task.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#ifndef RXE_TASK_H
+#define RXE_TASK_H
+
+enum {
+ TASK_STATE_IDLE = 0,
+ TASK_STATE_BUSY = 1,
+ TASK_STATE_ARMED = 2,
+ TASK_STATE_DRAINING = 3,
+ TASK_STATE_DRAINED = 4,
+ TASK_STATE_INVALID = 5,
+};
+
+/*
+ * data structure to describe a 'task' which is a short
+ * function that returns 0 as long as it needs to be
+ * called again.
+ */
+struct rxe_task {
+ struct work_struct work;
+ int state;
+ spinlock_t lock;
+ struct rxe_qp *qp;
+ int (*func)(struct rxe_qp *qp);
+ int ret;
+ long num_sched;
+ long num_done;
+};
+
+int rxe_alloc_wq(void);
+
+void rxe_destroy_wq(void);
+
+/*
+ * init rxe_task structure
+ * qp => parameter to pass to func
+ * func => function to call until it returns != 0
+ */
+int rxe_init_task(struct rxe_task *task, struct rxe_qp *qp,
+ int (*func)(struct rxe_qp *));
+
+/* cleanup task */
+void rxe_cleanup_task(struct rxe_task *task);
+
+void rxe_run_task(struct rxe_task *task);
+
+void rxe_sched_task(struct rxe_task *task);
+
+/* keep a task from scheduling */
+void rxe_disable_task(struct rxe_task *task);
+
+/* allow task to run */
+void rxe_enable_task(struct rxe_task *task);
+
+#endif /* RXE_TASK_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
new file mode 100644
index 0000000000..48f86839d3
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -0,0 +1,1533 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#include <linux/dma-mapping.h>
+#include <net/addrconf.h>
+#include <rdma/uverbs_ioctl.h>
+
+#include "rxe.h"
+#include "rxe_queue.h"
+#include "rxe_hw_counters.h"
+
+static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr);
+
+/* dev */
+static int rxe_query_device(struct ib_device *ibdev,
+ struct ib_device_attr *attr,
+ struct ib_udata *udata)
+{
+ struct rxe_dev *rxe = to_rdev(ibdev);
+ int err;
+
+ if (udata->inlen || udata->outlen) {
+ rxe_dbg_dev(rxe, "malformed udata");
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ memcpy(attr, &rxe->attr, sizeof(*attr));
+
+ return 0;
+
+err_out:
+ rxe_err_dev(rxe, "returned err = %d", err);
+ return err;
+}
+
+static int rxe_query_port(struct ib_device *ibdev,
+ u32 port_num, struct ib_port_attr *attr)
+{
+ struct rxe_dev *rxe = to_rdev(ibdev);
+ int err, ret;
+
+ if (port_num != 1) {
+ err = -EINVAL;
+ rxe_dbg_dev(rxe, "bad port_num = %d", port_num);
+ goto err_out;
+ }
+
+ memcpy(attr, &rxe->port.attr, sizeof(*attr));
+
+ mutex_lock(&rxe->usdev_lock);
+ ret = ib_get_eth_speed(ibdev, port_num, &attr->active_speed,
+ &attr->active_width);
+
+ if (attr->state == IB_PORT_ACTIVE)
+ attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
+ else if (dev_get_flags(rxe->ndev) & IFF_UP)
+ attr->phys_state = IB_PORT_PHYS_STATE_POLLING;
+ else
+ attr->phys_state = IB_PORT_PHYS_STATE_DISABLED;
+
+ mutex_unlock(&rxe->usdev_lock);
+
+ return ret;
+
+err_out:
+ rxe_err_dev(rxe, "returned err = %d", err);
+ return err;
+}
+
+static int rxe_query_pkey(struct ib_device *ibdev,
+ u32 port_num, u16 index, u16 *pkey)
+{
+ struct rxe_dev *rxe = to_rdev(ibdev);
+ int err;
+
+ if (index != 0) {
+ err = -EINVAL;
+ rxe_dbg_dev(rxe, "bad pkey index = %d", index);
+ goto err_out;
+ }
+
+ *pkey = IB_DEFAULT_PKEY_FULL;
+ return 0;
+
+err_out:
+ rxe_err_dev(rxe, "returned err = %d", err);
+ return err;
+}
+
+static int rxe_modify_device(struct ib_device *ibdev,
+ int mask, struct ib_device_modify *attr)
+{
+ struct rxe_dev *rxe = to_rdev(ibdev);
+ int err;
+
+ if (mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
+ IB_DEVICE_MODIFY_NODE_DESC)) {
+ err = -EOPNOTSUPP;
+ rxe_dbg_dev(rxe, "unsupported mask = 0x%x", mask);
+ goto err_out;
+ }
+
+ if (mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID)
+ rxe->attr.sys_image_guid = cpu_to_be64(attr->sys_image_guid);
+
+ if (mask & IB_DEVICE_MODIFY_NODE_DESC) {
+ memcpy(rxe->ib_dev.node_desc,
+ attr->node_desc, sizeof(rxe->ib_dev.node_desc));
+ }
+
+ return 0;
+
+err_out:
+ rxe_err_dev(rxe, "returned err = %d", err);
+ return err;
+}
+
+static int rxe_modify_port(struct ib_device *ibdev, u32 port_num,
+ int mask, struct ib_port_modify *attr)
+{
+ struct rxe_dev *rxe = to_rdev(ibdev);
+ struct rxe_port *port;
+ int err;
+
+ if (port_num != 1) {
+ err = -EINVAL;
+ rxe_dbg_dev(rxe, "bad port_num = %d", port_num);
+ goto err_out;
+ }
+
+ //TODO is shutdown useful
+ if (mask & ~(IB_PORT_RESET_QKEY_CNTR)) {
+ err = -EOPNOTSUPP;
+ rxe_dbg_dev(rxe, "unsupported mask = 0x%x", mask);
+ goto err_out;
+ }
+
+ port = &rxe->port;
+ port->attr.port_cap_flags |= attr->set_port_cap_mask;
+ port->attr.port_cap_flags &= ~attr->clr_port_cap_mask;
+
+ if (mask & IB_PORT_RESET_QKEY_CNTR)
+ port->attr.qkey_viol_cntr = 0;
+
+ return 0;
+
+err_out:
+ rxe_err_dev(rxe, "returned err = %d", err);
+ return err;
+}
+
+static enum rdma_link_layer rxe_get_link_layer(struct ib_device *ibdev,
+ u32 port_num)
+{
+ struct rxe_dev *rxe = to_rdev(ibdev);
+ int err;
+
+ if (port_num != 1) {
+ err = -EINVAL;
+ rxe_dbg_dev(rxe, "bad port_num = %d", port_num);
+ goto err_out;
+ }
+
+ return IB_LINK_LAYER_ETHERNET;
+
+err_out:
+ rxe_err_dev(rxe, "returned err = %d", err);
+ return err;
+}
+
+static int rxe_port_immutable(struct ib_device *ibdev, u32 port_num,
+ struct ib_port_immutable *immutable)
+{
+ struct rxe_dev *rxe = to_rdev(ibdev);
+ struct ib_port_attr attr = {};
+ int err;
+
+ if (port_num != 1) {
+ err = -EINVAL;
+ rxe_dbg_dev(rxe, "bad port_num = %d", port_num);
+ goto err_out;
+ }
+
+ err = ib_query_port(ibdev, port_num, &attr);
+ if (err)
+ goto err_out;
+
+ immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+ immutable->pkey_tbl_len = attr.pkey_tbl_len;
+ immutable->gid_tbl_len = attr.gid_tbl_len;
+ immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+
+ return 0;
+
+err_out:
+ rxe_err_dev(rxe, "returned err = %d", err);
+ return err;
+}
+
+/* uc */
+static int rxe_alloc_ucontext(struct ib_ucontext *ibuc, struct ib_udata *udata)
+{
+ struct rxe_dev *rxe = to_rdev(ibuc->device);
+ struct rxe_ucontext *uc = to_ruc(ibuc);
+ int err;
+
+ err = rxe_add_to_pool(&rxe->uc_pool, uc);
+ if (err)
+ rxe_err_dev(rxe, "unable to create uc");
+
+ return err;
+}
+
+static void rxe_dealloc_ucontext(struct ib_ucontext *ibuc)
+{
+ struct rxe_ucontext *uc = to_ruc(ibuc);
+ int err;
+
+ err = rxe_cleanup(uc);
+ if (err)
+ rxe_err_uc(uc, "cleanup failed, err = %d", err);
+}
+
+/* pd */
+static int rxe_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+{
+ struct rxe_dev *rxe = to_rdev(ibpd->device);
+ struct rxe_pd *pd = to_rpd(ibpd);
+ int err;
+
+ err = rxe_add_to_pool(&rxe->pd_pool, pd);
+ if (err) {
+ rxe_dbg_dev(rxe, "unable to alloc pd");
+ goto err_out;
+ }
+
+ return 0;
+
+err_out:
+ rxe_err_dev(rxe, "returned err = %d", err);
+ return err;
+}
+
+static int rxe_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+{
+ struct rxe_pd *pd = to_rpd(ibpd);
+ int err;
+
+ err = rxe_cleanup(pd);
+ if (err)
+ rxe_err_pd(pd, "cleanup failed, err = %d", err);
+
+ return 0;
+}
+
+/* ah */
+static int rxe_create_ah(struct ib_ah *ibah,
+ struct rdma_ah_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ struct rxe_dev *rxe = to_rdev(ibah->device);
+ struct rxe_ah *ah = to_rah(ibah);
+ struct rxe_create_ah_resp __user *uresp = NULL;
+ int err, cleanup_err;
+
+ if (udata) {
+ /* test if new user provider */
+ if (udata->outlen >= sizeof(*uresp))
+ uresp = udata->outbuf;
+ ah->is_user = true;
+ } else {
+ ah->is_user = false;
+ }
+
+ err = rxe_add_to_pool_ah(&rxe->ah_pool, ah,
+ init_attr->flags & RDMA_CREATE_AH_SLEEPABLE);
+ if (err) {
+ rxe_dbg_dev(rxe, "unable to create ah");
+ goto err_out;
+ }
+
+ /* create index > 0 */
+ ah->ah_num = ah->elem.index;
+
+ err = rxe_ah_chk_attr(ah, init_attr->ah_attr);
+ if (err) {
+ rxe_dbg_ah(ah, "bad attr");
+ goto err_cleanup;
+ }
+
+ if (uresp) {
+ /* only if new user provider */
+ err = copy_to_user(&uresp->ah_num, &ah->ah_num,
+ sizeof(uresp->ah_num));
+ if (err) {
+ err = -EFAULT;
+ rxe_dbg_ah(ah, "unable to copy to user");
+ goto err_cleanup;
+ }
+ } else if (ah->is_user) {
+ /* only if old user provider */
+ ah->ah_num = 0;
+ }
+
+ rxe_init_av(init_attr->ah_attr, &ah->av);
+ rxe_finalize(ah);
+
+ return 0;
+
+err_cleanup:
+ cleanup_err = rxe_cleanup(ah);
+ if (cleanup_err)
+ rxe_err_ah(ah, "cleanup failed, err = %d", cleanup_err);
+err_out:
+ rxe_err_ah(ah, "returned err = %d", err);
+ return err;
+}
+
+static int rxe_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr)
+{
+ struct rxe_ah *ah = to_rah(ibah);
+ int err;
+
+ err = rxe_ah_chk_attr(ah, attr);
+ if (err) {
+ rxe_dbg_ah(ah, "bad attr");
+ goto err_out;
+ }
+
+ rxe_init_av(attr, &ah->av);
+
+ return 0;
+
+err_out:
+ rxe_err_ah(ah, "returned err = %d", err);
+ return err;
+}
+
+static int rxe_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr)
+{
+ struct rxe_ah *ah = to_rah(ibah);
+
+ memset(attr, 0, sizeof(*attr));
+ attr->type = ibah->type;
+ rxe_av_to_attr(&ah->av, attr);
+
+ return 0;
+}
+
+static int rxe_destroy_ah(struct ib_ah *ibah, u32 flags)
+{
+ struct rxe_ah *ah = to_rah(ibah);
+ int err;
+
+ err = rxe_cleanup_ah(ah, flags & RDMA_DESTROY_AH_SLEEPABLE);
+ if (err)
+ rxe_err_ah(ah, "cleanup failed, err = %d", err);
+
+ return 0;
+}
+
+/* srq */
+static int rxe_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init,
+ struct ib_udata *udata)
+{
+ struct rxe_dev *rxe = to_rdev(ibsrq->device);
+ struct rxe_pd *pd = to_rpd(ibsrq->pd);
+ struct rxe_srq *srq = to_rsrq(ibsrq);
+ struct rxe_create_srq_resp __user *uresp = NULL;
+ int err, cleanup_err;
+
+ if (udata) {
+ if (udata->outlen < sizeof(*uresp)) {
+ err = -EINVAL;
+ rxe_err_dev(rxe, "malformed udata");
+ goto err_out;
+ }
+ uresp = udata->outbuf;
+ }
+
+ if (init->srq_type != IB_SRQT_BASIC) {
+ err = -EOPNOTSUPP;
+ rxe_dbg_dev(rxe, "srq type = %d, not supported",
+ init->srq_type);
+ goto err_out;
+ }
+
+ err = rxe_srq_chk_init(rxe, init);
+ if (err) {
+ rxe_dbg_dev(rxe, "invalid init attributes");
+ goto err_out;
+ }
+
+ err = rxe_add_to_pool(&rxe->srq_pool, srq);
+ if (err) {
+ rxe_dbg_dev(rxe, "unable to create srq, err = %d", err);
+ goto err_out;
+ }
+
+ rxe_get(pd);
+ srq->pd = pd;
+
+ err = rxe_srq_from_init(rxe, srq, init, udata, uresp);
+ if (err) {
+ rxe_dbg_srq(srq, "create srq failed, err = %d", err);
+ goto err_cleanup;
+ }
+
+ return 0;
+
+err_cleanup:
+ cleanup_err = rxe_cleanup(srq);
+ if (cleanup_err)
+ rxe_err_srq(srq, "cleanup failed, err = %d", cleanup_err);
+err_out:
+ rxe_err_dev(rxe, "returned err = %d", err);
+ return err;
+}
+
+static int rxe_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+ enum ib_srq_attr_mask mask,
+ struct ib_udata *udata)
+{
+ struct rxe_srq *srq = to_rsrq(ibsrq);
+ struct rxe_dev *rxe = to_rdev(ibsrq->device);
+ struct rxe_modify_srq_cmd cmd = {};
+ int err;
+
+ if (udata) {
+ if (udata->inlen < sizeof(cmd)) {
+ err = -EINVAL;
+ rxe_dbg_srq(srq, "malformed udata");
+ goto err_out;
+ }
+
+ err = ib_copy_from_udata(&cmd, udata, sizeof(cmd));
+ if (err) {
+ err = -EFAULT;
+ rxe_dbg_srq(srq, "unable to read udata");
+ goto err_out;
+ }
+ }
+
+ err = rxe_srq_chk_attr(rxe, srq, attr, mask);
+ if (err) {
+ rxe_dbg_srq(srq, "bad init attributes");
+ goto err_out;
+ }
+
+ err = rxe_srq_from_attr(rxe, srq, attr, mask, &cmd, udata);
+ if (err) {
+ rxe_dbg_srq(srq, "bad attr");
+ goto err_out;
+ }
+
+ return 0;
+
+err_out:
+ rxe_err_srq(srq, "returned err = %d", err);
+ return err;
+}
+
+static int rxe_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
+{
+ struct rxe_srq *srq = to_rsrq(ibsrq);
+ int err;
+
+ if (srq->error) {
+ err = -EINVAL;
+ rxe_dbg_srq(srq, "srq in error state");
+ goto err_out;
+ }
+
+ attr->max_wr = srq->rq.queue->buf->index_mask;
+ attr->max_sge = srq->rq.max_sge;
+ attr->srq_limit = srq->limit;
+ return 0;
+
+err_out:
+ rxe_err_srq(srq, "returned err = %d", err);
+ return err;
+}
+
+static int rxe_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
+ const struct ib_recv_wr **bad_wr)
+{
+ int err = 0;
+ struct rxe_srq *srq = to_rsrq(ibsrq);
+ unsigned long flags;
+
+ spin_lock_irqsave(&srq->rq.producer_lock, flags);
+
+ while (wr) {
+ err = post_one_recv(&srq->rq, wr);
+ if (unlikely(err))
+ break;
+ wr = wr->next;
+ }
+
+ spin_unlock_irqrestore(&srq->rq.producer_lock, flags);
+
+ if (err) {
+ *bad_wr = wr;
+ rxe_err_srq(srq, "returned err = %d", err);
+ }
+
+ return err;
+}
+
+static int rxe_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
+{
+ struct rxe_srq *srq = to_rsrq(ibsrq);
+ int err;
+
+ err = rxe_cleanup(srq);
+ if (err)
+ rxe_err_srq(srq, "cleanup failed, err = %d", err);
+
+ return 0;
+}
+
+/* qp */
+static int rxe_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init,
+ struct ib_udata *udata)
+{
+ struct rxe_dev *rxe = to_rdev(ibqp->device);
+ struct rxe_pd *pd = to_rpd(ibqp->pd);
+ struct rxe_qp *qp = to_rqp(ibqp);
+ struct rxe_create_qp_resp __user *uresp = NULL;
+ int err, cleanup_err;
+
+ if (udata) {
+ if (udata->inlen) {
+ err = -EINVAL;
+ rxe_dbg_dev(rxe, "malformed udata, err = %d", err);
+ goto err_out;
+ }
+
+ if (udata->outlen < sizeof(*uresp)) {
+ err = -EINVAL;
+ rxe_dbg_dev(rxe, "malformed udata, err = %d", err);
+ goto err_out;
+ }
+
+ qp->is_user = true;
+ uresp = udata->outbuf;
+ } else {
+ qp->is_user = false;
+ }
+
+ if (init->create_flags) {
+ err = -EOPNOTSUPP;
+ rxe_dbg_dev(rxe, "unsupported create_flags, err = %d", err);
+ goto err_out;
+ }
+
+ err = rxe_qp_chk_init(rxe, init);
+ if (err) {
+ rxe_dbg_dev(rxe, "bad init attr, err = %d", err);
+ goto err_out;
+ }
+
+ err = rxe_add_to_pool(&rxe->qp_pool, qp);
+ if (err) {
+ rxe_dbg_dev(rxe, "unable to create qp, err = %d", err);
+ goto err_out;
+ }
+
+ err = rxe_qp_from_init(rxe, qp, pd, init, uresp, ibqp->pd, udata);
+ if (err) {
+ rxe_dbg_qp(qp, "create qp failed, err = %d", err);
+ goto err_cleanup;
+ }
+
+ rxe_finalize(qp);
+ return 0;
+
+err_cleanup:
+ cleanup_err = rxe_cleanup(qp);
+ if (cleanup_err)
+ rxe_err_qp(qp, "cleanup failed, err = %d", cleanup_err);
+err_out:
+ rxe_err_dev(rxe, "returned err = %d", err);
+ return err;
+}
+
+static int rxe_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int mask, struct ib_udata *udata)
+{
+ struct rxe_dev *rxe = to_rdev(ibqp->device);
+ struct rxe_qp *qp = to_rqp(ibqp);
+ int err;
+
+ if (mask & ~IB_QP_ATTR_STANDARD_BITS) {
+ err = -EOPNOTSUPP;
+ rxe_dbg_qp(qp, "unsupported mask = 0x%x, err = %d",
+ mask, err);
+ goto err_out;
+ }
+
+ err = rxe_qp_chk_attr(rxe, qp, attr, mask);
+ if (err) {
+ rxe_dbg_qp(qp, "bad mask/attr, err = %d", err);
+ goto err_out;
+ }
+
+ err = rxe_qp_from_attr(qp, attr, mask, udata);
+ if (err) {
+ rxe_dbg_qp(qp, "modify qp failed, err = %d", err);
+ goto err_out;
+ }
+
+ if ((mask & IB_QP_AV) && (attr->ah_attr.ah_flags & IB_AH_GRH))
+ qp->src_port = rdma_get_udp_sport(attr->ah_attr.grh.flow_label,
+ qp->ibqp.qp_num,
+ qp->attr.dest_qp_num);
+
+ return 0;
+
+err_out:
+ rxe_err_qp(qp, "returned err = %d", err);
+ return err;
+}
+
+static int rxe_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int mask, struct ib_qp_init_attr *init)
+{
+ struct rxe_qp *qp = to_rqp(ibqp);
+
+ rxe_qp_to_init(qp, init);
+ rxe_qp_to_attr(qp, attr, mask);
+
+ return 0;
+}
+
+static int rxe_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
+{
+ struct rxe_qp *qp = to_rqp(ibqp);
+ int err;
+
+ err = rxe_qp_chk_destroy(qp);
+ if (err) {
+ rxe_dbg_qp(qp, "unable to destroy qp, err = %d", err);
+ goto err_out;
+ }
+
+ err = rxe_cleanup(qp);
+ if (err)
+ rxe_err_qp(qp, "cleanup failed, err = %d", err);
+
+ return 0;
+
+err_out:
+ rxe_err_qp(qp, "returned err = %d", err);
+ return err;
+}
+
+/* send wr */
+
+/* sanity check incoming send work request */
+static int validate_send_wr(struct rxe_qp *qp, const struct ib_send_wr *ibwr,
+ unsigned int *maskp, unsigned int *lengthp)
+{
+ int num_sge = ibwr->num_sge;
+ struct rxe_sq *sq = &qp->sq;
+ unsigned int mask = 0;
+ unsigned long length = 0;
+ int err = -EINVAL;
+ int i;
+
+ do {
+ mask = wr_opcode_mask(ibwr->opcode, qp);
+ if (!mask) {
+ rxe_err_qp(qp, "bad wr opcode for qp type");
+ break;
+ }
+
+ if (num_sge > sq->max_sge) {
+ rxe_err_qp(qp, "num_sge > max_sge");
+ break;
+ }
+
+ length = 0;
+ for (i = 0; i < ibwr->num_sge; i++)
+ length += ibwr->sg_list[i].length;
+
+ if (length > (1UL << 31)) {
+ rxe_err_qp(qp, "message length too long");
+ break;
+ }
+
+ if (mask & WR_ATOMIC_MASK) {
+ if (length != 8) {
+ rxe_err_qp(qp, "atomic length != 8");
+ break;
+ }
+ if (atomic_wr(ibwr)->remote_addr & 0x7) {
+ rxe_err_qp(qp, "misaligned atomic address");
+ break;
+ }
+ }
+ if (ibwr->send_flags & IB_SEND_INLINE) {
+ if (!(mask & WR_INLINE_MASK)) {
+ rxe_err_qp(qp, "opcode doesn't support inline data");
+ break;
+ }
+ if (length > sq->max_inline) {
+ rxe_err_qp(qp, "inline length too big");
+ break;
+ }
+ }
+
+ err = 0;
+ } while (0);
+
+ *maskp = mask;
+ *lengthp = (int)length;
+
+ return err;
+}
+
+static int init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr,
+ const struct ib_send_wr *ibwr)
+{
+ wr->wr_id = ibwr->wr_id;
+ wr->opcode = ibwr->opcode;
+ wr->send_flags = ibwr->send_flags;
+
+ if (qp_type(qp) == IB_QPT_UD ||
+ qp_type(qp) == IB_QPT_GSI) {
+ struct ib_ah *ibah = ud_wr(ibwr)->ah;
+
+ wr->wr.ud.remote_qpn = ud_wr(ibwr)->remote_qpn;
+ wr->wr.ud.remote_qkey = ud_wr(ibwr)->remote_qkey;
+ wr->wr.ud.ah_num = to_rah(ibah)->ah_num;
+ if (qp_type(qp) == IB_QPT_GSI)
+ wr->wr.ud.pkey_index = ud_wr(ibwr)->pkey_index;
+
+ switch (wr->opcode) {
+ case IB_WR_SEND_WITH_IMM:
+ wr->ex.imm_data = ibwr->ex.imm_data;
+ break;
+ case IB_WR_SEND:
+ break;
+ default:
+ rxe_err_qp(qp, "bad wr opcode %d for UD/GSI QP",
+ wr->opcode);
+ return -EINVAL;
+ }
+ } else {
+ switch (wr->opcode) {
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ wr->ex.imm_data = ibwr->ex.imm_data;
+ fallthrough;
+ case IB_WR_RDMA_READ:
+ case IB_WR_RDMA_WRITE:
+ wr->wr.rdma.remote_addr = rdma_wr(ibwr)->remote_addr;
+ wr->wr.rdma.rkey = rdma_wr(ibwr)->rkey;
+ break;
+ case IB_WR_SEND_WITH_IMM:
+ wr->ex.imm_data = ibwr->ex.imm_data;
+ break;
+ case IB_WR_SEND_WITH_INV:
+ wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey;
+ break;
+ case IB_WR_RDMA_READ_WITH_INV:
+ wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey;
+ wr->wr.rdma.remote_addr = rdma_wr(ibwr)->remote_addr;
+ wr->wr.rdma.rkey = rdma_wr(ibwr)->rkey;
+ break;
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ wr->wr.atomic.remote_addr =
+ atomic_wr(ibwr)->remote_addr;
+ wr->wr.atomic.compare_add =
+ atomic_wr(ibwr)->compare_add;
+ wr->wr.atomic.swap = atomic_wr(ibwr)->swap;
+ wr->wr.atomic.rkey = atomic_wr(ibwr)->rkey;
+ break;
+ case IB_WR_LOCAL_INV:
+ wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey;
+ break;
+ case IB_WR_REG_MR:
+ wr->wr.reg.mr = reg_wr(ibwr)->mr;
+ wr->wr.reg.key = reg_wr(ibwr)->key;
+ wr->wr.reg.access = reg_wr(ibwr)->access;
+ break;
+ case IB_WR_SEND:
+ case IB_WR_BIND_MW:
+ case IB_WR_FLUSH:
+ case IB_WR_ATOMIC_WRITE:
+ break;
+ default:
+ rxe_err_qp(qp, "unsupported wr opcode %d",
+ wr->opcode);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static void copy_inline_data_to_wqe(struct rxe_send_wqe *wqe,
+ const struct ib_send_wr *ibwr)
+{
+ struct ib_sge *sge = ibwr->sg_list;
+ u8 *p = wqe->dma.inline_data;
+ int i;
+
+ for (i = 0; i < ibwr->num_sge; i++, sge++) {
+ memcpy(p, ib_virt_dma_to_page(sge->addr), sge->length);
+ p += sge->length;
+ }
+}
+
+static int init_send_wqe(struct rxe_qp *qp, const struct ib_send_wr *ibwr,
+ unsigned int mask, unsigned int length,
+ struct rxe_send_wqe *wqe)
+{
+ int num_sge = ibwr->num_sge;
+ int err;
+
+ err = init_send_wr(qp, &wqe->wr, ibwr);
+ if (err)
+ return err;
+
+ /* local operation */
+ if (unlikely(mask & WR_LOCAL_OP_MASK)) {
+ wqe->mask = mask;
+ wqe->state = wqe_state_posted;
+ return 0;
+ }
+
+ if (unlikely(ibwr->send_flags & IB_SEND_INLINE))
+ copy_inline_data_to_wqe(wqe, ibwr);
+ else
+ memcpy(wqe->dma.sge, ibwr->sg_list,
+ num_sge * sizeof(struct ib_sge));
+
+ wqe->iova = mask & WR_ATOMIC_MASK ? atomic_wr(ibwr)->remote_addr :
+ mask & WR_READ_OR_WRITE_MASK ? rdma_wr(ibwr)->remote_addr : 0;
+ wqe->mask = mask;
+ wqe->dma.length = length;
+ wqe->dma.resid = length;
+ wqe->dma.num_sge = num_sge;
+ wqe->dma.cur_sge = 0;
+ wqe->dma.sge_offset = 0;
+ wqe->state = wqe_state_posted;
+ wqe->ssn = atomic_add_return(1, &qp->ssn);
+
+ return 0;
+}
+
+static int post_one_send(struct rxe_qp *qp, const struct ib_send_wr *ibwr)
+{
+ int err;
+ struct rxe_sq *sq = &qp->sq;
+ struct rxe_send_wqe *send_wqe;
+ unsigned int mask;
+ unsigned int length;
+ int full;
+
+ err = validate_send_wr(qp, ibwr, &mask, &length);
+ if (err)
+ return err;
+
+ full = queue_full(sq->queue, QUEUE_TYPE_FROM_ULP);
+ if (unlikely(full)) {
+ rxe_err_qp(qp, "send queue full");
+ return -ENOMEM;
+ }
+
+ send_wqe = queue_producer_addr(sq->queue, QUEUE_TYPE_FROM_ULP);
+ err = init_send_wqe(qp, ibwr, mask, length, send_wqe);
+ if (!err)
+ queue_advance_producer(sq->queue, QUEUE_TYPE_FROM_ULP);
+
+ return err;
+}
+
+static int rxe_post_send_kernel(struct rxe_qp *qp,
+ const struct ib_send_wr *ibwr,
+ const struct ib_send_wr **bad_wr)
+{
+ int err = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->sq.sq_lock, flags);
+ while (ibwr) {
+ err = post_one_send(qp, ibwr);
+ if (err) {
+ *bad_wr = ibwr;
+ break;
+ }
+ ibwr = ibwr->next;
+ }
+ spin_unlock_irqrestore(&qp->sq.sq_lock, flags);
+
+ if (!err)
+ rxe_sched_task(&qp->req.task);
+
+ spin_lock_irqsave(&qp->state_lock, flags);
+ if (qp_state(qp) == IB_QPS_ERR)
+ rxe_sched_task(&qp->comp.task);
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+
+ return err;
+}
+
+static int rxe_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
+ const struct ib_send_wr **bad_wr)
+{
+ struct rxe_qp *qp = to_rqp(ibqp);
+ int err;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->state_lock, flags);
+ /* caller has already called destroy_qp */
+ if (WARN_ON_ONCE(!qp->valid)) {
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+ rxe_err_qp(qp, "qp has been destroyed");
+ return -EINVAL;
+ }
+
+ if (unlikely(qp_state(qp) < IB_QPS_RTS)) {
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+ *bad_wr = wr;
+ rxe_err_qp(qp, "qp not ready to send");
+ return -EINVAL;
+ }
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+
+ if (qp->is_user) {
+ /* Utilize process context to do protocol processing */
+ rxe_run_task(&qp->req.task);
+ } else {
+ err = rxe_post_send_kernel(qp, wr, bad_wr);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/* recv wr */
+static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr)
+{
+ int i;
+ unsigned long length;
+ struct rxe_recv_wqe *recv_wqe;
+ int num_sge = ibwr->num_sge;
+ int full;
+ int err;
+
+ full = queue_full(rq->queue, QUEUE_TYPE_FROM_ULP);
+ if (unlikely(full)) {
+ err = -ENOMEM;
+ rxe_dbg("queue full");
+ goto err_out;
+ }
+
+ if (unlikely(num_sge > rq->max_sge)) {
+ err = -EINVAL;
+ rxe_dbg("bad num_sge > max_sge");
+ goto err_out;
+ }
+
+ length = 0;
+ for (i = 0; i < num_sge; i++)
+ length += ibwr->sg_list[i].length;
+
+ /* IBA max message size is 2^31 */
+ if (length >= (1UL<<31)) {
+ err = -EINVAL;
+ rxe_dbg("message length too long");
+ goto err_out;
+ }
+
+ recv_wqe = queue_producer_addr(rq->queue, QUEUE_TYPE_FROM_ULP);
+
+ recv_wqe->wr_id = ibwr->wr_id;
+ recv_wqe->dma.length = length;
+ recv_wqe->dma.resid = length;
+ recv_wqe->dma.num_sge = num_sge;
+ recv_wqe->dma.cur_sge = 0;
+ recv_wqe->dma.sge_offset = 0;
+ memcpy(recv_wqe->dma.sge, ibwr->sg_list,
+ num_sge * sizeof(struct ib_sge));
+
+ queue_advance_producer(rq->queue, QUEUE_TYPE_FROM_ULP);
+
+ return 0;
+
+err_out:
+ rxe_dbg("returned err = %d", err);
+ return err;
+}
+
+static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
+ const struct ib_recv_wr **bad_wr)
+{
+ int err = 0;
+ struct rxe_qp *qp = to_rqp(ibqp);
+ struct rxe_rq *rq = &qp->rq;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->state_lock, flags);
+ /* caller has already called destroy_qp */
+ if (WARN_ON_ONCE(!qp->valid)) {
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+ rxe_err_qp(qp, "qp has been destroyed");
+ return -EINVAL;
+ }
+
+ /* see C10-97.2.1 */
+ if (unlikely((qp_state(qp) < IB_QPS_INIT))) {
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+ *bad_wr = wr;
+ rxe_dbg_qp(qp, "qp not ready to post recv");
+ return -EINVAL;
+ }
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+
+ if (unlikely(qp->srq)) {
+ *bad_wr = wr;
+ rxe_dbg_qp(qp, "qp has srq, use post_srq_recv instead");
+ return -EINVAL;
+ }
+
+ spin_lock_irqsave(&rq->producer_lock, flags);
+
+ while (wr) {
+ err = post_one_recv(rq, wr);
+ if (unlikely(err)) {
+ *bad_wr = wr;
+ break;
+ }
+ wr = wr->next;
+ }
+
+ spin_unlock_irqrestore(&rq->producer_lock, flags);
+
+ spin_lock_irqsave(&qp->state_lock, flags);
+ if (qp_state(qp) == IB_QPS_ERR)
+ rxe_sched_task(&qp->resp.task);
+ spin_unlock_irqrestore(&qp->state_lock, flags);
+
+ return err;
+}
+
+/* cq */
+static int rxe_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+ struct ib_udata *udata)
+{
+ struct ib_device *dev = ibcq->device;
+ struct rxe_dev *rxe = to_rdev(dev);
+ struct rxe_cq *cq = to_rcq(ibcq);
+ struct rxe_create_cq_resp __user *uresp = NULL;
+ int err, cleanup_err;
+
+ if (udata) {
+ if (udata->outlen < sizeof(*uresp)) {
+ err = -EINVAL;
+ rxe_dbg_dev(rxe, "malformed udata, err = %d", err);
+ goto err_out;
+ }
+ uresp = udata->outbuf;
+ }
+
+ if (attr->flags) {
+ err = -EOPNOTSUPP;
+ rxe_dbg_dev(rxe, "bad attr->flags, err = %d", err);
+ goto err_out;
+ }
+
+ err = rxe_cq_chk_attr(rxe, NULL, attr->cqe, attr->comp_vector);
+ if (err) {
+ rxe_dbg_dev(rxe, "bad init attributes, err = %d", err);
+ goto err_out;
+ }
+
+ err = rxe_add_to_pool(&rxe->cq_pool, cq);
+ if (err) {
+ rxe_dbg_dev(rxe, "unable to create cq, err = %d", err);
+ goto err_out;
+ }
+
+ err = rxe_cq_from_init(rxe, cq, attr->cqe, attr->comp_vector, udata,
+ uresp);
+ if (err) {
+ rxe_dbg_cq(cq, "create cq failed, err = %d", err);
+ goto err_cleanup;
+ }
+
+ return 0;
+
+err_cleanup:
+ cleanup_err = rxe_cleanup(cq);
+ if (cleanup_err)
+ rxe_err_cq(cq, "cleanup failed, err = %d", cleanup_err);
+err_out:
+ rxe_err_dev(rxe, "returned err = %d", err);
+ return err;
+}
+
+static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
+{
+ struct rxe_cq *cq = to_rcq(ibcq);
+ struct rxe_dev *rxe = to_rdev(ibcq->device);
+ struct rxe_resize_cq_resp __user *uresp = NULL;
+ int err;
+
+ if (udata) {
+ if (udata->outlen < sizeof(*uresp)) {
+ err = -EINVAL;
+ rxe_dbg_cq(cq, "malformed udata");
+ goto err_out;
+ }
+ uresp = udata->outbuf;
+ }
+
+ err = rxe_cq_chk_attr(rxe, cq, cqe, 0);
+ if (err) {
+ rxe_dbg_cq(cq, "bad attr, err = %d", err);
+ goto err_out;
+ }
+
+ err = rxe_cq_resize_queue(cq, cqe, uresp, udata);
+ if (err) {
+ rxe_dbg_cq(cq, "resize cq failed, err = %d", err);
+ goto err_out;
+ }
+
+ return 0;
+
+err_out:
+ rxe_err_cq(cq, "returned err = %d", err);
+ return err;
+}
+
+static int rxe_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+ int i;
+ struct rxe_cq *cq = to_rcq(ibcq);
+ struct rxe_cqe *cqe;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cq->cq_lock, flags);
+ for (i = 0; i < num_entries; i++) {
+ cqe = queue_head(cq->queue, QUEUE_TYPE_TO_ULP);
+ if (!cqe)
+ break; /* queue empty */
+
+ memcpy(wc++, &cqe->ibwc, sizeof(*wc));
+ queue_advance_consumer(cq->queue, QUEUE_TYPE_TO_ULP);
+ }
+ spin_unlock_irqrestore(&cq->cq_lock, flags);
+
+ return i;
+}
+
+static int rxe_peek_cq(struct ib_cq *ibcq, int wc_cnt)
+{
+ struct rxe_cq *cq = to_rcq(ibcq);
+ int count;
+
+ count = queue_count(cq->queue, QUEUE_TYPE_TO_ULP);
+
+ return (count > wc_cnt) ? wc_cnt : count;
+}
+
+static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+ struct rxe_cq *cq = to_rcq(ibcq);
+ int ret = 0;
+ int empty;
+ unsigned long irq_flags;
+
+ spin_lock_irqsave(&cq->cq_lock, irq_flags);
+ cq->notify |= flags & IB_CQ_SOLICITED_MASK;
+ empty = queue_empty(cq->queue, QUEUE_TYPE_TO_ULP);
+
+ if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && !empty)
+ ret = 1;
+
+ spin_unlock_irqrestore(&cq->cq_lock, irq_flags);
+
+ return ret;
+}
+
+static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+{
+ struct rxe_cq *cq = to_rcq(ibcq);
+ int err;
+
+ /* See IBA C11-17: The CI shall return an error if this Verb is
+ * invoked while a Work Queue is still associated with the CQ.
+ */
+ if (atomic_read(&cq->num_wq)) {
+ err = -EINVAL;
+ rxe_dbg_cq(cq, "still in use");
+ goto err_out;
+ }
+
+ err = rxe_cleanup(cq);
+ if (err)
+ rxe_err_cq(cq, "cleanup failed, err = %d", err);
+
+ return 0;
+
+err_out:
+ rxe_err_cq(cq, "returned err = %d", err);
+ return err;
+}
+
+/* mr */
+static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access)
+{
+ struct rxe_dev *rxe = to_rdev(ibpd->device);
+ struct rxe_pd *pd = to_rpd(ibpd);
+ struct rxe_mr *mr;
+ int err;
+
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ err = rxe_add_to_pool(&rxe->mr_pool, mr);
+ if (err) {
+ rxe_dbg_dev(rxe, "unable to create mr");
+ goto err_free;
+ }
+
+ rxe_get(pd);
+ mr->ibmr.pd = ibpd;
+ mr->ibmr.device = ibpd->device;
+
+ rxe_mr_init_dma(access, mr);
+ rxe_finalize(mr);
+ return &mr->ibmr;
+
+err_free:
+ kfree(mr);
+ rxe_err_pd(pd, "returned err = %d", err);
+ return ERR_PTR(err);
+}
+
+static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start,
+ u64 length, u64 iova, int access,
+ struct ib_udata *udata)
+{
+ struct rxe_dev *rxe = to_rdev(ibpd->device);
+ struct rxe_pd *pd = to_rpd(ibpd);
+ struct rxe_mr *mr;
+ int err, cleanup_err;
+
+ if (access & ~RXE_ACCESS_SUPPORTED_MR) {
+ rxe_err_pd(pd, "access = %#x not supported (%#x)", access,
+ RXE_ACCESS_SUPPORTED_MR);
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ err = rxe_add_to_pool(&rxe->mr_pool, mr);
+ if (err) {
+ rxe_dbg_pd(pd, "unable to create mr");
+ goto err_free;
+ }
+
+ rxe_get(pd);
+ mr->ibmr.pd = ibpd;
+ mr->ibmr.device = ibpd->device;
+
+ err = rxe_mr_init_user(rxe, start, length, iova, access, mr);
+ if (err) {
+ rxe_dbg_mr(mr, "reg_user_mr failed, err = %d", err);
+ goto err_cleanup;
+ }
+
+ rxe_finalize(mr);
+ return &mr->ibmr;
+
+err_cleanup:
+ cleanup_err = rxe_cleanup(mr);
+ if (cleanup_err)
+ rxe_err_mr(mr, "cleanup failed, err = %d", cleanup_err);
+err_free:
+ kfree(mr);
+ rxe_err_pd(pd, "returned err = %d", err);
+ return ERR_PTR(err);
+}
+
+static struct ib_mr *rxe_rereg_user_mr(struct ib_mr *ibmr, int flags,
+ u64 start, u64 length, u64 iova,
+ int access, struct ib_pd *ibpd,
+ struct ib_udata *udata)
+{
+ struct rxe_mr *mr = to_rmr(ibmr);
+ struct rxe_pd *old_pd = to_rpd(ibmr->pd);
+ struct rxe_pd *pd = to_rpd(ibpd);
+
+ /* for now only support the two easy cases:
+ * rereg_pd and rereg_access
+ */
+ if (flags & ~RXE_MR_REREG_SUPPORTED) {
+ rxe_err_mr(mr, "flags = %#x not supported", flags);
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ if (flags & IB_MR_REREG_PD) {
+ rxe_put(old_pd);
+ rxe_get(pd);
+ mr->ibmr.pd = ibpd;
+ }
+
+ if (flags & IB_MR_REREG_ACCESS) {
+ if (access & ~RXE_ACCESS_SUPPORTED_MR) {
+ rxe_err_mr(mr, "access = %#x not supported", access);
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+ mr->access = access;
+ }
+
+ return NULL;
+}
+
+static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
+ u32 max_num_sg)
+{
+ struct rxe_dev *rxe = to_rdev(ibpd->device);
+ struct rxe_pd *pd = to_rpd(ibpd);
+ struct rxe_mr *mr;
+ int err, cleanup_err;
+
+ if (mr_type != IB_MR_TYPE_MEM_REG) {
+ err = -EINVAL;
+ rxe_dbg_pd(pd, "mr type %d not supported, err = %d",
+ mr_type, err);
+ goto err_out;
+ }
+
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ err = rxe_add_to_pool(&rxe->mr_pool, mr);
+ if (err)
+ goto err_free;
+
+ rxe_get(pd);
+ mr->ibmr.pd = ibpd;
+ mr->ibmr.device = ibpd->device;
+
+ err = rxe_mr_init_fast(max_num_sg, mr);
+ if (err) {
+ rxe_dbg_mr(mr, "alloc_mr failed, err = %d", err);
+ goto err_cleanup;
+ }
+
+ rxe_finalize(mr);
+ return &mr->ibmr;
+
+err_cleanup:
+ cleanup_err = rxe_cleanup(mr);
+ if (cleanup_err)
+ rxe_err_mr(mr, "cleanup failed, err = %d", err);
+err_free:
+ kfree(mr);
+err_out:
+ rxe_err_pd(pd, "returned err = %d", err);
+ return ERR_PTR(err);
+}
+
+static int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+{
+ struct rxe_mr *mr = to_rmr(ibmr);
+ int err, cleanup_err;
+
+ /* See IBA 10.6.7.2.6 */
+ if (atomic_read(&mr->num_mw) > 0) {
+ err = -EINVAL;
+ rxe_dbg_mr(mr, "mr has mw's bound");
+ goto err_out;
+ }
+
+ cleanup_err = rxe_cleanup(mr);
+ if (cleanup_err)
+ rxe_err_mr(mr, "cleanup failed, err = %d", cleanup_err);
+
+ kfree_rcu_mightsleep(mr);
+ return 0;
+
+err_out:
+ rxe_err_mr(mr, "returned err = %d", err);
+ return err;
+}
+
+static ssize_t parent_show(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct rxe_dev *rxe =
+ rdma_device_to_drv_device(device, struct rxe_dev, ib_dev);
+
+ return sysfs_emit(buf, "%s\n", rxe_parent_name(rxe, 1));
+}
+
+static DEVICE_ATTR_RO(parent);
+
+static struct attribute *rxe_dev_attributes[] = {
+ &dev_attr_parent.attr,
+ NULL
+};
+
+static const struct attribute_group rxe_attr_group = {
+ .attrs = rxe_dev_attributes,
+};
+
+static int rxe_enable_driver(struct ib_device *ib_dev)
+{
+ struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev);
+
+ rxe_set_port_state(rxe);
+ dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(rxe->ndev));
+ return 0;
+}
+
+static const struct ib_device_ops rxe_dev_ops = {
+ .owner = THIS_MODULE,
+ .driver_id = RDMA_DRIVER_RXE,
+ .uverbs_abi_ver = RXE_UVERBS_ABI_VERSION,
+
+ .alloc_hw_port_stats = rxe_ib_alloc_hw_port_stats,
+ .alloc_mr = rxe_alloc_mr,
+ .alloc_mw = rxe_alloc_mw,
+ .alloc_pd = rxe_alloc_pd,
+ .alloc_ucontext = rxe_alloc_ucontext,
+ .attach_mcast = rxe_attach_mcast,
+ .create_ah = rxe_create_ah,
+ .create_cq = rxe_create_cq,
+ .create_qp = rxe_create_qp,
+ .create_srq = rxe_create_srq,
+ .create_user_ah = rxe_create_ah,
+ .dealloc_driver = rxe_dealloc,
+ .dealloc_mw = rxe_dealloc_mw,
+ .dealloc_pd = rxe_dealloc_pd,
+ .dealloc_ucontext = rxe_dealloc_ucontext,
+ .dereg_mr = rxe_dereg_mr,
+ .destroy_ah = rxe_destroy_ah,
+ .destroy_cq = rxe_destroy_cq,
+ .destroy_qp = rxe_destroy_qp,
+ .destroy_srq = rxe_destroy_srq,
+ .detach_mcast = rxe_detach_mcast,
+ .device_group = &rxe_attr_group,
+ .enable_driver = rxe_enable_driver,
+ .get_dma_mr = rxe_get_dma_mr,
+ .get_hw_stats = rxe_ib_get_hw_stats,
+ .get_link_layer = rxe_get_link_layer,
+ .get_port_immutable = rxe_port_immutable,
+ .map_mr_sg = rxe_map_mr_sg,
+ .mmap = rxe_mmap,
+ .modify_ah = rxe_modify_ah,
+ .modify_device = rxe_modify_device,
+ .modify_port = rxe_modify_port,
+ .modify_qp = rxe_modify_qp,
+ .modify_srq = rxe_modify_srq,
+ .peek_cq = rxe_peek_cq,
+ .poll_cq = rxe_poll_cq,
+ .post_recv = rxe_post_recv,
+ .post_send = rxe_post_send,
+ .post_srq_recv = rxe_post_srq_recv,
+ .query_ah = rxe_query_ah,
+ .query_device = rxe_query_device,
+ .query_pkey = rxe_query_pkey,
+ .query_port = rxe_query_port,
+ .query_qp = rxe_query_qp,
+ .query_srq = rxe_query_srq,
+ .reg_user_mr = rxe_reg_user_mr,
+ .req_notify_cq = rxe_req_notify_cq,
+ .rereg_user_mr = rxe_rereg_user_mr,
+ .resize_cq = rxe_resize_cq,
+
+ INIT_RDMA_OBJ_SIZE(ib_ah, rxe_ah, ibah),
+ INIT_RDMA_OBJ_SIZE(ib_cq, rxe_cq, ibcq),
+ INIT_RDMA_OBJ_SIZE(ib_pd, rxe_pd, ibpd),
+ INIT_RDMA_OBJ_SIZE(ib_qp, rxe_qp, ibqp),
+ INIT_RDMA_OBJ_SIZE(ib_srq, rxe_srq, ibsrq),
+ INIT_RDMA_OBJ_SIZE(ib_ucontext, rxe_ucontext, ibuc),
+ INIT_RDMA_OBJ_SIZE(ib_mw, rxe_mw, ibmw),
+};
+
+int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name)
+{
+ int err;
+ struct ib_device *dev = &rxe->ib_dev;
+
+ strscpy(dev->node_desc, "rxe", sizeof(dev->node_desc));
+
+ dev->node_type = RDMA_NODE_IB_CA;
+ dev->phys_port_cnt = 1;
+ dev->num_comp_vectors = num_possible_cpus();
+ dev->local_dma_lkey = 0;
+ addrconf_addr_eui48((unsigned char *)&dev->node_guid,
+ rxe->ndev->dev_addr);
+
+ dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND) |
+ BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ);
+
+ ib_set_device_ops(dev, &rxe_dev_ops);
+ err = ib_device_set_netdev(&rxe->ib_dev, rxe->ndev, 1);
+ if (err)
+ return err;
+
+ err = rxe_icrc_init(rxe);
+ if (err)
+ return err;
+
+ err = ib_register_device(dev, ibdev_name, NULL);
+ if (err)
+ rxe_dbg_dev(rxe, "failed with error %d\n", err);
+
+ /*
+ * Note that rxe may be invalid at this point if another thread
+ * unregistered it.
+ */
+ return err;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
new file mode 100644
index 0000000000..ccb9d19ffe
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -0,0 +1,475 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
+ */
+
+#ifndef RXE_VERBS_H
+#define RXE_VERBS_H
+
+#include <linux/interrupt.h>
+#include <linux/workqueue.h>
+#include "rxe_pool.h"
+#include "rxe_task.h"
+#include "rxe_hw_counters.h"
+
+static inline int pkey_match(u16 key1, u16 key2)
+{
+ return (((key1 & 0x7fff) != 0) &&
+ ((key1 & 0x7fff) == (key2 & 0x7fff)) &&
+ ((key1 & 0x8000) || (key2 & 0x8000))) ? 1 : 0;
+}
+
+/* Return >0 if psn_a > psn_b
+ * 0 if psn_a == psn_b
+ * <0 if psn_a < psn_b
+ */
+static inline int psn_compare(u32 psn_a, u32 psn_b)
+{
+ s32 diff;
+
+ diff = (psn_a - psn_b) << 8;
+ return diff;
+}
+
+struct rxe_ucontext {
+ struct ib_ucontext ibuc;
+ struct rxe_pool_elem elem;
+};
+
+struct rxe_pd {
+ struct ib_pd ibpd;
+ struct rxe_pool_elem elem;
+};
+
+struct rxe_ah {
+ struct ib_ah ibah;
+ struct rxe_pool_elem elem;
+ struct rxe_av av;
+ bool is_user;
+ int ah_num;
+};
+
+struct rxe_cqe {
+ union {
+ struct ib_wc ibwc;
+ struct ib_uverbs_wc uibwc;
+ };
+};
+
+struct rxe_cq {
+ struct ib_cq ibcq;
+ struct rxe_pool_elem elem;
+ struct rxe_queue *queue;
+ spinlock_t cq_lock;
+ u8 notify;
+ bool is_user;
+ atomic_t num_wq;
+};
+
+enum wqe_state {
+ wqe_state_posted,
+ wqe_state_processing,
+ wqe_state_pending,
+ wqe_state_done,
+ wqe_state_error,
+};
+
+struct rxe_sq {
+ int max_wr;
+ int max_sge;
+ int max_inline;
+ spinlock_t sq_lock; /* guard queue */
+ struct rxe_queue *queue;
+};
+
+struct rxe_rq {
+ int max_wr;
+ int max_sge;
+ spinlock_t producer_lock; /* guard queue producer */
+ spinlock_t consumer_lock; /* guard queue consumer */
+ struct rxe_queue *queue;
+};
+
+struct rxe_srq {
+ struct ib_srq ibsrq;
+ struct rxe_pool_elem elem;
+ struct rxe_pd *pd;
+ struct rxe_rq rq;
+ u32 srq_num;
+
+ int limit;
+ int error;
+};
+
+struct rxe_req_info {
+ int wqe_index;
+ u32 psn;
+ int opcode;
+ atomic_t rd_atomic;
+ int wait_fence;
+ int need_rd_atomic;
+ int wait_psn;
+ int need_retry;
+ int wait_for_rnr_timer;
+ int noack_pkts;
+ struct rxe_task task;
+};
+
+struct rxe_comp_info {
+ u32 psn;
+ int opcode;
+ int timeout;
+ int timeout_retry;
+ int started_retry;
+ u32 retry_cnt;
+ u32 rnr_retry;
+ struct rxe_task task;
+};
+
+enum rdatm_res_state {
+ rdatm_res_state_next,
+ rdatm_res_state_new,
+ rdatm_res_state_replay,
+};
+
+struct resp_res {
+ int type;
+ int replay;
+ u32 first_psn;
+ u32 last_psn;
+ u32 cur_psn;
+ enum rdatm_res_state state;
+
+ union {
+ struct {
+ u64 orig_val;
+ } atomic;
+ struct {
+ u64 va_org;
+ u32 rkey;
+ u32 length;
+ u64 va;
+ u32 resid;
+ } read;
+ struct {
+ u32 length;
+ u64 va;
+ u8 type;
+ u8 level;
+ } flush;
+ };
+};
+
+struct rxe_resp_info {
+ u32 msn;
+ u32 psn;
+ u32 ack_psn;
+ int opcode;
+ int drop_msg;
+ int goto_error;
+ int sent_psn_nak;
+ enum ib_wc_status status;
+ u8 aeth_syndrome;
+
+ /* Receive only */
+ struct rxe_recv_wqe *wqe;
+
+ /* RDMA read / atomic only */
+ u64 va;
+ u64 offset;
+ struct rxe_mr *mr;
+ u32 resid;
+ u32 rkey;
+ u32 length;
+
+ /* SRQ only */
+ struct {
+ struct rxe_recv_wqe wqe;
+ struct ib_sge sge[RXE_MAX_SGE];
+ } srq_wqe;
+
+ /* Responder resources. It's a circular list where the oldest
+ * resource is dropped first.
+ */
+ struct resp_res *resources;
+ unsigned int res_head;
+ unsigned int res_tail;
+ struct resp_res *res;
+ struct rxe_task task;
+};
+
+struct rxe_qp {
+ struct ib_qp ibqp;
+ struct rxe_pool_elem elem;
+ struct ib_qp_attr attr;
+ unsigned int valid;
+ unsigned int mtu;
+ bool is_user;
+
+ struct rxe_pd *pd;
+ struct rxe_srq *srq;
+ struct rxe_cq *scq;
+ struct rxe_cq *rcq;
+
+ enum ib_sig_type sq_sig_type;
+
+ struct rxe_sq sq;
+ struct rxe_rq rq;
+
+ struct socket *sk;
+ u32 dst_cookie;
+ u16 src_port;
+
+ struct rxe_av pri_av;
+ struct rxe_av alt_av;
+
+ atomic_t mcg_num;
+
+ struct sk_buff_head req_pkts;
+ struct sk_buff_head resp_pkts;
+
+ struct rxe_req_info req;
+ struct rxe_comp_info comp;
+ struct rxe_resp_info resp;
+
+ atomic_t ssn;
+ atomic_t skb_out;
+ int need_req_skb;
+
+ /* Timer for retranmitting packet when ACKs have been lost. RC
+ * only. The requester sets it when it is not already
+ * started. The responder resets it whenever an ack is
+ * received.
+ */
+ struct timer_list retrans_timer;
+ u64 qp_timeout_jiffies;
+
+ /* Timer for handling RNR NAKS. */
+ struct timer_list rnr_nak_timer;
+
+ spinlock_t state_lock; /* guard requester and completer */
+
+ struct execute_work cleanup_work;
+};
+
+enum {
+ RXE_ACCESS_REMOTE = IB_ACCESS_REMOTE_READ
+ | IB_ACCESS_REMOTE_WRITE
+ | IB_ACCESS_REMOTE_ATOMIC,
+ RXE_ACCESS_SUPPORTED_MR = RXE_ACCESS_REMOTE
+ | IB_ACCESS_LOCAL_WRITE
+ | IB_ACCESS_MW_BIND
+ | IB_ACCESS_ON_DEMAND
+ | IB_ACCESS_FLUSH_GLOBAL
+ | IB_ACCESS_FLUSH_PERSISTENT
+ | IB_ACCESS_OPTIONAL,
+ RXE_ACCESS_SUPPORTED_QP = RXE_ACCESS_SUPPORTED_MR,
+ RXE_ACCESS_SUPPORTED_MW = RXE_ACCESS_SUPPORTED_MR
+ | IB_ZERO_BASED,
+};
+
+enum rxe_mr_state {
+ RXE_MR_STATE_INVALID,
+ RXE_MR_STATE_FREE,
+ RXE_MR_STATE_VALID,
+};
+
+enum rxe_mr_copy_dir {
+ RXE_TO_MR_OBJ,
+ RXE_FROM_MR_OBJ,
+};
+
+enum rxe_mr_lookup_type {
+ RXE_LOOKUP_LOCAL,
+ RXE_LOOKUP_REMOTE,
+};
+
+enum rxe_rereg {
+ RXE_MR_REREG_SUPPORTED = IB_MR_REREG_PD
+ | IB_MR_REREG_ACCESS,
+};
+
+static inline int rkey_is_mw(u32 rkey)
+{
+ u32 index = rkey >> 8;
+
+ return (index >= RXE_MIN_MW_INDEX) && (index <= RXE_MAX_MW_INDEX);
+}
+
+struct rxe_mr {
+ struct rxe_pool_elem elem;
+ struct ib_mr ibmr;
+
+ struct ib_umem *umem;
+
+ u32 lkey;
+ u32 rkey;
+ enum rxe_mr_state state;
+ int access;
+ atomic_t num_mw;
+
+ unsigned int page_offset;
+ unsigned int page_shift;
+ u64 page_mask;
+
+ u32 num_buf;
+ u32 nbuf;
+
+ struct xarray page_list;
+};
+
+static inline unsigned int mr_page_size(struct rxe_mr *mr)
+{
+ return mr ? mr->ibmr.page_size : PAGE_SIZE;
+}
+
+enum rxe_mw_state {
+ RXE_MW_STATE_INVALID = RXE_MR_STATE_INVALID,
+ RXE_MW_STATE_FREE = RXE_MR_STATE_FREE,
+ RXE_MW_STATE_VALID = RXE_MR_STATE_VALID,
+};
+
+struct rxe_mw {
+ struct ib_mw ibmw;
+ struct rxe_pool_elem elem;
+ spinlock_t lock;
+ enum rxe_mw_state state;
+ struct rxe_qp *qp; /* Type 2 only */
+ struct rxe_mr *mr;
+ u32 rkey;
+ int access;
+ u64 addr;
+ u64 length;
+};
+
+struct rxe_mcg {
+ struct rb_node node;
+ struct kref ref_cnt;
+ struct rxe_dev *rxe;
+ struct list_head qp_list;
+ union ib_gid mgid;
+ atomic_t qp_num;
+ u32 qkey;
+ u16 pkey;
+};
+
+struct rxe_mca {
+ struct list_head qp_list;
+ struct rxe_qp *qp;
+};
+
+struct rxe_port {
+ struct ib_port_attr attr;
+ __be64 port_guid;
+ __be64 subnet_prefix;
+ spinlock_t port_lock; /* guard port */
+ unsigned int mtu_cap;
+ /* special QPs */
+ u32 qp_gsi_index;
+};
+
+struct rxe_dev {
+ struct ib_device ib_dev;
+ struct ib_device_attr attr;
+ int max_ucontext;
+ int max_inline_data;
+ struct mutex usdev_lock;
+
+ struct net_device *ndev;
+
+ struct rxe_pool uc_pool;
+ struct rxe_pool pd_pool;
+ struct rxe_pool ah_pool;
+ struct rxe_pool srq_pool;
+ struct rxe_pool qp_pool;
+ struct rxe_pool cq_pool;
+ struct rxe_pool mr_pool;
+ struct rxe_pool mw_pool;
+
+ /* multicast support */
+ spinlock_t mcg_lock;
+ struct rb_root mcg_tree;
+ atomic_t mcg_num;
+ atomic_t mcg_attach;
+
+ spinlock_t pending_lock; /* guard pending_mmaps */
+ struct list_head pending_mmaps;
+
+ spinlock_t mmap_offset_lock; /* guard mmap_offset */
+ u64 mmap_offset;
+
+ atomic64_t stats_counters[RXE_NUM_OF_COUNTERS];
+
+ struct rxe_port port;
+ struct crypto_shash *tfm;
+};
+
+static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters index)
+{
+ atomic64_inc(&rxe->stats_counters[index]);
+}
+
+static inline struct rxe_dev *to_rdev(struct ib_device *dev)
+{
+ return dev ? container_of(dev, struct rxe_dev, ib_dev) : NULL;
+}
+
+static inline struct rxe_ucontext *to_ruc(struct ib_ucontext *uc)
+{
+ return uc ? container_of(uc, struct rxe_ucontext, ibuc) : NULL;
+}
+
+static inline struct rxe_pd *to_rpd(struct ib_pd *pd)
+{
+ return pd ? container_of(pd, struct rxe_pd, ibpd) : NULL;
+}
+
+static inline struct rxe_ah *to_rah(struct ib_ah *ah)
+{
+ return ah ? container_of(ah, struct rxe_ah, ibah) : NULL;
+}
+
+static inline struct rxe_srq *to_rsrq(struct ib_srq *srq)
+{
+ return srq ? container_of(srq, struct rxe_srq, ibsrq) : NULL;
+}
+
+static inline struct rxe_qp *to_rqp(struct ib_qp *qp)
+{
+ return qp ? container_of(qp, struct rxe_qp, ibqp) : NULL;
+}
+
+static inline struct rxe_cq *to_rcq(struct ib_cq *cq)
+{
+ return cq ? container_of(cq, struct rxe_cq, ibcq) : NULL;
+}
+
+static inline struct rxe_mr *to_rmr(struct ib_mr *mr)
+{
+ return mr ? container_of(mr, struct rxe_mr, ibmr) : NULL;
+}
+
+static inline struct rxe_mw *to_rmw(struct ib_mw *mw)
+{
+ return mw ? container_of(mw, struct rxe_mw, ibmw) : NULL;
+}
+
+static inline struct rxe_pd *rxe_ah_pd(struct rxe_ah *ah)
+{
+ return to_rpd(ah->ibah.pd);
+}
+
+static inline struct rxe_pd *mr_pd(struct rxe_mr *mr)
+{
+ return to_rpd(mr->ibmr.pd);
+}
+
+static inline struct rxe_pd *rxe_mw_pd(struct rxe_mw *mw)
+{
+ return to_rpd(mw->ibmw.pd);
+}
+
+int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name);
+
+#endif /* RXE_VERBS_H */
diff --git a/drivers/infiniband/sw/siw/Kconfig b/drivers/infiniband/sw/siw/Kconfig
new file mode 100644
index 0000000000..81b70a3eeb
--- /dev/null
+++ b/drivers/infiniband/sw/siw/Kconfig
@@ -0,0 +1,21 @@
+config RDMA_SIW
+ tristate "Software RDMA over TCP/IP (iWARP) driver"
+ depends on INET && INFINIBAND
+ depends on INFINIBAND_VIRT_DMA
+ select LIBCRC32C
+ select CRYPTO
+ select CRYPTO_CRC32C
+ help
+ This driver implements the iWARP RDMA transport over
+ the Linux TCP/IP network stack. It enables a system with a
+ standard Ethernet adapter to interoperate with a iWARP
+ adapter or with another system running the SIW driver.
+ (See also RXE which is a similar software driver for RoCE.)
+
+ The driver interfaces with the Linux RDMA stack and
+ implements both a kernel and user space RDMA verbs API.
+ The user space verbs API requires a support
+ library named libsiw which is loaded by the generic user
+ space verbs API, libibverbs. To implement RDMA over
+ TCP/IP, the driver further interfaces with the Linux
+ in-kernel TCP socket layer.
diff --git a/drivers/infiniband/sw/siw/Makefile b/drivers/infiniband/sw/siw/Makefile
new file mode 100644
index 0000000000..f5f7e38678
--- /dev/null
+++ b/drivers/infiniband/sw/siw/Makefile
@@ -0,0 +1,11 @@
+obj-$(CONFIG_RDMA_SIW) += siw.o
+
+siw-y := \
+ siw_cm.o \
+ siw_cq.o \
+ siw_main.o \
+ siw_mem.o \
+ siw_qp.o \
+ siw_qp_tx.o \
+ siw_qp_rx.o \
+ siw_verbs.o
diff --git a/drivers/infiniband/sw/siw/iwarp.h b/drivers/infiniband/sw/siw/iwarp.h
new file mode 100644
index 0000000000..3f1dedb50a
--- /dev/null
+++ b/drivers/infiniband/sw/siw/iwarp.h
@@ -0,0 +1,367 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#ifndef _IWARP_H
+#define _IWARP_H
+
+#include <rdma/rdma_user_cm.h> /* RDMA_MAX_PRIVATE_DATA */
+#include <linux/types.h>
+#include <asm/byteorder.h>
+
+#define RDMAP_VERSION 1
+#define DDP_VERSION 1
+#define MPA_REVISION_1 1
+#define MPA_REVISION_2 2
+#define MPA_MAX_PRIVDATA RDMA_MAX_PRIVATE_DATA
+#define MPA_KEY_REQ "MPA ID Req Frame"
+#define MPA_KEY_REP "MPA ID Rep Frame"
+#define MPA_IRD_ORD_MASK 0x3fff
+
+struct mpa_rr_params {
+ __be16 bits;
+ __be16 pd_len;
+};
+
+/*
+ * MPA request/response header bits & fields
+ */
+enum {
+ MPA_RR_FLAG_MARKERS = cpu_to_be16(0x8000),
+ MPA_RR_FLAG_CRC = cpu_to_be16(0x4000),
+ MPA_RR_FLAG_REJECT = cpu_to_be16(0x2000),
+ MPA_RR_FLAG_ENHANCED = cpu_to_be16(0x1000),
+ MPA_RR_FLAG_GSO_EXP = cpu_to_be16(0x0800),
+ MPA_RR_MASK_REVISION = cpu_to_be16(0x00ff)
+};
+
+/*
+ * MPA request/reply header
+ */
+struct mpa_rr {
+ __u8 key[16];
+ struct mpa_rr_params params;
+};
+
+static inline void __mpa_rr_set_revision(__be16 *bits, u8 rev)
+{
+ *bits = (*bits & ~MPA_RR_MASK_REVISION) |
+ (cpu_to_be16(rev) & MPA_RR_MASK_REVISION);
+}
+
+static inline u8 __mpa_rr_revision(__be16 mpa_rr_bits)
+{
+ __be16 rev = mpa_rr_bits & MPA_RR_MASK_REVISION;
+
+ return be16_to_cpu(rev);
+}
+
+enum mpa_v2_ctrl {
+ MPA_V2_PEER_TO_PEER = cpu_to_be16(0x8000),
+ MPA_V2_ZERO_LENGTH_RTR = cpu_to_be16(0x4000),
+ MPA_V2_RDMA_WRITE_RTR = cpu_to_be16(0x8000),
+ MPA_V2_RDMA_READ_RTR = cpu_to_be16(0x4000),
+ MPA_V2_RDMA_NO_RTR = cpu_to_be16(0x0000),
+ MPA_V2_MASK_IRD_ORD = cpu_to_be16(0x3fff)
+};
+
+struct mpa_v2_data {
+ __be16 ird;
+ __be16 ord;
+};
+
+struct mpa_marker {
+ __be16 rsvd;
+ __be16 fpdu_hmd; /* FPDU header-marker distance (= MPA's FPDUPTR) */
+};
+
+/*
+ * maximum MPA trailer
+ */
+struct mpa_trailer {
+ __u8 pad[4];
+ __be32 crc;
+};
+
+#define MPA_HDR_SIZE 2
+#define MPA_CRC_SIZE 4
+
+/*
+ * Common portion of iWARP headers (MPA, DDP, RDMAP)
+ * for any FPDU
+ */
+struct iwarp_ctrl {
+ __be16 mpa_len;
+ __be16 ddp_rdmap_ctrl;
+};
+
+/*
+ * DDP/RDMAP Hdr bits & fields
+ */
+enum {
+ DDP_FLAG_TAGGED = cpu_to_be16(0x8000),
+ DDP_FLAG_LAST = cpu_to_be16(0x4000),
+ DDP_MASK_RESERVED = cpu_to_be16(0x3C00),
+ DDP_MASK_VERSION = cpu_to_be16(0x0300),
+ RDMAP_MASK_VERSION = cpu_to_be16(0x00C0),
+ RDMAP_MASK_RESERVED = cpu_to_be16(0x0030),
+ RDMAP_MASK_OPCODE = cpu_to_be16(0x000f)
+};
+
+static inline u8 __ddp_get_version(struct iwarp_ctrl *ctrl)
+{
+ return be16_to_cpu(ctrl->ddp_rdmap_ctrl & DDP_MASK_VERSION) >> 8;
+}
+
+static inline u8 __rdmap_get_version(struct iwarp_ctrl *ctrl)
+{
+ __be16 ver = ctrl->ddp_rdmap_ctrl & RDMAP_MASK_VERSION;
+
+ return be16_to_cpu(ver) >> 6;
+}
+
+static inline u8 __rdmap_get_opcode(struct iwarp_ctrl *ctrl)
+{
+ return be16_to_cpu(ctrl->ddp_rdmap_ctrl & RDMAP_MASK_OPCODE);
+}
+
+static inline void __rdmap_set_opcode(struct iwarp_ctrl *ctrl, u8 opcode)
+{
+ ctrl->ddp_rdmap_ctrl = (ctrl->ddp_rdmap_ctrl & ~RDMAP_MASK_OPCODE) |
+ (cpu_to_be16(opcode) & RDMAP_MASK_OPCODE);
+}
+
+struct iwarp_rdma_write {
+ struct iwarp_ctrl ctrl;
+ __be32 sink_stag;
+ __be64 sink_to;
+};
+
+struct iwarp_rdma_rreq {
+ struct iwarp_ctrl ctrl;
+ __be32 rsvd;
+ __be32 ddp_qn;
+ __be32 ddp_msn;
+ __be32 ddp_mo;
+ __be32 sink_stag;
+ __be64 sink_to;
+ __be32 read_size;
+ __be32 source_stag;
+ __be64 source_to;
+};
+
+struct iwarp_rdma_rresp {
+ struct iwarp_ctrl ctrl;
+ __be32 sink_stag;
+ __be64 sink_to;
+};
+
+struct iwarp_send {
+ struct iwarp_ctrl ctrl;
+ __be32 rsvd;
+ __be32 ddp_qn;
+ __be32 ddp_msn;
+ __be32 ddp_mo;
+};
+
+struct iwarp_send_inv {
+ struct iwarp_ctrl ctrl;
+ __be32 inval_stag;
+ __be32 ddp_qn;
+ __be32 ddp_msn;
+ __be32 ddp_mo;
+};
+
+struct iwarp_terminate {
+ struct iwarp_ctrl ctrl;
+ __be32 rsvd;
+ __be32 ddp_qn;
+ __be32 ddp_msn;
+ __be32 ddp_mo;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __be32 layer : 4;
+ __be32 etype : 4;
+ __be32 ecode : 8;
+ __be32 flag_m : 1;
+ __be32 flag_d : 1;
+ __be32 flag_r : 1;
+ __be32 reserved : 13;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ __be32 reserved : 13;
+ __be32 flag_r : 1;
+ __be32 flag_d : 1;
+ __be32 flag_m : 1;
+ __be32 ecode : 8;
+ __be32 etype : 4;
+ __be32 layer : 4;
+#else
+#error "undefined byte order"
+#endif
+};
+
+/*
+ * Terminate Hdr bits & fields
+ */
+enum {
+ TERM_MASK_LAYER = cpu_to_be32(0xf0000000),
+ TERM_MASK_ETYPE = cpu_to_be32(0x0f000000),
+ TERM_MASK_ECODE = cpu_to_be32(0x00ff0000),
+ TERM_FLAG_M = cpu_to_be32(0x00008000),
+ TERM_FLAG_D = cpu_to_be32(0x00004000),
+ TERM_FLAG_R = cpu_to_be32(0x00002000),
+ TERM_MASK_RESVD = cpu_to_be32(0x00001fff)
+};
+
+static inline u8 __rdmap_term_layer(struct iwarp_terminate *term)
+{
+ return term->layer;
+}
+
+static inline void __rdmap_term_set_layer(struct iwarp_terminate *term,
+ u8 layer)
+{
+ term->layer = layer & 0xf;
+}
+
+static inline u8 __rdmap_term_etype(struct iwarp_terminate *term)
+{
+ return term->etype;
+}
+
+static inline void __rdmap_term_set_etype(struct iwarp_terminate *term,
+ u8 etype)
+{
+ term->etype = etype & 0xf;
+}
+
+static inline u8 __rdmap_term_ecode(struct iwarp_terminate *term)
+{
+ return term->ecode;
+}
+
+static inline void __rdmap_term_set_ecode(struct iwarp_terminate *term,
+ u8 ecode)
+{
+ term->ecode = ecode;
+}
+
+/*
+ * Common portion of iWARP headers (MPA, DDP, RDMAP)
+ * for an FPDU carrying an untagged DDP segment
+ */
+struct iwarp_ctrl_untagged {
+ struct iwarp_ctrl ctrl;
+ __be32 rsvd;
+ __be32 ddp_qn;
+ __be32 ddp_msn;
+ __be32 ddp_mo;
+};
+
+/*
+ * Common portion of iWARP headers (MPA, DDP, RDMAP)
+ * for an FPDU carrying a tagged DDP segment
+ */
+struct iwarp_ctrl_tagged {
+ struct iwarp_ctrl ctrl;
+ __be32 ddp_stag;
+ __be64 ddp_to;
+};
+
+union iwarp_hdr {
+ struct iwarp_ctrl ctrl;
+ struct iwarp_ctrl_untagged c_untagged;
+ struct iwarp_ctrl_tagged c_tagged;
+ struct iwarp_rdma_write rwrite;
+ struct iwarp_rdma_rreq rreq;
+ struct iwarp_rdma_rresp rresp;
+ struct iwarp_terminate terminate;
+ struct iwarp_send send;
+ struct iwarp_send_inv send_inv;
+};
+
+enum term_elayer {
+ TERM_ERROR_LAYER_RDMAP = 0x00,
+ TERM_ERROR_LAYER_DDP = 0x01,
+ TERM_ERROR_LAYER_LLP = 0x02 /* eg., MPA */
+};
+
+enum ddp_etype {
+ DDP_ETYPE_CATASTROPHIC = 0x0,
+ DDP_ETYPE_TAGGED_BUF = 0x1,
+ DDP_ETYPE_UNTAGGED_BUF = 0x2,
+ DDP_ETYPE_RSVD = 0x3
+};
+
+enum ddp_ecode {
+ /* unspecified, set to zero */
+ DDP_ECODE_CATASTROPHIC = 0x00,
+ /* Tagged Buffer Errors */
+ DDP_ECODE_T_INVALID_STAG = 0x00,
+ DDP_ECODE_T_BASE_BOUNDS = 0x01,
+ DDP_ECODE_T_STAG_NOT_ASSOC = 0x02,
+ DDP_ECODE_T_TO_WRAP = 0x03,
+ DDP_ECODE_T_VERSION = 0x04,
+ /* Untagged Buffer Errors */
+ DDP_ECODE_UT_INVALID_QN = 0x01,
+ DDP_ECODE_UT_INVALID_MSN_NOBUF = 0x02,
+ DDP_ECODE_UT_INVALID_MSN_RANGE = 0x03,
+ DDP_ECODE_UT_INVALID_MO = 0x04,
+ DDP_ECODE_UT_MSG_TOOLONG = 0x05,
+ DDP_ECODE_UT_VERSION = 0x06
+};
+
+enum rdmap_untagged_qn {
+ RDMAP_UNTAGGED_QN_SEND = 0,
+ RDMAP_UNTAGGED_QN_RDMA_READ = 1,
+ RDMAP_UNTAGGED_QN_TERMINATE = 2,
+ RDMAP_UNTAGGED_QN_COUNT = 3
+};
+
+enum rdmap_etype {
+ RDMAP_ETYPE_CATASTROPHIC = 0x0,
+ RDMAP_ETYPE_REMOTE_PROTECTION = 0x1,
+ RDMAP_ETYPE_REMOTE_OPERATION = 0x2
+};
+
+enum rdmap_ecode {
+ RDMAP_ECODE_INVALID_STAG = 0x00,
+ RDMAP_ECODE_BASE_BOUNDS = 0x01,
+ RDMAP_ECODE_ACCESS_RIGHTS = 0x02,
+ RDMAP_ECODE_STAG_NOT_ASSOC = 0x03,
+ RDMAP_ECODE_TO_WRAP = 0x04,
+ RDMAP_ECODE_VERSION = 0x05,
+ RDMAP_ECODE_OPCODE = 0x06,
+ RDMAP_ECODE_CATASTROPHIC_STREAM = 0x07,
+ RDMAP_ECODE_CATASTROPHIC_GLOBAL = 0x08,
+ RDMAP_ECODE_CANNOT_INVALIDATE = 0x09,
+ RDMAP_ECODE_UNSPECIFIED = 0xff
+};
+
+enum llp_ecode {
+ LLP_ECODE_TCP_STREAM_LOST = 0x01, /* How to transfer this ?? */
+ LLP_ECODE_RECEIVED_CRC = 0x02,
+ LLP_ECODE_FPDU_START = 0x03,
+ LLP_ECODE_INVALID_REQ_RESP = 0x04,
+
+ /* Errors for Enhanced Connection Establishment only */
+ LLP_ECODE_LOCAL_CATASTROPHIC = 0x05,
+ LLP_ECODE_INSUFFICIENT_IRD = 0x06,
+ LLP_ECODE_NO_MATCHING_RTR = 0x07
+};
+
+enum llp_etype { LLP_ETYPE_MPA = 0x00 };
+
+enum rdma_opcode {
+ RDMAP_RDMA_WRITE = 0x0,
+ RDMAP_RDMA_READ_REQ = 0x1,
+ RDMAP_RDMA_READ_RESP = 0x2,
+ RDMAP_SEND = 0x3,
+ RDMAP_SEND_INVAL = 0x4,
+ RDMAP_SEND_SE = 0x5,
+ RDMAP_SEND_SE_INVAL = 0x6,
+ RDMAP_TERMINATE = 0x7,
+ RDMAP_NOT_SUPPORTED = RDMAP_TERMINATE + 1
+};
+
+#endif
diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h
new file mode 100644
index 0000000000..58dddb143b
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw.h
@@ -0,0 +1,729 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#ifndef _SIW_H
+#define _SIW_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/restrack.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <crypto/hash.h>
+#include <linux/crc32.h>
+#include <linux/crc32c.h>
+
+#include <rdma/siw-abi.h>
+#include "iwarp.h"
+
+#define SIW_VENDOR_ID 0x626d74 /* ascii 'bmt' for now */
+#define SIW_VENDORT_PART_ID 0
+#define SIW_MAX_QP (1024 * 100)
+#define SIW_MAX_QP_WR (1024 * 32)
+#define SIW_MAX_ORD_QP 128
+#define SIW_MAX_IRD_QP 128
+#define SIW_MAX_SGE_PBL 256 /* max num sge's for PBL */
+#define SIW_MAX_SGE_RD 1 /* iwarp limitation. we could relax */
+#define SIW_MAX_CQ (1024 * 100)
+#define SIW_MAX_CQE (SIW_MAX_QP_WR * 100)
+#define SIW_MAX_MR (SIW_MAX_QP * 10)
+#define SIW_MAX_PD SIW_MAX_QP
+#define SIW_MAX_MW 0 /* to be set if MW's are supported */
+#define SIW_MAX_SRQ SIW_MAX_QP
+#define SIW_MAX_SRQ_WR (SIW_MAX_QP_WR * 10)
+#define SIW_MAX_CONTEXT SIW_MAX_PD
+
+/* Min number of bytes for using zero copy transmit */
+#define SENDPAGE_THRESH PAGE_SIZE
+
+/* Maximum number of frames which can be send in one SQ processing */
+#define SQ_USER_MAXBURST 100
+
+/* Maximum number of consecutive IRQ elements which get served
+ * if SQ has pending work. Prevents starving local SQ processing
+ * by serving peer Read Requests.
+ */
+#define SIW_IRQ_MAXBURST_SQ_ACTIVE 4
+
+struct siw_dev_cap {
+ int max_qp;
+ int max_qp_wr;
+ int max_ord; /* max. outbound read queue depth */
+ int max_ird; /* max. inbound read queue depth */
+ int max_sge;
+ int max_sge_rd;
+ int max_cq;
+ int max_cqe;
+ int max_mr;
+ int max_pd;
+ int max_mw;
+ int max_srq;
+ int max_srq_wr;
+ int max_srq_sge;
+};
+
+struct siw_pd {
+ struct ib_pd base_pd;
+};
+
+struct siw_device {
+ struct ib_device base_dev;
+ struct net_device *netdev;
+ struct siw_dev_cap attrs;
+
+ u32 vendor_part_id;
+ int numa_node;
+ char raw_gid[ETH_ALEN];
+
+ /* physical port state (only one port per device) */
+ enum ib_port_state state;
+
+ spinlock_t lock;
+
+ struct xarray qp_xa;
+ struct xarray mem_xa;
+
+ struct list_head cep_list;
+ struct list_head qp_list;
+
+ /* active objects statistics to enforce limits */
+ atomic_t num_qp;
+ atomic_t num_cq;
+ atomic_t num_pd;
+ atomic_t num_mr;
+ atomic_t num_srq;
+ atomic_t num_ctx;
+
+ struct work_struct netdev_down;
+};
+
+struct siw_ucontext {
+ struct ib_ucontext base_ucontext;
+ struct siw_device *sdev;
+};
+
+/*
+ * The RDMA core does not define LOCAL_READ access, which is always
+ * enabled implictely.
+ */
+#define IWARP_ACCESS_MASK \
+ (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | \
+ IB_ACCESS_REMOTE_READ)
+
+/*
+ * siw presentation of user memory registered as source
+ * or target of RDMA operations.
+ */
+
+struct siw_page_chunk {
+ struct page **plist;
+};
+
+struct siw_umem {
+ struct siw_page_chunk *page_chunk;
+ int num_pages;
+ bool writable;
+ u64 fp_addr; /* First page base address */
+ struct mm_struct *owning_mm;
+};
+
+struct siw_pble {
+ dma_addr_t addr; /* Address of assigned buffer */
+ unsigned int size; /* Size of this entry */
+ unsigned long pbl_off; /* Total offset from start of PBL */
+};
+
+struct siw_pbl {
+ unsigned int num_buf;
+ unsigned int max_buf;
+ struct siw_pble pbe[];
+};
+
+/*
+ * Generic memory representation for registered siw memory.
+ * Memory lookup always via higher 24 bit of STag (STag index).
+ */
+struct siw_mem {
+ struct siw_device *sdev;
+ struct kref ref;
+ u64 va; /* VA of memory */
+ u64 len; /* length of the memory buffer in bytes */
+ u32 stag; /* iWarp memory access steering tag */
+ u8 stag_valid; /* VALID or INVALID */
+ u8 is_pbl; /* PBL or user space mem */
+ u8 is_mw; /* Memory Region or Memory Window */
+ enum ib_access_flags perms; /* local/remote READ & WRITE */
+ union {
+ struct siw_umem *umem;
+ struct siw_pbl *pbl;
+ void *mem_obj;
+ };
+ struct ib_pd *pd;
+};
+
+struct siw_mr {
+ struct ib_mr base_mr;
+ struct siw_mem *mem;
+ struct rcu_head rcu;
+};
+
+/*
+ * Error codes for local or remote
+ * access to registered memory
+ */
+enum siw_access_state {
+ E_ACCESS_OK,
+ E_STAG_INVALID,
+ E_BASE_BOUNDS,
+ E_ACCESS_PERM,
+ E_PD_MISMATCH
+};
+
+enum siw_wr_state {
+ SIW_WR_IDLE,
+ SIW_WR_QUEUED, /* processing has not started yet */
+ SIW_WR_INPROGRESS /* initiated processing of the WR */
+};
+
+/* The WQE currently being processed (RX or TX) */
+struct siw_wqe {
+ /* Copy of applications SQE or RQE */
+ union {
+ struct siw_sqe sqe;
+ struct siw_rqe rqe;
+ };
+ struct siw_mem *mem[SIW_MAX_SGE]; /* per sge's resolved mem */
+ enum siw_wr_state wr_status;
+ enum siw_wc_status wc_status;
+ u32 bytes; /* total bytes to process */
+ u32 processed; /* bytes processed */
+};
+
+struct siw_cq {
+ struct ib_cq base_cq;
+ spinlock_t lock;
+ struct siw_cq_ctrl *notify;
+ struct siw_cqe *queue;
+ u32 cq_put;
+ u32 cq_get;
+ u32 num_cqe;
+ struct rdma_user_mmap_entry *cq_entry; /* mmap info for CQE array */
+ u32 id; /* For debugging only */
+};
+
+enum siw_qp_state {
+ SIW_QP_STATE_IDLE,
+ SIW_QP_STATE_RTR,
+ SIW_QP_STATE_RTS,
+ SIW_QP_STATE_CLOSING,
+ SIW_QP_STATE_TERMINATE,
+ SIW_QP_STATE_ERROR,
+ SIW_QP_STATE_COUNT
+};
+
+enum siw_qp_flags {
+ SIW_RDMA_BIND_ENABLED = (1 << 0),
+ SIW_RDMA_WRITE_ENABLED = (1 << 1),
+ SIW_RDMA_READ_ENABLED = (1 << 2),
+ SIW_SIGNAL_ALL_WR = (1 << 3),
+ SIW_MPA_CRC = (1 << 4),
+ SIW_QP_IN_DESTROY = (1 << 5)
+};
+
+enum siw_qp_attr_mask {
+ SIW_QP_ATTR_STATE = (1 << 0),
+ SIW_QP_ATTR_ACCESS_FLAGS = (1 << 1),
+ SIW_QP_ATTR_LLP_HANDLE = (1 << 2),
+ SIW_QP_ATTR_ORD = (1 << 3),
+ SIW_QP_ATTR_IRD = (1 << 4),
+ SIW_QP_ATTR_SQ_SIZE = (1 << 5),
+ SIW_QP_ATTR_RQ_SIZE = (1 << 6),
+ SIW_QP_ATTR_MPA = (1 << 7)
+};
+
+struct siw_srq {
+ struct ib_srq base_srq;
+ spinlock_t lock;
+ u32 max_sge;
+ u32 limit; /* low watermark for async event */
+ struct siw_rqe *recvq;
+ u32 rq_put;
+ u32 rq_get;
+ u32 num_rqe; /* max # of wqe's allowed */
+ struct rdma_user_mmap_entry *srq_entry; /* mmap info for SRQ array */
+ bool armed:1; /* inform user if limit hit */
+ bool is_kernel_res:1; /* true if kernel client */
+};
+
+struct siw_qp_attrs {
+ enum siw_qp_state state;
+ u32 sq_size;
+ u32 rq_size;
+ u32 orq_size;
+ u32 irq_size;
+ u32 sq_max_sges;
+ u32 rq_max_sges;
+ enum siw_qp_flags flags;
+
+ struct socket *sk;
+};
+
+enum siw_tx_ctx {
+ SIW_SEND_HDR, /* start or continue sending HDR */
+ SIW_SEND_DATA, /* start or continue sending DDP payload */
+ SIW_SEND_TRAILER, /* start or continue sending TRAILER */
+ SIW_SEND_SHORT_FPDU/* send whole FPDU hdr|data|trailer at once */
+};
+
+enum siw_rx_state {
+ SIW_GET_HDR, /* await new hdr or within hdr */
+ SIW_GET_DATA_START, /* start of inbound DDP payload */
+ SIW_GET_DATA_MORE, /* continuation of (misaligned) DDP payload */
+ SIW_GET_TRAILER/* await new trailer or within trailer */
+};
+
+struct siw_rx_stream {
+ struct sk_buff *skb;
+ int skb_new; /* pending unread bytes in skb */
+ int skb_offset; /* offset in skb */
+ int skb_copied; /* processed bytes in skb */
+
+ union iwarp_hdr hdr;
+ struct mpa_trailer trailer;
+
+ enum siw_rx_state state;
+
+ /*
+ * For each FPDU, main RX loop runs through 3 stages:
+ * Receiving protocol headers, placing DDP payload and receiving
+ * trailer information (CRC + possibly padding).
+ * Next two variables keep state on receive status of the
+ * current FPDU part (hdr, data, trailer).
+ */
+ int fpdu_part_rcvd; /* bytes in pkt part copied */
+ int fpdu_part_rem; /* bytes in pkt part not seen */
+
+ /*
+ * Next expected DDP MSN for each QN +
+ * expected steering tag +
+ * expected DDP tagget offset (all HBO)
+ */
+ u32 ddp_msn[RDMAP_UNTAGGED_QN_COUNT];
+ u32 ddp_stag;
+ u64 ddp_to;
+ u32 inval_stag; /* Stag to be invalidated */
+
+ struct shash_desc *mpa_crc_hd;
+ u8 rx_suspend : 1;
+ u8 pad : 2; /* # of pad bytes expected */
+ u8 rdmap_op : 4; /* opcode of current frame */
+};
+
+struct siw_rx_fpdu {
+ /*
+ * Local destination memory of inbound RDMA operation.
+ * Valid, according to wqe->wr_status
+ */
+ struct siw_wqe wqe_active;
+
+ unsigned int pbl_idx; /* Index into current PBL */
+ unsigned int sge_idx; /* current sge in rx */
+ unsigned int sge_off; /* already rcvd in curr. sge */
+
+ char first_ddp_seg; /* this is the first DDP seg */
+ char more_ddp_segs; /* more DDP segs expected */
+ u8 prev_rdmap_op : 4; /* opcode of prev frame */
+};
+
+/*
+ * Shorthands for short packets w/o payload
+ * to be transmitted more efficient.
+ */
+struct siw_send_pkt {
+ struct iwarp_send send;
+ __be32 crc;
+};
+
+struct siw_write_pkt {
+ struct iwarp_rdma_write write;
+ __be32 crc;
+};
+
+struct siw_rreq_pkt {
+ struct iwarp_rdma_rreq rreq;
+ __be32 crc;
+};
+
+struct siw_rresp_pkt {
+ struct iwarp_rdma_rresp rresp;
+ __be32 crc;
+};
+
+struct siw_iwarp_tx {
+ union {
+ union iwarp_hdr hdr;
+
+ /* Generic part of FPDU header */
+ struct iwarp_ctrl ctrl;
+ struct iwarp_ctrl_untagged c_untagged;
+ struct iwarp_ctrl_tagged c_tagged;
+
+ /* FPDU headers */
+ struct iwarp_rdma_write rwrite;
+ struct iwarp_rdma_rreq rreq;
+ struct iwarp_rdma_rresp rresp;
+ struct iwarp_terminate terminate;
+ struct iwarp_send send;
+ struct iwarp_send_inv send_inv;
+
+ /* complete short FPDUs */
+ struct siw_send_pkt send_pkt;
+ struct siw_write_pkt write_pkt;
+ struct siw_rreq_pkt rreq_pkt;
+ struct siw_rresp_pkt rresp_pkt;
+ } pkt;
+
+ struct mpa_trailer trailer;
+ /* DDP MSN for untagged messages */
+ u32 ddp_msn[RDMAP_UNTAGGED_QN_COUNT];
+
+ enum siw_tx_ctx state;
+ u16 ctrl_len; /* ddp+rdmap hdr */
+ u16 ctrl_sent;
+ int burst;
+ int bytes_unsent; /* ddp payload bytes */
+
+ struct shash_desc *mpa_crc_hd;
+
+ u8 do_crc : 1; /* do crc for segment */
+ u8 use_sendpage : 1; /* send w/o copy */
+ u8 tx_suspend : 1; /* stop sending DDP segs. */
+ u8 pad : 2; /* # pad in current fpdu */
+ u8 orq_fence : 1; /* ORQ full or Send fenced */
+ u8 in_syscall : 1; /* TX out of user context */
+ u8 zcopy_tx : 1; /* Use TCP_SENDPAGE if possible */
+ u8 gso_seg_limit; /* Maximum segments for GSO, 0 = unbound */
+
+ u16 fpdu_len; /* len of FPDU to tx */
+ unsigned int tcp_seglen; /* remaining tcp seg space */
+
+ struct siw_wqe wqe_active;
+
+ int pbl_idx; /* Index into current PBL */
+ int sge_idx; /* current sge in tx */
+ u32 sge_off; /* already sent in curr. sge */
+};
+
+struct siw_qp {
+ struct ib_qp base_qp;
+ struct siw_device *sdev;
+ struct kref ref;
+ struct completion qp_free;
+ struct list_head devq;
+ int tx_cpu;
+ struct siw_qp_attrs attrs;
+
+ struct siw_cep *cep;
+ struct rw_semaphore state_lock;
+
+ struct ib_pd *pd;
+ struct siw_cq *scq;
+ struct siw_cq *rcq;
+ struct siw_srq *srq;
+
+ struct siw_iwarp_tx tx_ctx; /* Transmit context */
+ spinlock_t sq_lock;
+ struct siw_sqe *sendq; /* send queue element array */
+ uint32_t sq_get; /* consumer index into sq array */
+ uint32_t sq_put; /* kernel prod. index into sq array */
+ struct llist_node tx_list;
+
+ struct siw_sqe *orq; /* outbound read queue element array */
+ spinlock_t orq_lock;
+ uint32_t orq_get; /* consumer index into orq array */
+ uint32_t orq_put; /* shared producer index for ORQ */
+
+ struct siw_rx_stream rx_stream;
+ struct siw_rx_fpdu *rx_fpdu;
+ struct siw_rx_fpdu rx_tagged;
+ struct siw_rx_fpdu rx_untagged;
+ spinlock_t rq_lock;
+ struct siw_rqe *recvq; /* recv queue element array */
+ uint32_t rq_get; /* consumer index into rq array */
+ uint32_t rq_put; /* kernel prod. index into rq array */
+
+ struct siw_sqe *irq; /* inbound read queue element array */
+ uint32_t irq_get; /* consumer index into irq array */
+ uint32_t irq_put; /* producer index into irq array */
+ int irq_burst;
+
+ struct { /* information to be carried in TERMINATE pkt, if valid */
+ u8 valid;
+ u8 in_tx;
+ u8 layer : 4, etype : 4;
+ u8 ecode;
+ } term_info;
+ struct rdma_user_mmap_entry *sq_entry; /* mmap info for SQE array */
+ struct rdma_user_mmap_entry *rq_entry; /* mmap info for RQE array */
+ struct rcu_head rcu;
+};
+
+/* helper macros */
+#define rx_qp(rx) container_of(rx, struct siw_qp, rx_stream)
+#define tx_qp(tx) container_of(tx, struct siw_qp, tx_ctx)
+#define tx_wqe(qp) (&(qp)->tx_ctx.wqe_active)
+#define rx_wqe(rctx) (&(rctx)->wqe_active)
+#define rx_mem(rctx) ((rctx)->wqe_active.mem[0])
+#define tx_type(wqe) ((wqe)->sqe.opcode)
+#define rx_type(wqe) ((wqe)->rqe.opcode)
+#define tx_flags(wqe) ((wqe)->sqe.flags)
+
+struct iwarp_msg_info {
+ int hdr_len;
+ struct iwarp_ctrl ctrl;
+ int (*rx_data)(struct siw_qp *qp);
+};
+
+struct siw_user_mmap_entry {
+ struct rdma_user_mmap_entry rdma_entry;
+ void *address;
+};
+
+/* Global siw parameters. Currently set in siw_main.c */
+extern const bool zcopy_tx;
+extern const bool try_gso;
+extern const bool loopback_enabled;
+extern const bool mpa_crc_required;
+extern const bool mpa_crc_strict;
+extern const bool siw_tcp_nagle;
+extern u_char mpa_version;
+extern const bool peer_to_peer;
+extern struct task_struct *siw_tx_thread[];
+
+extern struct crypto_shash *siw_crypto_shash;
+extern struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1];
+
+/* QP general functions */
+int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attr,
+ enum siw_qp_attr_mask mask);
+int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl);
+void siw_qp_llp_close(struct siw_qp *qp);
+void siw_qp_cm_drop(struct siw_qp *qp, int schedule);
+void siw_send_terminate(struct siw_qp *qp);
+
+void siw_qp_get_ref(struct ib_qp *qp);
+void siw_qp_put_ref(struct ib_qp *qp);
+int siw_qp_add(struct siw_device *sdev, struct siw_qp *qp);
+void siw_free_qp(struct kref *ref);
+
+void siw_init_terminate(struct siw_qp *qp, enum term_elayer layer,
+ u8 etype, u8 ecode, int in_tx);
+enum ddp_ecode siw_tagged_error(enum siw_access_state state);
+enum rdmap_ecode siw_rdmap_error(enum siw_access_state state);
+
+void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe);
+int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe, u32 bytes,
+ enum siw_wc_status status);
+int siw_rqe_complete(struct siw_qp *qp, struct siw_rqe *rqe, u32 bytes,
+ u32 inval_stag, enum siw_wc_status status);
+void siw_qp_llp_data_ready(struct sock *sk);
+void siw_qp_llp_write_space(struct sock *sk);
+
+/* QP TX path functions */
+int siw_create_tx_threads(void);
+void siw_stop_tx_threads(void);
+int siw_run_sq(void *arg);
+int siw_qp_sq_process(struct siw_qp *qp);
+int siw_sq_start(struct siw_qp *qp);
+int siw_activate_tx(struct siw_qp *qp);
+int siw_get_tx_cpu(struct siw_device *sdev);
+void siw_put_tx_cpu(int cpu);
+
+/* QP RX path functions */
+int siw_proc_send(struct siw_qp *qp);
+int siw_proc_rreq(struct siw_qp *qp);
+int siw_proc_rresp(struct siw_qp *qp);
+int siw_proc_write(struct siw_qp *qp);
+int siw_proc_terminate(struct siw_qp *qp);
+
+int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
+ unsigned int off, size_t len);
+
+static inline void set_rx_fpdu_context(struct siw_qp *qp, u8 opcode)
+{
+ if (opcode == RDMAP_RDMA_WRITE || opcode == RDMAP_RDMA_READ_RESP)
+ qp->rx_fpdu = &qp->rx_tagged;
+ else
+ qp->rx_fpdu = &qp->rx_untagged;
+
+ qp->rx_stream.rdmap_op = opcode;
+}
+
+static inline struct siw_ucontext *to_siw_ctx(struct ib_ucontext *base_ctx)
+{
+ return container_of(base_ctx, struct siw_ucontext, base_ucontext);
+}
+
+static inline struct siw_qp *to_siw_qp(struct ib_qp *base_qp)
+{
+ return container_of(base_qp, struct siw_qp, base_qp);
+}
+
+static inline struct siw_cq *to_siw_cq(struct ib_cq *base_cq)
+{
+ return container_of(base_cq, struct siw_cq, base_cq);
+}
+
+static inline struct siw_srq *to_siw_srq(struct ib_srq *base_srq)
+{
+ return container_of(base_srq, struct siw_srq, base_srq);
+}
+
+static inline struct siw_device *to_siw_dev(struct ib_device *base_dev)
+{
+ return container_of(base_dev, struct siw_device, base_dev);
+}
+
+static inline struct siw_mr *to_siw_mr(struct ib_mr *base_mr)
+{
+ return container_of(base_mr, struct siw_mr, base_mr);
+}
+
+static inline struct siw_user_mmap_entry *
+to_siw_mmap_entry(struct rdma_user_mmap_entry *rdma_mmap)
+{
+ return container_of(rdma_mmap, struct siw_user_mmap_entry, rdma_entry);
+}
+
+static inline struct siw_qp *siw_qp_id2obj(struct siw_device *sdev, int id)
+{
+ struct siw_qp *qp;
+
+ rcu_read_lock();
+ qp = xa_load(&sdev->qp_xa, id);
+ if (likely(qp && kref_get_unless_zero(&qp->ref))) {
+ rcu_read_unlock();
+ return qp;
+ }
+ rcu_read_unlock();
+ return NULL;
+}
+
+static inline u32 qp_id(struct siw_qp *qp)
+{
+ return qp->base_qp.qp_num;
+}
+
+static inline void siw_qp_get(struct siw_qp *qp)
+{
+ kref_get(&qp->ref);
+}
+
+static inline void siw_qp_put(struct siw_qp *qp)
+{
+ kref_put(&qp->ref, siw_free_qp);
+}
+
+static inline int siw_sq_empty(struct siw_qp *qp)
+{
+ struct siw_sqe *sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size];
+
+ return READ_ONCE(sqe->flags) == 0;
+}
+
+static inline struct siw_sqe *sq_get_next(struct siw_qp *qp)
+{
+ struct siw_sqe *sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size];
+
+ if (READ_ONCE(sqe->flags) & SIW_WQE_VALID)
+ return sqe;
+
+ return NULL;
+}
+
+static inline struct siw_sqe *orq_get_current(struct siw_qp *qp)
+{
+ return &qp->orq[qp->orq_get % qp->attrs.orq_size];
+}
+
+static inline struct siw_sqe *orq_get_free(struct siw_qp *qp)
+{
+ struct siw_sqe *orq_e = &qp->orq[qp->orq_put % qp->attrs.orq_size];
+
+ if (READ_ONCE(orq_e->flags) == 0)
+ return orq_e;
+
+ return NULL;
+}
+
+static inline int siw_orq_empty(struct siw_qp *qp)
+{
+ return qp->orq[qp->orq_get % qp->attrs.orq_size].flags == 0 ? 1 : 0;
+}
+
+static inline struct siw_sqe *irq_alloc_free(struct siw_qp *qp)
+{
+ struct siw_sqe *irq_e = &qp->irq[qp->irq_put % qp->attrs.irq_size];
+
+ if (READ_ONCE(irq_e->flags) == 0) {
+ qp->irq_put++;
+ return irq_e;
+ }
+ return NULL;
+}
+
+static inline __wsum siw_csum_update(const void *buff, int len, __wsum sum)
+{
+ return (__force __wsum)crc32c((__force __u32)sum, buff, len);
+}
+
+static inline __wsum siw_csum_combine(__wsum csum, __wsum csum2, int offset,
+ int len)
+{
+ return (__force __wsum)__crc32c_le_combine((__force __u32)csum,
+ (__force __u32)csum2, len);
+}
+
+static inline void siw_crc_skb(struct siw_rx_stream *srx, unsigned int len)
+{
+ const struct skb_checksum_ops siw_cs_ops = {
+ .update = siw_csum_update,
+ .combine = siw_csum_combine,
+ };
+ __wsum crc = *(u32 *)shash_desc_ctx(srx->mpa_crc_hd);
+
+ crc = __skb_checksum(srx->skb, srx->skb_offset, len, crc,
+ &siw_cs_ops);
+ *(u32 *)shash_desc_ctx(srx->mpa_crc_hd) = crc;
+}
+
+#define siw_dbg(ibdev, fmt, ...) \
+ ibdev_dbg(ibdev, "%s: " fmt, __func__, ##__VA_ARGS__)
+
+#define siw_dbg_qp(qp, fmt, ...) \
+ ibdev_dbg(&qp->sdev->base_dev, "QP[%u] %s: " fmt, qp_id(qp), __func__, \
+ ##__VA_ARGS__)
+
+#define siw_dbg_cq(cq, fmt, ...) \
+ ibdev_dbg(cq->base_cq.device, "CQ[%u] %s: " fmt, cq->id, __func__, \
+ ##__VA_ARGS__)
+
+#define siw_dbg_pd(pd, fmt, ...) \
+ ibdev_dbg(pd->device, "PD[%u] %s: " fmt, pd->res.id, __func__, \
+ ##__VA_ARGS__)
+
+#define siw_dbg_mem(mem, fmt, ...) \
+ ibdev_dbg(&mem->sdev->base_dev, \
+ "MEM[0x%08x] %s: " fmt, mem->stag, __func__, ##__VA_ARGS__)
+
+#define siw_dbg_cep(cep, fmt, ...) \
+ ibdev_dbg(&cep->sdev->base_dev, "CEP[0x%pK] %s: " fmt, \
+ cep, __func__, ##__VA_ARGS__)
+
+void siw_cq_flush(struct siw_cq *cq);
+void siw_sq_flush(struct siw_qp *qp);
+void siw_rq_flush(struct siw_qp *qp);
+int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc);
+
+#endif
diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c
new file mode 100644
index 0000000000..43e776073f
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_cm.c
@@ -0,0 +1,1970 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Fredy Neeser */
+/* Greg Joyce <greg@opengridcomputing.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+/* Copyright (c) 2017, Open Grid Computing, Inc. */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
+#include <linux/workqueue.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/inet.h>
+#include <linux/tcp.h>
+#include <trace/events/sock.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "siw.h"
+#include "siw_cm.h"
+
+/*
+ * Set to any combination of
+ * MPA_V2_RDMA_NO_RTR, MPA_V2_RDMA_READ_RTR, MPA_V2_RDMA_WRITE_RTR
+ */
+static __be16 rtr_type = MPA_V2_RDMA_READ_RTR | MPA_V2_RDMA_WRITE_RTR;
+static const bool relaxed_ird_negotiation = true;
+
+static void siw_cm_llp_state_change(struct sock *s);
+static void siw_cm_llp_data_ready(struct sock *s);
+static void siw_cm_llp_write_space(struct sock *s);
+static void siw_cm_llp_error_report(struct sock *s);
+static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason,
+ int status);
+
+static void siw_sk_assign_cm_upcalls(struct sock *sk)
+{
+ write_lock_bh(&sk->sk_callback_lock);
+ sk->sk_state_change = siw_cm_llp_state_change;
+ sk->sk_data_ready = siw_cm_llp_data_ready;
+ sk->sk_write_space = siw_cm_llp_write_space;
+ sk->sk_error_report = siw_cm_llp_error_report;
+ write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void siw_sk_save_upcalls(struct sock *sk)
+{
+ struct siw_cep *cep = sk_to_cep(sk);
+
+ write_lock_bh(&sk->sk_callback_lock);
+ cep->sk_state_change = sk->sk_state_change;
+ cep->sk_data_ready = sk->sk_data_ready;
+ cep->sk_write_space = sk->sk_write_space;
+ cep->sk_error_report = sk->sk_error_report;
+ write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void siw_sk_restore_upcalls(struct sock *sk, struct siw_cep *cep)
+{
+ sk->sk_state_change = cep->sk_state_change;
+ sk->sk_data_ready = cep->sk_data_ready;
+ sk->sk_write_space = cep->sk_write_space;
+ sk->sk_error_report = cep->sk_error_report;
+ sk->sk_user_data = NULL;
+}
+
+static void siw_qp_socket_assoc(struct siw_cep *cep, struct siw_qp *qp)
+{
+ struct socket *s = cep->sock;
+ struct sock *sk = s->sk;
+
+ write_lock_bh(&sk->sk_callback_lock);
+
+ qp->attrs.sk = s;
+ sk->sk_data_ready = siw_qp_llp_data_ready;
+ sk->sk_write_space = siw_qp_llp_write_space;
+
+ write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void siw_socket_disassoc(struct socket *s)
+{
+ struct sock *sk = s->sk;
+ struct siw_cep *cep;
+
+ if (sk) {
+ write_lock_bh(&sk->sk_callback_lock);
+ cep = sk_to_cep(sk);
+ if (cep) {
+ siw_sk_restore_upcalls(sk, cep);
+ siw_cep_put(cep);
+ } else {
+ pr_warn("siw: cannot restore sk callbacks: no ep\n");
+ }
+ write_unlock_bh(&sk->sk_callback_lock);
+ } else {
+ pr_warn("siw: cannot restore sk callbacks: no sk\n");
+ }
+}
+
+static void siw_rtr_data_ready(struct sock *sk)
+{
+ struct siw_cep *cep;
+ struct siw_qp *qp = NULL;
+ read_descriptor_t rd_desc;
+
+ trace_sk_data_ready(sk);
+
+ read_lock(&sk->sk_callback_lock);
+
+ cep = sk_to_cep(sk);
+ if (!cep) {
+ WARN(1, "No connection endpoint\n");
+ goto out;
+ }
+ qp = sk_to_qp(sk);
+
+ memset(&rd_desc, 0, sizeof(rd_desc));
+ rd_desc.arg.data = qp;
+ rd_desc.count = 1;
+
+ tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);
+ /*
+ * Check if first frame was successfully processed.
+ * Signal connection full establishment if yes.
+ * Failed data processing would have already scheduled
+ * connection drop.
+ */
+ if (!qp->rx_stream.rx_suspend)
+ siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0);
+out:
+ read_unlock(&sk->sk_callback_lock);
+ if (qp)
+ siw_qp_socket_assoc(cep, qp);
+}
+
+static void siw_sk_assign_rtr_upcalls(struct siw_cep *cep)
+{
+ struct sock *sk = cep->sock->sk;
+
+ write_lock_bh(&sk->sk_callback_lock);
+ sk->sk_data_ready = siw_rtr_data_ready;
+ sk->sk_write_space = siw_qp_llp_write_space;
+ write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void siw_cep_socket_assoc(struct siw_cep *cep, struct socket *s)
+{
+ cep->sock = s;
+ siw_cep_get(cep);
+ s->sk->sk_user_data = cep;
+
+ siw_sk_save_upcalls(s->sk);
+ siw_sk_assign_cm_upcalls(s->sk);
+}
+
+static struct siw_cep *siw_cep_alloc(struct siw_device *sdev)
+{
+ struct siw_cep *cep = kzalloc(sizeof(*cep), GFP_KERNEL);
+ unsigned long flags;
+
+ if (!cep)
+ return NULL;
+
+ INIT_LIST_HEAD(&cep->listenq);
+ INIT_LIST_HEAD(&cep->devq);
+ INIT_LIST_HEAD(&cep->work_freelist);
+
+ kref_init(&cep->ref);
+ cep->state = SIW_EPSTATE_IDLE;
+ init_waitqueue_head(&cep->waitq);
+ spin_lock_init(&cep->lock);
+ cep->sdev = sdev;
+ cep->enhanced_rdma_conn_est = false;
+
+ spin_lock_irqsave(&sdev->lock, flags);
+ list_add_tail(&cep->devq, &sdev->cep_list);
+ spin_unlock_irqrestore(&sdev->lock, flags);
+
+ siw_dbg_cep(cep, "new endpoint\n");
+ return cep;
+}
+
+static void siw_cm_free_work(struct siw_cep *cep)
+{
+ struct list_head *w, *tmp;
+ struct siw_cm_work *work;
+
+ list_for_each_safe(w, tmp, &cep->work_freelist) {
+ work = list_entry(w, struct siw_cm_work, list);
+ list_del(&work->list);
+ kfree(work);
+ }
+}
+
+static void siw_cancel_mpatimer(struct siw_cep *cep)
+{
+ spin_lock_bh(&cep->lock);
+ if (cep->mpa_timer) {
+ if (cancel_delayed_work(&cep->mpa_timer->work)) {
+ siw_cep_put(cep);
+ kfree(cep->mpa_timer); /* not needed again */
+ }
+ cep->mpa_timer = NULL;
+ }
+ spin_unlock_bh(&cep->lock);
+}
+
+static void siw_put_work(struct siw_cm_work *work)
+{
+ INIT_LIST_HEAD(&work->list);
+ spin_lock_bh(&work->cep->lock);
+ list_add(&work->list, &work->cep->work_freelist);
+ spin_unlock_bh(&work->cep->lock);
+}
+
+static void siw_cep_set_inuse(struct siw_cep *cep)
+{
+ unsigned long flags;
+retry:
+ spin_lock_irqsave(&cep->lock, flags);
+
+ if (cep->in_use) {
+ spin_unlock_irqrestore(&cep->lock, flags);
+ wait_event_interruptible(cep->waitq, !cep->in_use);
+ if (signal_pending(current))
+ flush_signals(current);
+ goto retry;
+ } else {
+ cep->in_use = 1;
+ spin_unlock_irqrestore(&cep->lock, flags);
+ }
+}
+
+static void siw_cep_set_free(struct siw_cep *cep)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cep->lock, flags);
+ cep->in_use = 0;
+ spin_unlock_irqrestore(&cep->lock, flags);
+
+ wake_up(&cep->waitq);
+}
+
+static void __siw_cep_dealloc(struct kref *ref)
+{
+ struct siw_cep *cep = container_of(ref, struct siw_cep, ref);
+ struct siw_device *sdev = cep->sdev;
+ unsigned long flags;
+
+ WARN_ON(cep->listen_cep);
+
+ /* kfree(NULL) is safe */
+ kfree(cep->mpa.pdata);
+ spin_lock_bh(&cep->lock);
+ if (!list_empty(&cep->work_freelist))
+ siw_cm_free_work(cep);
+ spin_unlock_bh(&cep->lock);
+
+ spin_lock_irqsave(&sdev->lock, flags);
+ list_del(&cep->devq);
+ spin_unlock_irqrestore(&sdev->lock, flags);
+
+ siw_dbg_cep(cep, "free endpoint\n");
+ kfree(cep);
+}
+
+static struct siw_cm_work *siw_get_work(struct siw_cep *cep)
+{
+ struct siw_cm_work *work = NULL;
+
+ spin_lock_bh(&cep->lock);
+ if (!list_empty(&cep->work_freelist)) {
+ work = list_entry(cep->work_freelist.next, struct siw_cm_work,
+ list);
+ list_del_init(&work->list);
+ }
+ spin_unlock_bh(&cep->lock);
+ return work;
+}
+
+static int siw_cm_alloc_work(struct siw_cep *cep, int num)
+{
+ struct siw_cm_work *work;
+
+ while (num--) {
+ work = kmalloc(sizeof(*work), GFP_KERNEL);
+ if (!work) {
+ if (!(list_empty(&cep->work_freelist)))
+ siw_cm_free_work(cep);
+ return -ENOMEM;
+ }
+ work->cep = cep;
+ INIT_LIST_HEAD(&work->list);
+ list_add(&work->list, &cep->work_freelist);
+ }
+ return 0;
+}
+
+/*
+ * siw_cm_upcall()
+ *
+ * Upcall to IWCM to inform about async connection events
+ */
+static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason,
+ int status)
+{
+ struct iw_cm_event event;
+ struct iw_cm_id *id;
+
+ memset(&event, 0, sizeof(event));
+ event.status = status;
+ event.event = reason;
+
+ if (reason == IW_CM_EVENT_CONNECT_REQUEST) {
+ event.provider_data = cep;
+ id = cep->listen_cep->cm_id;
+ } else {
+ id = cep->cm_id;
+ }
+ /* Signal IRD and ORD */
+ if (reason == IW_CM_EVENT_ESTABLISHED ||
+ reason == IW_CM_EVENT_CONNECT_REPLY) {
+ /* Signal negotiated IRD/ORD values we will use */
+ event.ird = cep->ird;
+ event.ord = cep->ord;
+ } else if (reason == IW_CM_EVENT_CONNECT_REQUEST) {
+ event.ird = cep->ord;
+ event.ord = cep->ird;
+ }
+ /* Signal private data and address information */
+ if (reason == IW_CM_EVENT_CONNECT_REQUEST ||
+ reason == IW_CM_EVENT_CONNECT_REPLY) {
+ u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len);
+
+ if (pd_len) {
+ /*
+ * hand over MPA private data
+ */
+ event.private_data_len = pd_len;
+ event.private_data = cep->mpa.pdata;
+
+ /* Hide MPA V2 IRD/ORD control */
+ if (cep->enhanced_rdma_conn_est) {
+ event.private_data_len -=
+ sizeof(struct mpa_v2_data);
+ event.private_data +=
+ sizeof(struct mpa_v2_data);
+ }
+ }
+ getname_local(cep->sock, &event.local_addr);
+ getname_peer(cep->sock, &event.remote_addr);
+ }
+ siw_dbg_cep(cep, "[QP %u]: reason=%d, status=%d\n",
+ cep->qp ? qp_id(cep->qp) : UINT_MAX, reason, status);
+
+ return id->event_handler(id, &event);
+}
+
+/*
+ * siw_qp_cm_drop()
+ *
+ * Drops established LLP connection if present and not already
+ * scheduled for dropping. Called from user context, SQ workqueue
+ * or receive IRQ. Caller signals if socket can be immediately
+ * closed (basically, if not in IRQ).
+ */
+void siw_qp_cm_drop(struct siw_qp *qp, int schedule)
+{
+ struct siw_cep *cep = qp->cep;
+
+ qp->rx_stream.rx_suspend = 1;
+ qp->tx_ctx.tx_suspend = 1;
+
+ if (!qp->cep)
+ return;
+
+ if (schedule) {
+ siw_cm_queue_work(cep, SIW_CM_WORK_CLOSE_LLP);
+ } else {
+ siw_cep_set_inuse(cep);
+
+ if (cep->state == SIW_EPSTATE_CLOSED) {
+ siw_dbg_cep(cep, "already closed\n");
+ goto out;
+ }
+ siw_dbg_cep(cep, "immediate close, state %d\n", cep->state);
+
+ if (qp->term_info.valid)
+ siw_send_terminate(qp);
+
+ if (cep->cm_id) {
+ switch (cep->state) {
+ case SIW_EPSTATE_AWAIT_MPAREP:
+ siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+ -EINVAL);
+ break;
+
+ case SIW_EPSTATE_RDMA_MODE:
+ siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
+ break;
+
+ case SIW_EPSTATE_IDLE:
+ case SIW_EPSTATE_LISTENING:
+ case SIW_EPSTATE_CONNECTING:
+ case SIW_EPSTATE_AWAIT_MPAREQ:
+ case SIW_EPSTATE_RECVD_MPAREQ:
+ case SIW_EPSTATE_CLOSED:
+ default:
+ break;
+ }
+ cep->cm_id->rem_ref(cep->cm_id);
+ cep->cm_id = NULL;
+ siw_cep_put(cep);
+ }
+ cep->state = SIW_EPSTATE_CLOSED;
+
+ if (cep->sock) {
+ siw_socket_disassoc(cep->sock);
+ /*
+ * Immediately close socket
+ */
+ sock_release(cep->sock);
+ cep->sock = NULL;
+ }
+ if (cep->qp) {
+ cep->qp = NULL;
+ siw_qp_put(qp);
+ }
+out:
+ siw_cep_set_free(cep);
+ }
+}
+
+void siw_cep_put(struct siw_cep *cep)
+{
+ WARN_ON(kref_read(&cep->ref) < 1);
+ kref_put(&cep->ref, __siw_cep_dealloc);
+}
+
+void siw_cep_get(struct siw_cep *cep)
+{
+ kref_get(&cep->ref);
+}
+
+/*
+ * Expects params->pd_len in host byte order
+ */
+static int siw_send_mpareqrep(struct siw_cep *cep, const void *pdata, u8 pd_len)
+{
+ struct socket *s = cep->sock;
+ struct mpa_rr *rr = &cep->mpa.hdr;
+ struct kvec iov[3];
+ struct msghdr msg;
+ int rv;
+ int iovec_num = 0;
+ int mpa_len;
+
+ memset(&msg, 0, sizeof(msg));
+
+ iov[iovec_num].iov_base = rr;
+ iov[iovec_num].iov_len = sizeof(*rr);
+ mpa_len = sizeof(*rr);
+
+ if (cep->enhanced_rdma_conn_est) {
+ iovec_num++;
+ iov[iovec_num].iov_base = &cep->mpa.v2_ctrl;
+ iov[iovec_num].iov_len = sizeof(cep->mpa.v2_ctrl);
+ mpa_len += sizeof(cep->mpa.v2_ctrl);
+ }
+ if (pd_len) {
+ iovec_num++;
+ iov[iovec_num].iov_base = (char *)pdata;
+ iov[iovec_num].iov_len = pd_len;
+ mpa_len += pd_len;
+ }
+ if (cep->enhanced_rdma_conn_est)
+ pd_len += sizeof(cep->mpa.v2_ctrl);
+
+ rr->params.pd_len = cpu_to_be16(pd_len);
+
+ rv = kernel_sendmsg(s, &msg, iov, iovec_num + 1, mpa_len);
+
+ return rv < 0 ? rv : 0;
+}
+
+/*
+ * Receive MPA Request/Reply header.
+ *
+ * Returns 0 if complete MPA Request/Reply header including
+ * eventual private data was received. Returns -EAGAIN if
+ * header was partially received or negative error code otherwise.
+ *
+ * Context: May be called in process context only
+ */
+static int siw_recv_mpa_rr(struct siw_cep *cep)
+{
+ struct mpa_rr *hdr = &cep->mpa.hdr;
+ struct socket *s = cep->sock;
+ u16 pd_len;
+ int rcvd, to_rcv;
+
+ if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) {
+ rcvd = ksock_recv(s, (char *)hdr + cep->mpa.bytes_rcvd,
+ sizeof(struct mpa_rr) - cep->mpa.bytes_rcvd,
+ 0);
+ if (rcvd <= 0)
+ return -ECONNABORTED;
+
+ cep->mpa.bytes_rcvd += rcvd;
+
+ if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr))
+ return -EAGAIN;
+
+ if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA)
+ return -EPROTO;
+ }
+ pd_len = be16_to_cpu(hdr->params.pd_len);
+
+ /*
+ * At least the MPA Request/Reply header (frame not including
+ * private data) has been received.
+ * Receive (or continue receiving) any private data.
+ */
+ to_rcv = pd_len - (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr));
+
+ if (!to_rcv) {
+ /*
+ * We must have hdr->params.pd_len == 0 and thus received a
+ * complete MPA Request/Reply frame.
+ * Check against peer protocol violation.
+ */
+ u32 word;
+
+ rcvd = ksock_recv(s, (char *)&word, sizeof(word), MSG_DONTWAIT);
+ if (rcvd == -EAGAIN)
+ return 0;
+
+ if (rcvd == 0) {
+ siw_dbg_cep(cep, "peer EOF\n");
+ return -EPIPE;
+ }
+ if (rcvd < 0) {
+ siw_dbg_cep(cep, "error: %d\n", rcvd);
+ return rcvd;
+ }
+ siw_dbg_cep(cep, "peer sent extra data: %d\n", rcvd);
+
+ return -EPROTO;
+ }
+
+ /*
+ * At this point, we must have hdr->params.pd_len != 0.
+ * A private data buffer gets allocated if hdr->params.pd_len != 0.
+ */
+ if (!cep->mpa.pdata) {
+ cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL);
+ if (!cep->mpa.pdata)
+ return -ENOMEM;
+ }
+ rcvd = ksock_recv(
+ s, cep->mpa.pdata + cep->mpa.bytes_rcvd - sizeof(struct mpa_rr),
+ to_rcv + 4, MSG_DONTWAIT);
+
+ if (rcvd < 0)
+ return rcvd;
+
+ if (rcvd > to_rcv)
+ return -EPROTO;
+
+ cep->mpa.bytes_rcvd += rcvd;
+
+ if (to_rcv == rcvd) {
+ siw_dbg_cep(cep, "%d bytes private data received\n", pd_len);
+ return 0;
+ }
+ return -EAGAIN;
+}
+
+/*
+ * siw_proc_mpareq()
+ *
+ * Read MPA Request from socket and signal new connection to IWCM
+ * if success. Caller must hold lock on corresponding listening CEP.
+ */
+static int siw_proc_mpareq(struct siw_cep *cep)
+{
+ struct mpa_rr *req;
+ int version, rv;
+ u16 pd_len;
+
+ rv = siw_recv_mpa_rr(cep);
+ if (rv)
+ return rv;
+
+ req = &cep->mpa.hdr;
+
+ version = __mpa_rr_revision(req->params.bits);
+ pd_len = be16_to_cpu(req->params.pd_len);
+
+ if (version > MPA_REVISION_2)
+ /* allow for 0, 1, and 2 only */
+ return -EPROTO;
+
+ if (memcmp(req->key, MPA_KEY_REQ, 16))
+ return -EPROTO;
+
+ /* Prepare for sending MPA reply */
+ memcpy(req->key, MPA_KEY_REP, 16);
+
+ if (version == MPA_REVISION_2 &&
+ (req->params.bits & MPA_RR_FLAG_ENHANCED)) {
+ /*
+ * MPA version 2 must signal IRD/ORD values and P2P mode
+ * in private data if header flag MPA_RR_FLAG_ENHANCED
+ * is set.
+ */
+ if (pd_len < sizeof(struct mpa_v2_data))
+ goto reject_conn;
+
+ cep->enhanced_rdma_conn_est = true;
+ }
+
+ /* MPA Markers: currently not supported. Marker TX to be added. */
+ if (req->params.bits & MPA_RR_FLAG_MARKERS)
+ goto reject_conn;
+
+ if (req->params.bits & MPA_RR_FLAG_CRC) {
+ /*
+ * RFC 5044, page 27: CRC MUST be used if peer requests it.
+ * siw specific: 'mpa_crc_strict' parameter to reject
+ * connection with CRC if local CRC off enforced by
+ * 'mpa_crc_strict' module parameter.
+ */
+ if (!mpa_crc_required && mpa_crc_strict)
+ goto reject_conn;
+
+ /* Enable CRC if requested by module parameter */
+ if (mpa_crc_required)
+ req->params.bits |= MPA_RR_FLAG_CRC;
+ }
+ if (cep->enhanced_rdma_conn_est) {
+ struct mpa_v2_data *v2 = (struct mpa_v2_data *)cep->mpa.pdata;
+
+ /*
+ * Peer requested ORD becomes requested local IRD,
+ * peer requested IRD becomes requested local ORD.
+ * IRD and ORD get limited by global maximum values.
+ */
+ cep->ord = ntohs(v2->ird) & MPA_IRD_ORD_MASK;
+ cep->ord = min(cep->ord, SIW_MAX_ORD_QP);
+ cep->ird = ntohs(v2->ord) & MPA_IRD_ORD_MASK;
+ cep->ird = min(cep->ird, SIW_MAX_IRD_QP);
+
+ /* May get overwritten by locally negotiated values */
+ cep->mpa.v2_ctrl.ird = htons(cep->ird);
+ cep->mpa.v2_ctrl.ord = htons(cep->ord);
+
+ /*
+ * Support for peer sent zero length Write or Read to
+ * let local side enter RTS. Writes are preferred.
+ * Sends would require pre-posting a Receive and are
+ * not supported.
+ * Propose zero length Write if none of Read and Write
+ * is indicated.
+ */
+ if (v2->ird & MPA_V2_PEER_TO_PEER) {
+ cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER;
+
+ if (v2->ord & MPA_V2_RDMA_WRITE_RTR)
+ cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR;
+ else if (v2->ord & MPA_V2_RDMA_READ_RTR)
+ cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_READ_RTR;
+ else
+ cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR;
+ }
+ }
+
+ cep->state = SIW_EPSTATE_RECVD_MPAREQ;
+
+ /* Keep reference until IWCM accepts/rejects */
+ siw_cep_get(cep);
+ rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0);
+ if (rv)
+ siw_cep_put(cep);
+
+ return rv;
+
+reject_conn:
+ siw_dbg_cep(cep, "reject: crc %d:%d:%d, m %d:%d\n",
+ req->params.bits & MPA_RR_FLAG_CRC ? 1 : 0,
+ mpa_crc_required, mpa_crc_strict,
+ req->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0);
+
+ req->params.bits &= ~MPA_RR_FLAG_MARKERS;
+ req->params.bits |= MPA_RR_FLAG_REJECT;
+
+ if (!mpa_crc_required && mpa_crc_strict)
+ req->params.bits &= ~MPA_RR_FLAG_CRC;
+
+ if (pd_len)
+ kfree(cep->mpa.pdata);
+
+ cep->mpa.pdata = NULL;
+
+ siw_send_mpareqrep(cep, NULL, 0);
+
+ return -EOPNOTSUPP;
+}
+
+static int siw_proc_mpareply(struct siw_cep *cep)
+{
+ struct siw_qp_attrs qp_attrs;
+ enum siw_qp_attr_mask qp_attr_mask;
+ struct siw_qp *qp = cep->qp;
+ struct mpa_rr *rep;
+ int rv;
+ u16 rep_ord;
+ u16 rep_ird;
+ bool ird_insufficient = false;
+ enum mpa_v2_ctrl mpa_p2p_mode = MPA_V2_RDMA_NO_RTR;
+
+ rv = siw_recv_mpa_rr(cep);
+ if (rv)
+ goto out_err;
+
+ siw_cancel_mpatimer(cep);
+
+ rep = &cep->mpa.hdr;
+
+ if (__mpa_rr_revision(rep->params.bits) > MPA_REVISION_2) {
+ /* allow for 0, 1, and 2 only */
+ rv = -EPROTO;
+ goto out_err;
+ }
+ if (memcmp(rep->key, MPA_KEY_REP, 16)) {
+ siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA,
+ LLP_ECODE_INVALID_REQ_RESP, 0);
+ siw_send_terminate(qp);
+ rv = -EPROTO;
+ goto out_err;
+ }
+ if (rep->params.bits & MPA_RR_FLAG_REJECT) {
+ siw_dbg_cep(cep, "got mpa reject\n");
+ siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET);
+
+ return -ECONNRESET;
+ }
+ if (try_gso && rep->params.bits & MPA_RR_FLAG_GSO_EXP) {
+ siw_dbg_cep(cep, "peer allows GSO on TX\n");
+ qp->tx_ctx.gso_seg_limit = 0;
+ }
+ if ((rep->params.bits & MPA_RR_FLAG_MARKERS) ||
+ (mpa_crc_required && !(rep->params.bits & MPA_RR_FLAG_CRC)) ||
+ (mpa_crc_strict && !mpa_crc_required &&
+ (rep->params.bits & MPA_RR_FLAG_CRC))) {
+ siw_dbg_cep(cep, "reply unsupp: crc %d:%d:%d, m %d:%d\n",
+ rep->params.bits & MPA_RR_FLAG_CRC ? 1 : 0,
+ mpa_crc_required, mpa_crc_strict,
+ rep->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0);
+
+ siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED);
+
+ return -EINVAL;
+ }
+ if (cep->enhanced_rdma_conn_est) {
+ struct mpa_v2_data *v2;
+
+ if (__mpa_rr_revision(rep->params.bits) < MPA_REVISION_2 ||
+ !(rep->params.bits & MPA_RR_FLAG_ENHANCED)) {
+ /*
+ * Protocol failure: The responder MUST reply with
+ * MPA version 2 and MUST set MPA_RR_FLAG_ENHANCED.
+ */
+ siw_dbg_cep(cep, "mpa reply error: vers %d, enhcd %d\n",
+ __mpa_rr_revision(rep->params.bits),
+ rep->params.bits & MPA_RR_FLAG_ENHANCED ?
+ 1 :
+ 0);
+
+ siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+ -ECONNRESET);
+ return -EINVAL;
+ }
+ v2 = (struct mpa_v2_data *)cep->mpa.pdata;
+ rep_ird = ntohs(v2->ird) & MPA_IRD_ORD_MASK;
+ rep_ord = ntohs(v2->ord) & MPA_IRD_ORD_MASK;
+
+ if (cep->ird < rep_ord &&
+ (relaxed_ird_negotiation == false ||
+ rep_ord > cep->sdev->attrs.max_ird)) {
+ siw_dbg_cep(cep, "ird %d, rep_ord %d, max_ord %d\n",
+ cep->ird, rep_ord,
+ cep->sdev->attrs.max_ord);
+ ird_insufficient = true;
+ }
+ if (cep->ord > rep_ird && relaxed_ird_negotiation == false) {
+ siw_dbg_cep(cep, "ord %d, rep_ird %d\n", cep->ord,
+ rep_ird);
+ ird_insufficient = true;
+ }
+ /*
+ * Always report negotiated peer values to user,
+ * even if IRD/ORD negotiation failed
+ */
+ cep->ird = rep_ord;
+ cep->ord = rep_ird;
+
+ if (ird_insufficient) {
+ /*
+ * If the initiator IRD is insuffient for the
+ * responder ORD, send a TERM.
+ */
+ siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
+ LLP_ETYPE_MPA,
+ LLP_ECODE_INSUFFICIENT_IRD, 0);
+ siw_send_terminate(qp);
+ rv = -ENOMEM;
+ goto out_err;
+ }
+ if (cep->mpa.v2_ctrl_req.ird & MPA_V2_PEER_TO_PEER)
+ mpa_p2p_mode =
+ cep->mpa.v2_ctrl_req.ord &
+ (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR);
+
+ /*
+ * Check if we requested P2P mode, and if peer agrees
+ */
+ if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) {
+ if ((mpa_p2p_mode & v2->ord) == 0) {
+ /*
+ * We requested RTR mode(s), but the peer
+ * did not pick any mode we support.
+ */
+ siw_dbg_cep(cep,
+ "rtr mode: req %2x, got %2x\n",
+ mpa_p2p_mode,
+ v2->ord & (MPA_V2_RDMA_WRITE_RTR |
+ MPA_V2_RDMA_READ_RTR));
+
+ siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
+ LLP_ETYPE_MPA,
+ LLP_ECODE_NO_MATCHING_RTR,
+ 0);
+ siw_send_terminate(qp);
+ rv = -EPROTO;
+ goto out_err;
+ }
+ mpa_p2p_mode = v2->ord & (MPA_V2_RDMA_WRITE_RTR |
+ MPA_V2_RDMA_READ_RTR);
+ }
+ }
+ memset(&qp_attrs, 0, sizeof(qp_attrs));
+
+ if (rep->params.bits & MPA_RR_FLAG_CRC)
+ qp_attrs.flags = SIW_MPA_CRC;
+
+ qp_attrs.irq_size = cep->ird;
+ qp_attrs.orq_size = cep->ord;
+ qp_attrs.sk = cep->sock;
+ qp_attrs.state = SIW_QP_STATE_RTS;
+
+ qp_attr_mask = SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE |
+ SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | SIW_QP_ATTR_MPA;
+
+ /* Move socket RX/TX under QP control */
+ down_write(&qp->state_lock);
+ if (qp->attrs.state > SIW_QP_STATE_RTR) {
+ rv = -EINVAL;
+ up_write(&qp->state_lock);
+ goto out_err;
+ }
+ rv = siw_qp_modify(qp, &qp_attrs, qp_attr_mask);
+
+ siw_qp_socket_assoc(cep, qp);
+
+ up_write(&qp->state_lock);
+
+ /* Send extra RDMA frame to trigger peer RTS if negotiated */
+ if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) {
+ rv = siw_qp_mpa_rts(qp, mpa_p2p_mode);
+ if (rv)
+ goto out_err;
+ }
+ if (!rv) {
+ rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0);
+ if (!rv)
+ cep->state = SIW_EPSTATE_RDMA_MODE;
+
+ return 0;
+ }
+
+out_err:
+ if (rv != -EAGAIN)
+ siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL);
+
+ return rv;
+}
+
+/*
+ * siw_accept_newconn - accept an incoming pending connection
+ *
+ */
+static void siw_accept_newconn(struct siw_cep *cep)
+{
+ struct socket *s = cep->sock;
+ struct socket *new_s = NULL;
+ struct siw_cep *new_cep = NULL;
+ int rv = 0; /* debug only. should disappear */
+
+ if (cep->state != SIW_EPSTATE_LISTENING)
+ goto error;
+
+ new_cep = siw_cep_alloc(cep->sdev);
+ if (!new_cep)
+ goto error;
+
+ /*
+ * 4: Allocate a sufficient number of work elements
+ * to allow concurrent handling of local + peer close
+ * events, MPA header processing + MPA timeout.
+ */
+ if (siw_cm_alloc_work(new_cep, 4) != 0)
+ goto error;
+
+ /*
+ * Copy saved socket callbacks from listening CEP
+ * and assign new socket with new CEP
+ */
+ new_cep->sk_state_change = cep->sk_state_change;
+ new_cep->sk_data_ready = cep->sk_data_ready;
+ new_cep->sk_write_space = cep->sk_write_space;
+ new_cep->sk_error_report = cep->sk_error_report;
+
+ rv = kernel_accept(s, &new_s, O_NONBLOCK);
+ if (rv != 0) {
+ /*
+ * Connection already aborted by peer..?
+ */
+ siw_dbg_cep(cep, "kernel_accept() error: %d\n", rv);
+ goto error;
+ }
+ new_cep->sock = new_s;
+ siw_cep_get(new_cep);
+ new_s->sk->sk_user_data = new_cep;
+
+ if (siw_tcp_nagle == false)
+ tcp_sock_set_nodelay(new_s->sk);
+ new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ;
+
+ rv = siw_cm_queue_work(new_cep, SIW_CM_WORK_MPATIMEOUT);
+ if (rv)
+ goto error;
+ /*
+ * See siw_proc_mpareq() etc. for the use of new_cep->listen_cep.
+ */
+ new_cep->listen_cep = cep;
+ siw_cep_get(cep);
+
+ if (atomic_read(&new_s->sk->sk_rmem_alloc)) {
+ /*
+ * MPA REQ already queued
+ */
+ siw_dbg_cep(cep, "immediate mpa request\n");
+
+ siw_cep_set_inuse(new_cep);
+ rv = siw_proc_mpareq(new_cep);
+ if (rv != -EAGAIN) {
+ siw_cep_put(cep);
+ new_cep->listen_cep = NULL;
+ if (rv) {
+ siw_cancel_mpatimer(new_cep);
+ siw_cep_set_free(new_cep);
+ goto error;
+ }
+ }
+ siw_cep_set_free(new_cep);
+ }
+ return;
+
+error:
+ if (new_cep)
+ siw_cep_put(new_cep);
+
+ if (new_s) {
+ siw_socket_disassoc(new_s);
+ sock_release(new_s);
+ new_cep->sock = NULL;
+ }
+ siw_dbg_cep(cep, "error %d\n", rv);
+}
+
+static void siw_cm_work_handler(struct work_struct *w)
+{
+ struct siw_cm_work *work;
+ struct siw_cep *cep;
+ int release_cep = 0, rv = 0;
+
+ work = container_of(w, struct siw_cm_work, work.work);
+ cep = work->cep;
+
+ siw_dbg_cep(cep, "[QP %u]: work type: %d, state %d\n",
+ cep->qp ? qp_id(cep->qp) : UINT_MAX,
+ work->type, cep->state);
+
+ siw_cep_set_inuse(cep);
+
+ switch (work->type) {
+ case SIW_CM_WORK_ACCEPT:
+ siw_accept_newconn(cep);
+ break;
+
+ case SIW_CM_WORK_READ_MPAHDR:
+ if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
+ if (cep->listen_cep) {
+ siw_cep_set_inuse(cep->listen_cep);
+
+ if (cep->listen_cep->state ==
+ SIW_EPSTATE_LISTENING)
+ rv = siw_proc_mpareq(cep);
+ else
+ rv = -EFAULT;
+
+ siw_cep_set_free(cep->listen_cep);
+
+ if (rv != -EAGAIN) {
+ siw_cep_put(cep->listen_cep);
+ cep->listen_cep = NULL;
+ if (rv)
+ siw_cep_put(cep);
+ }
+ }
+ } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
+ rv = siw_proc_mpareply(cep);
+ } else {
+ /*
+ * CEP already moved out of MPA handshake.
+ * any connection management already done.
+ * silently ignore the mpa packet.
+ */
+ if (cep->state == SIW_EPSTATE_RDMA_MODE) {
+ cep->sock->sk->sk_data_ready(cep->sock->sk);
+ siw_dbg_cep(cep, "already in RDMA mode");
+ } else {
+ siw_dbg_cep(cep, "out of state: %d\n",
+ cep->state);
+ }
+ }
+ if (rv && rv != -EAGAIN)
+ release_cep = 1;
+ break;
+
+ case SIW_CM_WORK_CLOSE_LLP:
+ /*
+ * QP scheduled LLP close
+ */
+ if (cep->qp && cep->qp->term_info.valid)
+ siw_send_terminate(cep->qp);
+
+ if (cep->cm_id)
+ siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
+
+ release_cep = 1;
+ break;
+
+ case SIW_CM_WORK_PEER_CLOSE:
+ if (cep->cm_id) {
+ if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
+ /*
+ * MPA reply not received, but connection drop
+ */
+ siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+ -ECONNRESET);
+ } else if (cep->state == SIW_EPSTATE_RDMA_MODE) {
+ /*
+ * NOTE: IW_CM_EVENT_DISCONNECT is given just
+ * to transition IWCM into CLOSING.
+ */
+ siw_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0);
+ siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
+ }
+ /*
+ * for other states there is no connection
+ * known to the IWCM.
+ */
+ } else {
+ if (cep->state == SIW_EPSTATE_RECVD_MPAREQ) {
+ /*
+ * Wait for the ulp/CM to call accept/reject
+ */
+ siw_dbg_cep(cep,
+ "mpa req recvd, wait for ULP\n");
+ } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
+ /*
+ * Socket close before MPA request received.
+ */
+ if (cep->listen_cep) {
+ siw_dbg_cep(cep,
+ "no mpareq: drop listener\n");
+ siw_cep_put(cep->listen_cep);
+ cep->listen_cep = NULL;
+ }
+ }
+ }
+ release_cep = 1;
+ break;
+
+ case SIW_CM_WORK_MPATIMEOUT:
+ cep->mpa_timer = NULL;
+
+ if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
+ /*
+ * MPA request timed out:
+ * Hide any partially received private data and signal
+ * timeout
+ */
+ cep->mpa.hdr.params.pd_len = 0;
+
+ if (cep->cm_id)
+ siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+ -ETIMEDOUT);
+ release_cep = 1;
+
+ } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
+ /*
+ * No MPA request received after peer TCP stream setup.
+ */
+ if (cep->listen_cep) {
+ siw_cep_put(cep->listen_cep);
+ cep->listen_cep = NULL;
+ }
+ release_cep = 1;
+ }
+ break;
+
+ default:
+ WARN(1, "Undefined CM work type: %d\n", work->type);
+ }
+ if (release_cep) {
+ siw_dbg_cep(cep,
+ "release: timer=%s, QP[%u]\n",
+ cep->mpa_timer ? "y" : "n",
+ cep->qp ? qp_id(cep->qp) : UINT_MAX);
+
+ siw_cancel_mpatimer(cep);
+
+ cep->state = SIW_EPSTATE_CLOSED;
+
+ if (cep->qp) {
+ struct siw_qp *qp = cep->qp;
+ /*
+ * Serialize a potential race with application
+ * closing the QP and calling siw_qp_cm_drop()
+ */
+ siw_qp_get(qp);
+ siw_cep_set_free(cep);
+
+ siw_qp_llp_close(qp);
+ siw_qp_put(qp);
+
+ siw_cep_set_inuse(cep);
+ cep->qp = NULL;
+ siw_qp_put(qp);
+ }
+ if (cep->sock) {
+ siw_socket_disassoc(cep->sock);
+ sock_release(cep->sock);
+ cep->sock = NULL;
+ }
+ if (cep->cm_id) {
+ cep->cm_id->rem_ref(cep->cm_id);
+ cep->cm_id = NULL;
+ siw_cep_put(cep);
+ }
+ }
+ siw_cep_set_free(cep);
+ siw_put_work(work);
+ siw_cep_put(cep);
+}
+
+static struct workqueue_struct *siw_cm_wq;
+
+int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type)
+{
+ struct siw_cm_work *work = siw_get_work(cep);
+ unsigned long delay = 0;
+
+ if (!work) {
+ siw_dbg_cep(cep, "failed with no work available\n");
+ return -ENOMEM;
+ }
+ work->type = type;
+ work->cep = cep;
+
+ siw_cep_get(cep);
+
+ INIT_DELAYED_WORK(&work->work, siw_cm_work_handler);
+
+ if (type == SIW_CM_WORK_MPATIMEOUT) {
+ cep->mpa_timer = work;
+
+ if (cep->state == SIW_EPSTATE_AWAIT_MPAREP)
+ delay = MPAREQ_TIMEOUT;
+ else
+ delay = MPAREP_TIMEOUT;
+ }
+ siw_dbg_cep(cep, "[QP %u]: work type: %d, timeout %lu\n",
+ cep->qp ? qp_id(cep->qp) : -1, type, delay);
+
+ queue_delayed_work(siw_cm_wq, &work->work, delay);
+
+ return 0;
+}
+
+static void siw_cm_llp_data_ready(struct sock *sk)
+{
+ struct siw_cep *cep;
+
+ trace_sk_data_ready(sk);
+
+ read_lock(&sk->sk_callback_lock);
+
+ cep = sk_to_cep(sk);
+ if (!cep)
+ goto out;
+
+ siw_dbg_cep(cep, "cep state: %d, socket state %d\n",
+ cep->state, sk->sk_state);
+
+ if (sk->sk_state != TCP_ESTABLISHED)
+ goto out;
+
+ switch (cep->state) {
+ case SIW_EPSTATE_RDMA_MODE:
+ case SIW_EPSTATE_LISTENING:
+ break;
+
+ case SIW_EPSTATE_AWAIT_MPAREQ:
+ case SIW_EPSTATE_AWAIT_MPAREP:
+ siw_cm_queue_work(cep, SIW_CM_WORK_READ_MPAHDR);
+ break;
+
+ default:
+ siw_dbg_cep(cep, "unexpected data, state %d\n", cep->state);
+ break;
+ }
+out:
+ read_unlock(&sk->sk_callback_lock);
+}
+
+static void siw_cm_llp_write_space(struct sock *sk)
+{
+ struct siw_cep *cep = sk_to_cep(sk);
+
+ if (cep)
+ siw_dbg_cep(cep, "state: %d\n", cep->state);
+}
+
+static void siw_cm_llp_error_report(struct sock *sk)
+{
+ struct siw_cep *cep = sk_to_cep(sk);
+
+ if (cep) {
+ siw_dbg_cep(cep, "error %d, socket state: %d, cep state: %d\n",
+ sk->sk_err, sk->sk_state, cep->state);
+ cep->sk_error_report(sk);
+ }
+}
+
+static void siw_cm_llp_state_change(struct sock *sk)
+{
+ struct siw_cep *cep;
+ void (*orig_state_change)(struct sock *s);
+
+ read_lock(&sk->sk_callback_lock);
+
+ cep = sk_to_cep(sk);
+ if (!cep) {
+ /* endpoint already disassociated */
+ read_unlock(&sk->sk_callback_lock);
+ return;
+ }
+ orig_state_change = cep->sk_state_change;
+
+ siw_dbg_cep(cep, "state: %d\n", cep->state);
+
+ switch (sk->sk_state) {
+ case TCP_ESTABLISHED:
+ /*
+ * handle accepting socket as special case where only
+ * new connection is possible
+ */
+ siw_cm_queue_work(cep, SIW_CM_WORK_ACCEPT);
+ break;
+
+ case TCP_CLOSE:
+ case TCP_CLOSE_WAIT:
+ if (cep->qp)
+ cep->qp->tx_ctx.tx_suspend = 1;
+ siw_cm_queue_work(cep, SIW_CM_WORK_PEER_CLOSE);
+ break;
+
+ default:
+ siw_dbg_cep(cep, "unexpected socket state %d\n", sk->sk_state);
+ }
+ read_unlock(&sk->sk_callback_lock);
+ orig_state_change(sk);
+}
+
+static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr,
+ struct sockaddr *raddr, bool afonly)
+{
+ int rv, flags = 0;
+ size_t size = laddr->sa_family == AF_INET ?
+ sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+
+ /*
+ * Make address available again asap.
+ */
+ sock_set_reuseaddr(s->sk);
+
+ if (afonly) {
+ rv = ip6_sock_set_v6only(s->sk);
+ if (rv)
+ return rv;
+ }
+
+ rv = s->ops->bind(s, laddr, size);
+ if (rv < 0)
+ return rv;
+
+ rv = s->ops->connect(s, raddr, size, flags);
+
+ return rv < 0 ? rv : 0;
+}
+
+int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params)
+{
+ struct siw_device *sdev = to_siw_dev(id->device);
+ struct siw_qp *qp;
+ struct siw_cep *cep = NULL;
+ struct socket *s = NULL;
+ struct sockaddr *laddr = (struct sockaddr *)&id->local_addr,
+ *raddr = (struct sockaddr *)&id->remote_addr;
+ bool p2p_mode = peer_to_peer, v4 = true;
+ u16 pd_len = params->private_data_len;
+ int version = mpa_version, rv;
+
+ if (pd_len > MPA_MAX_PRIVDATA)
+ return -EINVAL;
+
+ if (params->ird > sdev->attrs.max_ird ||
+ params->ord > sdev->attrs.max_ord)
+ return -ENOMEM;
+
+ if (laddr->sa_family == AF_INET6)
+ v4 = false;
+ else if (laddr->sa_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ /*
+ * Respect any iwarp port mapping: Use mapped remote address
+ * if valid. Local address must not be mapped, since siw
+ * uses kernel TCP stack.
+ */
+ if ((v4 && to_sockaddr_in(id->remote_addr).sin_port != 0) ||
+ to_sockaddr_in6(id->remote_addr).sin6_port != 0)
+ raddr = (struct sockaddr *)&id->m_remote_addr;
+
+ qp = siw_qp_id2obj(sdev, params->qpn);
+ if (!qp) {
+ WARN(1, "[QP %u] does not exist\n", params->qpn);
+ rv = -EINVAL;
+ goto error;
+ }
+ siw_dbg_qp(qp, "pd_len %d, laddr %pISp, raddr %pISp\n", pd_len, laddr,
+ raddr);
+
+ rv = sock_create(v4 ? AF_INET : AF_INET6, SOCK_STREAM, IPPROTO_TCP, &s);
+ if (rv < 0)
+ goto error;
+
+ /*
+ * NOTE: For simplification, connect() is called in blocking
+ * mode. Might be reconsidered for async connection setup at
+ * TCP level.
+ */
+ rv = kernel_bindconnect(s, laddr, raddr, id->afonly);
+ if (rv != 0) {
+ siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv);
+ goto error;
+ }
+ if (siw_tcp_nagle == false)
+ tcp_sock_set_nodelay(s->sk);
+ cep = siw_cep_alloc(sdev);
+ if (!cep) {
+ rv = -ENOMEM;
+ goto error;
+ }
+ siw_cep_set_inuse(cep);
+
+ /* Associate QP with CEP */
+ siw_cep_get(cep);
+ qp->cep = cep;
+
+ /* siw_qp_get(qp) already done by QP lookup */
+ cep->qp = qp;
+
+ id->add_ref(id);
+ cep->cm_id = id;
+
+ /*
+ * 4: Allocate a sufficient number of work elements
+ * to allow concurrent handling of local + peer close
+ * events, MPA header processing + MPA timeout.
+ */
+ rv = siw_cm_alloc_work(cep, 4);
+ if (rv != 0) {
+ rv = -ENOMEM;
+ goto error;
+ }
+ cep->ird = params->ird;
+ cep->ord = params->ord;
+
+ if (p2p_mode && cep->ord == 0)
+ cep->ord = 1;
+
+ cep->state = SIW_EPSTATE_CONNECTING;
+
+ /*
+ * Associate CEP with socket
+ */
+ siw_cep_socket_assoc(cep, s);
+
+ cep->state = SIW_EPSTATE_AWAIT_MPAREP;
+
+ /*
+ * Set MPA Request bits: CRC if required, no MPA Markers,
+ * MPA Rev. according to module parameter 'mpa_version', Key 'Request'.
+ */
+ cep->mpa.hdr.params.bits = 0;
+ if (version > MPA_REVISION_2) {
+ pr_warn("Setting MPA version to %u\n", MPA_REVISION_2);
+ version = MPA_REVISION_2;
+ /* Adjust also module parameter */
+ mpa_version = MPA_REVISION_2;
+ }
+ __mpa_rr_set_revision(&cep->mpa.hdr.params.bits, version);
+
+ if (try_gso)
+ cep->mpa.hdr.params.bits |= MPA_RR_FLAG_GSO_EXP;
+
+ if (mpa_crc_required)
+ cep->mpa.hdr.params.bits |= MPA_RR_FLAG_CRC;
+
+ /*
+ * If MPA version == 2:
+ * o Include ORD and IRD.
+ * o Indicate peer-to-peer mode, if required by module
+ * parameter 'peer_to_peer'.
+ */
+ if (version == MPA_REVISION_2) {
+ cep->enhanced_rdma_conn_est = true;
+ cep->mpa.hdr.params.bits |= MPA_RR_FLAG_ENHANCED;
+
+ cep->mpa.v2_ctrl.ird = htons(cep->ird);
+ cep->mpa.v2_ctrl.ord = htons(cep->ord);
+
+ if (p2p_mode) {
+ cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER;
+ cep->mpa.v2_ctrl.ord |= rtr_type;
+ }
+ /* Remember own P2P mode requested */
+ cep->mpa.v2_ctrl_req.ird = cep->mpa.v2_ctrl.ird;
+ cep->mpa.v2_ctrl_req.ord = cep->mpa.v2_ctrl.ord;
+ }
+ memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, 16);
+
+ rv = siw_send_mpareqrep(cep, params->private_data, pd_len);
+ /*
+ * Reset private data.
+ */
+ cep->mpa.hdr.params.pd_len = 0;
+
+ if (rv >= 0) {
+ rv = siw_cm_queue_work(cep, SIW_CM_WORK_MPATIMEOUT);
+ if (!rv) {
+ siw_dbg_cep(cep, "[QP %u]: exit\n", qp_id(qp));
+ siw_cep_set_free(cep);
+ return 0;
+ }
+ }
+error:
+ siw_dbg(id->device, "failed: %d\n", rv);
+
+ if (cep) {
+ siw_socket_disassoc(s);
+ sock_release(s);
+ cep->sock = NULL;
+
+ cep->qp = NULL;
+
+ cep->cm_id = NULL;
+ id->rem_ref(id);
+
+ qp->cep = NULL;
+ siw_cep_put(cep);
+
+ cep->state = SIW_EPSTATE_CLOSED;
+
+ siw_cep_set_free(cep);
+
+ siw_cep_put(cep);
+
+ } else if (s) {
+ sock_release(s);
+ }
+ if (qp)
+ siw_qp_put(qp);
+
+ return rv;
+}
+
+/*
+ * siw_accept - Let SoftiWARP accept an RDMA connection request
+ *
+ * @id: New connection management id to be used for accepted
+ * connection request
+ * @params: Connection parameters provided by ULP for accepting connection
+ *
+ * Transition QP to RTS state, associate new CM id @id with accepted CEP
+ * and get prepared for TCP input by installing socket callbacks.
+ * Then send MPA Reply and generate the "connection established" event.
+ * Socket callbacks must be installed before sending MPA Reply, because
+ * the latter may cause a first RDMA message to arrive from the RDMA Initiator
+ * side very quickly, at which time the socket callbacks must be ready.
+ */
+int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params)
+{
+ struct siw_device *sdev = to_siw_dev(id->device);
+ struct siw_cep *cep = (struct siw_cep *)id->provider_data;
+ struct siw_qp *qp;
+ struct siw_qp_attrs qp_attrs;
+ int rv, max_priv_data = MPA_MAX_PRIVDATA;
+ bool wait_for_peer_rts = false;
+
+ siw_cep_set_inuse(cep);
+ siw_cep_put(cep);
+
+ /* Free lingering inbound private data */
+ if (cep->mpa.hdr.params.pd_len) {
+ cep->mpa.hdr.params.pd_len = 0;
+ kfree(cep->mpa.pdata);
+ cep->mpa.pdata = NULL;
+ }
+ siw_cancel_mpatimer(cep);
+
+ if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) {
+ siw_dbg_cep(cep, "out of state\n");
+
+ siw_cep_set_free(cep);
+ siw_cep_put(cep);
+
+ return -ECONNRESET;
+ }
+ qp = siw_qp_id2obj(sdev, params->qpn);
+ if (!qp) {
+ WARN(1, "[QP %d] does not exist\n", params->qpn);
+ siw_cep_set_free(cep);
+ siw_cep_put(cep);
+
+ return -EINVAL;
+ }
+ down_write(&qp->state_lock);
+ if (qp->attrs.state > SIW_QP_STATE_RTR) {
+ rv = -EINVAL;
+ up_write(&qp->state_lock);
+ goto error;
+ }
+ siw_dbg_cep(cep, "[QP %d]\n", params->qpn);
+
+ if (try_gso && cep->mpa.hdr.params.bits & MPA_RR_FLAG_GSO_EXP) {
+ siw_dbg_cep(cep, "peer allows GSO on TX\n");
+ qp->tx_ctx.gso_seg_limit = 0;
+ }
+ if (params->ord > sdev->attrs.max_ord ||
+ params->ird > sdev->attrs.max_ird) {
+ siw_dbg_cep(
+ cep,
+ "[QP %u]: ord %d (max %d), ird %d (max %d)\n",
+ qp_id(qp), params->ord, sdev->attrs.max_ord,
+ params->ird, sdev->attrs.max_ird);
+ rv = -EINVAL;
+ up_write(&qp->state_lock);
+ goto error;
+ }
+ if (cep->enhanced_rdma_conn_est)
+ max_priv_data -= sizeof(struct mpa_v2_data);
+
+ if (params->private_data_len > max_priv_data) {
+ siw_dbg_cep(
+ cep,
+ "[QP %u]: private data length: %d (max %d)\n",
+ qp_id(qp), params->private_data_len, max_priv_data);
+ rv = -EINVAL;
+ up_write(&qp->state_lock);
+ goto error;
+ }
+ if (cep->enhanced_rdma_conn_est) {
+ if (params->ord > cep->ord) {
+ if (relaxed_ird_negotiation) {
+ params->ord = cep->ord;
+ } else {
+ cep->ird = params->ird;
+ cep->ord = params->ord;
+ rv = -EINVAL;
+ up_write(&qp->state_lock);
+ goto error;
+ }
+ }
+ if (params->ird < cep->ird) {
+ if (relaxed_ird_negotiation &&
+ cep->ird <= sdev->attrs.max_ird)
+ params->ird = cep->ird;
+ else {
+ rv = -ENOMEM;
+ up_write(&qp->state_lock);
+ goto error;
+ }
+ }
+ if (cep->mpa.v2_ctrl.ord &
+ (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR))
+ wait_for_peer_rts = true;
+ /*
+ * Signal back negotiated IRD and ORD values
+ */
+ cep->mpa.v2_ctrl.ord =
+ htons(params->ord & MPA_IRD_ORD_MASK) |
+ (cep->mpa.v2_ctrl.ord & ~MPA_V2_MASK_IRD_ORD);
+ cep->mpa.v2_ctrl.ird =
+ htons(params->ird & MPA_IRD_ORD_MASK) |
+ (cep->mpa.v2_ctrl.ird & ~MPA_V2_MASK_IRD_ORD);
+ }
+ cep->ird = params->ird;
+ cep->ord = params->ord;
+
+ cep->cm_id = id;
+ id->add_ref(id);
+
+ memset(&qp_attrs, 0, sizeof(qp_attrs));
+ qp_attrs.orq_size = cep->ord;
+ qp_attrs.irq_size = cep->ird;
+ qp_attrs.sk = cep->sock;
+ if (cep->mpa.hdr.params.bits & MPA_RR_FLAG_CRC)
+ qp_attrs.flags = SIW_MPA_CRC;
+ qp_attrs.state = SIW_QP_STATE_RTS;
+
+ siw_dbg_cep(cep, "[QP%u]: moving to rts\n", qp_id(qp));
+
+ /* Associate QP with CEP */
+ siw_cep_get(cep);
+ qp->cep = cep;
+
+ /* siw_qp_get(qp) already done by QP lookup */
+ cep->qp = qp;
+
+ cep->state = SIW_EPSTATE_RDMA_MODE;
+
+ /* Move socket RX/TX under QP control */
+ rv = siw_qp_modify(qp, &qp_attrs,
+ SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE |
+ SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD |
+ SIW_QP_ATTR_MPA);
+ up_write(&qp->state_lock);
+
+ if (rv)
+ goto error;
+
+ siw_dbg_cep(cep, "[QP %u]: send mpa reply, %d byte pdata\n",
+ qp_id(qp), params->private_data_len);
+
+ rv = siw_send_mpareqrep(cep, params->private_data,
+ params->private_data_len);
+ if (rv != 0)
+ goto error;
+
+ if (wait_for_peer_rts) {
+ siw_sk_assign_rtr_upcalls(cep);
+ } else {
+ siw_qp_socket_assoc(cep, qp);
+ rv = siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0);
+ if (rv)
+ goto error;
+ }
+ siw_cep_set_free(cep);
+
+ return 0;
+error:
+ siw_socket_disassoc(cep->sock);
+ sock_release(cep->sock);
+ cep->sock = NULL;
+
+ cep->state = SIW_EPSTATE_CLOSED;
+
+ if (cep->cm_id) {
+ cep->cm_id->rem_ref(id);
+ cep->cm_id = NULL;
+ }
+ if (qp->cep) {
+ siw_cep_put(cep);
+ qp->cep = NULL;
+ }
+ cep->qp = NULL;
+ siw_qp_put(qp);
+
+ siw_cep_set_free(cep);
+ siw_cep_put(cep);
+
+ return rv;
+}
+
+/*
+ * siw_reject()
+ *
+ * Local connection reject case. Send private data back to peer,
+ * close connection and dereference connection id.
+ */
+int siw_reject(struct iw_cm_id *id, const void *pdata, u8 pd_len)
+{
+ struct siw_cep *cep = (struct siw_cep *)id->provider_data;
+
+ siw_cep_set_inuse(cep);
+ siw_cep_put(cep);
+
+ siw_cancel_mpatimer(cep);
+
+ if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) {
+ siw_dbg_cep(cep, "out of state\n");
+
+ siw_cep_set_free(cep);
+ siw_cep_put(cep); /* put last reference */
+
+ return -ECONNRESET;
+ }
+ siw_dbg_cep(cep, "cep->state %d, pd_len %d\n", cep->state,
+ pd_len);
+
+ if (__mpa_rr_revision(cep->mpa.hdr.params.bits) >= MPA_REVISION_1) {
+ cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */
+ siw_send_mpareqrep(cep, pdata, pd_len);
+ }
+ siw_socket_disassoc(cep->sock);
+ sock_release(cep->sock);
+ cep->sock = NULL;
+
+ cep->state = SIW_EPSTATE_CLOSED;
+
+ siw_cep_set_free(cep);
+ siw_cep_put(cep);
+
+ return 0;
+}
+
+/*
+ * siw_create_listen - Create resources for a listener's IWCM ID @id
+ *
+ * Starts listen on the socket address id->local_addr.
+ *
+ */
+int siw_create_listen(struct iw_cm_id *id, int backlog)
+{
+ struct socket *s;
+ struct siw_cep *cep = NULL;
+ struct siw_device *sdev = to_siw_dev(id->device);
+ int addr_family = id->local_addr.ss_family;
+ int rv = 0;
+
+ if (addr_family != AF_INET && addr_family != AF_INET6)
+ return -EAFNOSUPPORT;
+
+ rv = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s);
+ if (rv < 0)
+ return rv;
+
+ /*
+ * Allow binding local port when still in TIME_WAIT from last close.
+ */
+ sock_set_reuseaddr(s->sk);
+
+ if (addr_family == AF_INET) {
+ struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr);
+
+ /* For wildcard addr, limit binding to current device only */
+ if (ipv4_is_zeronet(laddr->sin_addr.s_addr))
+ s->sk->sk_bound_dev_if = sdev->netdev->ifindex;
+
+ rv = s->ops->bind(s, (struct sockaddr *)laddr,
+ sizeof(struct sockaddr_in));
+ } else {
+ struct sockaddr_in6 *laddr = &to_sockaddr_in6(id->local_addr);
+
+ if (id->afonly) {
+ rv = ip6_sock_set_v6only(s->sk);
+ if (rv) {
+ siw_dbg(id->device,
+ "ip6_sock_set_v6only erro: %d\n", rv);
+ goto error;
+ }
+ }
+
+ /* For wildcard addr, limit binding to current device only */
+ if (ipv6_addr_any(&laddr->sin6_addr))
+ s->sk->sk_bound_dev_if = sdev->netdev->ifindex;
+
+ rv = s->ops->bind(s, (struct sockaddr *)laddr,
+ sizeof(struct sockaddr_in6));
+ }
+ if (rv) {
+ siw_dbg(id->device, "socket bind error: %d\n", rv);
+ goto error;
+ }
+ cep = siw_cep_alloc(sdev);
+ if (!cep) {
+ rv = -ENOMEM;
+ goto error;
+ }
+ siw_cep_socket_assoc(cep, s);
+
+ rv = siw_cm_alloc_work(cep, backlog);
+ if (rv) {
+ siw_dbg(id->device,
+ "alloc_work error %d, backlog %d\n",
+ rv, backlog);
+ goto error;
+ }
+ rv = s->ops->listen(s, backlog);
+ if (rv) {
+ siw_dbg(id->device, "listen error %d\n", rv);
+ goto error;
+ }
+ cep->cm_id = id;
+ id->add_ref(id);
+
+ /*
+ * In case of a wildcard rdma_listen on a multi-homed device,
+ * a listener's IWCM id is associated with more than one listening CEP.
+ *
+ * We currently use id->provider_data in three different ways:
+ *
+ * o For a listener's IWCM id, id->provider_data points to
+ * the list_head of the list of listening CEPs.
+ * Uses: siw_create_listen(), siw_destroy_listen()
+ *
+ * o For each accepted passive-side IWCM id, id->provider_data
+ * points to the CEP itself. This is a consequence of
+ * - siw_cm_upcall() setting event.provider_data = cep and
+ * - the IWCM's cm_conn_req_handler() setting provider_data of the
+ * new passive-side IWCM id equal to event.provider_data
+ * Uses: siw_accept(), siw_reject()
+ *
+ * o For an active-side IWCM id, id->provider_data is not used at all.
+ *
+ */
+ if (!id->provider_data) {
+ id->provider_data =
+ kmalloc(sizeof(struct list_head), GFP_KERNEL);
+ if (!id->provider_data) {
+ rv = -ENOMEM;
+ goto error;
+ }
+ INIT_LIST_HEAD((struct list_head *)id->provider_data);
+ }
+ list_add_tail(&cep->listenq, (struct list_head *)id->provider_data);
+ cep->state = SIW_EPSTATE_LISTENING;
+
+ siw_dbg(id->device, "Listen at laddr %pISp\n", &id->local_addr);
+
+ return 0;
+
+error:
+ siw_dbg(id->device, "failed: %d\n", rv);
+
+ if (cep) {
+ siw_cep_set_inuse(cep);
+
+ if (cep->cm_id) {
+ cep->cm_id->rem_ref(cep->cm_id);
+ cep->cm_id = NULL;
+ }
+ cep->sock = NULL;
+ siw_socket_disassoc(s);
+ cep->state = SIW_EPSTATE_CLOSED;
+
+ siw_cep_set_free(cep);
+ siw_cep_put(cep);
+ }
+ sock_release(s);
+
+ return rv;
+}
+
+static void siw_drop_listeners(struct iw_cm_id *id)
+{
+ struct list_head *p, *tmp;
+
+ /*
+ * In case of a wildcard rdma_listen on a multi-homed device,
+ * a listener's IWCM id is associated with more than one listening CEP.
+ */
+ list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) {
+ struct siw_cep *cep = list_entry(p, struct siw_cep, listenq);
+
+ list_del(p);
+
+ siw_dbg_cep(cep, "drop cep, state %d\n", cep->state);
+
+ siw_cep_set_inuse(cep);
+
+ if (cep->cm_id) {
+ cep->cm_id->rem_ref(cep->cm_id);
+ cep->cm_id = NULL;
+ }
+ if (cep->sock) {
+ siw_socket_disassoc(cep->sock);
+ sock_release(cep->sock);
+ cep->sock = NULL;
+ }
+ cep->state = SIW_EPSTATE_CLOSED;
+ siw_cep_set_free(cep);
+ siw_cep_put(cep);
+ }
+}
+
+int siw_destroy_listen(struct iw_cm_id *id)
+{
+ if (!id->provider_data) {
+ siw_dbg(id->device, "no cep(s)\n");
+ return 0;
+ }
+ siw_drop_listeners(id);
+ kfree(id->provider_data);
+ id->provider_data = NULL;
+
+ return 0;
+}
+
+int siw_cm_init(void)
+{
+ /*
+ * create_single_workqueue for strict ordering
+ */
+ siw_cm_wq = create_singlethread_workqueue("siw_cm_wq");
+ if (!siw_cm_wq)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void siw_cm_exit(void)
+{
+ if (siw_cm_wq)
+ destroy_workqueue(siw_cm_wq);
+}
diff --git a/drivers/infiniband/sw/siw/siw_cm.h b/drivers/infiniband/sw/siw/siw_cm.h
new file mode 100644
index 0000000000..8c59cb3e28
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_cm.h
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Greg Joyce <greg@opengridcomputing.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+/* Copyright (c) 2017, Open Grid Computing, Inc. */
+
+#ifndef _SIW_CM_H
+#define _SIW_CM_H
+
+#include <net/sock.h>
+#include <linux/tcp.h>
+
+#include <rdma/iw_cm.h>
+
+enum siw_cep_state {
+ SIW_EPSTATE_IDLE = 1,
+ SIW_EPSTATE_LISTENING,
+ SIW_EPSTATE_CONNECTING,
+ SIW_EPSTATE_AWAIT_MPAREQ,
+ SIW_EPSTATE_RECVD_MPAREQ,
+ SIW_EPSTATE_AWAIT_MPAREP,
+ SIW_EPSTATE_RDMA_MODE,
+ SIW_EPSTATE_CLOSED
+};
+
+struct siw_mpa_info {
+ struct mpa_rr hdr; /* peer mpa hdr in host byte order */
+ struct mpa_v2_data v2_ctrl;
+ struct mpa_v2_data v2_ctrl_req;
+ char *pdata;
+ int bytes_rcvd;
+};
+
+struct siw_device;
+
+struct siw_cep {
+ struct iw_cm_id *cm_id;
+ struct siw_device *sdev;
+ struct list_head devq;
+ spinlock_t lock;
+ struct kref ref;
+ int in_use;
+ wait_queue_head_t waitq;
+ enum siw_cep_state state;
+
+ struct list_head listenq;
+ struct siw_cep *listen_cep;
+
+ struct siw_qp *qp;
+ struct socket *sock;
+
+ struct siw_cm_work *mpa_timer;
+ struct list_head work_freelist;
+
+ struct siw_mpa_info mpa;
+ int ord;
+ int ird;
+ bool enhanced_rdma_conn_est;
+
+ /* Saved upcalls of socket */
+ void (*sk_state_change)(struct sock *sk);
+ void (*sk_data_ready)(struct sock *sk);
+ void (*sk_write_space)(struct sock *sk);
+ void (*sk_error_report)(struct sock *sk);
+};
+
+/*
+ * Connection initiator waits 10 seconds to receive an
+ * MPA reply after sending out MPA request. Reponder waits for
+ * 5 seconds for MPA request to arrive if new TCP connection
+ * was set up.
+ */
+#define MPAREQ_TIMEOUT (HZ * 10)
+#define MPAREP_TIMEOUT (HZ * 5)
+
+enum siw_work_type {
+ SIW_CM_WORK_ACCEPT = 1,
+ SIW_CM_WORK_READ_MPAHDR,
+ SIW_CM_WORK_CLOSE_LLP, /* close socket */
+ SIW_CM_WORK_PEER_CLOSE, /* socket indicated peer close */
+ SIW_CM_WORK_MPATIMEOUT
+};
+
+struct siw_cm_work {
+ struct delayed_work work;
+ struct list_head list;
+ enum siw_work_type type;
+ struct siw_cep *cep;
+};
+
+#define to_sockaddr_in(a) (*(struct sockaddr_in *)(&(a)))
+#define to_sockaddr_in6(a) (*(struct sockaddr_in6 *)(&(a)))
+
+static inline int getname_peer(struct socket *s, struct sockaddr_storage *a)
+{
+ return s->ops->getname(s, (struct sockaddr *)a, 1);
+}
+
+static inline int getname_local(struct socket *s, struct sockaddr_storage *a)
+{
+ return s->ops->getname(s, (struct sockaddr *)a, 0);
+}
+
+static inline int ksock_recv(struct socket *sock, char *buf, size_t size,
+ int flags)
+{
+ struct kvec iov = { buf, size };
+ struct msghdr msg = { .msg_name = NULL, .msg_flags = flags };
+
+ return kernel_recvmsg(sock, &msg, &iov, 1, size, flags);
+}
+
+int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *parm);
+int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *param);
+int siw_reject(struct iw_cm_id *id, const void *data, u8 len);
+int siw_create_listen(struct iw_cm_id *id, int backlog);
+int siw_destroy_listen(struct iw_cm_id *id);
+
+void siw_cep_get(struct siw_cep *cep);
+void siw_cep_put(struct siw_cep *cep);
+int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type);
+
+int siw_cm_init(void);
+void siw_cm_exit(void);
+
+/*
+ * TCP socket interface
+ */
+#define sk_to_qp(sk) (((struct siw_cep *)((sk)->sk_user_data))->qp)
+#define sk_to_cep(sk) ((struct siw_cep *)((sk)->sk_user_data))
+
+#endif
diff --git a/drivers/infiniband/sw/siw/siw_cq.c b/drivers/infiniband/sw/siw/siw_cq.c
new file mode 100644
index 0000000000..403029de6b
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_cq.c
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "siw.h"
+
+static int map_wc_opcode[SIW_NUM_OPCODES] = {
+ [SIW_OP_WRITE] = IB_WC_RDMA_WRITE,
+ [SIW_OP_SEND] = IB_WC_SEND,
+ [SIW_OP_SEND_WITH_IMM] = IB_WC_SEND,
+ [SIW_OP_READ] = IB_WC_RDMA_READ,
+ [SIW_OP_READ_LOCAL_INV] = IB_WC_RDMA_READ,
+ [SIW_OP_COMP_AND_SWAP] = IB_WC_COMP_SWAP,
+ [SIW_OP_FETCH_AND_ADD] = IB_WC_FETCH_ADD,
+ [SIW_OP_INVAL_STAG] = IB_WC_LOCAL_INV,
+ [SIW_OP_REG_MR] = IB_WC_REG_MR,
+ [SIW_OP_RECEIVE] = IB_WC_RECV,
+ [SIW_OP_READ_RESPONSE] = -1 /* not used */
+};
+
+static struct {
+ enum siw_wc_status siw;
+ enum ib_wc_status ib;
+} map_cqe_status[SIW_NUM_WC_STATUS] = {
+ { SIW_WC_SUCCESS, IB_WC_SUCCESS },
+ { SIW_WC_LOC_LEN_ERR, IB_WC_LOC_LEN_ERR },
+ { SIW_WC_LOC_PROT_ERR, IB_WC_LOC_PROT_ERR },
+ { SIW_WC_LOC_QP_OP_ERR, IB_WC_LOC_QP_OP_ERR },
+ { SIW_WC_WR_FLUSH_ERR, IB_WC_WR_FLUSH_ERR },
+ { SIW_WC_BAD_RESP_ERR, IB_WC_BAD_RESP_ERR },
+ { SIW_WC_LOC_ACCESS_ERR, IB_WC_LOC_ACCESS_ERR },
+ { SIW_WC_REM_ACCESS_ERR, IB_WC_REM_ACCESS_ERR },
+ { SIW_WC_REM_INV_REQ_ERR, IB_WC_REM_INV_REQ_ERR },
+ { SIW_WC_GENERAL_ERR, IB_WC_GENERAL_ERR }
+};
+
+/*
+ * Reap one CQE from the CQ. Only used by kernel clients
+ * during CQ normal operation. Might be called during CQ
+ * flush for user mapped CQE array as well.
+ */
+int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc)
+{
+ struct siw_cqe *cqe;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cq->lock, flags);
+
+ cqe = &cq->queue[cq->cq_get % cq->num_cqe];
+ if (READ_ONCE(cqe->flags) & SIW_WQE_VALID) {
+ memset(wc, 0, sizeof(*wc));
+ wc->wr_id = cqe->id;
+ wc->byte_len = cqe->bytes;
+
+ /*
+ * During CQ flush, also user land CQE's may get
+ * reaped here, which do not hold a QP reference
+ * and do not qualify for memory extension verbs.
+ */
+ if (likely(rdma_is_kernel_res(&cq->base_cq.res))) {
+ if (cqe->flags & SIW_WQE_REM_INVAL) {
+ wc->ex.invalidate_rkey = cqe->inval_stag;
+ wc->wc_flags = IB_WC_WITH_INVALIDATE;
+ }
+ wc->qp = cqe->base_qp;
+ wc->opcode = map_wc_opcode[cqe->opcode];
+ wc->status = map_cqe_status[cqe->status].ib;
+ siw_dbg_cq(cq,
+ "idx %u, type %d, flags %2x, id 0x%pK\n",
+ cq->cq_get % cq->num_cqe, cqe->opcode,
+ cqe->flags, (void *)(uintptr_t)cqe->id);
+ } else {
+ /*
+ * A malicious user may set invalid opcode or
+ * status in the user mmapped CQE array.
+ * Sanity check and correct values in that case
+ * to avoid out-of-bounds access to global arrays
+ * for opcode and status mapping.
+ */
+ u8 opcode = cqe->opcode;
+ u16 status = cqe->status;
+
+ if (opcode >= SIW_NUM_OPCODES) {
+ opcode = 0;
+ status = SIW_WC_GENERAL_ERR;
+ } else if (status >= SIW_NUM_WC_STATUS) {
+ status = SIW_WC_GENERAL_ERR;
+ }
+ wc->opcode = map_wc_opcode[opcode];
+ wc->status = map_cqe_status[status].ib;
+
+ }
+ WRITE_ONCE(cqe->flags, 0);
+ cq->cq_get++;
+
+ spin_unlock_irqrestore(&cq->lock, flags);
+
+ return 1;
+ }
+ spin_unlock_irqrestore(&cq->lock, flags);
+
+ return 0;
+}
+
+/*
+ * siw_cq_flush()
+ *
+ * Flush all CQ elements.
+ */
+void siw_cq_flush(struct siw_cq *cq)
+{
+ struct ib_wc wc;
+
+ while (siw_reap_cqe(cq, &wc))
+ ;
+}
diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c
new file mode 100644
index 0000000000..d4b6e01068
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_main.c
@@ -0,0 +1,587 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <net/net_namespace.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_arp.h>
+#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/dma-mapping.h>
+
+#include <net/addrconf.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/rdma_netlink.h>
+#include <linux/kthread.h>
+
+#include "siw.h"
+#include "siw_verbs.h"
+
+MODULE_AUTHOR("Bernard Metzler");
+MODULE_DESCRIPTION("Software iWARP Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+/* transmit from user buffer, if possible */
+const bool zcopy_tx = true;
+
+/* Restrict usage of GSO, if hardware peer iwarp is unable to process
+ * large packets. try_gso = true lets siw try to use local GSO,
+ * if peer agrees. Not using GSO severly limits siw maximum tx bandwidth.
+ */
+const bool try_gso;
+
+/* Attach siw also with loopback devices */
+const bool loopback_enabled = true;
+
+/* We try to negotiate CRC on, if true */
+const bool mpa_crc_required;
+
+/* MPA CRC on/off enforced */
+const bool mpa_crc_strict;
+
+/* Control TCP_NODELAY socket option */
+const bool siw_tcp_nagle;
+
+/* Select MPA version to be used during connection setup */
+u_char mpa_version = MPA_REVISION_2;
+
+/* Selects MPA P2P mode (additional handshake during connection
+ * setup, if true.
+ */
+const bool peer_to_peer;
+
+struct task_struct *siw_tx_thread[NR_CPUS];
+struct crypto_shash *siw_crypto_shash;
+
+static int siw_device_register(struct siw_device *sdev, const char *name)
+{
+ struct ib_device *base_dev = &sdev->base_dev;
+ static int dev_id = 1;
+ int rv;
+
+ sdev->vendor_part_id = dev_id++;
+
+ rv = ib_register_device(base_dev, name, NULL);
+ if (rv) {
+ pr_warn("siw: device registration error %d\n", rv);
+ return rv;
+ }
+
+ siw_dbg(base_dev, "HWaddr=%pM\n", sdev->raw_gid);
+ return 0;
+}
+
+static void siw_device_cleanup(struct ib_device *base_dev)
+{
+ struct siw_device *sdev = to_siw_dev(base_dev);
+
+ xa_destroy(&sdev->qp_xa);
+ xa_destroy(&sdev->mem_xa);
+}
+
+static int siw_dev_qualified(struct net_device *netdev)
+{
+ /*
+ * Additional hardware support can be added here
+ * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see
+ * <linux/if_arp.h> for type identifiers.
+ */
+ if (netdev->type == ARPHRD_ETHER || netdev->type == ARPHRD_IEEE802 ||
+ netdev->type == ARPHRD_NONE ||
+ (netdev->type == ARPHRD_LOOPBACK && loopback_enabled))
+ return 1;
+
+ return 0;
+}
+
+static DEFINE_PER_CPU(atomic_t, siw_use_cnt);
+
+static struct {
+ struct cpumask **tx_valid_cpus;
+ int num_nodes;
+} siw_cpu_info;
+
+static int siw_init_cpulist(void)
+{
+ int i, num_nodes = nr_node_ids;
+
+ memset(siw_tx_thread, 0, sizeof(siw_tx_thread));
+
+ siw_cpu_info.num_nodes = num_nodes;
+
+ siw_cpu_info.tx_valid_cpus =
+ kcalloc(num_nodes, sizeof(struct cpumask *), GFP_KERNEL);
+ if (!siw_cpu_info.tx_valid_cpus) {
+ siw_cpu_info.num_nodes = 0;
+ return -ENOMEM;
+ }
+ for (i = 0; i < siw_cpu_info.num_nodes; i++) {
+ siw_cpu_info.tx_valid_cpus[i] =
+ kzalloc(sizeof(struct cpumask), GFP_KERNEL);
+ if (!siw_cpu_info.tx_valid_cpus[i])
+ goto out_err;
+
+ cpumask_clear(siw_cpu_info.tx_valid_cpus[i]);
+ }
+ for_each_possible_cpu(i)
+ cpumask_set_cpu(i, siw_cpu_info.tx_valid_cpus[cpu_to_node(i)]);
+
+ return 0;
+
+out_err:
+ siw_cpu_info.num_nodes = 0;
+ while (--i >= 0)
+ kfree(siw_cpu_info.tx_valid_cpus[i]);
+ kfree(siw_cpu_info.tx_valid_cpus);
+ siw_cpu_info.tx_valid_cpus = NULL;
+
+ return -ENOMEM;
+}
+
+static void siw_destroy_cpulist(void)
+{
+ int i = 0;
+
+ while (i < siw_cpu_info.num_nodes)
+ kfree(siw_cpu_info.tx_valid_cpus[i++]);
+
+ kfree(siw_cpu_info.tx_valid_cpus);
+}
+
+/*
+ * Choose CPU with least number of active QP's from NUMA node of
+ * TX interface.
+ */
+int siw_get_tx_cpu(struct siw_device *sdev)
+{
+ const struct cpumask *tx_cpumask;
+ int i, num_cpus, cpu, min_use, node = sdev->numa_node, tx_cpu = -1;
+
+ if (node < 0)
+ tx_cpumask = cpu_online_mask;
+ else
+ tx_cpumask = siw_cpu_info.tx_valid_cpus[node];
+
+ num_cpus = cpumask_weight(tx_cpumask);
+ if (!num_cpus) {
+ /* no CPU on this NUMA node */
+ tx_cpumask = cpu_online_mask;
+ num_cpus = cpumask_weight(tx_cpumask);
+ }
+ if (!num_cpus)
+ goto out;
+
+ cpu = cpumask_first(tx_cpumask);
+
+ for (i = 0, min_use = SIW_MAX_QP; i < num_cpus;
+ i++, cpu = cpumask_next(cpu, tx_cpumask)) {
+ int usage;
+
+ /* Skip any cores which have no TX thread */
+ if (!siw_tx_thread[cpu])
+ continue;
+
+ usage = atomic_read(&per_cpu(siw_use_cnt, cpu));
+ if (usage <= min_use) {
+ tx_cpu = cpu;
+ min_use = usage;
+ }
+ }
+ siw_dbg(&sdev->base_dev,
+ "tx cpu %d, node %d, %d qp's\n", tx_cpu, node, min_use);
+
+out:
+ if (tx_cpu >= 0)
+ atomic_inc(&per_cpu(siw_use_cnt, tx_cpu));
+ else
+ pr_warn("siw: no tx cpu found\n");
+
+ return tx_cpu;
+}
+
+void siw_put_tx_cpu(int cpu)
+{
+ atomic_dec(&per_cpu(siw_use_cnt, cpu));
+}
+
+static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id)
+{
+ struct siw_qp *qp = siw_qp_id2obj(to_siw_dev(base_dev), id);
+
+ if (qp) {
+ /*
+ * siw_qp_id2obj() increments object reference count
+ */
+ siw_qp_put(qp);
+ return &qp->base_qp;
+ }
+ return NULL;
+}
+
+static const struct ib_device_ops siw_device_ops = {
+ .owner = THIS_MODULE,
+ .uverbs_abi_ver = SIW_ABI_VERSION,
+ .driver_id = RDMA_DRIVER_SIW,
+
+ .alloc_mr = siw_alloc_mr,
+ .alloc_pd = siw_alloc_pd,
+ .alloc_ucontext = siw_alloc_ucontext,
+ .create_cq = siw_create_cq,
+ .create_qp = siw_create_qp,
+ .create_srq = siw_create_srq,
+ .dealloc_driver = siw_device_cleanup,
+ .dealloc_pd = siw_dealloc_pd,
+ .dealloc_ucontext = siw_dealloc_ucontext,
+ .dereg_mr = siw_dereg_mr,
+ .destroy_cq = siw_destroy_cq,
+ .destroy_qp = siw_destroy_qp,
+ .destroy_srq = siw_destroy_srq,
+ .get_dma_mr = siw_get_dma_mr,
+ .get_port_immutable = siw_get_port_immutable,
+ .iw_accept = siw_accept,
+ .iw_add_ref = siw_qp_get_ref,
+ .iw_connect = siw_connect,
+ .iw_create_listen = siw_create_listen,
+ .iw_destroy_listen = siw_destroy_listen,
+ .iw_get_qp = siw_get_base_qp,
+ .iw_reject = siw_reject,
+ .iw_rem_ref = siw_qp_put_ref,
+ .map_mr_sg = siw_map_mr_sg,
+ .mmap = siw_mmap,
+ .mmap_free = siw_mmap_free,
+ .modify_qp = siw_verbs_modify_qp,
+ .modify_srq = siw_modify_srq,
+ .poll_cq = siw_poll_cq,
+ .post_recv = siw_post_receive,
+ .post_send = siw_post_send,
+ .post_srq_recv = siw_post_srq_recv,
+ .query_device = siw_query_device,
+ .query_gid = siw_query_gid,
+ .query_port = siw_query_port,
+ .query_qp = siw_query_qp,
+ .query_srq = siw_query_srq,
+ .req_notify_cq = siw_req_notify_cq,
+ .reg_user_mr = siw_reg_user_mr,
+
+ INIT_RDMA_OBJ_SIZE(ib_cq, siw_cq, base_cq),
+ INIT_RDMA_OBJ_SIZE(ib_pd, siw_pd, base_pd),
+ INIT_RDMA_OBJ_SIZE(ib_qp, siw_qp, base_qp),
+ INIT_RDMA_OBJ_SIZE(ib_srq, siw_srq, base_srq),
+ INIT_RDMA_OBJ_SIZE(ib_ucontext, siw_ucontext, base_ucontext),
+};
+
+static struct siw_device *siw_device_create(struct net_device *netdev)
+{
+ struct siw_device *sdev = NULL;
+ struct ib_device *base_dev;
+ int rv;
+
+ sdev = ib_alloc_device(siw_device, base_dev);
+ if (!sdev)
+ return NULL;
+
+ base_dev = &sdev->base_dev;
+ sdev->netdev = netdev;
+
+ if (netdev->addr_len) {
+ memcpy(sdev->raw_gid, netdev->dev_addr,
+ min_t(unsigned int, netdev->addr_len, ETH_ALEN));
+ } else {
+ /*
+ * This device does not have a HW address, but
+ * connection mangagement requires a unique gid.
+ */
+ eth_random_addr(sdev->raw_gid);
+ }
+ addrconf_addr_eui48((u8 *)&base_dev->node_guid, sdev->raw_gid);
+
+ base_dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND);
+
+ base_dev->node_type = RDMA_NODE_RNIC;
+ memcpy(base_dev->node_desc, SIW_NODE_DESC_COMMON,
+ sizeof(SIW_NODE_DESC_COMMON));
+
+ /*
+ * Current model (one-to-one device association):
+ * One Softiwarp device per net_device or, equivalently,
+ * per physical port.
+ */
+ base_dev->phys_port_cnt = 1;
+ base_dev->num_comp_vectors = num_possible_cpus();
+
+ xa_init_flags(&sdev->qp_xa, XA_FLAGS_ALLOC1);
+ xa_init_flags(&sdev->mem_xa, XA_FLAGS_ALLOC1);
+
+ ib_set_device_ops(base_dev, &siw_device_ops);
+ rv = ib_device_set_netdev(base_dev, netdev, 1);
+ if (rv)
+ goto error;
+
+ memcpy(base_dev->iw_ifname, netdev->name,
+ sizeof(base_dev->iw_ifname));
+
+ /* Disable TCP port mapping */
+ base_dev->iw_driver_flags = IW_F_NO_PORT_MAP;
+
+ sdev->attrs.max_qp = SIW_MAX_QP;
+ sdev->attrs.max_qp_wr = SIW_MAX_QP_WR;
+ sdev->attrs.max_ord = SIW_MAX_ORD_QP;
+ sdev->attrs.max_ird = SIW_MAX_IRD_QP;
+ sdev->attrs.max_sge = SIW_MAX_SGE;
+ sdev->attrs.max_sge_rd = SIW_MAX_SGE_RD;
+ sdev->attrs.max_cq = SIW_MAX_CQ;
+ sdev->attrs.max_cqe = SIW_MAX_CQE;
+ sdev->attrs.max_mr = SIW_MAX_MR;
+ sdev->attrs.max_pd = SIW_MAX_PD;
+ sdev->attrs.max_mw = SIW_MAX_MW;
+ sdev->attrs.max_srq = SIW_MAX_SRQ;
+ sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR;
+ sdev->attrs.max_srq_sge = SIW_MAX_SGE;
+
+ INIT_LIST_HEAD(&sdev->cep_list);
+ INIT_LIST_HEAD(&sdev->qp_list);
+
+ atomic_set(&sdev->num_ctx, 0);
+ atomic_set(&sdev->num_srq, 0);
+ atomic_set(&sdev->num_qp, 0);
+ atomic_set(&sdev->num_cq, 0);
+ atomic_set(&sdev->num_mr, 0);
+ atomic_set(&sdev->num_pd, 0);
+
+ sdev->numa_node = dev_to_node(&netdev->dev);
+ spin_lock_init(&sdev->lock);
+
+ return sdev;
+error:
+ ib_dealloc_device(base_dev);
+
+ return NULL;
+}
+
+/*
+ * Network link becomes unavailable. Mark all
+ * affected QP's accordingly.
+ */
+static void siw_netdev_down(struct work_struct *work)
+{
+ struct siw_device *sdev =
+ container_of(work, struct siw_device, netdev_down);
+
+ struct siw_qp_attrs qp_attrs;
+ struct list_head *pos, *tmp;
+
+ memset(&qp_attrs, 0, sizeof(qp_attrs));
+ qp_attrs.state = SIW_QP_STATE_ERROR;
+
+ list_for_each_safe(pos, tmp, &sdev->qp_list) {
+ struct siw_qp *qp = list_entry(pos, struct siw_qp, devq);
+
+ down_write(&qp->state_lock);
+ WARN_ON(siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE));
+ up_write(&qp->state_lock);
+ }
+ ib_device_put(&sdev->base_dev);
+}
+
+static void siw_device_goes_down(struct siw_device *sdev)
+{
+ if (ib_device_try_get(&sdev->base_dev)) {
+ INIT_WORK(&sdev->netdev_down, siw_netdev_down);
+ schedule_work(&sdev->netdev_down);
+ }
+}
+
+static int siw_netdev_event(struct notifier_block *nb, unsigned long event,
+ void *arg)
+{
+ struct net_device *netdev = netdev_notifier_info_to_dev(arg);
+ struct ib_device *base_dev;
+ struct siw_device *sdev;
+
+ dev_dbg(&netdev->dev, "siw: event %lu\n", event);
+
+ base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW);
+ if (!base_dev)
+ return NOTIFY_OK;
+
+ sdev = to_siw_dev(base_dev);
+
+ switch (event) {
+ case NETDEV_UP:
+ sdev->state = IB_PORT_ACTIVE;
+ siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE);
+ break;
+
+ case NETDEV_GOING_DOWN:
+ siw_device_goes_down(sdev);
+ break;
+
+ case NETDEV_DOWN:
+ sdev->state = IB_PORT_DOWN;
+ siw_port_event(sdev, 1, IB_EVENT_PORT_ERR);
+ break;
+
+ case NETDEV_REGISTER:
+ /*
+ * Device registration now handled only by
+ * rdma netlink commands. So it shall be impossible
+ * to end up here with a valid siw device.
+ */
+ siw_dbg(base_dev, "unexpected NETDEV_REGISTER event\n");
+ break;
+
+ case NETDEV_UNREGISTER:
+ ib_unregister_device_queued(&sdev->base_dev);
+ break;
+
+ case NETDEV_CHANGEADDR:
+ siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE);
+ break;
+ /*
+ * Todo: Below netdev events are currently not handled.
+ */
+ case NETDEV_CHANGEMTU:
+ case NETDEV_CHANGE:
+ break;
+
+ default:
+ break;
+ }
+ ib_device_put(&sdev->base_dev);
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block siw_netdev_nb = {
+ .notifier_call = siw_netdev_event,
+};
+
+static int siw_newlink(const char *basedev_name, struct net_device *netdev)
+{
+ struct ib_device *base_dev;
+ struct siw_device *sdev = NULL;
+ int rv = -ENOMEM;
+
+ if (!siw_dev_qualified(netdev))
+ return -EINVAL;
+
+ base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW);
+ if (base_dev) {
+ ib_device_put(base_dev);
+ return -EEXIST;
+ }
+ sdev = siw_device_create(netdev);
+ if (sdev) {
+ dev_dbg(&netdev->dev, "siw: new device\n");
+
+ if (netif_running(netdev) && netif_carrier_ok(netdev))
+ sdev->state = IB_PORT_ACTIVE;
+ else
+ sdev->state = IB_PORT_DOWN;
+
+ rv = siw_device_register(sdev, basedev_name);
+ if (rv)
+ ib_dealloc_device(&sdev->base_dev);
+ }
+ return rv;
+}
+
+static struct rdma_link_ops siw_link_ops = {
+ .type = "siw",
+ .newlink = siw_newlink,
+};
+
+/*
+ * siw_init_module - Initialize Softiwarp module and register with netdev
+ * subsystem.
+ */
+static __init int siw_init_module(void)
+{
+ int rv;
+
+ if (SENDPAGE_THRESH < SIW_MAX_INLINE) {
+ pr_info("siw: sendpage threshold too small: %u\n",
+ (int)SENDPAGE_THRESH);
+ rv = -EINVAL;
+ goto out_error;
+ }
+ rv = siw_init_cpulist();
+ if (rv)
+ goto out_error;
+
+ rv = siw_cm_init();
+ if (rv)
+ goto out_error;
+
+ if (!siw_create_tx_threads()) {
+ pr_info("siw: Could not start any TX thread\n");
+ rv = -ENOMEM;
+ goto out_error;
+ }
+ /*
+ * Locate CRC32 algorithm. If unsuccessful, fail
+ * loading siw only, if CRC is required.
+ */
+ siw_crypto_shash = crypto_alloc_shash("crc32c", 0, 0);
+ if (IS_ERR(siw_crypto_shash)) {
+ pr_info("siw: Loading CRC32c failed: %ld\n",
+ PTR_ERR(siw_crypto_shash));
+ siw_crypto_shash = NULL;
+ if (mpa_crc_required) {
+ rv = -EOPNOTSUPP;
+ goto out_error;
+ }
+ }
+ rv = register_netdevice_notifier(&siw_netdev_nb);
+ if (rv)
+ goto out_error;
+
+ rdma_link_register(&siw_link_ops);
+
+ pr_info("SoftiWARP attached\n");
+ return 0;
+
+out_error:
+ siw_stop_tx_threads();
+
+ if (siw_crypto_shash)
+ crypto_free_shash(siw_crypto_shash);
+
+ pr_info("SoftIWARP attach failed. Error: %d\n", rv);
+
+ siw_cm_exit();
+ siw_destroy_cpulist();
+
+ return rv;
+}
+
+static void __exit siw_exit_module(void)
+{
+ siw_stop_tx_threads();
+
+ unregister_netdevice_notifier(&siw_netdev_nb);
+ rdma_link_unregister(&siw_link_ops);
+ ib_unregister_driver(RDMA_DRIVER_SIW);
+
+ siw_cm_exit();
+
+ siw_destroy_cpulist();
+
+ if (siw_crypto_shash)
+ crypto_free_shash(siw_crypto_shash);
+
+ pr_info("SoftiWARP detached\n");
+}
+
+module_init(siw_init_module);
+module_exit(siw_exit_module);
+
+MODULE_ALIAS_RDMA_LINK("siw");
diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c
new file mode 100644
index 0000000000..e6e25f1556
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_mem.c
@@ -0,0 +1,449 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/gfp.h>
+#include <rdma/ib_verbs.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/sched/mm.h>
+#include <linux/resource.h>
+
+#include "siw.h"
+#include "siw_mem.h"
+
+/*
+ * Stag lookup is based on its index part only (24 bits).
+ * The code avoids special Stag of zero and tries to randomize
+ * STag values between 1 and SIW_STAG_MAX_INDEX.
+ */
+int siw_mem_add(struct siw_device *sdev, struct siw_mem *m)
+{
+ struct xa_limit limit = XA_LIMIT(1, 0x00ffffff);
+ u32 id, next;
+
+ get_random_bytes(&next, 4);
+ next &= 0x00ffffff;
+
+ if (xa_alloc_cyclic(&sdev->mem_xa, &id, m, limit, &next,
+ GFP_KERNEL) < 0)
+ return -ENOMEM;
+
+ /* Set the STag index part */
+ m->stag = id << 8;
+
+ siw_dbg_mem(m, "new MEM object\n");
+
+ return 0;
+}
+
+/*
+ * siw_mem_id2obj()
+ *
+ * resolves memory from stag given by id. might be called from:
+ * o process context before sending out of sgl, or
+ * o in softirq when resolving target memory
+ */
+struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index)
+{
+ struct siw_mem *mem;
+
+ rcu_read_lock();
+ mem = xa_load(&sdev->mem_xa, stag_index);
+ if (likely(mem && kref_get_unless_zero(&mem->ref))) {
+ rcu_read_unlock();
+ return mem;
+ }
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages,
+ bool dirty)
+{
+ unpin_user_pages_dirty_lock(chunk->plist, num_pages, dirty);
+}
+
+void siw_umem_release(struct siw_umem *umem, bool dirty)
+{
+ struct mm_struct *mm_s = umem->owning_mm;
+ int i, num_pages = umem->num_pages;
+
+ for (i = 0; num_pages; i++) {
+ int to_free = min_t(int, PAGES_PER_CHUNK, num_pages);
+
+ siw_free_plist(&umem->page_chunk[i], to_free,
+ umem->writable && dirty);
+ kfree(umem->page_chunk[i].plist);
+ num_pages -= to_free;
+ }
+ atomic64_sub(umem->num_pages, &mm_s->pinned_vm);
+
+ mmdrop(mm_s);
+ kfree(umem->page_chunk);
+ kfree(umem);
+}
+
+int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj,
+ u64 start, u64 len, int rights)
+{
+ struct siw_device *sdev = to_siw_dev(pd->device);
+ struct siw_mem *mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+ struct xa_limit limit = XA_LIMIT(1, 0x00ffffff);
+ u32 id, next;
+
+ if (!mem)
+ return -ENOMEM;
+
+ mem->mem_obj = mem_obj;
+ mem->stag_valid = 0;
+ mem->sdev = sdev;
+ mem->va = start;
+ mem->len = len;
+ mem->pd = pd;
+ mem->perms = rights & IWARP_ACCESS_MASK;
+ kref_init(&mem->ref);
+
+ get_random_bytes(&next, 4);
+ next &= 0x00ffffff;
+
+ if (xa_alloc_cyclic(&sdev->mem_xa, &id, mem, limit, &next,
+ GFP_KERNEL) < 0) {
+ kfree(mem);
+ return -ENOMEM;
+ }
+
+ mr->mem = mem;
+ /* Set the STag index part */
+ mem->stag = id << 8;
+ mr->base_mr.lkey = mr->base_mr.rkey = mem->stag;
+
+ return 0;
+}
+
+void siw_mr_drop_mem(struct siw_mr *mr)
+{
+ struct siw_mem *mem = mr->mem, *found;
+
+ mem->stag_valid = 0;
+
+ /* make STag invalid visible asap */
+ smp_mb();
+
+ found = xa_erase(&mem->sdev->mem_xa, mem->stag >> 8);
+ WARN_ON(found != mem);
+ siw_mem_put(mem);
+}
+
+void siw_free_mem(struct kref *ref)
+{
+ struct siw_mem *mem = container_of(ref, struct siw_mem, ref);
+
+ siw_dbg_mem(mem, "free mem, pbl: %s\n", mem->is_pbl ? "y" : "n");
+
+ if (!mem->is_mw && mem->mem_obj) {
+ if (mem->is_pbl == 0)
+ siw_umem_release(mem->umem, true);
+ else
+ kfree(mem->pbl);
+ }
+ kfree(mem);
+}
+
+/*
+ * siw_check_mem()
+ *
+ * Check protection domain, STAG state, access permissions and
+ * address range for memory object.
+ *
+ * @pd: Protection Domain memory should belong to
+ * @mem: memory to be checked
+ * @addr: starting addr of mem
+ * @perms: requested access permissions
+ * @len: len of memory interval to be checked
+ *
+ */
+int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr,
+ enum ib_access_flags perms, int len)
+{
+ if (!mem->stag_valid) {
+ siw_dbg_pd(pd, "STag 0x%08x invalid\n", mem->stag);
+ return -E_STAG_INVALID;
+ }
+ if (mem->pd != pd) {
+ siw_dbg_pd(pd, "STag 0x%08x: PD mismatch\n", mem->stag);
+ return -E_PD_MISMATCH;
+ }
+ /*
+ * check access permissions
+ */
+ if ((mem->perms & perms) < perms) {
+ siw_dbg_pd(pd, "permissions 0x%08x < 0x%08x\n",
+ mem->perms, perms);
+ return -E_ACCESS_PERM;
+ }
+ /*
+ * Check if access falls into valid memory interval.
+ */
+ if (addr < mem->va || addr + len > mem->va + mem->len) {
+ siw_dbg_pd(pd, "MEM interval len %d\n", len);
+ siw_dbg_pd(pd, "[0x%pK, 0x%pK] out of bounds\n",
+ (void *)(uintptr_t)addr,
+ (void *)(uintptr_t)(addr + len));
+ siw_dbg_pd(pd, "[0x%pK, 0x%pK] STag=0x%08x\n",
+ (void *)(uintptr_t)mem->va,
+ (void *)(uintptr_t)(mem->va + mem->len),
+ mem->stag);
+
+ return -E_BASE_BOUNDS;
+ }
+ return E_ACCESS_OK;
+}
+
+/*
+ * siw_check_sge()
+ *
+ * Check SGE for access rights in given interval
+ *
+ * @pd: Protection Domain memory should belong to
+ * @sge: SGE to be checked
+ * @mem: location of memory reference within array
+ * @perms: requested access permissions
+ * @off: starting offset in SGE
+ * @len: len of memory interval to be checked
+ *
+ * NOTE: Function references SGE's memory object (mem->obj)
+ * if not yet done. New reference is kept if check went ok and
+ * released if check failed. If mem->obj is already valid, no new
+ * lookup is being done and mem is not released it check fails.
+ */
+int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge, struct siw_mem *mem[],
+ enum ib_access_flags perms, u32 off, int len)
+{
+ struct siw_device *sdev = to_siw_dev(pd->device);
+ struct siw_mem *new = NULL;
+ int rv = E_ACCESS_OK;
+
+ if (len + off > sge->length) {
+ rv = -E_BASE_BOUNDS;
+ goto fail;
+ }
+ if (*mem == NULL) {
+ new = siw_mem_id2obj(sdev, sge->lkey >> 8);
+ if (unlikely(!new)) {
+ siw_dbg_pd(pd, "STag unknown: 0x%08x\n", sge->lkey);
+ rv = -E_STAG_INVALID;
+ goto fail;
+ }
+ *mem = new;
+ }
+ /* Check if user re-registered with different STag key */
+ if (unlikely((*mem)->stag != sge->lkey)) {
+ siw_dbg_mem((*mem), "STag mismatch: 0x%08x\n", sge->lkey);
+ rv = -E_STAG_INVALID;
+ goto fail;
+ }
+ rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len);
+ if (unlikely(rv))
+ goto fail;
+
+ return 0;
+
+fail:
+ if (new) {
+ *mem = NULL;
+ siw_mem_put(new);
+ }
+ return rv;
+}
+
+void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op)
+{
+ switch (op) {
+ case SIW_OP_SEND:
+ case SIW_OP_WRITE:
+ case SIW_OP_SEND_WITH_IMM:
+ case SIW_OP_SEND_REMOTE_INV:
+ case SIW_OP_READ:
+ case SIW_OP_READ_LOCAL_INV:
+ if (!(wqe->sqe.flags & SIW_WQE_INLINE))
+ siw_unref_mem_sgl(wqe->mem, wqe->sqe.num_sge);
+ break;
+
+ case SIW_OP_RECEIVE:
+ siw_unref_mem_sgl(wqe->mem, wqe->rqe.num_sge);
+ break;
+
+ case SIW_OP_READ_RESPONSE:
+ siw_unref_mem_sgl(wqe->mem, 1);
+ break;
+
+ default:
+ /*
+ * SIW_OP_INVAL_STAG and SIW_OP_REG_MR
+ * do not hold memory references
+ */
+ break;
+ }
+}
+
+int siw_invalidate_stag(struct ib_pd *pd, u32 stag)
+{
+ struct siw_device *sdev = to_siw_dev(pd->device);
+ struct siw_mem *mem = siw_mem_id2obj(sdev, stag >> 8);
+ int rv = 0;
+
+ if (unlikely(!mem)) {
+ siw_dbg_pd(pd, "STag 0x%08x unknown\n", stag);
+ return -EINVAL;
+ }
+ if (unlikely(mem->pd != pd)) {
+ siw_dbg_pd(pd, "PD mismatch for STag 0x%08x\n", stag);
+ rv = -EACCES;
+ goto out;
+ }
+ /*
+ * Per RDMA verbs definition, an STag may already be in invalid
+ * state if invalidation is requested. So no state check here.
+ */
+ mem->stag_valid = 0;
+
+ siw_dbg_pd(pd, "STag 0x%08x now invalid\n", stag);
+out:
+ siw_mem_put(mem);
+ return rv;
+}
+
+/*
+ * Gets physical address backed by PBL element. Address is referenced
+ * by linear byte offset into list of variably sized PB elements.
+ * Optionally, provides remaining len within current element, and
+ * current PBL index for later resume at same element.
+ */
+dma_addr_t siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx)
+{
+ int i = idx ? *idx : 0;
+
+ while (i < pbl->num_buf) {
+ struct siw_pble *pble = &pbl->pbe[i];
+
+ if (pble->pbl_off + pble->size > off) {
+ u64 pble_off = off - pble->pbl_off;
+
+ if (len)
+ *len = pble->size - pble_off;
+ if (idx)
+ *idx = i;
+
+ return pble->addr + pble_off;
+ }
+ i++;
+ }
+ if (len)
+ *len = 0;
+ return 0;
+}
+
+struct siw_pbl *siw_pbl_alloc(u32 num_buf)
+{
+ struct siw_pbl *pbl;
+
+ if (num_buf == 0)
+ return ERR_PTR(-EINVAL);
+
+ pbl = kzalloc(struct_size(pbl, pbe, num_buf), GFP_KERNEL);
+ if (!pbl)
+ return ERR_PTR(-ENOMEM);
+
+ pbl->max_buf = num_buf;
+
+ return pbl;
+}
+
+struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable)
+{
+ struct siw_umem *umem;
+ struct mm_struct *mm_s;
+ u64 first_page_va;
+ unsigned long mlock_limit;
+ unsigned int foll_flags = FOLL_LONGTERM;
+ int num_pages, num_chunks, i, rv = 0;
+
+ if (!can_do_mlock())
+ return ERR_PTR(-EPERM);
+
+ if (!len)
+ return ERR_PTR(-EINVAL);
+
+ first_page_va = start & PAGE_MASK;
+ num_pages = PAGE_ALIGN(start + len - first_page_va) >> PAGE_SHIFT;
+ num_chunks = (num_pages >> CHUNK_SHIFT) + 1;
+
+ umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+ if (!umem)
+ return ERR_PTR(-ENOMEM);
+
+ mm_s = current->mm;
+ umem->owning_mm = mm_s;
+ umem->writable = writable;
+
+ mmgrab(mm_s);
+
+ if (writable)
+ foll_flags |= FOLL_WRITE;
+
+ mmap_read_lock(mm_s);
+
+ mlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ if (atomic64_add_return(num_pages, &mm_s->pinned_vm) > mlock_limit) {
+ rv = -ENOMEM;
+ goto out_sem_up;
+ }
+ umem->fp_addr = first_page_va;
+
+ umem->page_chunk =
+ kcalloc(num_chunks, sizeof(struct siw_page_chunk), GFP_KERNEL);
+ if (!umem->page_chunk) {
+ rv = -ENOMEM;
+ goto out_sem_up;
+ }
+ for (i = 0; num_pages; i++) {
+ int nents = min_t(int, num_pages, PAGES_PER_CHUNK);
+ struct page **plist =
+ kcalloc(nents, sizeof(struct page *), GFP_KERNEL);
+
+ if (!plist) {
+ rv = -ENOMEM;
+ goto out_sem_up;
+ }
+ umem->page_chunk[i].plist = plist;
+ while (nents) {
+ rv = pin_user_pages(first_page_va, nents, foll_flags,
+ plist);
+ if (rv < 0)
+ goto out_sem_up;
+
+ umem->num_pages += rv;
+ first_page_va += rv * PAGE_SIZE;
+ plist += rv;
+ nents -= rv;
+ num_pages -= rv;
+ }
+ }
+out_sem_up:
+ mmap_read_unlock(mm_s);
+
+ if (rv > 0)
+ return umem;
+
+ /* Adjust accounting for pages not pinned */
+ if (num_pages)
+ atomic64_sub(num_pages, &mm_s->pinned_vm);
+
+ siw_umem_release(umem, false);
+
+ return ERR_PTR(rv);
+}
diff --git a/drivers/infiniband/sw/siw/siw_mem.h b/drivers/infiniband/sw/siw/siw_mem.h
new file mode 100644
index 0000000000..f911287576
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_mem.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#ifndef _SIW_MEM_H
+#define _SIW_MEM_H
+
+struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable);
+void siw_umem_release(struct siw_umem *umem, bool dirty);
+struct siw_pbl *siw_pbl_alloc(u32 num_buf);
+dma_addr_t siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx);
+struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index);
+int siw_mem_add(struct siw_device *sdev, struct siw_mem *m);
+int siw_invalidate_stag(struct ib_pd *pd, u32 stag);
+int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr,
+ enum ib_access_flags perms, int len);
+int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge,
+ struct siw_mem *mem[], enum ib_access_flags perms,
+ u32 off, int len);
+void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op);
+int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj,
+ u64 start, u64 len, int rights);
+void siw_mr_drop_mem(struct siw_mr *mr);
+void siw_free_mem(struct kref *ref);
+
+static inline void siw_mem_put(struct siw_mem *mem)
+{
+ kref_put(&mem->ref, siw_free_mem);
+}
+
+static inline void siw_unref_mem_sgl(struct siw_mem **mem, unsigned int num_sge)
+{
+ while (num_sge) {
+ if (*mem == NULL)
+ break;
+
+ siw_mem_put(*mem);
+ *mem = NULL;
+ mem++;
+ num_sge--;
+ }
+}
+
+#define CHUNK_SHIFT 9 /* sets number of pages per chunk */
+#define PAGES_PER_CHUNK (_AC(1, UL) << CHUNK_SHIFT)
+#define CHUNK_MASK (~(PAGES_PER_CHUNK - 1))
+#define PAGE_CHUNK_SIZE (PAGES_PER_CHUNK * sizeof(struct page *))
+
+/*
+ * siw_get_upage()
+ *
+ * Get page pointer for address on given umem.
+ *
+ * @umem: two dimensional list of page pointers
+ * @addr: user virtual address
+ */
+static inline struct page *siw_get_upage(struct siw_umem *umem, u64 addr)
+{
+ unsigned int page_idx = (addr - umem->fp_addr) >> PAGE_SHIFT,
+ chunk_idx = page_idx >> CHUNK_SHIFT,
+ page_in_chunk = page_idx & ~CHUNK_MASK;
+
+ if (likely(page_idx < umem->num_pages))
+ return umem->page_chunk[chunk_idx].plist[page_in_chunk];
+
+ return NULL;
+}
+#endif
diff --git a/drivers/infiniband/sw/siw/siw_qp.c b/drivers/infiniband/sw/siw/siw_qp.c
new file mode 100644
index 0000000000..47d0197db9
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_qp.c
@@ -0,0 +1,1350 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/llist.h>
+#include <asm/barrier.h>
+#include <net/tcp.h>
+#include <trace/events/sock.h>
+
+#include "siw.h"
+#include "siw_verbs.h"
+#include "siw_mem.h"
+
+static char siw_qp_state_to_string[SIW_QP_STATE_COUNT][sizeof "TERMINATE"] = {
+ [SIW_QP_STATE_IDLE] = "IDLE",
+ [SIW_QP_STATE_RTR] = "RTR",
+ [SIW_QP_STATE_RTS] = "RTS",
+ [SIW_QP_STATE_CLOSING] = "CLOSING",
+ [SIW_QP_STATE_TERMINATE] = "TERMINATE",
+ [SIW_QP_STATE_ERROR] = "ERROR"
+};
+
+/*
+ * iWARP (RDMAP, DDP and MPA) parameters as well as Softiwarp settings on a
+ * per-RDMAP message basis. Please keep order of initializer. All MPA len
+ * is initialized to minimum packet size.
+ */
+struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1] = {
+ { /* RDMAP_RDMA_WRITE */
+ .hdr_len = sizeof(struct iwarp_rdma_write),
+ .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_write) - 2),
+ .ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST |
+ cpu_to_be16(DDP_VERSION << 8) |
+ cpu_to_be16(RDMAP_VERSION << 6) |
+ cpu_to_be16(RDMAP_RDMA_WRITE),
+ .rx_data = siw_proc_write },
+ { /* RDMAP_RDMA_READ_REQ */
+ .hdr_len = sizeof(struct iwarp_rdma_rreq),
+ .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rreq) - 2),
+ .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
+ cpu_to_be16(RDMAP_VERSION << 6) |
+ cpu_to_be16(RDMAP_RDMA_READ_REQ),
+ .rx_data = siw_proc_rreq },
+ { /* RDMAP_RDMA_READ_RESP */
+ .hdr_len = sizeof(struct iwarp_rdma_rresp),
+ .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rresp) - 2),
+ .ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST |
+ cpu_to_be16(DDP_VERSION << 8) |
+ cpu_to_be16(RDMAP_VERSION << 6) |
+ cpu_to_be16(RDMAP_RDMA_READ_RESP),
+ .rx_data = siw_proc_rresp },
+ { /* RDMAP_SEND */
+ .hdr_len = sizeof(struct iwarp_send),
+ .ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
+ .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
+ cpu_to_be16(RDMAP_VERSION << 6) |
+ cpu_to_be16(RDMAP_SEND),
+ .rx_data = siw_proc_send },
+ { /* RDMAP_SEND_INVAL */
+ .hdr_len = sizeof(struct iwarp_send_inv),
+ .ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
+ .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
+ cpu_to_be16(RDMAP_VERSION << 6) |
+ cpu_to_be16(RDMAP_SEND_INVAL),
+ .rx_data = siw_proc_send },
+ { /* RDMAP_SEND_SE */
+ .hdr_len = sizeof(struct iwarp_send),
+ .ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
+ .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
+ cpu_to_be16(RDMAP_VERSION << 6) |
+ cpu_to_be16(RDMAP_SEND_SE),
+ .rx_data = siw_proc_send },
+ { /* RDMAP_SEND_SE_INVAL */
+ .hdr_len = sizeof(struct iwarp_send_inv),
+ .ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
+ .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
+ cpu_to_be16(RDMAP_VERSION << 6) |
+ cpu_to_be16(RDMAP_SEND_SE_INVAL),
+ .rx_data = siw_proc_send },
+ { /* RDMAP_TERMINATE */
+ .hdr_len = sizeof(struct iwarp_terminate),
+ .ctrl.mpa_len = htons(sizeof(struct iwarp_terminate) - 2),
+ .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
+ cpu_to_be16(RDMAP_VERSION << 6) |
+ cpu_to_be16(RDMAP_TERMINATE),
+ .rx_data = siw_proc_terminate }
+};
+
+void siw_qp_llp_data_ready(struct sock *sk)
+{
+ struct siw_qp *qp;
+
+ trace_sk_data_ready(sk);
+
+ read_lock(&sk->sk_callback_lock);
+
+ if (unlikely(!sk->sk_user_data || !sk_to_qp(sk)))
+ goto done;
+
+ qp = sk_to_qp(sk);
+
+ if (likely(!qp->rx_stream.rx_suspend &&
+ down_read_trylock(&qp->state_lock))) {
+ read_descriptor_t rd_desc = { .arg.data = qp, .count = 1 };
+
+ if (likely(qp->attrs.state == SIW_QP_STATE_RTS))
+ /*
+ * Implements data receive operation during
+ * socket callback. TCP gracefully catches
+ * the case where there is nothing to receive
+ * (not calling siw_tcp_rx_data() then).
+ */
+ tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);
+
+ up_read(&qp->state_lock);
+ } else {
+ siw_dbg_qp(qp, "unable to process RX, suspend: %d\n",
+ qp->rx_stream.rx_suspend);
+ }
+done:
+ read_unlock(&sk->sk_callback_lock);
+}
+
+void siw_qp_llp_close(struct siw_qp *qp)
+{
+ siw_dbg_qp(qp, "enter llp close, state = %s\n",
+ siw_qp_state_to_string[qp->attrs.state]);
+
+ down_write(&qp->state_lock);
+
+ qp->rx_stream.rx_suspend = 1;
+ qp->tx_ctx.tx_suspend = 1;
+ qp->attrs.sk = NULL;
+
+ switch (qp->attrs.state) {
+ case SIW_QP_STATE_RTS:
+ case SIW_QP_STATE_RTR:
+ case SIW_QP_STATE_IDLE:
+ case SIW_QP_STATE_TERMINATE:
+ qp->attrs.state = SIW_QP_STATE_ERROR;
+ break;
+ /*
+ * SIW_QP_STATE_CLOSING:
+ *
+ * This is a forced close. shall the QP be moved to
+ * ERROR or IDLE ?
+ */
+ case SIW_QP_STATE_CLOSING:
+ if (tx_wqe(qp)->wr_status == SIW_WR_IDLE)
+ qp->attrs.state = SIW_QP_STATE_ERROR;
+ else
+ qp->attrs.state = SIW_QP_STATE_IDLE;
+ break;
+
+ default:
+ siw_dbg_qp(qp, "llp close: no state transition needed: %s\n",
+ siw_qp_state_to_string[qp->attrs.state]);
+ break;
+ }
+ siw_sq_flush(qp);
+ siw_rq_flush(qp);
+
+ /*
+ * Dereference closing CEP
+ */
+ if (qp->cep) {
+ siw_cep_put(qp->cep);
+ qp->cep = NULL;
+ }
+
+ up_write(&qp->state_lock);
+
+ siw_dbg_qp(qp, "llp close exit: state %s\n",
+ siw_qp_state_to_string[qp->attrs.state]);
+}
+
+/*
+ * socket callback routine informing about newly available send space.
+ * Function schedules SQ work for processing SQ items.
+ */
+void siw_qp_llp_write_space(struct sock *sk)
+{
+ struct siw_cep *cep;
+
+ read_lock(&sk->sk_callback_lock);
+
+ cep = sk_to_cep(sk);
+ if (cep) {
+ cep->sk_write_space(sk);
+
+ if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+ (void)siw_sq_start(cep->qp);
+ }
+
+ read_unlock(&sk->sk_callback_lock);
+}
+
+static int siw_qp_readq_init(struct siw_qp *qp, int irq_size, int orq_size)
+{
+ if (irq_size) {
+ irq_size = roundup_pow_of_two(irq_size);
+ qp->irq = vcalloc(irq_size, sizeof(struct siw_sqe));
+ if (!qp->irq) {
+ qp->attrs.irq_size = 0;
+ return -ENOMEM;
+ }
+ }
+ if (orq_size) {
+ orq_size = roundup_pow_of_two(orq_size);
+ qp->orq = vcalloc(orq_size, sizeof(struct siw_sqe));
+ if (!qp->orq) {
+ qp->attrs.orq_size = 0;
+ qp->attrs.irq_size = 0;
+ vfree(qp->irq);
+ return -ENOMEM;
+ }
+ }
+ qp->attrs.irq_size = irq_size;
+ qp->attrs.orq_size = orq_size;
+ siw_dbg_qp(qp, "ORD %d, IRD %d\n", orq_size, irq_size);
+ return 0;
+}
+
+static int siw_qp_enable_crc(struct siw_qp *qp)
+{
+ struct siw_rx_stream *c_rx = &qp->rx_stream;
+ struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
+ int size;
+
+ if (siw_crypto_shash == NULL)
+ return -ENOENT;
+
+ size = crypto_shash_descsize(siw_crypto_shash) +
+ sizeof(struct shash_desc);
+
+ c_tx->mpa_crc_hd = kzalloc(size, GFP_KERNEL);
+ c_rx->mpa_crc_hd = kzalloc(size, GFP_KERNEL);
+ if (!c_tx->mpa_crc_hd || !c_rx->mpa_crc_hd) {
+ kfree(c_tx->mpa_crc_hd);
+ kfree(c_rx->mpa_crc_hd);
+ c_tx->mpa_crc_hd = NULL;
+ c_rx->mpa_crc_hd = NULL;
+ return -ENOMEM;
+ }
+ c_tx->mpa_crc_hd->tfm = siw_crypto_shash;
+ c_rx->mpa_crc_hd->tfm = siw_crypto_shash;
+
+ return 0;
+}
+
+/*
+ * Send a non signalled READ or WRITE to peer side as negotiated
+ * with MPAv2 P2P setup protocol. The work request is only created
+ * as a current active WR and does not consume Send Queue space.
+ *
+ * Caller must hold QP state lock.
+ */
+int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl)
+{
+ struct siw_wqe *wqe = tx_wqe(qp);
+ unsigned long flags;
+ int rv = 0;
+
+ spin_lock_irqsave(&qp->sq_lock, flags);
+
+ if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
+ spin_unlock_irqrestore(&qp->sq_lock, flags);
+ return -EIO;
+ }
+ memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
+
+ wqe->wr_status = SIW_WR_QUEUED;
+ wqe->sqe.flags = 0;
+ wqe->sqe.num_sge = 1;
+ wqe->sqe.sge[0].length = 0;
+ wqe->sqe.sge[0].laddr = 0;
+ wqe->sqe.sge[0].lkey = 0;
+ /*
+ * While it must not be checked for inbound zero length
+ * READ/WRITE, some HW may treat STag 0 special.
+ */
+ wqe->sqe.rkey = 1;
+ wqe->sqe.raddr = 0;
+ wqe->processed = 0;
+
+ if (ctrl & MPA_V2_RDMA_WRITE_RTR)
+ wqe->sqe.opcode = SIW_OP_WRITE;
+ else if (ctrl & MPA_V2_RDMA_READ_RTR) {
+ struct siw_sqe *rreq = NULL;
+
+ wqe->sqe.opcode = SIW_OP_READ;
+
+ spin_lock(&qp->orq_lock);
+
+ if (qp->attrs.orq_size)
+ rreq = orq_get_free(qp);
+ if (rreq) {
+ siw_read_to_orq(rreq, &wqe->sqe);
+ qp->orq_put++;
+ } else
+ rv = -EIO;
+
+ spin_unlock(&qp->orq_lock);
+ } else
+ rv = -EINVAL;
+
+ if (rv)
+ wqe->wr_status = SIW_WR_IDLE;
+
+ spin_unlock_irqrestore(&qp->sq_lock, flags);
+
+ if (!rv)
+ rv = siw_sq_start(qp);
+
+ return rv;
+}
+
+/*
+ * Map memory access error to DDP tagged error
+ */
+enum ddp_ecode siw_tagged_error(enum siw_access_state state)
+{
+ switch (state) {
+ case E_STAG_INVALID:
+ return DDP_ECODE_T_INVALID_STAG;
+ case E_BASE_BOUNDS:
+ return DDP_ECODE_T_BASE_BOUNDS;
+ case E_PD_MISMATCH:
+ return DDP_ECODE_T_STAG_NOT_ASSOC;
+ case E_ACCESS_PERM:
+ /*
+ * RFC 5041 (DDP) lacks an ecode for insufficient access
+ * permissions. 'Invalid STag' seem to be the closest
+ * match though.
+ */
+ return DDP_ECODE_T_INVALID_STAG;
+ default:
+ WARN_ON(1);
+ return DDP_ECODE_T_INVALID_STAG;
+ }
+}
+
+/*
+ * Map memory access error to RDMAP protection error
+ */
+enum rdmap_ecode siw_rdmap_error(enum siw_access_state state)
+{
+ switch (state) {
+ case E_STAG_INVALID:
+ return RDMAP_ECODE_INVALID_STAG;
+ case E_BASE_BOUNDS:
+ return RDMAP_ECODE_BASE_BOUNDS;
+ case E_PD_MISMATCH:
+ return RDMAP_ECODE_STAG_NOT_ASSOC;
+ case E_ACCESS_PERM:
+ return RDMAP_ECODE_ACCESS_RIGHTS;
+ default:
+ return RDMAP_ECODE_UNSPECIFIED;
+ }
+}
+
+void siw_init_terminate(struct siw_qp *qp, enum term_elayer layer, u8 etype,
+ u8 ecode, int in_tx)
+{
+ if (!qp->term_info.valid) {
+ memset(&qp->term_info, 0, sizeof(qp->term_info));
+ qp->term_info.layer = layer;
+ qp->term_info.etype = etype;
+ qp->term_info.ecode = ecode;
+ qp->term_info.in_tx = in_tx;
+ qp->term_info.valid = 1;
+ }
+ siw_dbg_qp(qp, "init TERM: layer %d, type %d, code %d, in tx %s\n",
+ layer, etype, ecode, in_tx ? "yes" : "no");
+}
+
+/*
+ * Send a TERMINATE message, as defined in RFC's 5040/5041/5044/6581.
+ * Sending TERMINATE messages is best effort - such messages
+ * can only be send if the QP is still connected and it does
+ * not have another outbound message in-progress, i.e. the
+ * TERMINATE message must not interfer with an incomplete current
+ * transmit operation.
+ */
+void siw_send_terminate(struct siw_qp *qp)
+{
+ struct kvec iov[3];
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
+ struct iwarp_terminate *term = NULL;
+ union iwarp_hdr *err_hdr = NULL;
+ struct socket *s = qp->attrs.sk;
+ struct siw_rx_stream *srx = &qp->rx_stream;
+ union iwarp_hdr *rx_hdr = &srx->hdr;
+ u32 crc = 0;
+ int num_frags, len_terminate, rv;
+
+ if (!qp->term_info.valid)
+ return;
+
+ qp->term_info.valid = 0;
+
+ if (tx_wqe(qp)->wr_status == SIW_WR_INPROGRESS) {
+ siw_dbg_qp(qp, "cannot send TERMINATE: op %d in progress\n",
+ tx_type(tx_wqe(qp)));
+ return;
+ }
+ if (!s && qp->cep)
+ /* QP not yet in RTS. Take socket from connection end point */
+ s = qp->cep->sock;
+
+ if (!s) {
+ siw_dbg_qp(qp, "cannot send TERMINATE: not connected\n");
+ return;
+ }
+
+ term = kzalloc(sizeof(*term), GFP_KERNEL);
+ if (!term)
+ return;
+
+ term->ddp_qn = cpu_to_be32(RDMAP_UNTAGGED_QN_TERMINATE);
+ term->ddp_mo = 0;
+ term->ddp_msn = cpu_to_be32(1);
+
+ iov[0].iov_base = term;
+ iov[0].iov_len = sizeof(*term);
+
+ if ((qp->term_info.layer == TERM_ERROR_LAYER_DDP) ||
+ ((qp->term_info.layer == TERM_ERROR_LAYER_RDMAP) &&
+ (qp->term_info.etype != RDMAP_ETYPE_CATASTROPHIC))) {
+ err_hdr = kzalloc(sizeof(*err_hdr), GFP_KERNEL);
+ if (!err_hdr) {
+ kfree(term);
+ return;
+ }
+ }
+ memcpy(&term->ctrl, &iwarp_pktinfo[RDMAP_TERMINATE].ctrl,
+ sizeof(struct iwarp_ctrl));
+
+ __rdmap_term_set_layer(term, qp->term_info.layer);
+ __rdmap_term_set_etype(term, qp->term_info.etype);
+ __rdmap_term_set_ecode(term, qp->term_info.ecode);
+
+ switch (qp->term_info.layer) {
+ case TERM_ERROR_LAYER_RDMAP:
+ if (qp->term_info.etype == RDMAP_ETYPE_CATASTROPHIC)
+ /* No additional DDP/RDMAP header to be included */
+ break;
+
+ if (qp->term_info.etype == RDMAP_ETYPE_REMOTE_PROTECTION) {
+ /*
+ * Complete RDMAP frame will get attached, and
+ * DDP segment length is valid
+ */
+ term->flag_m = 1;
+ term->flag_d = 1;
+ term->flag_r = 1;
+
+ if (qp->term_info.in_tx) {
+ struct iwarp_rdma_rreq *rreq;
+ struct siw_wqe *wqe = tx_wqe(qp);
+
+ /* Inbound RREQ error, detected during
+ * RRESP creation. Take state from
+ * current TX work queue element to
+ * reconstruct peers RREQ.
+ */
+ rreq = (struct iwarp_rdma_rreq *)err_hdr;
+
+ memcpy(&rreq->ctrl,
+ &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl,
+ sizeof(struct iwarp_ctrl));
+
+ rreq->rsvd = 0;
+ rreq->ddp_qn =
+ htonl(RDMAP_UNTAGGED_QN_RDMA_READ);
+
+ /* Provide RREQ's MSN as kept aside */
+ rreq->ddp_msn = htonl(wqe->sqe.sge[0].length);
+
+ rreq->ddp_mo = htonl(wqe->processed);
+ rreq->sink_stag = htonl(wqe->sqe.rkey);
+ rreq->sink_to = cpu_to_be64(wqe->sqe.raddr);
+ rreq->read_size = htonl(wqe->sqe.sge[0].length);
+ rreq->source_stag = htonl(wqe->sqe.sge[0].lkey);
+ rreq->source_to =
+ cpu_to_be64(wqe->sqe.sge[0].laddr);
+
+ iov[1].iov_base = rreq;
+ iov[1].iov_len = sizeof(*rreq);
+
+ rx_hdr = (union iwarp_hdr *)rreq;
+ } else {
+ /* Take RDMAP/DDP information from
+ * current (failed) inbound frame.
+ */
+ iov[1].iov_base = rx_hdr;
+
+ if (__rdmap_get_opcode(&rx_hdr->ctrl) ==
+ RDMAP_RDMA_READ_REQ)
+ iov[1].iov_len =
+ sizeof(struct iwarp_rdma_rreq);
+ else /* SEND type */
+ iov[1].iov_len =
+ sizeof(struct iwarp_send);
+ }
+ } else {
+ /* Do not report DDP hdr information if packet
+ * layout is unknown
+ */
+ if ((qp->term_info.ecode == RDMAP_ECODE_VERSION) ||
+ (qp->term_info.ecode == RDMAP_ECODE_OPCODE))
+ break;
+
+ iov[1].iov_base = rx_hdr;
+
+ /* Only DDP frame will get attached */
+ if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
+ iov[1].iov_len =
+ sizeof(struct iwarp_rdma_write);
+ else
+ iov[1].iov_len = sizeof(struct iwarp_send);
+
+ term->flag_m = 1;
+ term->flag_d = 1;
+ }
+ term->ctrl.mpa_len = cpu_to_be16(iov[1].iov_len);
+ break;
+
+ case TERM_ERROR_LAYER_DDP:
+ /* Report error encountered while DDP processing.
+ * This can only happen as a result of inbound
+ * DDP processing
+ */
+
+ /* Do not report DDP hdr information if packet
+ * layout is unknown
+ */
+ if (((qp->term_info.etype == DDP_ETYPE_TAGGED_BUF) &&
+ (qp->term_info.ecode == DDP_ECODE_T_VERSION)) ||
+ ((qp->term_info.etype == DDP_ETYPE_UNTAGGED_BUF) &&
+ (qp->term_info.ecode == DDP_ECODE_UT_VERSION)))
+ break;
+
+ iov[1].iov_base = rx_hdr;
+
+ if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
+ iov[1].iov_len = sizeof(struct iwarp_ctrl_tagged);
+ else
+ iov[1].iov_len = sizeof(struct iwarp_ctrl_untagged);
+
+ term->flag_m = 1;
+ term->flag_d = 1;
+ break;
+
+ default:
+ break;
+ }
+ if (term->flag_m || term->flag_d || term->flag_r) {
+ iov[2].iov_base = &crc;
+ iov[2].iov_len = sizeof(crc);
+ len_terminate = sizeof(*term) + iov[1].iov_len + MPA_CRC_SIZE;
+ num_frags = 3;
+ } else {
+ iov[1].iov_base = &crc;
+ iov[1].iov_len = sizeof(crc);
+ len_terminate = sizeof(*term) + MPA_CRC_SIZE;
+ num_frags = 2;
+ }
+
+ /* Adjust DDP Segment Length parameter, if valid */
+ if (term->flag_m) {
+ u32 real_ddp_len = be16_to_cpu(rx_hdr->ctrl.mpa_len);
+ enum rdma_opcode op = __rdmap_get_opcode(&rx_hdr->ctrl);
+
+ real_ddp_len -= iwarp_pktinfo[op].hdr_len - MPA_HDR_SIZE;
+ rx_hdr->ctrl.mpa_len = cpu_to_be16(real_ddp_len);
+ }
+
+ term->ctrl.mpa_len =
+ cpu_to_be16(len_terminate - (MPA_HDR_SIZE + MPA_CRC_SIZE));
+ if (qp->tx_ctx.mpa_crc_hd) {
+ crypto_shash_init(qp->tx_ctx.mpa_crc_hd);
+ if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd,
+ (u8 *)iov[0].iov_base,
+ iov[0].iov_len))
+ goto out;
+
+ if (num_frags == 3) {
+ if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd,
+ (u8 *)iov[1].iov_base,
+ iov[1].iov_len))
+ goto out;
+ }
+ crypto_shash_final(qp->tx_ctx.mpa_crc_hd, (u8 *)&crc);
+ }
+
+ rv = kernel_sendmsg(s, &msg, iov, num_frags, len_terminate);
+ siw_dbg_qp(qp, "sent TERM: %s, layer %d, type %d, code %d (%d bytes)\n",
+ rv == len_terminate ? "success" : "failure",
+ __rdmap_term_layer(term), __rdmap_term_etype(term),
+ __rdmap_term_ecode(term), rv);
+out:
+ kfree(term);
+ kfree(err_hdr);
+}
+
+/*
+ * Handle all attrs other than state
+ */
+static void siw_qp_modify_nonstate(struct siw_qp *qp,
+ struct siw_qp_attrs *attrs,
+ enum siw_qp_attr_mask mask)
+{
+ if (mask & SIW_QP_ATTR_ACCESS_FLAGS) {
+ if (attrs->flags & SIW_RDMA_BIND_ENABLED)
+ qp->attrs.flags |= SIW_RDMA_BIND_ENABLED;
+ else
+ qp->attrs.flags &= ~SIW_RDMA_BIND_ENABLED;
+
+ if (attrs->flags & SIW_RDMA_WRITE_ENABLED)
+ qp->attrs.flags |= SIW_RDMA_WRITE_ENABLED;
+ else
+ qp->attrs.flags &= ~SIW_RDMA_WRITE_ENABLED;
+
+ if (attrs->flags & SIW_RDMA_READ_ENABLED)
+ qp->attrs.flags |= SIW_RDMA_READ_ENABLED;
+ else
+ qp->attrs.flags &= ~SIW_RDMA_READ_ENABLED;
+ }
+}
+
+static int siw_qp_nextstate_from_idle(struct siw_qp *qp,
+ struct siw_qp_attrs *attrs,
+ enum siw_qp_attr_mask mask)
+{
+ int rv = 0;
+
+ switch (attrs->state) {
+ case SIW_QP_STATE_RTS:
+ if (attrs->flags & SIW_MPA_CRC) {
+ rv = siw_qp_enable_crc(qp);
+ if (rv)
+ break;
+ }
+ if (!(mask & SIW_QP_ATTR_LLP_HANDLE)) {
+ siw_dbg_qp(qp, "no socket\n");
+ rv = -EINVAL;
+ break;
+ }
+ if (!(mask & SIW_QP_ATTR_MPA)) {
+ siw_dbg_qp(qp, "no MPA\n");
+ rv = -EINVAL;
+ break;
+ }
+ /*
+ * Initialize iWARP TX state
+ */
+ qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 0;
+ qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 0;
+ qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 0;
+
+ /*
+ * Initialize iWARP RX state
+ */
+ qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 1;
+ qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 1;
+ qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 1;
+
+ /*
+ * init IRD free queue, caller has already checked
+ * limits.
+ */
+ rv = siw_qp_readq_init(qp, attrs->irq_size,
+ attrs->orq_size);
+ if (rv)
+ break;
+
+ qp->attrs.sk = attrs->sk;
+ qp->attrs.state = SIW_QP_STATE_RTS;
+
+ siw_dbg_qp(qp, "enter RTS: crc=%s, ord=%u, ird=%u\n",
+ attrs->flags & SIW_MPA_CRC ? "y" : "n",
+ qp->attrs.orq_size, qp->attrs.irq_size);
+ break;
+
+ case SIW_QP_STATE_ERROR:
+ siw_rq_flush(qp);
+ qp->attrs.state = SIW_QP_STATE_ERROR;
+ if (qp->cep) {
+ siw_cep_put(qp->cep);
+ qp->cep = NULL;
+ }
+ break;
+
+ default:
+ break;
+ }
+ return rv;
+}
+
+static int siw_qp_nextstate_from_rts(struct siw_qp *qp,
+ struct siw_qp_attrs *attrs)
+{
+ int drop_conn = 0;
+
+ switch (attrs->state) {
+ case SIW_QP_STATE_CLOSING:
+ /*
+ * Verbs: move to IDLE if SQ and ORQ are empty.
+ * Move to ERROR otherwise. But first of all we must
+ * close the connection. So we keep CLOSING or ERROR
+ * as a transient state, schedule connection drop work
+ * and wait for the socket state change upcall to
+ * come back closed.
+ */
+ if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) {
+ qp->attrs.state = SIW_QP_STATE_CLOSING;
+ } else {
+ qp->attrs.state = SIW_QP_STATE_ERROR;
+ siw_sq_flush(qp);
+ }
+ siw_rq_flush(qp);
+
+ drop_conn = 1;
+ break;
+
+ case SIW_QP_STATE_TERMINATE:
+ qp->attrs.state = SIW_QP_STATE_TERMINATE;
+
+ siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
+ RDMAP_ETYPE_CATASTROPHIC,
+ RDMAP_ECODE_UNSPECIFIED, 1);
+ drop_conn = 1;
+ break;
+
+ case SIW_QP_STATE_ERROR:
+ /*
+ * This is an emergency close.
+ *
+ * Any in progress transmit operation will get
+ * cancelled.
+ * This will likely result in a protocol failure,
+ * if a TX operation is in transit. The caller
+ * could unconditional wait to give the current
+ * operation a chance to complete.
+ * Esp., how to handle the non-empty IRQ case?
+ * The peer was asking for data transfer at a valid
+ * point in time.
+ */
+ siw_sq_flush(qp);
+ siw_rq_flush(qp);
+ qp->attrs.state = SIW_QP_STATE_ERROR;
+ drop_conn = 1;
+ break;
+
+ default:
+ break;
+ }
+ return drop_conn;
+}
+
+static void siw_qp_nextstate_from_term(struct siw_qp *qp,
+ struct siw_qp_attrs *attrs)
+{
+ switch (attrs->state) {
+ case SIW_QP_STATE_ERROR:
+ siw_rq_flush(qp);
+ qp->attrs.state = SIW_QP_STATE_ERROR;
+
+ if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
+ siw_sq_flush(qp);
+ break;
+
+ default:
+ break;
+ }
+}
+
+static int siw_qp_nextstate_from_close(struct siw_qp *qp,
+ struct siw_qp_attrs *attrs)
+{
+ int rv = 0;
+
+ switch (attrs->state) {
+ case SIW_QP_STATE_IDLE:
+ WARN_ON(tx_wqe(qp)->wr_status != SIW_WR_IDLE);
+ qp->attrs.state = SIW_QP_STATE_IDLE;
+ break;
+
+ case SIW_QP_STATE_CLOSING:
+ /*
+ * The LLP may already moved the QP to closing
+ * due to graceful peer close init
+ */
+ break;
+
+ case SIW_QP_STATE_ERROR:
+ /*
+ * QP was moved to CLOSING by LLP event
+ * not yet seen by user.
+ */
+ qp->attrs.state = SIW_QP_STATE_ERROR;
+
+ if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
+ siw_sq_flush(qp);
+
+ siw_rq_flush(qp);
+ break;
+
+ default:
+ siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
+ siw_qp_state_to_string[qp->attrs.state],
+ siw_qp_state_to_string[attrs->state]);
+
+ rv = -ECONNABORTED;
+ }
+ return rv;
+}
+
+/*
+ * Caller must hold qp->state_lock
+ */
+int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attrs,
+ enum siw_qp_attr_mask mask)
+{
+ int drop_conn = 0, rv = 0;
+
+ if (!mask)
+ return 0;
+
+ siw_dbg_qp(qp, "state: %s => %s\n",
+ siw_qp_state_to_string[qp->attrs.state],
+ siw_qp_state_to_string[attrs->state]);
+
+ if (mask != SIW_QP_ATTR_STATE)
+ siw_qp_modify_nonstate(qp, attrs, mask);
+
+ if (!(mask & SIW_QP_ATTR_STATE))
+ return 0;
+
+ switch (qp->attrs.state) {
+ case SIW_QP_STATE_IDLE:
+ case SIW_QP_STATE_RTR:
+ rv = siw_qp_nextstate_from_idle(qp, attrs, mask);
+ break;
+
+ case SIW_QP_STATE_RTS:
+ drop_conn = siw_qp_nextstate_from_rts(qp, attrs);
+ break;
+
+ case SIW_QP_STATE_TERMINATE:
+ siw_qp_nextstate_from_term(qp, attrs);
+ break;
+
+ case SIW_QP_STATE_CLOSING:
+ siw_qp_nextstate_from_close(qp, attrs);
+ break;
+ default:
+ break;
+ }
+ if (drop_conn)
+ siw_qp_cm_drop(qp, 0);
+
+ return rv;
+}
+
+void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe)
+{
+ rreq->id = sqe->id;
+ rreq->opcode = sqe->opcode;
+ rreq->sge[0].laddr = sqe->sge[0].laddr;
+ rreq->sge[0].length = sqe->sge[0].length;
+ rreq->sge[0].lkey = sqe->sge[0].lkey;
+ rreq->sge[1].lkey = sqe->sge[1].lkey;
+ rreq->flags = sqe->flags | SIW_WQE_VALID;
+ rreq->num_sge = 1;
+}
+
+static int siw_activate_tx_from_sq(struct siw_qp *qp)
+{
+ struct siw_sqe *sqe;
+ struct siw_wqe *wqe = tx_wqe(qp);
+ int rv = 1;
+
+ sqe = sq_get_next(qp);
+ if (!sqe)
+ return 0;
+
+ memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
+ wqe->wr_status = SIW_WR_QUEUED;
+
+ /* First copy SQE to kernel private memory */
+ memcpy(&wqe->sqe, sqe, sizeof(*sqe));
+
+ if (wqe->sqe.opcode >= SIW_NUM_OPCODES) {
+ rv = -EINVAL;
+ goto out;
+ }
+ if (wqe->sqe.flags & SIW_WQE_INLINE) {
+ if (wqe->sqe.opcode != SIW_OP_SEND &&
+ wqe->sqe.opcode != SIW_OP_WRITE) {
+ rv = -EINVAL;
+ goto out;
+ }
+ if (wqe->sqe.sge[0].length > SIW_MAX_INLINE) {
+ rv = -EINVAL;
+ goto out;
+ }
+ wqe->sqe.sge[0].laddr = (uintptr_t)&wqe->sqe.sge[1];
+ wqe->sqe.sge[0].lkey = 0;
+ wqe->sqe.num_sge = 1;
+ }
+ if (wqe->sqe.flags & SIW_WQE_READ_FENCE) {
+ /* A READ cannot be fenced */
+ if (unlikely(wqe->sqe.opcode == SIW_OP_READ ||
+ wqe->sqe.opcode ==
+ SIW_OP_READ_LOCAL_INV)) {
+ siw_dbg_qp(qp, "cannot fence read\n");
+ rv = -EINVAL;
+ goto out;
+ }
+ spin_lock(&qp->orq_lock);
+
+ if (qp->attrs.orq_size && !siw_orq_empty(qp)) {
+ qp->tx_ctx.orq_fence = 1;
+ rv = 0;
+ }
+ spin_unlock(&qp->orq_lock);
+
+ } else if (wqe->sqe.opcode == SIW_OP_READ ||
+ wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
+ struct siw_sqe *rreq;
+
+ if (unlikely(!qp->attrs.orq_size)) {
+ /* We negotiated not to send READ req's */
+ rv = -EINVAL;
+ goto out;
+ }
+ wqe->sqe.num_sge = 1;
+
+ spin_lock(&qp->orq_lock);
+
+ rreq = orq_get_free(qp);
+ if (rreq) {
+ /*
+ * Make an immediate copy in ORQ to be ready
+ * to process loopback READ reply
+ */
+ siw_read_to_orq(rreq, &wqe->sqe);
+ qp->orq_put++;
+ } else {
+ qp->tx_ctx.orq_fence = 1;
+ rv = 0;
+ }
+ spin_unlock(&qp->orq_lock);
+ }
+
+ /* Clear SQE, can be re-used by application */
+ smp_store_mb(sqe->flags, 0);
+ qp->sq_get++;
+out:
+ if (unlikely(rv < 0)) {
+ siw_dbg_qp(qp, "error %d\n", rv);
+ wqe->wr_status = SIW_WR_IDLE;
+ }
+ return rv;
+}
+
+/*
+ * Must be called with SQ locked.
+ * To avoid complete SQ starvation by constant inbound READ requests,
+ * the active IRQ will not be served after qp->irq_burst, if the
+ * SQ has pending work.
+ */
+int siw_activate_tx(struct siw_qp *qp)
+{
+ struct siw_sqe *irqe;
+ struct siw_wqe *wqe = tx_wqe(qp);
+
+ if (!qp->attrs.irq_size)
+ return siw_activate_tx_from_sq(qp);
+
+ irqe = &qp->irq[qp->irq_get % qp->attrs.irq_size];
+
+ if (!(irqe->flags & SIW_WQE_VALID))
+ return siw_activate_tx_from_sq(qp);
+
+ /*
+ * Avoid local WQE processing starvation in case
+ * of constant inbound READ request stream
+ */
+ if (sq_get_next(qp) && ++qp->irq_burst >= SIW_IRQ_MAXBURST_SQ_ACTIVE) {
+ qp->irq_burst = 0;
+ return siw_activate_tx_from_sq(qp);
+ }
+ memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
+ wqe->wr_status = SIW_WR_QUEUED;
+
+ /* start READ RESPONSE */
+ wqe->sqe.opcode = SIW_OP_READ_RESPONSE;
+ wqe->sqe.flags = 0;
+ if (irqe->num_sge) {
+ wqe->sqe.num_sge = 1;
+ wqe->sqe.sge[0].length = irqe->sge[0].length;
+ wqe->sqe.sge[0].laddr = irqe->sge[0].laddr;
+ wqe->sqe.sge[0].lkey = irqe->sge[0].lkey;
+ } else {
+ wqe->sqe.num_sge = 0;
+ }
+
+ /* Retain original RREQ's message sequence number for
+ * potential error reporting cases.
+ */
+ wqe->sqe.sge[1].length = irqe->sge[1].length;
+
+ wqe->sqe.rkey = irqe->rkey;
+ wqe->sqe.raddr = irqe->raddr;
+
+ wqe->processed = 0;
+ qp->irq_get++;
+
+ /* mark current IRQ entry free */
+ smp_store_mb(irqe->flags, 0);
+
+ return 1;
+}
+
+/*
+ * Check if current CQ state qualifies for calling CQ completion
+ * handler. Must be called with CQ lock held.
+ */
+static bool siw_cq_notify_now(struct siw_cq *cq, u32 flags)
+{
+ u32 cq_notify;
+
+ if (!cq->base_cq.comp_handler)
+ return false;
+
+ /* Read application shared notification state */
+ cq_notify = READ_ONCE(cq->notify->flags);
+
+ if ((cq_notify & SIW_NOTIFY_NEXT_COMPLETION) ||
+ ((cq_notify & SIW_NOTIFY_SOLICITED) &&
+ (flags & SIW_WQE_SOLICITED))) {
+ /*
+ * CQ notification is one-shot: Since the
+ * current CQE causes user notification,
+ * the CQ gets dis-aremd and must be re-aremd
+ * by the user for a new notification.
+ */
+ WRITE_ONCE(cq->notify->flags, SIW_NOTIFY_NOT);
+
+ return true;
+ }
+ return false;
+}
+
+int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe, u32 bytes,
+ enum siw_wc_status status)
+{
+ struct siw_cq *cq = qp->scq;
+ int rv = 0;
+
+ if (cq) {
+ u32 sqe_flags = sqe->flags;
+ struct siw_cqe *cqe;
+ u32 idx;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cq->lock, flags);
+
+ idx = cq->cq_put % cq->num_cqe;
+ cqe = &cq->queue[idx];
+
+ if (!READ_ONCE(cqe->flags)) {
+ bool notify;
+
+ cqe->id = sqe->id;
+ cqe->opcode = sqe->opcode;
+ cqe->status = status;
+ cqe->imm_data = 0;
+ cqe->bytes = bytes;
+
+ if (rdma_is_kernel_res(&cq->base_cq.res))
+ cqe->base_qp = &qp->base_qp;
+ else
+ cqe->qp_id = qp_id(qp);
+
+ /* mark CQE valid for application */
+ WRITE_ONCE(cqe->flags, SIW_WQE_VALID);
+ /* recycle SQE */
+ smp_store_mb(sqe->flags, 0);
+
+ cq->cq_put++;
+ notify = siw_cq_notify_now(cq, sqe_flags);
+
+ spin_unlock_irqrestore(&cq->lock, flags);
+
+ if (notify) {
+ siw_dbg_cq(cq, "Call completion handler\n");
+ cq->base_cq.comp_handler(&cq->base_cq,
+ cq->base_cq.cq_context);
+ }
+ } else {
+ spin_unlock_irqrestore(&cq->lock, flags);
+ rv = -ENOMEM;
+ siw_cq_event(cq, IB_EVENT_CQ_ERR);
+ }
+ } else {
+ /* recycle SQE */
+ smp_store_mb(sqe->flags, 0);
+ }
+ return rv;
+}
+
+int siw_rqe_complete(struct siw_qp *qp, struct siw_rqe *rqe, u32 bytes,
+ u32 inval_stag, enum siw_wc_status status)
+{
+ struct siw_cq *cq = qp->rcq;
+ int rv = 0;
+
+ if (cq) {
+ struct siw_cqe *cqe;
+ u32 idx;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cq->lock, flags);
+
+ idx = cq->cq_put % cq->num_cqe;
+ cqe = &cq->queue[idx];
+
+ if (!READ_ONCE(cqe->flags)) {
+ bool notify;
+ u8 cqe_flags = SIW_WQE_VALID;
+
+ cqe->id = rqe->id;
+ cqe->opcode = SIW_OP_RECEIVE;
+ cqe->status = status;
+ cqe->imm_data = 0;
+ cqe->bytes = bytes;
+
+ if (rdma_is_kernel_res(&cq->base_cq.res)) {
+ cqe->base_qp = &qp->base_qp;
+ if (inval_stag) {
+ cqe_flags |= SIW_WQE_REM_INVAL;
+ cqe->inval_stag = inval_stag;
+ }
+ } else {
+ cqe->qp_id = qp_id(qp);
+ }
+ /* mark CQE valid for application */
+ WRITE_ONCE(cqe->flags, cqe_flags);
+ /* recycle RQE */
+ smp_store_mb(rqe->flags, 0);
+
+ cq->cq_put++;
+ notify = siw_cq_notify_now(cq, SIW_WQE_SIGNALLED);
+
+ spin_unlock_irqrestore(&cq->lock, flags);
+
+ if (notify) {
+ siw_dbg_cq(cq, "Call completion handler\n");
+ cq->base_cq.comp_handler(&cq->base_cq,
+ cq->base_cq.cq_context);
+ }
+ } else {
+ spin_unlock_irqrestore(&cq->lock, flags);
+ rv = -ENOMEM;
+ siw_cq_event(cq, IB_EVENT_CQ_ERR);
+ }
+ } else {
+ /* recycle RQE */
+ smp_store_mb(rqe->flags, 0);
+ }
+ return rv;
+}
+
+/*
+ * siw_sq_flush()
+ *
+ * Flush SQ and ORRQ entries to CQ.
+ *
+ * Must be called with QP state write lock held.
+ * Therefore, SQ and ORQ lock must not be taken.
+ */
+void siw_sq_flush(struct siw_qp *qp)
+{
+ struct siw_sqe *sqe;
+ struct siw_wqe *wqe = tx_wqe(qp);
+ int async_event = 0;
+
+ /*
+ * Start with completing any work currently on the ORQ
+ */
+ while (qp->attrs.orq_size) {
+ sqe = &qp->orq[qp->orq_get % qp->attrs.orq_size];
+ if (!READ_ONCE(sqe->flags))
+ break;
+
+ if (siw_sqe_complete(qp, sqe, 0, SIW_WC_WR_FLUSH_ERR) != 0)
+ break;
+
+ WRITE_ONCE(sqe->flags, 0);
+ qp->orq_get++;
+ }
+ /*
+ * Flush an in-progress WQE if present
+ */
+ if (wqe->wr_status != SIW_WR_IDLE) {
+ siw_dbg_qp(qp, "flush current SQE, type %d, status %d\n",
+ tx_type(wqe), wqe->wr_status);
+
+ siw_wqe_put_mem(wqe, tx_type(wqe));
+
+ if (tx_type(wqe) != SIW_OP_READ_RESPONSE &&
+ ((tx_type(wqe) != SIW_OP_READ &&
+ tx_type(wqe) != SIW_OP_READ_LOCAL_INV) ||
+ wqe->wr_status == SIW_WR_QUEUED))
+ /*
+ * An in-progress Read Request is already in
+ * the ORQ
+ */
+ siw_sqe_complete(qp, &wqe->sqe, wqe->bytes,
+ SIW_WC_WR_FLUSH_ERR);
+
+ wqe->wr_status = SIW_WR_IDLE;
+ }
+ /*
+ * Flush the Send Queue
+ */
+ while (qp->attrs.sq_size) {
+ sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size];
+ if (!READ_ONCE(sqe->flags))
+ break;
+
+ async_event = 1;
+ if (siw_sqe_complete(qp, sqe, 0, SIW_WC_WR_FLUSH_ERR) != 0)
+ /*
+ * Shall IB_EVENT_SQ_DRAINED be supressed if work
+ * completion fails?
+ */
+ break;
+
+ WRITE_ONCE(sqe->flags, 0);
+ qp->sq_get++;
+ }
+ if (async_event)
+ siw_qp_event(qp, IB_EVENT_SQ_DRAINED);
+}
+
+/*
+ * siw_rq_flush()
+ *
+ * Flush recv queue entries to CQ. Also
+ * takes care of pending active tagged and untagged
+ * inbound transfers, which have target memory
+ * referenced.
+ *
+ * Must be called with QP state write lock held.
+ * Therefore, RQ lock must not be taken.
+ */
+void siw_rq_flush(struct siw_qp *qp)
+{
+ struct siw_wqe *wqe = &qp->rx_untagged.wqe_active;
+
+ /*
+ * Flush an in-progress untagged operation if present
+ */
+ if (wqe->wr_status != SIW_WR_IDLE) {
+ siw_dbg_qp(qp, "flush current rqe, type %d, status %d\n",
+ rx_type(wqe), wqe->wr_status);
+
+ siw_wqe_put_mem(wqe, rx_type(wqe));
+
+ if (rx_type(wqe) == SIW_OP_RECEIVE) {
+ siw_rqe_complete(qp, &wqe->rqe, wqe->bytes,
+ 0, SIW_WC_WR_FLUSH_ERR);
+ } else if (rx_type(wqe) != SIW_OP_READ &&
+ rx_type(wqe) != SIW_OP_READ_RESPONSE &&
+ rx_type(wqe) != SIW_OP_WRITE) {
+ siw_sqe_complete(qp, &wqe->sqe, 0, SIW_WC_WR_FLUSH_ERR);
+ }
+ wqe->wr_status = SIW_WR_IDLE;
+ }
+ wqe = &qp->rx_tagged.wqe_active;
+
+ if (wqe->wr_status != SIW_WR_IDLE) {
+ siw_wqe_put_mem(wqe, rx_type(wqe));
+ wqe->wr_status = SIW_WR_IDLE;
+ }
+ /*
+ * Flush the Receive Queue
+ */
+ while (qp->attrs.rq_size) {
+ struct siw_rqe *rqe =
+ &qp->recvq[qp->rq_get % qp->attrs.rq_size];
+
+ if (!READ_ONCE(rqe->flags))
+ break;
+
+ if (siw_rqe_complete(qp, rqe, 0, 0, SIW_WC_WR_FLUSH_ERR) != 0)
+ break;
+
+ WRITE_ONCE(rqe->flags, 0);
+ qp->rq_get++;
+ }
+}
+
+int siw_qp_add(struct siw_device *sdev, struct siw_qp *qp)
+{
+ int rv = xa_alloc(&sdev->qp_xa, &qp->base_qp.qp_num, qp, xa_limit_32b,
+ GFP_KERNEL);
+
+ if (!rv) {
+ kref_init(&qp->ref);
+ qp->sdev = sdev;
+ siw_dbg_qp(qp, "new QP\n");
+ }
+ return rv;
+}
+
+void siw_free_qp(struct kref *ref)
+{
+ struct siw_qp *found, *qp = container_of(ref, struct siw_qp, ref);
+ struct siw_device *sdev = qp->sdev;
+ unsigned long flags;
+
+ if (qp->cep)
+ siw_cep_put(qp->cep);
+
+ found = xa_erase(&sdev->qp_xa, qp_id(qp));
+ WARN_ON(found != qp);
+ spin_lock_irqsave(&sdev->lock, flags);
+ list_del(&qp->devq);
+ spin_unlock_irqrestore(&sdev->lock, flags);
+
+ vfree(qp->sendq);
+ vfree(qp->recvq);
+ vfree(qp->irq);
+ vfree(qp->orq);
+
+ siw_put_tx_cpu(qp->tx_cpu);
+ complete(&qp->qp_free);
+ atomic_dec(&sdev->num_qp);
+}
diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c
new file mode 100644
index 0000000000..58bbf738e4
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_qp_rx.c
@@ -0,0 +1,1476 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+
+#include "siw.h"
+#include "siw_verbs.h"
+#include "siw_mem.h"
+
+/*
+ * siw_rx_umem()
+ *
+ * Receive data of @len into target referenced by @dest_addr.
+ *
+ * @srx: Receive Context
+ * @umem: siw representation of target memory
+ * @dest_addr: user virtual address
+ * @len: number of bytes to place
+ */
+static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem,
+ u64 dest_addr, int len)
+{
+ int copied = 0;
+
+ while (len) {
+ struct page *p;
+ int pg_off, bytes, rv;
+ void *dest;
+
+ p = siw_get_upage(umem, dest_addr);
+ if (unlikely(!p)) {
+ pr_warn("siw: %s: [QP %u]: bogus addr: %pK, %pK\n",
+ __func__, qp_id(rx_qp(srx)),
+ (void *)(uintptr_t)dest_addr,
+ (void *)(uintptr_t)umem->fp_addr);
+ /* siw internal error */
+ srx->skb_copied += copied;
+ srx->skb_new -= copied;
+
+ return -EFAULT;
+ }
+ pg_off = dest_addr & ~PAGE_MASK;
+ bytes = min(len, (int)PAGE_SIZE - pg_off);
+
+ siw_dbg_qp(rx_qp(srx), "page %pK, bytes=%u\n", p, bytes);
+
+ dest = kmap_atomic(p);
+ rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off,
+ bytes);
+
+ if (unlikely(rv)) {
+ kunmap_atomic(dest);
+ srx->skb_copied += copied;
+ srx->skb_new -= copied;
+
+ pr_warn("siw: [QP %u]: %s, len %d, page %p, rv %d\n",
+ qp_id(rx_qp(srx)), __func__, len, p, rv);
+
+ return -EFAULT;
+ }
+ if (srx->mpa_crc_hd) {
+ if (rdma_is_kernel_res(&rx_qp(srx)->base_qp.res)) {
+ crypto_shash_update(srx->mpa_crc_hd,
+ (u8 *)(dest + pg_off), bytes);
+ kunmap_atomic(dest);
+ } else {
+ kunmap_atomic(dest);
+ /*
+ * Do CRC on original, not target buffer.
+ * Some user land applications may
+ * concurrently write the target buffer,
+ * which would yield a broken CRC.
+ * Walking the skb twice is very ineffcient.
+ * Folding the CRC into skb_copy_bits()
+ * would be much better, but is currently
+ * not supported.
+ */
+ siw_crc_skb(srx, bytes);
+ }
+ } else {
+ kunmap_atomic(dest);
+ }
+ srx->skb_offset += bytes;
+ copied += bytes;
+ len -= bytes;
+ dest_addr += bytes;
+ pg_off = 0;
+ }
+ srx->skb_copied += copied;
+ srx->skb_new -= copied;
+
+ return copied;
+}
+
+static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len)
+{
+ int rv;
+
+ siw_dbg_qp(rx_qp(srx), "kva: 0x%pK, len: %u\n", kva, len);
+
+ rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len);
+ if (unlikely(rv)) {
+ pr_warn("siw: [QP %u]: %s, len %d, kva 0x%pK, rv %d\n",
+ qp_id(rx_qp(srx)), __func__, len, kva, rv);
+
+ return rv;
+ }
+ if (srx->mpa_crc_hd)
+ crypto_shash_update(srx->mpa_crc_hd, (u8 *)kva, len);
+
+ srx->skb_offset += len;
+ srx->skb_copied += len;
+ srx->skb_new -= len;
+
+ return len;
+}
+
+static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx,
+ struct siw_mem *mem, u64 addr, int len)
+{
+ struct siw_pbl *pbl = mem->pbl;
+ u64 offset = addr - mem->va;
+ int copied = 0;
+
+ while (len) {
+ int bytes;
+ dma_addr_t buf_addr =
+ siw_pbl_get_buffer(pbl, offset, &bytes, pbl_idx);
+ if (!buf_addr)
+ break;
+
+ bytes = min(bytes, len);
+ if (siw_rx_kva(srx, ib_virt_dma_to_ptr(buf_addr), bytes) ==
+ bytes) {
+ copied += bytes;
+ offset += bytes;
+ len -= bytes;
+ } else {
+ break;
+ }
+ }
+ return copied;
+}
+
+/*
+ * siw_rresp_check_ntoh()
+ *
+ * Check incoming RRESP fragment header against expected
+ * header values and update expected values for potential next
+ * fragment.
+ *
+ * NOTE: This function must be called only if a RRESP DDP segment
+ * starts but not for fragmented consecutive pieces of an
+ * already started DDP segment.
+ */
+static int siw_rresp_check_ntoh(struct siw_rx_stream *srx,
+ struct siw_rx_fpdu *frx)
+{
+ struct iwarp_rdma_rresp *rresp = &srx->hdr.rresp;
+ struct siw_wqe *wqe = &frx->wqe_active;
+ enum ddp_ecode ecode;
+
+ u32 sink_stag = be32_to_cpu(rresp->sink_stag);
+ u64 sink_to = be64_to_cpu(rresp->sink_to);
+
+ if (frx->first_ddp_seg) {
+ srx->ddp_stag = wqe->sqe.sge[0].lkey;
+ srx->ddp_to = wqe->sqe.sge[0].laddr;
+ frx->pbl_idx = 0;
+ }
+ /* Below checks extend beyond the semantics of DDP, and
+ * into RDMAP:
+ * We check if the read response matches exactly the
+ * read request which was send to the remote peer to
+ * trigger this read response. RFC5040/5041 do not
+ * always have a proper error code for the detected
+ * error cases. We choose 'base or bounds error' for
+ * cases where the inbound STag is valid, but offset
+ * or length do not match our response receive state.
+ */
+ if (unlikely(srx->ddp_stag != sink_stag)) {
+ pr_warn("siw: [QP %u]: rresp stag: %08x != %08x\n",
+ qp_id(rx_qp(srx)), sink_stag, srx->ddp_stag);
+ ecode = DDP_ECODE_T_INVALID_STAG;
+ goto error;
+ }
+ if (unlikely(srx->ddp_to != sink_to)) {
+ pr_warn("siw: [QP %u]: rresp off: %016llx != %016llx\n",
+ qp_id(rx_qp(srx)), (unsigned long long)sink_to,
+ (unsigned long long)srx->ddp_to);
+ ecode = DDP_ECODE_T_BASE_BOUNDS;
+ goto error;
+ }
+ if (unlikely(!frx->more_ddp_segs &&
+ (wqe->processed + srx->fpdu_part_rem != wqe->bytes))) {
+ pr_warn("siw: [QP %u]: rresp len: %d != %d\n",
+ qp_id(rx_qp(srx)),
+ wqe->processed + srx->fpdu_part_rem, wqe->bytes);
+ ecode = DDP_ECODE_T_BASE_BOUNDS;
+ goto error;
+ }
+ return 0;
+error:
+ siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
+ DDP_ETYPE_TAGGED_BUF, ecode, 0);
+ return -EINVAL;
+}
+
+/*
+ * siw_write_check_ntoh()
+ *
+ * Check incoming WRITE fragment header against expected
+ * header values and update expected values for potential next
+ * fragment
+ *
+ * NOTE: This function must be called only if a WRITE DDP segment
+ * starts but not for fragmented consecutive pieces of an
+ * already started DDP segment.
+ */
+static int siw_write_check_ntoh(struct siw_rx_stream *srx,
+ struct siw_rx_fpdu *frx)
+{
+ struct iwarp_rdma_write *write = &srx->hdr.rwrite;
+ enum ddp_ecode ecode;
+
+ u32 sink_stag = be32_to_cpu(write->sink_stag);
+ u64 sink_to = be64_to_cpu(write->sink_to);
+
+ if (frx->first_ddp_seg) {
+ srx->ddp_stag = sink_stag;
+ srx->ddp_to = sink_to;
+ frx->pbl_idx = 0;
+ } else {
+ if (unlikely(srx->ddp_stag != sink_stag)) {
+ pr_warn("siw: [QP %u]: write stag: %08x != %08x\n",
+ qp_id(rx_qp(srx)), sink_stag,
+ srx->ddp_stag);
+ ecode = DDP_ECODE_T_INVALID_STAG;
+ goto error;
+ }
+ if (unlikely(srx->ddp_to != sink_to)) {
+ pr_warn("siw: [QP %u]: write off: %016llx != %016llx\n",
+ qp_id(rx_qp(srx)),
+ (unsigned long long)sink_to,
+ (unsigned long long)srx->ddp_to);
+ ecode = DDP_ECODE_T_BASE_BOUNDS;
+ goto error;
+ }
+ }
+ return 0;
+error:
+ siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
+ DDP_ETYPE_TAGGED_BUF, ecode, 0);
+ return -EINVAL;
+}
+
+/*
+ * siw_send_check_ntoh()
+ *
+ * Check incoming SEND fragment header against expected
+ * header values and update expected MSN if no next
+ * fragment expected
+ *
+ * NOTE: This function must be called only if a SEND DDP segment
+ * starts but not for fragmented consecutive pieces of an
+ * already started DDP segment.
+ */
+static int siw_send_check_ntoh(struct siw_rx_stream *srx,
+ struct siw_rx_fpdu *frx)
+{
+ struct iwarp_send_inv *send = &srx->hdr.send_inv;
+ struct siw_wqe *wqe = &frx->wqe_active;
+ enum ddp_ecode ecode;
+
+ u32 ddp_msn = be32_to_cpu(send->ddp_msn);
+ u32 ddp_mo = be32_to_cpu(send->ddp_mo);
+ u32 ddp_qn = be32_to_cpu(send->ddp_qn);
+
+ if (unlikely(ddp_qn != RDMAP_UNTAGGED_QN_SEND)) {
+ pr_warn("siw: [QP %u]: invalid ddp qn %d for send\n",
+ qp_id(rx_qp(srx)), ddp_qn);
+ ecode = DDP_ECODE_UT_INVALID_QN;
+ goto error;
+ }
+ if (unlikely(ddp_msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND])) {
+ pr_warn("siw: [QP %u]: send msn: %u != %u\n",
+ qp_id(rx_qp(srx)), ddp_msn,
+ srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
+ ecode = DDP_ECODE_UT_INVALID_MSN_RANGE;
+ goto error;
+ }
+ if (unlikely(ddp_mo != wqe->processed)) {
+ pr_warn("siw: [QP %u], send mo: %u != %u\n",
+ qp_id(rx_qp(srx)), ddp_mo, wqe->processed);
+ ecode = DDP_ECODE_UT_INVALID_MO;
+ goto error;
+ }
+ if (frx->first_ddp_seg) {
+ /* initialize user memory write position */
+ frx->sge_idx = 0;
+ frx->sge_off = 0;
+ frx->pbl_idx = 0;
+
+ /* only valid for SEND_INV and SEND_SE_INV operations */
+ srx->inval_stag = be32_to_cpu(send->inval_stag);
+ }
+ if (unlikely(wqe->bytes < wqe->processed + srx->fpdu_part_rem)) {
+ siw_dbg_qp(rx_qp(srx), "receive space short: %d - %d < %d\n",
+ wqe->bytes, wqe->processed, srx->fpdu_part_rem);
+ wqe->wc_status = SIW_WC_LOC_LEN_ERR;
+ ecode = DDP_ECODE_UT_INVALID_MSN_NOBUF;
+ goto error;
+ }
+ return 0;
+error:
+ siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
+ DDP_ETYPE_UNTAGGED_BUF, ecode, 0);
+ return -EINVAL;
+}
+
+static struct siw_wqe *siw_rqe_get(struct siw_qp *qp)
+{
+ struct siw_rqe *rqe;
+ struct siw_srq *srq;
+ struct siw_wqe *wqe = NULL;
+ bool srq_event = false;
+ unsigned long flags;
+
+ srq = qp->srq;
+ if (srq) {
+ spin_lock_irqsave(&srq->lock, flags);
+ if (unlikely(!srq->num_rqe))
+ goto out;
+
+ rqe = &srq->recvq[srq->rq_get % srq->num_rqe];
+ } else {
+ if (unlikely(!qp->recvq))
+ goto out;
+
+ rqe = &qp->recvq[qp->rq_get % qp->attrs.rq_size];
+ }
+ if (likely(rqe->flags == SIW_WQE_VALID)) {
+ int num_sge = rqe->num_sge;
+
+ if (likely(num_sge <= SIW_MAX_SGE)) {
+ int i = 0;
+
+ wqe = rx_wqe(&qp->rx_untagged);
+ rx_type(wqe) = SIW_OP_RECEIVE;
+ wqe->wr_status = SIW_WR_INPROGRESS;
+ wqe->bytes = 0;
+ wqe->processed = 0;
+
+ wqe->rqe.id = rqe->id;
+ wqe->rqe.num_sge = num_sge;
+
+ while (i < num_sge) {
+ wqe->rqe.sge[i].laddr = rqe->sge[i].laddr;
+ wqe->rqe.sge[i].lkey = rqe->sge[i].lkey;
+ wqe->rqe.sge[i].length = rqe->sge[i].length;
+ wqe->bytes += wqe->rqe.sge[i].length;
+ wqe->mem[i] = NULL;
+ i++;
+ }
+ /* can be re-used by appl */
+ smp_store_mb(rqe->flags, 0);
+ } else {
+ siw_dbg_qp(qp, "too many sge's: %d\n", rqe->num_sge);
+ if (srq)
+ spin_unlock_irqrestore(&srq->lock, flags);
+ return NULL;
+ }
+ if (!srq) {
+ qp->rq_get++;
+ } else {
+ if (srq->armed) {
+ /* Test SRQ limit */
+ u32 off = (srq->rq_get + srq->limit) %
+ srq->num_rqe;
+ struct siw_rqe *rqe2 = &srq->recvq[off];
+
+ if (!(rqe2->flags & SIW_WQE_VALID)) {
+ srq->armed = false;
+ srq_event = true;
+ }
+ }
+ srq->rq_get++;
+ }
+ }
+out:
+ if (srq) {
+ spin_unlock_irqrestore(&srq->lock, flags);
+ if (srq_event)
+ siw_srq_event(srq, IB_EVENT_SRQ_LIMIT_REACHED);
+ }
+ return wqe;
+}
+
+/*
+ * siw_proc_send:
+ *
+ * Process one incoming SEND and place data into memory referenced by
+ * receive wqe.
+ *
+ * Function supports partially received sends (suspending/resuming
+ * current receive wqe processing)
+ *
+ * return value:
+ * 0: reached the end of a DDP segment
+ * -EAGAIN: to be called again to finish the DDP segment
+ */
+int siw_proc_send(struct siw_qp *qp)
+{
+ struct siw_rx_stream *srx = &qp->rx_stream;
+ struct siw_rx_fpdu *frx = &qp->rx_untagged;
+ struct siw_wqe *wqe;
+ u32 data_bytes; /* all data bytes available */
+ u32 rcvd_bytes; /* sum of data bytes rcvd */
+ int rv = 0;
+
+ if (frx->first_ddp_seg) {
+ wqe = siw_rqe_get(qp);
+ if (unlikely(!wqe)) {
+ siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+ DDP_ETYPE_UNTAGGED_BUF,
+ DDP_ECODE_UT_INVALID_MSN_NOBUF, 0);
+ return -ENOENT;
+ }
+ } else {
+ wqe = rx_wqe(frx);
+ }
+ if (srx->state == SIW_GET_DATA_START) {
+ rv = siw_send_check_ntoh(srx, frx);
+ if (unlikely(rv)) {
+ siw_qp_event(qp, IB_EVENT_QP_FATAL);
+ return rv;
+ }
+ if (!srx->fpdu_part_rem) /* zero length SEND */
+ return 0;
+ }
+ data_bytes = min(srx->fpdu_part_rem, srx->skb_new);
+ rcvd_bytes = 0;
+
+ /* A zero length SEND will skip below loop */
+ while (data_bytes) {
+ struct ib_pd *pd;
+ struct siw_mem **mem, *mem_p;
+ struct siw_sge *sge;
+ u32 sge_bytes; /* data bytes avail for SGE */
+
+ sge = &wqe->rqe.sge[frx->sge_idx];
+
+ if (!sge->length) {
+ /* just skip empty sge's */
+ frx->sge_idx++;
+ frx->sge_off = 0;
+ frx->pbl_idx = 0;
+ continue;
+ }
+ sge_bytes = min(data_bytes, sge->length - frx->sge_off);
+ mem = &wqe->mem[frx->sge_idx];
+
+ /*
+ * check with QP's PD if no SRQ present, SRQ's PD otherwise
+ */
+ pd = qp->srq == NULL ? qp->pd : qp->srq->base_srq.pd;
+
+ rv = siw_check_sge(pd, sge, mem, IB_ACCESS_LOCAL_WRITE,
+ frx->sge_off, sge_bytes);
+ if (unlikely(rv)) {
+ siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+ DDP_ETYPE_CATASTROPHIC,
+ DDP_ECODE_CATASTROPHIC, 0);
+
+ siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
+ break;
+ }
+ mem_p = *mem;
+ if (mem_p->mem_obj == NULL)
+ rv = siw_rx_kva(srx,
+ ib_virt_dma_to_ptr(sge->laddr + frx->sge_off),
+ sge_bytes);
+ else if (!mem_p->is_pbl)
+ rv = siw_rx_umem(srx, mem_p->umem,
+ sge->laddr + frx->sge_off, sge_bytes);
+ else
+ rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
+ sge->laddr + frx->sge_off, sge_bytes);
+
+ if (unlikely(rv != sge_bytes)) {
+ wqe->processed += rcvd_bytes;
+
+ siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+ DDP_ETYPE_CATASTROPHIC,
+ DDP_ECODE_CATASTROPHIC, 0);
+ return -EINVAL;
+ }
+ frx->sge_off += rv;
+
+ if (frx->sge_off == sge->length) {
+ frx->sge_idx++;
+ frx->sge_off = 0;
+ frx->pbl_idx = 0;
+ }
+ data_bytes -= rv;
+ rcvd_bytes += rv;
+
+ srx->fpdu_part_rem -= rv;
+ srx->fpdu_part_rcvd += rv;
+ }
+ wqe->processed += rcvd_bytes;
+
+ if (!srx->fpdu_part_rem)
+ return 0;
+
+ return (rv < 0) ? rv : -EAGAIN;
+}
+
+/*
+ * siw_proc_write:
+ *
+ * Place incoming WRITE after referencing and checking target buffer
+
+ * Function supports partially received WRITEs (suspending/resuming
+ * current receive processing)
+ *
+ * return value:
+ * 0: reached the end of a DDP segment
+ * -EAGAIN: to be called again to finish the DDP segment
+ */
+int siw_proc_write(struct siw_qp *qp)
+{
+ struct siw_rx_stream *srx = &qp->rx_stream;
+ struct siw_rx_fpdu *frx = &qp->rx_tagged;
+ struct siw_mem *mem;
+ int bytes, rv;
+
+ if (srx->state == SIW_GET_DATA_START) {
+ if (!srx->fpdu_part_rem) /* zero length WRITE */
+ return 0;
+
+ rv = siw_write_check_ntoh(srx, frx);
+ if (unlikely(rv)) {
+ siw_qp_event(qp, IB_EVENT_QP_FATAL);
+ return rv;
+ }
+ }
+ bytes = min(srx->fpdu_part_rem, srx->skb_new);
+
+ if (frx->first_ddp_seg) {
+ struct siw_wqe *wqe = rx_wqe(frx);
+
+ rx_mem(frx) = siw_mem_id2obj(qp->sdev, srx->ddp_stag >> 8);
+ if (unlikely(!rx_mem(frx))) {
+ siw_dbg_qp(qp,
+ "sink stag not found/invalid, stag 0x%08x\n",
+ srx->ddp_stag);
+
+ siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+ DDP_ETYPE_TAGGED_BUF,
+ DDP_ECODE_T_INVALID_STAG, 0);
+ return -EINVAL;
+ }
+ wqe->rqe.num_sge = 1;
+ rx_type(wqe) = SIW_OP_WRITE;
+ wqe->wr_status = SIW_WR_INPROGRESS;
+ }
+ mem = rx_mem(frx);
+
+ /*
+ * Check if application re-registered memory with different
+ * key field of STag.
+ */
+ if (unlikely(mem->stag != srx->ddp_stag)) {
+ siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+ DDP_ETYPE_TAGGED_BUF,
+ DDP_ECODE_T_INVALID_STAG, 0);
+ return -EINVAL;
+ }
+ rv = siw_check_mem(qp->pd, mem, srx->ddp_to + srx->fpdu_part_rcvd,
+ IB_ACCESS_REMOTE_WRITE, bytes);
+ if (unlikely(rv)) {
+ siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+ DDP_ETYPE_TAGGED_BUF, siw_tagged_error(-rv),
+ 0);
+
+ siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
+
+ return -EINVAL;
+ }
+
+ if (mem->mem_obj == NULL)
+ rv = siw_rx_kva(srx,
+ (void *)(uintptr_t)(srx->ddp_to + srx->fpdu_part_rcvd),
+ bytes);
+ else if (!mem->is_pbl)
+ rv = siw_rx_umem(srx, mem->umem,
+ srx->ddp_to + srx->fpdu_part_rcvd, bytes);
+ else
+ rv = siw_rx_pbl(srx, &frx->pbl_idx, mem,
+ srx->ddp_to + srx->fpdu_part_rcvd, bytes);
+
+ if (unlikely(rv != bytes)) {
+ siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+ DDP_ETYPE_CATASTROPHIC,
+ DDP_ECODE_CATASTROPHIC, 0);
+ return -EINVAL;
+ }
+ srx->fpdu_part_rem -= rv;
+ srx->fpdu_part_rcvd += rv;
+
+ if (!srx->fpdu_part_rem) {
+ srx->ddp_to += srx->fpdu_part_rcvd;
+ return 0;
+ }
+ return -EAGAIN;
+}
+
+/*
+ * Inbound RREQ's cannot carry user data.
+ */
+int siw_proc_rreq(struct siw_qp *qp)
+{
+ struct siw_rx_stream *srx = &qp->rx_stream;
+
+ if (!srx->fpdu_part_rem)
+ return 0;
+
+ pr_warn("siw: [QP %u]: rreq with mpa len %d\n", qp_id(qp),
+ be16_to_cpu(srx->hdr.ctrl.mpa_len));
+
+ return -EPROTO;
+}
+
+/*
+ * siw_init_rresp:
+ *
+ * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE.
+ * Put it at the tail of the IRQ, if there is another WQE currently in
+ * transmit processing. If not, make it the current WQE to be processed
+ * and schedule transmit processing.
+ *
+ * Can be called from softirq context and from process
+ * context (RREAD socket loopback case!)
+ *
+ * return value:
+ * 0: success,
+ * failure code otherwise
+ */
+
+static int siw_init_rresp(struct siw_qp *qp, struct siw_rx_stream *srx)
+{
+ struct siw_wqe *tx_work = tx_wqe(qp);
+ struct siw_sqe *resp;
+
+ uint64_t raddr = be64_to_cpu(srx->hdr.rreq.sink_to),
+ laddr = be64_to_cpu(srx->hdr.rreq.source_to);
+ uint32_t length = be32_to_cpu(srx->hdr.rreq.read_size),
+ lkey = be32_to_cpu(srx->hdr.rreq.source_stag),
+ rkey = be32_to_cpu(srx->hdr.rreq.sink_stag),
+ msn = be32_to_cpu(srx->hdr.rreq.ddp_msn);
+
+ int run_sq = 1, rv = 0;
+ unsigned long flags;
+
+ if (unlikely(msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ])) {
+ siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+ DDP_ETYPE_UNTAGGED_BUF,
+ DDP_ECODE_UT_INVALID_MSN_RANGE, 0);
+ return -EPROTO;
+ }
+ spin_lock_irqsave(&qp->sq_lock, flags);
+
+ if (unlikely(!qp->attrs.irq_size)) {
+ run_sq = 0;
+ goto error_irq;
+ }
+ if (tx_work->wr_status == SIW_WR_IDLE) {
+ /*
+ * immediately schedule READ response w/o
+ * consuming IRQ entry: IRQ must be empty.
+ */
+ tx_work->processed = 0;
+ tx_work->mem[0] = NULL;
+ tx_work->wr_status = SIW_WR_QUEUED;
+ resp = &tx_work->sqe;
+ } else {
+ resp = irq_alloc_free(qp);
+ run_sq = 0;
+ }
+ if (likely(resp)) {
+ resp->opcode = SIW_OP_READ_RESPONSE;
+
+ resp->sge[0].length = length;
+ resp->sge[0].laddr = laddr;
+ resp->sge[0].lkey = lkey;
+
+ /* Keep aside message sequence number for potential
+ * error reporting during Read Response generation.
+ */
+ resp->sge[1].length = msn;
+
+ resp->raddr = raddr;
+ resp->rkey = rkey;
+ resp->num_sge = length ? 1 : 0;
+
+ /* RRESP now valid as current TX wqe or placed into IRQ */
+ smp_store_mb(resp->flags, SIW_WQE_VALID);
+ } else {
+error_irq:
+ pr_warn("siw: [QP %u]: IRQ exceeded or null, size %d\n",
+ qp_id(qp), qp->attrs.irq_size);
+
+ siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
+ RDMAP_ETYPE_REMOTE_OPERATION,
+ RDMAP_ECODE_CATASTROPHIC_STREAM, 0);
+ rv = -EPROTO;
+ }
+
+ spin_unlock_irqrestore(&qp->sq_lock, flags);
+
+ if (run_sq)
+ rv = siw_sq_start(qp);
+
+ return rv;
+}
+
+/*
+ * Only called at start of Read.Resonse processing.
+ * Transfer pending Read from tip of ORQ into currrent rx wqe,
+ * but keep ORQ entry valid until Read.Response processing done.
+ * No Queue locking needed.
+ */
+static int siw_orqe_start_rx(struct siw_qp *qp)
+{
+ struct siw_sqe *orqe;
+ struct siw_wqe *wqe = NULL;
+
+ if (unlikely(!qp->attrs.orq_size))
+ return -EPROTO;
+
+ /* make sure ORQ indices are current */
+ smp_mb();
+
+ orqe = orq_get_current(qp);
+ if (READ_ONCE(orqe->flags) & SIW_WQE_VALID) {
+ /* RRESP is a TAGGED RDMAP operation */
+ wqe = rx_wqe(&qp->rx_tagged);
+ wqe->sqe.id = orqe->id;
+ wqe->sqe.opcode = orqe->opcode;
+ wqe->sqe.sge[0].laddr = orqe->sge[0].laddr;
+ wqe->sqe.sge[0].lkey = orqe->sge[0].lkey;
+ wqe->sqe.sge[0].length = orqe->sge[0].length;
+ wqe->sqe.flags = orqe->flags;
+ wqe->sqe.num_sge = 1;
+ wqe->bytes = orqe->sge[0].length;
+ wqe->processed = 0;
+ wqe->mem[0] = NULL;
+ /* make sure WQE is completely written before valid */
+ smp_wmb();
+ wqe->wr_status = SIW_WR_INPROGRESS;
+
+ return 0;
+ }
+ return -EPROTO;
+}
+
+/*
+ * siw_proc_rresp:
+ *
+ * Place incoming RRESP data into memory referenced by RREQ WQE
+ * which is at the tip of the ORQ
+ *
+ * Function supports partially received RRESP's (suspending/resuming
+ * current receive processing)
+ */
+int siw_proc_rresp(struct siw_qp *qp)
+{
+ struct siw_rx_stream *srx = &qp->rx_stream;
+ struct siw_rx_fpdu *frx = &qp->rx_tagged;
+ struct siw_wqe *wqe = rx_wqe(frx);
+ struct siw_mem **mem, *mem_p;
+ struct siw_sge *sge;
+ int bytes, rv;
+
+ if (frx->first_ddp_seg) {
+ if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
+ pr_warn("siw: [QP %u]: proc RRESP: status %d, op %d\n",
+ qp_id(qp), wqe->wr_status, wqe->sqe.opcode);
+ rv = -EPROTO;
+ goto error_term;
+ }
+ /*
+ * fetch pending RREQ from orq
+ */
+ rv = siw_orqe_start_rx(qp);
+ if (rv) {
+ pr_warn("siw: [QP %u]: ORQ empty, size %d\n",
+ qp_id(qp), qp->attrs.orq_size);
+ goto error_term;
+ }
+ rv = siw_rresp_check_ntoh(srx, frx);
+ if (unlikely(rv)) {
+ siw_qp_event(qp, IB_EVENT_QP_FATAL);
+ return rv;
+ }
+ } else {
+ if (unlikely(wqe->wr_status != SIW_WR_INPROGRESS)) {
+ pr_warn("siw: [QP %u]: resume RRESP: status %d\n",
+ qp_id(qp), wqe->wr_status);
+ rv = -EPROTO;
+ goto error_term;
+ }
+ }
+ if (!srx->fpdu_part_rem) /* zero length RRESPONSE */
+ return 0;
+
+ sge = wqe->sqe.sge; /* there is only one */
+ mem = &wqe->mem[0];
+
+ if (!(*mem)) {
+ /*
+ * check target memory which resolves memory on first fragment
+ */
+ rv = siw_check_sge(qp->pd, sge, mem, IB_ACCESS_LOCAL_WRITE, 0,
+ wqe->bytes);
+ if (unlikely(rv)) {
+ siw_dbg_qp(qp, "target mem check: %d\n", rv);
+ wqe->wc_status = SIW_WC_LOC_PROT_ERR;
+
+ siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+ DDP_ETYPE_TAGGED_BUF,
+ siw_tagged_error(-rv), 0);
+
+ siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
+
+ return -EINVAL;
+ }
+ }
+ mem_p = *mem;
+
+ bytes = min(srx->fpdu_part_rem, srx->skb_new);
+
+ if (mem_p->mem_obj == NULL)
+ rv = siw_rx_kva(srx,
+ ib_virt_dma_to_ptr(sge->laddr + wqe->processed),
+ bytes);
+ else if (!mem_p->is_pbl)
+ rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + wqe->processed,
+ bytes);
+ else
+ rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
+ sge->laddr + wqe->processed, bytes);
+ if (rv != bytes) {
+ wqe->wc_status = SIW_WC_GENERAL_ERR;
+ rv = -EINVAL;
+ goto error_term;
+ }
+ srx->fpdu_part_rem -= rv;
+ srx->fpdu_part_rcvd += rv;
+ wqe->processed += rv;
+
+ if (!srx->fpdu_part_rem) {
+ srx->ddp_to += srx->fpdu_part_rcvd;
+ return 0;
+ }
+ return -EAGAIN;
+
+error_term:
+ siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_CATASTROPHIC,
+ DDP_ECODE_CATASTROPHIC, 0);
+ return rv;
+}
+
+int siw_proc_terminate(struct siw_qp *qp)
+{
+ struct siw_rx_stream *srx = &qp->rx_stream;
+ struct sk_buff *skb = srx->skb;
+ struct iwarp_terminate *term = &srx->hdr.terminate;
+ union iwarp_hdr term_info;
+ u8 *infop = (u8 *)&term_info;
+ enum rdma_opcode op;
+ u16 to_copy = sizeof(struct iwarp_ctrl);
+
+ pr_warn("siw: got TERMINATE. layer %d, type %d, code %d\n",
+ __rdmap_term_layer(term), __rdmap_term_etype(term),
+ __rdmap_term_ecode(term));
+
+ if (be32_to_cpu(term->ddp_qn) != RDMAP_UNTAGGED_QN_TERMINATE ||
+ be32_to_cpu(term->ddp_msn) !=
+ qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] ||
+ be32_to_cpu(term->ddp_mo) != 0) {
+ pr_warn("siw: rx bogus TERM [QN x%08x, MSN x%08x, MO x%08x]\n",
+ be32_to_cpu(term->ddp_qn), be32_to_cpu(term->ddp_msn),
+ be32_to_cpu(term->ddp_mo));
+ return -ECONNRESET;
+ }
+ /*
+ * Receive remaining pieces of TERM if indicated
+ */
+ if (!term->flag_m)
+ return -ECONNRESET;
+
+ /* Do not take the effort to reassemble a network fragmented
+ * TERM message
+ */
+ if (srx->skb_new < sizeof(struct iwarp_ctrl_tagged))
+ return -ECONNRESET;
+
+ memset(infop, 0, sizeof(term_info));
+
+ skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
+
+ op = __rdmap_get_opcode(&term_info.ctrl);
+ if (op >= RDMAP_TERMINATE)
+ goto out;
+
+ infop += to_copy;
+ srx->skb_offset += to_copy;
+ srx->skb_new -= to_copy;
+ srx->skb_copied += to_copy;
+ srx->fpdu_part_rcvd += to_copy;
+ srx->fpdu_part_rem -= to_copy;
+
+ to_copy = iwarp_pktinfo[op].hdr_len - to_copy;
+
+ /* Again, no network fragmented TERM's */
+ if (to_copy + MPA_CRC_SIZE > srx->skb_new)
+ return -ECONNRESET;
+
+ skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
+
+ if (term->flag_r) {
+ siw_dbg_qp(qp, "TERM reports RDMAP hdr type %u, len %u (%s)\n",
+ op, be16_to_cpu(term_info.ctrl.mpa_len),
+ term->flag_m ? "valid" : "invalid");
+ } else if (term->flag_d) {
+ siw_dbg_qp(qp, "TERM reports DDP hdr type %u, len %u (%s)\n",
+ op, be16_to_cpu(term_info.ctrl.mpa_len),
+ term->flag_m ? "valid" : "invalid");
+ }
+out:
+ srx->skb_new -= to_copy;
+ srx->skb_offset += to_copy;
+ srx->skb_copied += to_copy;
+ srx->fpdu_part_rcvd += to_copy;
+ srx->fpdu_part_rem -= to_copy;
+
+ return -ECONNRESET;
+}
+
+static int siw_get_trailer(struct siw_qp *qp, struct siw_rx_stream *srx)
+{
+ struct sk_buff *skb = srx->skb;
+ int avail = min(srx->skb_new, srx->fpdu_part_rem);
+ u8 *tbuf = (u8 *)&srx->trailer.crc - srx->pad;
+ __wsum crc_in, crc_own = 0;
+
+ siw_dbg_qp(qp, "expected %d, available %d, pad %u\n",
+ srx->fpdu_part_rem, srx->skb_new, srx->pad);
+
+ skb_copy_bits(skb, srx->skb_offset, tbuf, avail);
+
+ srx->skb_new -= avail;
+ srx->skb_offset += avail;
+ srx->skb_copied += avail;
+ srx->fpdu_part_rem -= avail;
+
+ if (srx->fpdu_part_rem)
+ return -EAGAIN;
+
+ if (!srx->mpa_crc_hd)
+ return 0;
+
+ if (srx->pad)
+ crypto_shash_update(srx->mpa_crc_hd, tbuf, srx->pad);
+ /*
+ * CRC32 is computed, transmitted and received directly in NBO,
+ * so there's never a reason to convert byte order.
+ */
+ crypto_shash_final(srx->mpa_crc_hd, (u8 *)&crc_own);
+ crc_in = (__force __wsum)srx->trailer.crc;
+
+ if (unlikely(crc_in != crc_own)) {
+ pr_warn("siw: crc error. in: %08x, own %08x, op %u\n",
+ crc_in, crc_own, qp->rx_stream.rdmap_op);
+
+ siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
+ LLP_ETYPE_MPA,
+ LLP_ECODE_RECEIVED_CRC, 0);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+#define MIN_DDP_HDR sizeof(struct iwarp_ctrl_tagged)
+
+static int siw_get_hdr(struct siw_rx_stream *srx)
+{
+ struct sk_buff *skb = srx->skb;
+ struct siw_qp *qp = rx_qp(srx);
+ struct iwarp_ctrl *c_hdr = &srx->hdr.ctrl;
+ struct siw_rx_fpdu *frx;
+ u8 opcode;
+ int bytes;
+
+ if (srx->fpdu_part_rcvd < MIN_DDP_HDR) {
+ /*
+ * copy a mimimum sized (tagged) DDP frame control part
+ */
+ bytes = min_t(int, srx->skb_new,
+ MIN_DDP_HDR - srx->fpdu_part_rcvd);
+
+ skb_copy_bits(skb, srx->skb_offset,
+ (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
+
+ srx->fpdu_part_rcvd += bytes;
+
+ srx->skb_new -= bytes;
+ srx->skb_offset += bytes;
+ srx->skb_copied += bytes;
+
+ if (srx->fpdu_part_rcvd < MIN_DDP_HDR)
+ return -EAGAIN;
+
+ if (unlikely(__ddp_get_version(c_hdr) != DDP_VERSION)) {
+ enum ddp_etype etype;
+ enum ddp_ecode ecode;
+
+ pr_warn("siw: received ddp version unsupported %d\n",
+ __ddp_get_version(c_hdr));
+
+ if (c_hdr->ddp_rdmap_ctrl & DDP_FLAG_TAGGED) {
+ etype = DDP_ETYPE_TAGGED_BUF;
+ ecode = DDP_ECODE_T_VERSION;
+ } else {
+ etype = DDP_ETYPE_UNTAGGED_BUF;
+ ecode = DDP_ECODE_UT_VERSION;
+ }
+ siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
+ etype, ecode, 0);
+ return -EINVAL;
+ }
+ if (unlikely(__rdmap_get_version(c_hdr) != RDMAP_VERSION)) {
+ pr_warn("siw: received rdmap version unsupported %d\n",
+ __rdmap_get_version(c_hdr));
+
+ siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
+ RDMAP_ETYPE_REMOTE_OPERATION,
+ RDMAP_ECODE_VERSION, 0);
+ return -EINVAL;
+ }
+ opcode = __rdmap_get_opcode(c_hdr);
+
+ if (opcode > RDMAP_TERMINATE) {
+ pr_warn("siw: received unknown packet type %u\n",
+ opcode);
+
+ siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
+ RDMAP_ETYPE_REMOTE_OPERATION,
+ RDMAP_ECODE_OPCODE, 0);
+ return -EINVAL;
+ }
+ siw_dbg_qp(rx_qp(srx), "new header, opcode %u\n", opcode);
+ } else {
+ opcode = __rdmap_get_opcode(c_hdr);
+ }
+ set_rx_fpdu_context(qp, opcode);
+ frx = qp->rx_fpdu;
+
+ /*
+ * Figure out len of current hdr: variable length of
+ * iwarp hdr may force us to copy hdr information in
+ * two steps. Only tagged DDP messages are already
+ * completely received.
+ */
+ if (iwarp_pktinfo[opcode].hdr_len > sizeof(struct iwarp_ctrl_tagged)) {
+ int hdrlen = iwarp_pktinfo[opcode].hdr_len;
+
+ bytes = min_t(int, hdrlen - MIN_DDP_HDR, srx->skb_new);
+
+ skb_copy_bits(skb, srx->skb_offset,
+ (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
+
+ srx->fpdu_part_rcvd += bytes;
+
+ srx->skb_new -= bytes;
+ srx->skb_offset += bytes;
+ srx->skb_copied += bytes;
+
+ if (srx->fpdu_part_rcvd < hdrlen)
+ return -EAGAIN;
+ }
+
+ /*
+ * DDP/RDMAP header receive completed. Check if the current
+ * DDP segment starts a new RDMAP message or continues a previously
+ * started RDMAP message.
+ *
+ * Alternating reception of DDP segments (or FPDUs) from incomplete
+ * tagged and untagged RDMAP messages is supported, as long as
+ * the current tagged or untagged message gets eventually completed
+ * w/o intersection from another message of the same type
+ * (tagged/untagged). E.g., a WRITE can get intersected by a SEND,
+ * but not by a READ RESPONSE etc.
+ */
+ if (srx->mpa_crc_hd) {
+ /*
+ * Restart CRC computation
+ */
+ crypto_shash_init(srx->mpa_crc_hd);
+ crypto_shash_update(srx->mpa_crc_hd, (u8 *)c_hdr,
+ srx->fpdu_part_rcvd);
+ }
+ if (frx->more_ddp_segs) {
+ frx->first_ddp_seg = 0;
+ if (frx->prev_rdmap_op != opcode) {
+ pr_warn("siw: packet intersection: %u : %u\n",
+ frx->prev_rdmap_op, opcode);
+ /*
+ * The last inbound RDMA operation of same type
+ * (tagged or untagged) is left unfinished.
+ * To complete it in error, make it the current
+ * operation again, even with the header already
+ * overwritten. For error handling, only the opcode
+ * and current rx context are relevant.
+ */
+ set_rx_fpdu_context(qp, frx->prev_rdmap_op);
+ __rdmap_set_opcode(c_hdr, frx->prev_rdmap_op);
+ return -EPROTO;
+ }
+ } else {
+ frx->prev_rdmap_op = opcode;
+ frx->first_ddp_seg = 1;
+ }
+ frx->more_ddp_segs = c_hdr->ddp_rdmap_ctrl & DDP_FLAG_LAST ? 0 : 1;
+
+ return 0;
+}
+
+static int siw_check_tx_fence(struct siw_qp *qp)
+{
+ struct siw_wqe *tx_waiting = tx_wqe(qp);
+ struct siw_sqe *rreq;
+ int resume_tx = 0, rv = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->orq_lock, flags);
+
+ /* free current orq entry */
+ rreq = orq_get_current(qp);
+ WRITE_ONCE(rreq->flags, 0);
+
+ qp->orq_get++;
+
+ if (qp->tx_ctx.orq_fence) {
+ if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) {
+ pr_warn("siw: [QP %u]: fence resume: bad status %d\n",
+ qp_id(qp), tx_waiting->wr_status);
+ rv = -EPROTO;
+ goto out;
+ }
+ /* resume SQ processing, if possible */
+ if (tx_waiting->sqe.opcode == SIW_OP_READ ||
+ tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
+
+ /* SQ processing was stopped because of a full ORQ */
+ rreq = orq_get_free(qp);
+ if (unlikely(!rreq)) {
+ pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp));
+ rv = -EPROTO;
+ goto out;
+ }
+ siw_read_to_orq(rreq, &tx_waiting->sqe);
+
+ qp->orq_put++;
+ qp->tx_ctx.orq_fence = 0;
+ resume_tx = 1;
+
+ } else if (siw_orq_empty(qp)) {
+ /*
+ * SQ processing was stopped by fenced work request.
+ * Resume since all previous Read's are now completed.
+ */
+ qp->tx_ctx.orq_fence = 0;
+ resume_tx = 1;
+ }
+ }
+out:
+ spin_unlock_irqrestore(&qp->orq_lock, flags);
+
+ if (resume_tx)
+ rv = siw_sq_start(qp);
+
+ return rv;
+}
+
+/*
+ * siw_rdmap_complete()
+ *
+ * Complete processing of an RDMA message after receiving all
+ * DDP segmens or ABort processing after encountering error case.
+ *
+ * o SENDs + RRESPs will need for completion,
+ * o RREQs need for READ RESPONSE initialization
+ * o WRITEs need memory dereferencing
+ *
+ * TODO: Failed WRITEs need local error to be surfaced.
+ */
+static int siw_rdmap_complete(struct siw_qp *qp, int error)
+{
+ struct siw_rx_stream *srx = &qp->rx_stream;
+ struct siw_wqe *wqe = rx_wqe(qp->rx_fpdu);
+ enum siw_wc_status wc_status = wqe->wc_status;
+ u8 opcode = __rdmap_get_opcode(&srx->hdr.ctrl);
+ int rv = 0;
+
+ switch (opcode) {
+ case RDMAP_SEND_SE:
+ case RDMAP_SEND_SE_INVAL:
+ wqe->rqe.flags |= SIW_WQE_SOLICITED;
+ fallthrough;
+
+ case RDMAP_SEND:
+ case RDMAP_SEND_INVAL:
+ if (wqe->wr_status == SIW_WR_IDLE)
+ break;
+
+ srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++;
+
+ if (error != 0 && wc_status == SIW_WC_SUCCESS)
+ wc_status = SIW_WC_GENERAL_ERR;
+ /*
+ * Handle STag invalidation request
+ */
+ if (wc_status == SIW_WC_SUCCESS &&
+ (opcode == RDMAP_SEND_INVAL ||
+ opcode == RDMAP_SEND_SE_INVAL)) {
+ rv = siw_invalidate_stag(qp->pd, srx->inval_stag);
+ if (rv) {
+ siw_init_terminate(
+ qp, TERM_ERROR_LAYER_RDMAP,
+ rv == -EACCES ?
+ RDMAP_ETYPE_REMOTE_PROTECTION :
+ RDMAP_ETYPE_REMOTE_OPERATION,
+ RDMAP_ECODE_CANNOT_INVALIDATE, 0);
+
+ wc_status = SIW_WC_REM_INV_REQ_ERR;
+ }
+ rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
+ rv ? 0 : srx->inval_stag,
+ wc_status);
+ } else {
+ rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
+ 0, wc_status);
+ }
+ siw_wqe_put_mem(wqe, SIW_OP_RECEIVE);
+ break;
+
+ case RDMAP_RDMA_READ_RESP:
+ if (wqe->wr_status == SIW_WR_IDLE)
+ break;
+
+ if (error != 0) {
+ if ((srx->state == SIW_GET_HDR &&
+ qp->rx_fpdu->first_ddp_seg) || error == -ENODATA)
+ /* possible RREQ in ORQ left untouched */
+ break;
+
+ if (wc_status == SIW_WC_SUCCESS)
+ wc_status = SIW_WC_GENERAL_ERR;
+ } else if (rdma_is_kernel_res(&qp->base_qp.res) &&
+ rx_type(wqe) == SIW_OP_READ_LOCAL_INV) {
+ /*
+ * Handle any STag invalidation request
+ */
+ rv = siw_invalidate_stag(qp->pd, wqe->sqe.sge[0].lkey);
+ if (rv) {
+ siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
+ RDMAP_ETYPE_CATASTROPHIC,
+ RDMAP_ECODE_UNSPECIFIED, 0);
+
+ if (wc_status == SIW_WC_SUCCESS) {
+ wc_status = SIW_WC_GENERAL_ERR;
+ error = rv;
+ }
+ }
+ }
+ /*
+ * All errors turn the wqe into signalled.
+ */
+ if ((wqe->sqe.flags & SIW_WQE_SIGNALLED) || error != 0)
+ rv = siw_sqe_complete(qp, &wqe->sqe, wqe->processed,
+ wc_status);
+ siw_wqe_put_mem(wqe, SIW_OP_READ);
+
+ if (!error) {
+ rv = siw_check_tx_fence(qp);
+ } else {
+ /* Disable current ORQ element */
+ if (qp->attrs.orq_size)
+ WRITE_ONCE(orq_get_current(qp)->flags, 0);
+ }
+ break;
+
+ case RDMAP_RDMA_READ_REQ:
+ if (!error) {
+ rv = siw_init_rresp(qp, srx);
+ srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++;
+ }
+ break;
+
+ case RDMAP_RDMA_WRITE:
+ if (wqe->wr_status == SIW_WR_IDLE)
+ break;
+
+ /*
+ * Free References from memory object if
+ * attached to receive context (inbound WRITE).
+ * While a zero-length WRITE is allowed,
+ * no memory reference got created.
+ */
+ if (rx_mem(&qp->rx_tagged)) {
+ siw_mem_put(rx_mem(&qp->rx_tagged));
+ rx_mem(&qp->rx_tagged) = NULL;
+ }
+ break;
+
+ default:
+ break;
+ }
+ wqe->wr_status = SIW_WR_IDLE;
+
+ return rv;
+}
+
+/*
+ * siw_tcp_rx_data()
+ *
+ * Main routine to consume inbound TCP payload
+ *
+ * @rd_desc: read descriptor
+ * @skb: socket buffer
+ * @off: offset in skb
+ * @len: skb->len - offset : payload in skb
+ */
+int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
+ unsigned int off, size_t len)
+{
+ struct siw_qp *qp = rd_desc->arg.data;
+ struct siw_rx_stream *srx = &qp->rx_stream;
+ int rv;
+
+ srx->skb = skb;
+ srx->skb_new = skb->len - off;
+ srx->skb_offset = off;
+ srx->skb_copied = 0;
+
+ siw_dbg_qp(qp, "new data, len %d\n", srx->skb_new);
+
+ while (srx->skb_new) {
+ int run_completion = 1;
+
+ if (unlikely(srx->rx_suspend)) {
+ /* Do not process any more data */
+ srx->skb_copied += srx->skb_new;
+ break;
+ }
+ switch (srx->state) {
+ case SIW_GET_HDR:
+ rv = siw_get_hdr(srx);
+ if (!rv) {
+ srx->fpdu_part_rem =
+ be16_to_cpu(srx->hdr.ctrl.mpa_len) -
+ srx->fpdu_part_rcvd + MPA_HDR_SIZE;
+
+ if (srx->fpdu_part_rem)
+ srx->pad = -srx->fpdu_part_rem & 0x3;
+ else
+ srx->pad = 0;
+
+ srx->state = SIW_GET_DATA_START;
+ srx->fpdu_part_rcvd = 0;
+ }
+ break;
+
+ case SIW_GET_DATA_MORE:
+ /*
+ * Another data fragment of the same DDP segment.
+ * Setting first_ddp_seg = 0 avoids repeating
+ * initializations that shall occur only once per
+ * DDP segment.
+ */
+ qp->rx_fpdu->first_ddp_seg = 0;
+ fallthrough;
+
+ case SIW_GET_DATA_START:
+ /*
+ * Headers will be checked by the opcode-specific
+ * data receive function below.
+ */
+ rv = iwarp_pktinfo[qp->rx_stream.rdmap_op].rx_data(qp);
+ if (!rv) {
+ int mpa_len =
+ be16_to_cpu(srx->hdr.ctrl.mpa_len)
+ + MPA_HDR_SIZE;
+
+ srx->fpdu_part_rem = (-mpa_len & 0x3)
+ + MPA_CRC_SIZE;
+ srx->fpdu_part_rcvd = 0;
+ srx->state = SIW_GET_TRAILER;
+ } else {
+ if (unlikely(rv == -ECONNRESET))
+ run_completion = 0;
+ else
+ srx->state = SIW_GET_DATA_MORE;
+ }
+ break;
+
+ case SIW_GET_TRAILER:
+ /*
+ * read CRC + any padding
+ */
+ rv = siw_get_trailer(qp, srx);
+ if (likely(!rv)) {
+ /*
+ * FPDU completed.
+ * complete RDMAP message if last fragment
+ */
+ srx->state = SIW_GET_HDR;
+ srx->fpdu_part_rcvd = 0;
+
+ if (!(srx->hdr.ctrl.ddp_rdmap_ctrl &
+ DDP_FLAG_LAST))
+ /* more frags */
+ break;
+
+ rv = siw_rdmap_complete(qp, 0);
+ run_completion = 0;
+ }
+ break;
+
+ default:
+ pr_warn("QP[%u]: RX out of state\n", qp_id(qp));
+ rv = -EPROTO;
+ run_completion = 0;
+ }
+ if (unlikely(rv != 0 && rv != -EAGAIN)) {
+ if ((srx->state > SIW_GET_HDR ||
+ qp->rx_fpdu->more_ddp_segs) && run_completion)
+ siw_rdmap_complete(qp, rv);
+
+ siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv,
+ srx->state);
+
+ siw_qp_cm_drop(qp, 1);
+
+ break;
+ }
+ if (rv) {
+ siw_dbg_qp(qp, "fpdu fragment, state %d, missing %d\n",
+ srx->state, srx->fpdu_part_rem);
+ break;
+ }
+ }
+ return srx->skb_copied;
+}
diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c
new file mode 100644
index 0000000000..60b6a41359
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_qp_tx.c
@@ -0,0 +1,1314 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+#include <net/tcp.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "siw.h"
+#include "siw_verbs.h"
+#include "siw_mem.h"
+
+#define MAX_HDR_INLINE \
+ (((uint32_t)(sizeof(struct siw_rreq_pkt) - \
+ sizeof(struct iwarp_send))) & 0xF8)
+
+static struct page *siw_get_pblpage(struct siw_mem *mem, u64 addr, int *idx)
+{
+ struct siw_pbl *pbl = mem->pbl;
+ u64 offset = addr - mem->va;
+ dma_addr_t paddr = siw_pbl_get_buffer(pbl, offset, NULL, idx);
+
+ if (paddr)
+ return ib_virt_dma_to_page(paddr);
+
+ return NULL;
+}
+
+/*
+ * Copy short payload at provided destination payload address
+ */
+static int siw_try_1seg(struct siw_iwarp_tx *c_tx, void *paddr)
+{
+ struct siw_wqe *wqe = &c_tx->wqe_active;
+ struct siw_sge *sge = &wqe->sqe.sge[0];
+ u32 bytes = sge->length;
+
+ if (bytes > MAX_HDR_INLINE || wqe->sqe.num_sge != 1)
+ return MAX_HDR_INLINE + 1;
+
+ if (!bytes)
+ return 0;
+
+ if (tx_flags(wqe) & SIW_WQE_INLINE) {
+ memcpy(paddr, &wqe->sqe.sge[1], bytes);
+ } else {
+ struct siw_mem *mem = wqe->mem[0];
+
+ if (!mem->mem_obj) {
+ /* Kernel client using kva */
+ memcpy(paddr, ib_virt_dma_to_ptr(sge->laddr), bytes);
+ } else if (c_tx->in_syscall) {
+ if (copy_from_user(paddr, u64_to_user_ptr(sge->laddr),
+ bytes))
+ return -EFAULT;
+ } else {
+ unsigned int off = sge->laddr & ~PAGE_MASK;
+ struct page *p;
+ char *buffer;
+ int pbl_idx = 0;
+
+ if (!mem->is_pbl)
+ p = siw_get_upage(mem->umem, sge->laddr);
+ else
+ p = siw_get_pblpage(mem, sge->laddr, &pbl_idx);
+
+ if (unlikely(!p))
+ return -EFAULT;
+
+ buffer = kmap_local_page(p);
+
+ if (likely(PAGE_SIZE - off >= bytes)) {
+ memcpy(paddr, buffer + off, bytes);
+ } else {
+ unsigned long part = bytes - (PAGE_SIZE - off);
+
+ memcpy(paddr, buffer + off, part);
+ kunmap_local(buffer);
+
+ if (!mem->is_pbl)
+ p = siw_get_upage(mem->umem,
+ sge->laddr + part);
+ else
+ p = siw_get_pblpage(mem,
+ sge->laddr + part,
+ &pbl_idx);
+ if (unlikely(!p))
+ return -EFAULT;
+
+ buffer = kmap_local_page(p);
+ memcpy(paddr + part, buffer, bytes - part);
+ }
+ kunmap_local(buffer);
+ }
+ }
+ return (int)bytes;
+}
+
+#define PKT_FRAGMENTED 1
+#define PKT_COMPLETE 0
+
+/*
+ * siw_qp_prepare_tx()
+ *
+ * Prepare tx state for sending out one fpdu. Builds complete pkt
+ * if no user data or only immediate data are present.
+ *
+ * returns PKT_COMPLETE if complete pkt built, PKT_FRAGMENTED otherwise.
+ */
+static int siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx)
+{
+ struct siw_wqe *wqe = &c_tx->wqe_active;
+ char *crc = NULL;
+ int data = 0;
+
+ switch (tx_type(wqe)) {
+ case SIW_OP_READ:
+ case SIW_OP_READ_LOCAL_INV:
+ memcpy(&c_tx->pkt.ctrl,
+ &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl,
+ sizeof(struct iwarp_ctrl));
+
+ c_tx->pkt.rreq.rsvd = 0;
+ c_tx->pkt.rreq.ddp_qn = htonl(RDMAP_UNTAGGED_QN_RDMA_READ);
+ c_tx->pkt.rreq.ddp_msn =
+ htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]);
+ c_tx->pkt.rreq.ddp_mo = 0;
+ c_tx->pkt.rreq.sink_stag = htonl(wqe->sqe.sge[0].lkey);
+ c_tx->pkt.rreq.sink_to =
+ cpu_to_be64(wqe->sqe.sge[0].laddr);
+ c_tx->pkt.rreq.source_stag = htonl(wqe->sqe.rkey);
+ c_tx->pkt.rreq.source_to = cpu_to_be64(wqe->sqe.raddr);
+ c_tx->pkt.rreq.read_size = htonl(wqe->sqe.sge[0].length);
+
+ c_tx->ctrl_len = sizeof(struct iwarp_rdma_rreq);
+ crc = (char *)&c_tx->pkt.rreq_pkt.crc;
+ break;
+
+ case SIW_OP_SEND:
+ if (tx_flags(wqe) & SIW_WQE_SOLICITED)
+ memcpy(&c_tx->pkt.ctrl,
+ &iwarp_pktinfo[RDMAP_SEND_SE].ctrl,
+ sizeof(struct iwarp_ctrl));
+ else
+ memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_SEND].ctrl,
+ sizeof(struct iwarp_ctrl));
+
+ c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND;
+ c_tx->pkt.send.ddp_msn =
+ htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
+ c_tx->pkt.send.ddp_mo = 0;
+
+ c_tx->pkt.send_inv.inval_stag = 0;
+
+ c_tx->ctrl_len = sizeof(struct iwarp_send);
+
+ crc = (char *)&c_tx->pkt.send_pkt.crc;
+ data = siw_try_1seg(c_tx, crc);
+ break;
+
+ case SIW_OP_SEND_REMOTE_INV:
+ if (tx_flags(wqe) & SIW_WQE_SOLICITED)
+ memcpy(&c_tx->pkt.ctrl,
+ &iwarp_pktinfo[RDMAP_SEND_SE_INVAL].ctrl,
+ sizeof(struct iwarp_ctrl));
+ else
+ memcpy(&c_tx->pkt.ctrl,
+ &iwarp_pktinfo[RDMAP_SEND_INVAL].ctrl,
+ sizeof(struct iwarp_ctrl));
+
+ c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND;
+ c_tx->pkt.send.ddp_msn =
+ htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
+ c_tx->pkt.send.ddp_mo = 0;
+
+ c_tx->pkt.send_inv.inval_stag = cpu_to_be32(wqe->sqe.rkey);
+
+ c_tx->ctrl_len = sizeof(struct iwarp_send_inv);
+
+ crc = (char *)&c_tx->pkt.send_pkt.crc;
+ data = siw_try_1seg(c_tx, crc);
+ break;
+
+ case SIW_OP_WRITE:
+ memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_RDMA_WRITE].ctrl,
+ sizeof(struct iwarp_ctrl));
+
+ c_tx->pkt.rwrite.sink_stag = htonl(wqe->sqe.rkey);
+ c_tx->pkt.rwrite.sink_to = cpu_to_be64(wqe->sqe.raddr);
+ c_tx->ctrl_len = sizeof(struct iwarp_rdma_write);
+
+ crc = (char *)&c_tx->pkt.write_pkt.crc;
+ data = siw_try_1seg(c_tx, crc);
+ break;
+
+ case SIW_OP_READ_RESPONSE:
+ memcpy(&c_tx->pkt.ctrl,
+ &iwarp_pktinfo[RDMAP_RDMA_READ_RESP].ctrl,
+ sizeof(struct iwarp_ctrl));
+
+ /* NBO */
+ c_tx->pkt.rresp.sink_stag = cpu_to_be32(wqe->sqe.rkey);
+ c_tx->pkt.rresp.sink_to = cpu_to_be64(wqe->sqe.raddr);
+
+ c_tx->ctrl_len = sizeof(struct iwarp_rdma_rresp);
+
+ crc = (char *)&c_tx->pkt.write_pkt.crc;
+ data = siw_try_1seg(c_tx, crc);
+ break;
+
+ default:
+ siw_dbg_qp(tx_qp(c_tx), "stale wqe type %d\n", tx_type(wqe));
+ return -EOPNOTSUPP;
+ }
+ if (unlikely(data < 0))
+ return data;
+
+ c_tx->ctrl_sent = 0;
+
+ if (data <= MAX_HDR_INLINE) {
+ if (data) {
+ wqe->processed = data;
+
+ c_tx->pkt.ctrl.mpa_len =
+ htons(c_tx->ctrl_len + data - MPA_HDR_SIZE);
+
+ /* Add pad, if needed */
+ data += -(int)data & 0x3;
+ /* advance CRC location after payload */
+ crc += data;
+ c_tx->ctrl_len += data;
+
+ if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED))
+ c_tx->pkt.c_untagged.ddp_mo = 0;
+ else
+ c_tx->pkt.c_tagged.ddp_to =
+ cpu_to_be64(wqe->sqe.raddr);
+ }
+
+ *(u32 *)crc = 0;
+ /*
+ * Do complete CRC if enabled and short packet
+ */
+ if (c_tx->mpa_crc_hd) {
+ crypto_shash_init(c_tx->mpa_crc_hd);
+ if (crypto_shash_update(c_tx->mpa_crc_hd,
+ (u8 *)&c_tx->pkt,
+ c_tx->ctrl_len))
+ return -EINVAL;
+ crypto_shash_final(c_tx->mpa_crc_hd, (u8 *)crc);
+ }
+ c_tx->ctrl_len += MPA_CRC_SIZE;
+
+ return PKT_COMPLETE;
+ }
+ c_tx->ctrl_len += MPA_CRC_SIZE;
+ c_tx->sge_idx = 0;
+ c_tx->sge_off = 0;
+ c_tx->pbl_idx = 0;
+
+ /*
+ * Allow direct sending out of user buffer if WR is non signalled
+ * and payload is over threshold.
+ * Per RDMA verbs, the application should not change the send buffer
+ * until the work completed. In iWarp, work completion is only
+ * local delivery to TCP. TCP may reuse the buffer for
+ * retransmission. Changing unsent data also breaks the CRC,
+ * if applied.
+ */
+ if (c_tx->zcopy_tx && wqe->bytes >= SENDPAGE_THRESH &&
+ !(tx_flags(wqe) & SIW_WQE_SIGNALLED))
+ c_tx->use_sendpage = 1;
+ else
+ c_tx->use_sendpage = 0;
+
+ return PKT_FRAGMENTED;
+}
+
+/*
+ * Send out one complete control type FPDU, or header of FPDU carrying
+ * data. Used for fixed sized packets like Read.Requests or zero length
+ * SENDs, WRITEs, READ.Responses, or header only.
+ */
+static int siw_tx_ctrl(struct siw_iwarp_tx *c_tx, struct socket *s,
+ int flags)
+{
+ struct msghdr msg = { .msg_flags = flags };
+ struct kvec iov = { .iov_base =
+ (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent,
+ .iov_len = c_tx->ctrl_len - c_tx->ctrl_sent };
+
+ int rv = kernel_sendmsg(s, &msg, &iov, 1,
+ c_tx->ctrl_len - c_tx->ctrl_sent);
+
+ if (rv >= 0) {
+ c_tx->ctrl_sent += rv;
+
+ if (c_tx->ctrl_sent == c_tx->ctrl_len)
+ rv = 0;
+ else
+ rv = -EAGAIN;
+ }
+ return rv;
+}
+
+/*
+ * 0copy TCP transmit interface: Use MSG_SPLICE_PAGES.
+ *
+ * Using sendpage to push page by page appears to be less efficient
+ * than using sendmsg, even if data are copied.
+ *
+ * A general performance limitation might be the extra four bytes
+ * trailer checksum segment to be pushed after user data.
+ */
+static int siw_tcp_sendpages(struct socket *s, struct page **page, int offset,
+ size_t size)
+{
+ struct bio_vec bvec;
+ struct msghdr msg = {
+ .msg_flags = (MSG_MORE | MSG_DONTWAIT | MSG_SPLICE_PAGES),
+ };
+ struct sock *sk = s->sk;
+ int i = 0, rv = 0, sent = 0;
+
+ while (size) {
+ size_t bytes = min_t(size_t, PAGE_SIZE - offset, size);
+
+ if (size + offset <= PAGE_SIZE)
+ msg.msg_flags &= ~MSG_MORE;
+
+ tcp_rate_check_app_limited(sk);
+ bvec_set_page(&bvec, page[i], bytes, offset);
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size);
+
+try_page_again:
+ lock_sock(sk);
+ rv = tcp_sendmsg_locked(sk, &msg, size);
+ release_sock(sk);
+
+ if (rv > 0) {
+ size -= rv;
+ sent += rv;
+ if (rv != bytes) {
+ offset += rv;
+ bytes -= rv;
+ goto try_page_again;
+ }
+ offset = 0;
+ } else {
+ if (rv == -EAGAIN || rv == 0)
+ break;
+ return rv;
+ }
+ i++;
+ }
+ return sent;
+}
+
+/*
+ * siw_0copy_tx()
+ *
+ * Pushes list of pages to TCP socket. If pages from multiple
+ * SGE's, all referenced pages of each SGE are pushed in one
+ * shot.
+ */
+static int siw_0copy_tx(struct socket *s, struct page **page,
+ struct siw_sge *sge, unsigned int offset,
+ unsigned int size)
+{
+ int i = 0, sent = 0, rv;
+ int sge_bytes = min(sge->length - offset, size);
+
+ offset = (sge->laddr + offset) & ~PAGE_MASK;
+
+ while (sent != size) {
+ rv = siw_tcp_sendpages(s, &page[i], offset, sge_bytes);
+ if (rv >= 0) {
+ sent += rv;
+ if (size == sent || sge_bytes > rv)
+ break;
+
+ i += PAGE_ALIGN(sge_bytes + offset) >> PAGE_SHIFT;
+ sge++;
+ sge_bytes = min(sge->length, size - sent);
+ offset = sge->laddr & ~PAGE_MASK;
+ } else {
+ sent = rv;
+ break;
+ }
+ }
+ return sent;
+}
+
+#define MAX_TRAILER (MPA_CRC_SIZE + 4)
+
+static void siw_unmap_pages(struct kvec *iov, unsigned long kmap_mask, int len)
+{
+ int i;
+
+ /*
+ * Work backwards through the array to honor the kmap_local_page()
+ * ordering requirements.
+ */
+ for (i = (len-1); i >= 0; i--) {
+ if (kmap_mask & BIT(i)) {
+ unsigned long addr = (unsigned long)iov[i].iov_base;
+
+ kunmap_local((void *)(addr & PAGE_MASK));
+ }
+ }
+}
+
+/*
+ * siw_tx_hdt() tries to push a complete packet to TCP where all
+ * packet fragments are referenced by the elements of one iovec.
+ * For the data portion, each involved page must be referenced by
+ * one extra element. All sge's data can be non-aligned to page
+ * boundaries. Two more elements are referencing iWARP header
+ * and trailer:
+ * MAX_ARRAY = 64KB/PAGE_SIZE + 1 + (2 * (SIW_MAX_SGE - 1) + HDR + TRL
+ */
+#define MAX_ARRAY ((0xffff / PAGE_SIZE) + 1 + (2 * (SIW_MAX_SGE - 1) + 2))
+
+/*
+ * Write out iov referencing hdr, data and trailer of current FPDU.
+ * Update transmit state dependent on write return status
+ */
+static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s)
+{
+ struct siw_wqe *wqe = &c_tx->wqe_active;
+ struct siw_sge *sge = &wqe->sqe.sge[c_tx->sge_idx];
+ struct kvec iov[MAX_ARRAY];
+ struct page *page_array[MAX_ARRAY];
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
+
+ int seg = 0, do_crc = c_tx->do_crc, is_kva = 0, rv;
+ unsigned int data_len = c_tx->bytes_unsent, hdr_len = 0, trl_len = 0,
+ sge_off = c_tx->sge_off, sge_idx = c_tx->sge_idx,
+ pbl_idx = c_tx->pbl_idx;
+ unsigned long kmap_mask = 0L;
+
+ if (c_tx->state == SIW_SEND_HDR) {
+ if (c_tx->use_sendpage) {
+ rv = siw_tx_ctrl(c_tx, s, MSG_DONTWAIT | MSG_MORE);
+ if (rv)
+ goto done;
+
+ c_tx->state = SIW_SEND_DATA;
+ } else {
+ iov[0].iov_base =
+ (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent;
+ iov[0].iov_len = hdr_len =
+ c_tx->ctrl_len - c_tx->ctrl_sent;
+ seg = 1;
+ }
+ }
+
+ wqe->processed += data_len;
+
+ while (data_len) { /* walk the list of SGE's */
+ unsigned int sge_len = min(sge->length - sge_off, data_len);
+ unsigned int fp_off = (sge->laddr + sge_off) & ~PAGE_MASK;
+ struct siw_mem *mem;
+
+ if (!(tx_flags(wqe) & SIW_WQE_INLINE)) {
+ mem = wqe->mem[sge_idx];
+ is_kva = mem->mem_obj == NULL ? 1 : 0;
+ } else {
+ is_kva = 1;
+ }
+ if (is_kva && !c_tx->use_sendpage) {
+ /*
+ * tx from kernel virtual address: either inline data
+ * or memory region with assigned kernel buffer
+ */
+ iov[seg].iov_base =
+ ib_virt_dma_to_ptr(sge->laddr + sge_off);
+ iov[seg].iov_len = sge_len;
+
+ if (do_crc)
+ crypto_shash_update(c_tx->mpa_crc_hd,
+ iov[seg].iov_base,
+ sge_len);
+ sge_off += sge_len;
+ data_len -= sge_len;
+ seg++;
+ goto sge_done;
+ }
+
+ while (sge_len) {
+ size_t plen = min((int)PAGE_SIZE - fp_off, sge_len);
+ void *kaddr;
+
+ if (!is_kva) {
+ struct page *p;
+
+ if (mem->is_pbl)
+ p = siw_get_pblpage(
+ mem, sge->laddr + sge_off,
+ &pbl_idx);
+ else
+ p = siw_get_upage(mem->umem,
+ sge->laddr + sge_off);
+ if (unlikely(!p)) {
+ siw_unmap_pages(iov, kmap_mask, seg);
+ wqe->processed -= c_tx->bytes_unsent;
+ rv = -EFAULT;
+ goto done_crc;
+ }
+ page_array[seg] = p;
+
+ if (!c_tx->use_sendpage) {
+ void *kaddr = kmap_local_page(p);
+
+ /* Remember for later kunmap() */
+ kmap_mask |= BIT(seg);
+ iov[seg].iov_base = kaddr + fp_off;
+ iov[seg].iov_len = plen;
+
+ if (do_crc)
+ crypto_shash_update(
+ c_tx->mpa_crc_hd,
+ iov[seg].iov_base,
+ plen);
+ } else if (do_crc) {
+ kaddr = kmap_local_page(p);
+ crypto_shash_update(c_tx->mpa_crc_hd,
+ kaddr + fp_off,
+ plen);
+ kunmap_local(kaddr);
+ }
+ } else {
+ /*
+ * Cast to an uintptr_t to preserve all 64 bits
+ * in sge->laddr.
+ */
+ u64 va = sge->laddr + sge_off;
+
+ page_array[seg] = ib_virt_dma_to_page(va);
+ if (do_crc)
+ crypto_shash_update(
+ c_tx->mpa_crc_hd,
+ ib_virt_dma_to_ptr(va),
+ plen);
+ }
+
+ sge_len -= plen;
+ sge_off += plen;
+ data_len -= plen;
+ fp_off = 0;
+
+ if (++seg >= (int)MAX_ARRAY) {
+ siw_dbg_qp(tx_qp(c_tx), "to many fragments\n");
+ siw_unmap_pages(iov, kmap_mask, seg-1);
+ wqe->processed -= c_tx->bytes_unsent;
+ rv = -EMSGSIZE;
+ goto done_crc;
+ }
+ }
+sge_done:
+ /* Update SGE variables at end of SGE */
+ if (sge_off == sge->length &&
+ (data_len != 0 || wqe->processed < wqe->bytes)) {
+ sge_idx++;
+ sge++;
+ sge_off = 0;
+ }
+ }
+ /* trailer */
+ if (likely(c_tx->state != SIW_SEND_TRAILER)) {
+ iov[seg].iov_base = &c_tx->trailer.pad[4 - c_tx->pad];
+ iov[seg].iov_len = trl_len = MAX_TRAILER - (4 - c_tx->pad);
+ } else {
+ iov[seg].iov_base = &c_tx->trailer.pad[c_tx->ctrl_sent];
+ iov[seg].iov_len = trl_len = MAX_TRAILER - c_tx->ctrl_sent;
+ }
+
+ if (c_tx->pad) {
+ *(u32 *)c_tx->trailer.pad = 0;
+ if (do_crc)
+ crypto_shash_update(c_tx->mpa_crc_hd,
+ (u8 *)&c_tx->trailer.crc - c_tx->pad,
+ c_tx->pad);
+ }
+ if (!c_tx->mpa_crc_hd)
+ c_tx->trailer.crc = 0;
+ else if (do_crc)
+ crypto_shash_final(c_tx->mpa_crc_hd, (u8 *)&c_tx->trailer.crc);
+
+ data_len = c_tx->bytes_unsent;
+
+ if (c_tx->use_sendpage) {
+ rv = siw_0copy_tx(s, page_array, &wqe->sqe.sge[c_tx->sge_idx],
+ c_tx->sge_off, data_len);
+ if (rv == data_len) {
+ rv = kernel_sendmsg(s, &msg, &iov[seg], 1, trl_len);
+ if (rv > 0)
+ rv += data_len;
+ else
+ rv = data_len;
+ }
+ } else {
+ rv = kernel_sendmsg(s, &msg, iov, seg + 1,
+ hdr_len + data_len + trl_len);
+ siw_unmap_pages(iov, kmap_mask, seg);
+ }
+ if (rv < (int)hdr_len) {
+ /* Not even complete hdr pushed or negative rv */
+ wqe->processed -= data_len;
+ if (rv >= 0) {
+ c_tx->ctrl_sent += rv;
+ rv = -EAGAIN;
+ }
+ goto done_crc;
+ }
+ rv -= hdr_len;
+
+ if (rv >= (int)data_len) {
+ /* all user data pushed to TCP or no data to push */
+ if (data_len > 0 && wqe->processed < wqe->bytes) {
+ /* Save the current state for next tx */
+ c_tx->sge_idx = sge_idx;
+ c_tx->sge_off = sge_off;
+ c_tx->pbl_idx = pbl_idx;
+ }
+ rv -= data_len;
+
+ if (rv == trl_len) /* all pushed */
+ rv = 0;
+ else {
+ c_tx->state = SIW_SEND_TRAILER;
+ c_tx->ctrl_len = MAX_TRAILER;
+ c_tx->ctrl_sent = rv + 4 - c_tx->pad;
+ c_tx->bytes_unsent = 0;
+ rv = -EAGAIN;
+ }
+
+ } else if (data_len > 0) {
+ /* Maybe some user data pushed to TCP */
+ c_tx->state = SIW_SEND_DATA;
+ wqe->processed -= data_len - rv;
+
+ if (rv) {
+ /*
+ * Some bytes out. Recompute tx state based
+ * on old state and bytes pushed
+ */
+ unsigned int sge_unsent;
+
+ c_tx->bytes_unsent -= rv;
+ sge = &wqe->sqe.sge[c_tx->sge_idx];
+ sge_unsent = sge->length - c_tx->sge_off;
+
+ while (sge_unsent <= rv) {
+ rv -= sge_unsent;
+ c_tx->sge_idx++;
+ c_tx->sge_off = 0;
+ sge++;
+ sge_unsent = sge->length;
+ }
+ c_tx->sge_off += rv;
+ }
+ rv = -EAGAIN;
+ }
+done_crc:
+ c_tx->do_crc = 0;
+done:
+ return rv;
+}
+
+static void siw_update_tcpseg(struct siw_iwarp_tx *c_tx,
+ struct socket *s)
+{
+ struct tcp_sock *tp = tcp_sk(s->sk);
+
+ if (tp->gso_segs) {
+ if (c_tx->gso_seg_limit == 0)
+ c_tx->tcp_seglen = tp->mss_cache * tp->gso_segs;
+ else
+ c_tx->tcp_seglen =
+ tp->mss_cache *
+ min_t(u16, c_tx->gso_seg_limit, tp->gso_segs);
+ } else {
+ c_tx->tcp_seglen = tp->mss_cache;
+ }
+ /* Loopback may give odd numbers */
+ c_tx->tcp_seglen &= 0xfffffff8;
+}
+
+/*
+ * siw_prepare_fpdu()
+ *
+ * Prepares transmit context to send out one FPDU if FPDU will contain
+ * user data and user data are not immediate data.
+ * Computes maximum FPDU length to fill up TCP MSS if possible.
+ *
+ * @qp: QP from which to transmit
+ * @wqe: Current WQE causing transmission
+ *
+ * TODO: Take into account real available sendspace on socket
+ * to avoid header misalignment due to send pausing within
+ * fpdu transmission
+ */
+static void siw_prepare_fpdu(struct siw_qp *qp, struct siw_wqe *wqe)
+{
+ struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
+ int data_len;
+
+ c_tx->ctrl_len =
+ iwarp_pktinfo[__rdmap_get_opcode(&c_tx->pkt.ctrl)].hdr_len;
+ c_tx->ctrl_sent = 0;
+
+ /*
+ * Update target buffer offset if any
+ */
+ if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED))
+ /* Untagged message */
+ c_tx->pkt.c_untagged.ddp_mo = cpu_to_be32(wqe->processed);
+ else /* Tagged message */
+ c_tx->pkt.c_tagged.ddp_to =
+ cpu_to_be64(wqe->sqe.raddr + wqe->processed);
+
+ data_len = wqe->bytes - wqe->processed;
+ if (data_len + c_tx->ctrl_len + MPA_CRC_SIZE > c_tx->tcp_seglen) {
+ /* Trim DDP payload to fit into current TCP segment */
+ data_len = c_tx->tcp_seglen - (c_tx->ctrl_len + MPA_CRC_SIZE);
+ c_tx->pkt.ctrl.ddp_rdmap_ctrl &= ~DDP_FLAG_LAST;
+ c_tx->pad = 0;
+ } else {
+ c_tx->pkt.ctrl.ddp_rdmap_ctrl |= DDP_FLAG_LAST;
+ c_tx->pad = -data_len & 0x3;
+ }
+ c_tx->bytes_unsent = data_len;
+
+ c_tx->pkt.ctrl.mpa_len =
+ htons(c_tx->ctrl_len + data_len - MPA_HDR_SIZE);
+
+ /*
+ * Init MPA CRC computation
+ */
+ if (c_tx->mpa_crc_hd) {
+ crypto_shash_init(c_tx->mpa_crc_hd);
+ crypto_shash_update(c_tx->mpa_crc_hd, (u8 *)&c_tx->pkt,
+ c_tx->ctrl_len);
+ c_tx->do_crc = 1;
+ }
+}
+
+/*
+ * siw_check_sgl_tx()
+ *
+ * Check permissions for a list of SGE's (SGL).
+ * A successful check will have all memory referenced
+ * for transmission resolved and assigned to the WQE.
+ *
+ * @pd: Protection Domain SGL should belong to
+ * @wqe: WQE to be checked
+ * @perms: requested access permissions
+ *
+ */
+
+static int siw_check_sgl_tx(struct ib_pd *pd, struct siw_wqe *wqe,
+ enum ib_access_flags perms)
+{
+ struct siw_sge *sge = &wqe->sqe.sge[0];
+ int i, len, num_sge = wqe->sqe.num_sge;
+
+ if (unlikely(num_sge > SIW_MAX_SGE))
+ return -EINVAL;
+
+ for (i = 0, len = 0; num_sge; num_sge--, i++, sge++) {
+ /*
+ * rdma verbs: do not check stag for a zero length sge
+ */
+ if (sge->length) {
+ int rv = siw_check_sge(pd, sge, &wqe->mem[i], perms, 0,
+ sge->length);
+
+ if (unlikely(rv != E_ACCESS_OK))
+ return rv;
+ }
+ len += sge->length;
+ }
+ return len;
+}
+
+/*
+ * siw_qp_sq_proc_tx()
+ *
+ * Process one WQE which needs transmission on the wire.
+ */
+static int siw_qp_sq_proc_tx(struct siw_qp *qp, struct siw_wqe *wqe)
+{
+ struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
+ struct socket *s = qp->attrs.sk;
+ int rv = 0, burst_len = qp->tx_ctx.burst;
+ enum rdmap_ecode ecode = RDMAP_ECODE_CATASTROPHIC_STREAM;
+
+ if (unlikely(wqe->wr_status == SIW_WR_IDLE))
+ return 0;
+
+ if (!burst_len)
+ burst_len = SQ_USER_MAXBURST;
+
+ if (wqe->wr_status == SIW_WR_QUEUED) {
+ if (!(wqe->sqe.flags & SIW_WQE_INLINE)) {
+ if (tx_type(wqe) == SIW_OP_READ_RESPONSE)
+ wqe->sqe.num_sge = 1;
+
+ if (tx_type(wqe) != SIW_OP_READ &&
+ tx_type(wqe) != SIW_OP_READ_LOCAL_INV) {
+ /*
+ * Reference memory to be tx'd w/o checking
+ * access for LOCAL_READ permission, since
+ * not defined in RDMA core.
+ */
+ rv = siw_check_sgl_tx(qp->pd, wqe, 0);
+ if (rv < 0) {
+ if (tx_type(wqe) ==
+ SIW_OP_READ_RESPONSE)
+ ecode = siw_rdmap_error(-rv);
+ rv = -EINVAL;
+ goto tx_error;
+ }
+ wqe->bytes = rv;
+ } else {
+ wqe->bytes = 0;
+ }
+ } else {
+ wqe->bytes = wqe->sqe.sge[0].length;
+ if (!rdma_is_kernel_res(&qp->base_qp.res)) {
+ if (wqe->bytes > SIW_MAX_INLINE) {
+ rv = -EINVAL;
+ goto tx_error;
+ }
+ wqe->sqe.sge[0].laddr =
+ (u64)(uintptr_t)&wqe->sqe.sge[1];
+ }
+ }
+ wqe->wr_status = SIW_WR_INPROGRESS;
+ wqe->processed = 0;
+
+ siw_update_tcpseg(c_tx, s);
+
+ rv = siw_qp_prepare_tx(c_tx);
+ if (rv == PKT_FRAGMENTED) {
+ c_tx->state = SIW_SEND_HDR;
+ siw_prepare_fpdu(qp, wqe);
+ } else if (rv == PKT_COMPLETE) {
+ c_tx->state = SIW_SEND_SHORT_FPDU;
+ } else {
+ goto tx_error;
+ }
+ }
+
+next_segment:
+ siw_dbg_qp(qp, "wr type %d, state %d, data %u, sent %u, id %llx\n",
+ tx_type(wqe), wqe->wr_status, wqe->bytes, wqe->processed,
+ wqe->sqe.id);
+
+ if (--burst_len == 0) {
+ rv = -EINPROGRESS;
+ goto tx_done;
+ }
+ if (c_tx->state == SIW_SEND_SHORT_FPDU) {
+ enum siw_opcode tx_type = tx_type(wqe);
+ unsigned int msg_flags;
+
+ if (siw_sq_empty(qp) || !siw_tcp_nagle || burst_len == 1)
+ /*
+ * End current TCP segment, if SQ runs empty,
+ * or siw_tcp_nagle is not set, or we bail out
+ * soon due to no burst credit left.
+ */
+ msg_flags = MSG_DONTWAIT;
+ else
+ msg_flags = MSG_DONTWAIT | MSG_MORE;
+
+ rv = siw_tx_ctrl(c_tx, s, msg_flags);
+
+ if (!rv && tx_type != SIW_OP_READ &&
+ tx_type != SIW_OP_READ_LOCAL_INV)
+ wqe->processed = wqe->bytes;
+
+ goto tx_done;
+
+ } else {
+ rv = siw_tx_hdt(c_tx, s);
+ }
+ if (!rv) {
+ /*
+ * One segment sent. Processing completed if last
+ * segment, Do next segment otherwise.
+ */
+ if (unlikely(c_tx->tx_suspend)) {
+ /*
+ * Verbs, 6.4.: Try stopping sending after a full
+ * DDP segment if the connection goes down
+ * (== peer halfclose)
+ */
+ rv = -ECONNABORTED;
+ goto tx_done;
+ }
+ if (c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_LAST) {
+ siw_dbg_qp(qp, "WQE completed\n");
+ goto tx_done;
+ }
+ c_tx->state = SIW_SEND_HDR;
+
+ siw_update_tcpseg(c_tx, s);
+
+ siw_prepare_fpdu(qp, wqe);
+ goto next_segment;
+ }
+tx_done:
+ qp->tx_ctx.burst = burst_len;
+ return rv;
+
+tx_error:
+ if (ecode != RDMAP_ECODE_CATASTROPHIC_STREAM)
+ siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
+ RDMAP_ETYPE_REMOTE_PROTECTION, ecode, 1);
+ else
+ siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
+ RDMAP_ETYPE_CATASTROPHIC,
+ RDMAP_ECODE_UNSPECIFIED, 1);
+ return rv;
+}
+
+static int siw_fastreg_mr(struct ib_pd *pd, struct siw_sqe *sqe)
+{
+ struct ib_mr *base_mr = (struct ib_mr *)(uintptr_t)sqe->base_mr;
+ struct siw_device *sdev = to_siw_dev(pd->device);
+ struct siw_mem *mem;
+ int rv = 0;
+
+ siw_dbg_pd(pd, "STag 0x%08x\n", sqe->rkey);
+
+ if (unlikely(!base_mr)) {
+ pr_warn("siw: fastreg: STag 0x%08x unknown\n", sqe->rkey);
+ return -EINVAL;
+ }
+
+ if (unlikely(base_mr->rkey >> 8 != sqe->rkey >> 8)) {
+ pr_warn("siw: fastreg: STag 0x%08x: bad MR\n", sqe->rkey);
+ return -EINVAL;
+ }
+
+ mem = siw_mem_id2obj(sdev, sqe->rkey >> 8);
+ if (unlikely(!mem)) {
+ pr_warn("siw: fastreg: STag 0x%08x unknown\n", sqe->rkey);
+ return -EINVAL;
+ }
+
+ if (unlikely(mem->pd != pd)) {
+ pr_warn("siw: fastreg: PD mismatch\n");
+ rv = -EINVAL;
+ goto out;
+ }
+ if (unlikely(mem->stag_valid)) {
+ pr_warn("siw: fastreg: STag 0x%08x already valid\n", sqe->rkey);
+ rv = -EINVAL;
+ goto out;
+ }
+ /* Refresh STag since user may have changed key part */
+ mem->stag = sqe->rkey;
+ mem->perms = sqe->access;
+
+ siw_dbg_mem(mem, "STag 0x%08x now valid\n", sqe->rkey);
+ mem->va = base_mr->iova;
+ mem->stag_valid = 1;
+out:
+ siw_mem_put(mem);
+ return rv;
+}
+
+static int siw_qp_sq_proc_local(struct siw_qp *qp, struct siw_wqe *wqe)
+{
+ int rv;
+
+ switch (tx_type(wqe)) {
+ case SIW_OP_REG_MR:
+ rv = siw_fastreg_mr(qp->pd, &wqe->sqe);
+ break;
+
+ case SIW_OP_INVAL_STAG:
+ rv = siw_invalidate_stag(qp->pd, wqe->sqe.rkey);
+ break;
+
+ default:
+ rv = -EINVAL;
+ }
+ return rv;
+}
+
+/*
+ * siw_qp_sq_process()
+ *
+ * Core TX path routine for RDMAP/DDP/MPA using a TCP kernel socket.
+ * Sends RDMAP payload for the current SQ WR @wqe of @qp in one or more
+ * MPA FPDUs, each containing a DDP segment.
+ *
+ * SQ processing may occur in user context as a result of posting
+ * new WQE's or from siw_sq_work_handler() context. Processing in
+ * user context is limited to non-kernel verbs users.
+ *
+ * SQ processing may get paused anytime, possibly in the middle of a WR
+ * or FPDU, if insufficient send space is available. SQ processing
+ * gets resumed from siw_sq_work_handler(), if send space becomes
+ * available again.
+ *
+ * Must be called with the QP state read-locked.
+ *
+ * Note:
+ * An outbound RREQ can be satisfied by the corresponding RRESP
+ * _before_ it gets assigned to the ORQ. This happens regularly
+ * in RDMA READ via loopback case. Since both outbound RREQ and
+ * inbound RRESP can be handled by the same CPU, locking the ORQ
+ * is dead-lock prone and thus not an option. With that, the
+ * RREQ gets assigned to the ORQ _before_ being sent - see
+ * siw_activate_tx() - and pulled back in case of send failure.
+ */
+int siw_qp_sq_process(struct siw_qp *qp)
+{
+ struct siw_wqe *wqe = tx_wqe(qp);
+ enum siw_opcode tx_type;
+ unsigned long flags;
+ int rv = 0;
+
+ siw_dbg_qp(qp, "enter for type %d\n", tx_type(wqe));
+
+next_wqe:
+ /*
+ * Stop QP processing if SQ state changed
+ */
+ if (unlikely(qp->tx_ctx.tx_suspend)) {
+ siw_dbg_qp(qp, "tx suspended\n");
+ goto done;
+ }
+ tx_type = tx_type(wqe);
+
+ if (tx_type <= SIW_OP_READ_RESPONSE)
+ rv = siw_qp_sq_proc_tx(qp, wqe);
+ else
+ rv = siw_qp_sq_proc_local(qp, wqe);
+
+ if (!rv) {
+ /*
+ * WQE processing done
+ */
+ switch (tx_type) {
+ case SIW_OP_SEND:
+ case SIW_OP_SEND_REMOTE_INV:
+ case SIW_OP_WRITE:
+ siw_wqe_put_mem(wqe, tx_type);
+ fallthrough;
+
+ case SIW_OP_INVAL_STAG:
+ case SIW_OP_REG_MR:
+ if (tx_flags(wqe) & SIW_WQE_SIGNALLED)
+ siw_sqe_complete(qp, &wqe->sqe, wqe->bytes,
+ SIW_WC_SUCCESS);
+ break;
+
+ case SIW_OP_READ:
+ case SIW_OP_READ_LOCAL_INV:
+ /*
+ * already enqueued to ORQ queue
+ */
+ break;
+
+ case SIW_OP_READ_RESPONSE:
+ siw_wqe_put_mem(wqe, tx_type);
+ break;
+
+ default:
+ WARN(1, "undefined WQE type %d\n", tx_type);
+ rv = -EINVAL;
+ goto done;
+ }
+
+ spin_lock_irqsave(&qp->sq_lock, flags);
+ wqe->wr_status = SIW_WR_IDLE;
+ rv = siw_activate_tx(qp);
+ spin_unlock_irqrestore(&qp->sq_lock, flags);
+
+ if (rv <= 0)
+ goto done;
+
+ goto next_wqe;
+
+ } else if (rv == -EAGAIN) {
+ siw_dbg_qp(qp, "sq paused: hd/tr %d of %d, data %d\n",
+ qp->tx_ctx.ctrl_sent, qp->tx_ctx.ctrl_len,
+ qp->tx_ctx.bytes_unsent);
+ rv = 0;
+ goto done;
+ } else if (rv == -EINPROGRESS) {
+ rv = siw_sq_start(qp);
+ goto done;
+ } else {
+ /*
+ * WQE processing failed.
+ * Verbs 8.3.2:
+ * o It turns any WQE into a signalled WQE.
+ * o Local catastrophic error must be surfaced
+ * o QP must be moved into Terminate state: done by code
+ * doing socket state change processing
+ *
+ * o TODO: Termination message must be sent.
+ * o TODO: Implement more precise work completion errors,
+ * see enum ib_wc_status in ib_verbs.h
+ */
+ siw_dbg_qp(qp, "wqe type %d processing failed: %d\n",
+ tx_type(wqe), rv);
+
+ spin_lock_irqsave(&qp->sq_lock, flags);
+ /*
+ * RREQ may have already been completed by inbound RRESP!
+ */
+ if ((tx_type == SIW_OP_READ ||
+ tx_type == SIW_OP_READ_LOCAL_INV) && qp->attrs.orq_size) {
+ /* Cleanup pending entry in ORQ */
+ qp->orq_put--;
+ qp->orq[qp->orq_put % qp->attrs.orq_size].flags = 0;
+ }
+ spin_unlock_irqrestore(&qp->sq_lock, flags);
+ /*
+ * immediately suspends further TX processing
+ */
+ if (!qp->tx_ctx.tx_suspend)
+ siw_qp_cm_drop(qp, 0);
+
+ switch (tx_type) {
+ case SIW_OP_SEND:
+ case SIW_OP_SEND_REMOTE_INV:
+ case SIW_OP_SEND_WITH_IMM:
+ case SIW_OP_WRITE:
+ case SIW_OP_READ:
+ case SIW_OP_READ_LOCAL_INV:
+ siw_wqe_put_mem(wqe, tx_type);
+ fallthrough;
+
+ case SIW_OP_INVAL_STAG:
+ case SIW_OP_REG_MR:
+ siw_sqe_complete(qp, &wqe->sqe, wqe->bytes,
+ SIW_WC_LOC_QP_OP_ERR);
+
+ siw_qp_event(qp, IB_EVENT_QP_FATAL);
+
+ break;
+
+ case SIW_OP_READ_RESPONSE:
+ siw_dbg_qp(qp, "proc. read.response failed: %d\n", rv);
+
+ siw_qp_event(qp, IB_EVENT_QP_REQ_ERR);
+
+ siw_wqe_put_mem(wqe, SIW_OP_READ_RESPONSE);
+
+ break;
+
+ default:
+ WARN(1, "undefined WQE type %d\n", tx_type);
+ rv = -EINVAL;
+ }
+ wqe->wr_status = SIW_WR_IDLE;
+ }
+done:
+ return rv;
+}
+
+static void siw_sq_resume(struct siw_qp *qp)
+{
+ if (down_read_trylock(&qp->state_lock)) {
+ if (likely(qp->attrs.state == SIW_QP_STATE_RTS &&
+ !qp->tx_ctx.tx_suspend)) {
+ int rv = siw_qp_sq_process(qp);
+
+ up_read(&qp->state_lock);
+
+ if (unlikely(rv < 0)) {
+ siw_dbg_qp(qp, "SQ task failed: err %d\n", rv);
+
+ if (!qp->tx_ctx.tx_suspend)
+ siw_qp_cm_drop(qp, 0);
+ }
+ } else {
+ up_read(&qp->state_lock);
+ }
+ } else {
+ siw_dbg_qp(qp, "Resume SQ while QP locked\n");
+ }
+ siw_qp_put(qp);
+}
+
+struct tx_task_t {
+ struct llist_head active;
+ wait_queue_head_t waiting;
+};
+
+static DEFINE_PER_CPU(struct tx_task_t, siw_tx_task_g);
+
+int siw_create_tx_threads(void)
+{
+ int cpu, assigned = 0;
+
+ for_each_online_cpu(cpu) {
+ struct tx_task_t *tx_task;
+
+ /* Skip HT cores */
+ if (cpu % cpumask_weight(topology_sibling_cpumask(cpu)))
+ continue;
+
+ tx_task = &per_cpu(siw_tx_task_g, cpu);
+ init_llist_head(&tx_task->active);
+ init_waitqueue_head(&tx_task->waiting);
+
+ siw_tx_thread[cpu] =
+ kthread_run_on_cpu(siw_run_sq,
+ (unsigned long *)(long)cpu,
+ cpu, "siw_tx/%u");
+ if (IS_ERR(siw_tx_thread[cpu])) {
+ siw_tx_thread[cpu] = NULL;
+ continue;
+ }
+ assigned++;
+ }
+ return assigned;
+}
+
+void siw_stop_tx_threads(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (siw_tx_thread[cpu]) {
+ kthread_stop(siw_tx_thread[cpu]);
+ wake_up(&per_cpu(siw_tx_task_g, cpu).waiting);
+ siw_tx_thread[cpu] = NULL;
+ }
+ }
+}
+
+int siw_run_sq(void *data)
+{
+ const int nr_cpu = (unsigned int)(long)data;
+ struct llist_node *active;
+ struct siw_qp *qp;
+ struct tx_task_t *tx_task = &per_cpu(siw_tx_task_g, nr_cpu);
+
+ while (1) {
+ struct llist_node *fifo_list = NULL;
+
+ wait_event_interruptible(tx_task->waiting,
+ !llist_empty(&tx_task->active) ||
+ kthread_should_stop());
+
+ if (kthread_should_stop())
+ break;
+
+ active = llist_del_all(&tx_task->active);
+ /*
+ * llist_del_all returns a list with newest entry first.
+ * Re-order list for fairness among QP's.
+ */
+ fifo_list = llist_reverse_order(active);
+ while (fifo_list) {
+ qp = container_of(fifo_list, struct siw_qp, tx_list);
+ fifo_list = llist_next(fifo_list);
+ qp->tx_list.next = NULL;
+
+ siw_sq_resume(qp);
+ }
+ }
+ active = llist_del_all(&tx_task->active);
+ if (active) {
+ llist_for_each_entry(qp, active, tx_list) {
+ qp->tx_list.next = NULL;
+ siw_sq_resume(qp);
+ }
+ }
+ return 0;
+}
+
+int siw_sq_start(struct siw_qp *qp)
+{
+ if (tx_wqe(qp)->wr_status == SIW_WR_IDLE)
+ return 0;
+
+ if (unlikely(!cpu_online(qp->tx_cpu))) {
+ siw_put_tx_cpu(qp->tx_cpu);
+ qp->tx_cpu = siw_get_tx_cpu(qp->sdev);
+ if (qp->tx_cpu < 0) {
+ pr_warn("siw: no tx cpu available\n");
+
+ return -EIO;
+ }
+ }
+ siw_qp_get(qp);
+
+ llist_add(&qp->tx_list, &per_cpu(siw_tx_task_g, qp->tx_cpu).active);
+
+ wake_up(&per_cpu(siw_tx_task_g, qp->tx_cpu).waiting);
+
+ return 0;
+}
diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c
new file mode 100644
index 0000000000..fdbef3254e
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_verbs.c
@@ -0,0 +1,1889 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/xarray.h>
+#include <net/addrconf.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/uverbs_ioctl.h>
+
+#include "siw.h"
+#include "siw_verbs.h"
+#include "siw_mem.h"
+
+static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR + 1] = {
+ [IB_QPS_RESET] = SIW_QP_STATE_IDLE,
+ [IB_QPS_INIT] = SIW_QP_STATE_IDLE,
+ [IB_QPS_RTR] = SIW_QP_STATE_RTR,
+ [IB_QPS_RTS] = SIW_QP_STATE_RTS,
+ [IB_QPS_SQD] = SIW_QP_STATE_CLOSING,
+ [IB_QPS_SQE] = SIW_QP_STATE_TERMINATE,
+ [IB_QPS_ERR] = SIW_QP_STATE_ERROR
+};
+
+static char ib_qp_state_to_string[IB_QPS_ERR + 1][sizeof("RESET")] = {
+ [IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR",
+ [IB_QPS_RTS] = "RTS", [IB_QPS_SQD] = "SQD", [IB_QPS_SQE] = "SQE",
+ [IB_QPS_ERR] = "ERR"
+};
+
+void siw_mmap_free(struct rdma_user_mmap_entry *rdma_entry)
+{
+ struct siw_user_mmap_entry *entry = to_siw_mmap_entry(rdma_entry);
+
+ kfree(entry);
+}
+
+int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma)
+{
+ struct siw_ucontext *uctx = to_siw_ctx(ctx);
+ size_t size = vma->vm_end - vma->vm_start;
+ struct rdma_user_mmap_entry *rdma_entry;
+ struct siw_user_mmap_entry *entry;
+ int rv = -EINVAL;
+
+ /*
+ * Must be page aligned
+ */
+ if (vma->vm_start & (PAGE_SIZE - 1)) {
+ pr_warn("siw: mmap not page aligned\n");
+ return -EINVAL;
+ }
+ rdma_entry = rdma_user_mmap_entry_get(&uctx->base_ucontext, vma);
+ if (!rdma_entry) {
+ siw_dbg(&uctx->sdev->base_dev, "mmap lookup failed: %lu, %#zx\n",
+ vma->vm_pgoff, size);
+ return -EINVAL;
+ }
+ entry = to_siw_mmap_entry(rdma_entry);
+
+ rv = remap_vmalloc_range(vma, entry->address, 0);
+ if (rv) {
+ pr_warn("remap_vmalloc_range failed: %lu, %zu\n", vma->vm_pgoff,
+ size);
+ goto out;
+ }
+out:
+ rdma_user_mmap_entry_put(rdma_entry);
+
+ return rv;
+}
+
+int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata)
+{
+ struct siw_device *sdev = to_siw_dev(base_ctx->device);
+ struct siw_ucontext *ctx = to_siw_ctx(base_ctx);
+ struct siw_uresp_alloc_ctx uresp = {};
+ int rv;
+
+ if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) {
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ ctx->sdev = sdev;
+
+ uresp.dev_id = sdev->vendor_part_id;
+
+ if (udata->outlen < sizeof(uresp)) {
+ rv = -EINVAL;
+ goto err_out;
+ }
+ rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+ if (rv)
+ goto err_out;
+
+ siw_dbg(base_ctx->device, "success. now %d context(s)\n",
+ atomic_read(&sdev->num_ctx));
+
+ return 0;
+
+err_out:
+ atomic_dec(&sdev->num_ctx);
+ siw_dbg(base_ctx->device, "failure %d. now %d context(s)\n", rv,
+ atomic_read(&sdev->num_ctx));
+
+ return rv;
+}
+
+void siw_dealloc_ucontext(struct ib_ucontext *base_ctx)
+{
+ struct siw_ucontext *uctx = to_siw_ctx(base_ctx);
+
+ atomic_dec(&uctx->sdev->num_ctx);
+}
+
+int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr,
+ struct ib_udata *udata)
+{
+ struct siw_device *sdev = to_siw_dev(base_dev);
+
+ if (udata->inlen || udata->outlen)
+ return -EINVAL;
+
+ memset(attr, 0, sizeof(*attr));
+
+ /* Revisit atomic caps if RFC 7306 gets supported */
+ attr->atomic_cap = 0;
+ attr->device_cap_flags = IB_DEVICE_MEM_MGT_EXTENSIONS;
+ attr->kernel_cap_flags = IBK_ALLOW_USER_UNREG;
+ attr->max_cq = sdev->attrs.max_cq;
+ attr->max_cqe = sdev->attrs.max_cqe;
+ attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL;
+ attr->max_mr = sdev->attrs.max_mr;
+ attr->max_mw = sdev->attrs.max_mw;
+ attr->max_mr_size = ~0ull;
+ attr->max_pd = sdev->attrs.max_pd;
+ attr->max_qp = sdev->attrs.max_qp;
+ attr->max_qp_init_rd_atom = sdev->attrs.max_ird;
+ attr->max_qp_rd_atom = sdev->attrs.max_ord;
+ attr->max_qp_wr = sdev->attrs.max_qp_wr;
+ attr->max_recv_sge = sdev->attrs.max_sge;
+ attr->max_res_rd_atom = sdev->attrs.max_qp * sdev->attrs.max_ird;
+ attr->max_send_sge = sdev->attrs.max_sge;
+ attr->max_sge_rd = sdev->attrs.max_sge_rd;
+ attr->max_srq = sdev->attrs.max_srq;
+ attr->max_srq_sge = sdev->attrs.max_srq_sge;
+ attr->max_srq_wr = sdev->attrs.max_srq_wr;
+ attr->page_size_cap = PAGE_SIZE;
+ attr->vendor_id = SIW_VENDOR_ID;
+ attr->vendor_part_id = sdev->vendor_part_id;
+
+ addrconf_addr_eui48((u8 *)&attr->sys_image_guid,
+ sdev->raw_gid);
+
+ return 0;
+}
+
+int siw_query_port(struct ib_device *base_dev, u32 port,
+ struct ib_port_attr *attr)
+{
+ struct siw_device *sdev = to_siw_dev(base_dev);
+ int rv;
+
+ memset(attr, 0, sizeof(*attr));
+
+ rv = ib_get_eth_speed(base_dev, port, &attr->active_speed,
+ &attr->active_width);
+ attr->gid_tbl_len = 1;
+ attr->max_msg_sz = -1;
+ attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
+ attr->active_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
+ attr->phys_state = sdev->state == IB_PORT_ACTIVE ?
+ IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED;
+ attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP;
+ attr->state = sdev->state;
+ /*
+ * All zero
+ *
+ * attr->lid = 0;
+ * attr->bad_pkey_cntr = 0;
+ * attr->qkey_viol_cntr = 0;
+ * attr->sm_lid = 0;
+ * attr->lmc = 0;
+ * attr->max_vl_num = 0;
+ * attr->sm_sl = 0;
+ * attr->subnet_timeout = 0;
+ * attr->init_type_repy = 0;
+ */
+ return rv;
+}
+
+int siw_get_port_immutable(struct ib_device *base_dev, u32 port,
+ struct ib_port_immutable *port_immutable)
+{
+ struct ib_port_attr attr;
+ int rv = siw_query_port(base_dev, port, &attr);
+
+ if (rv)
+ return rv;
+
+ port_immutable->gid_tbl_len = attr.gid_tbl_len;
+ port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
+
+ return 0;
+}
+
+int siw_query_gid(struct ib_device *base_dev, u32 port, int idx,
+ union ib_gid *gid)
+{
+ struct siw_device *sdev = to_siw_dev(base_dev);
+
+ /* subnet_prefix == interface_id == 0; */
+ memset(gid, 0, sizeof(*gid));
+ memcpy(gid->raw, sdev->raw_gid, ETH_ALEN);
+
+ return 0;
+}
+
+int siw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata)
+{
+ struct siw_device *sdev = to_siw_dev(pd->device);
+
+ if (atomic_inc_return(&sdev->num_pd) > SIW_MAX_PD) {
+ atomic_dec(&sdev->num_pd);
+ return -ENOMEM;
+ }
+ siw_dbg_pd(pd, "now %d PD's(s)\n", atomic_read(&sdev->num_pd));
+
+ return 0;
+}
+
+int siw_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
+{
+ struct siw_device *sdev = to_siw_dev(pd->device);
+
+ siw_dbg_pd(pd, "free PD\n");
+ atomic_dec(&sdev->num_pd);
+ return 0;
+}
+
+void siw_qp_get_ref(struct ib_qp *base_qp)
+{
+ siw_qp_get(to_siw_qp(base_qp));
+}
+
+void siw_qp_put_ref(struct ib_qp *base_qp)
+{
+ siw_qp_put(to_siw_qp(base_qp));
+}
+
+static struct rdma_user_mmap_entry *
+siw_mmap_entry_insert(struct siw_ucontext *uctx,
+ void *address, size_t length,
+ u64 *offset)
+{
+ struct siw_user_mmap_entry *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ int rv;
+
+ *offset = SIW_INVAL_UOBJ_KEY;
+ if (!entry)
+ return NULL;
+
+ entry->address = address;
+
+ rv = rdma_user_mmap_entry_insert(&uctx->base_ucontext,
+ &entry->rdma_entry,
+ length);
+ if (rv) {
+ kfree(entry);
+ return NULL;
+ }
+
+ *offset = rdma_user_mmap_get_offset(&entry->rdma_entry);
+
+ return &entry->rdma_entry;
+}
+
+/*
+ * siw_create_qp()
+ *
+ * Create QP of requested size on given device.
+ *
+ * @qp: Queue pait
+ * @attrs: Initial QP attributes.
+ * @udata: used to provide QP ID, SQ and RQ size back to user.
+ */
+
+int siw_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs,
+ struct ib_udata *udata)
+{
+ struct ib_pd *pd = ibqp->pd;
+ struct siw_qp *qp = to_siw_qp(ibqp);
+ struct ib_device *base_dev = pd->device;
+ struct siw_device *sdev = to_siw_dev(base_dev);
+ struct siw_ucontext *uctx =
+ rdma_udata_to_drv_context(udata, struct siw_ucontext,
+ base_ucontext);
+ unsigned long flags;
+ int num_sqe, num_rqe, rv = 0;
+ size_t length;
+
+ siw_dbg(base_dev, "create new QP\n");
+
+ if (attrs->create_flags)
+ return -EOPNOTSUPP;
+
+ if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) {
+ siw_dbg(base_dev, "too many QP's\n");
+ rv = -ENOMEM;
+ goto err_atomic;
+ }
+ if (attrs->qp_type != IB_QPT_RC) {
+ siw_dbg(base_dev, "only RC QP's supported\n");
+ rv = -EOPNOTSUPP;
+ goto err_atomic;
+ }
+ if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) ||
+ (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) ||
+ (attrs->cap.max_send_sge > SIW_MAX_SGE) ||
+ (attrs->cap.max_recv_sge > SIW_MAX_SGE)) {
+ siw_dbg(base_dev, "QP size error\n");
+ rv = -EINVAL;
+ goto err_atomic;
+ }
+ if (attrs->cap.max_inline_data > SIW_MAX_INLINE) {
+ siw_dbg(base_dev, "max inline send: %d > %d\n",
+ attrs->cap.max_inline_data, (int)SIW_MAX_INLINE);
+ rv = -EINVAL;
+ goto err_atomic;
+ }
+ /*
+ * NOTE: we allow for zero element SQ and RQ WQE's SGL's
+ * but not for a QP unable to hold any WQE (SQ + RQ)
+ */
+ if (attrs->cap.max_send_wr + attrs->cap.max_recv_wr == 0) {
+ siw_dbg(base_dev, "QP must have send or receive queue\n");
+ rv = -EINVAL;
+ goto err_atomic;
+ }
+
+ if (!attrs->send_cq || (!attrs->recv_cq && !attrs->srq)) {
+ siw_dbg(base_dev, "send CQ or receive CQ invalid\n");
+ rv = -EINVAL;
+ goto err_atomic;
+ }
+
+ init_rwsem(&qp->state_lock);
+ spin_lock_init(&qp->sq_lock);
+ spin_lock_init(&qp->rq_lock);
+ spin_lock_init(&qp->orq_lock);
+
+ rv = siw_qp_add(sdev, qp);
+ if (rv)
+ goto err_atomic;
+
+ num_sqe = attrs->cap.max_send_wr;
+ num_rqe = attrs->cap.max_recv_wr;
+
+ /* All queue indices are derived from modulo operations
+ * on a free running 'get' (consumer) and 'put' (producer)
+ * unsigned counter. Having queue sizes at power of two
+ * avoids handling counter wrap around.
+ */
+ if (num_sqe)
+ num_sqe = roundup_pow_of_two(num_sqe);
+ else {
+ /* Zero sized SQ is not supported */
+ rv = -EINVAL;
+ goto err_out_xa;
+ }
+ if (num_rqe)
+ num_rqe = roundup_pow_of_two(num_rqe);
+
+ if (udata)
+ qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe));
+ else
+ qp->sendq = vcalloc(num_sqe, sizeof(struct siw_sqe));
+
+ if (qp->sendq == NULL) {
+ rv = -ENOMEM;
+ goto err_out_xa;
+ }
+ if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) {
+ if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR)
+ qp->attrs.flags |= SIW_SIGNAL_ALL_WR;
+ else {
+ rv = -EINVAL;
+ goto err_out_xa;
+ }
+ }
+ qp->pd = pd;
+ qp->scq = to_siw_cq(attrs->send_cq);
+ qp->rcq = to_siw_cq(attrs->recv_cq);
+
+ if (attrs->srq) {
+ /*
+ * SRQ support.
+ * Verbs 6.3.7: ignore RQ size, if SRQ present
+ * Verbs 6.3.5: do not check PD of SRQ against PD of QP
+ */
+ qp->srq = to_siw_srq(attrs->srq);
+ qp->attrs.rq_size = 0;
+ siw_dbg(base_dev, "QP [%u]: SRQ attached\n",
+ qp->base_qp.qp_num);
+ } else if (num_rqe) {
+ if (udata)
+ qp->recvq =
+ vmalloc_user(num_rqe * sizeof(struct siw_rqe));
+ else
+ qp->recvq = vcalloc(num_rqe, sizeof(struct siw_rqe));
+
+ if (qp->recvq == NULL) {
+ rv = -ENOMEM;
+ goto err_out_xa;
+ }
+ qp->attrs.rq_size = num_rqe;
+ }
+ qp->attrs.sq_size = num_sqe;
+ qp->attrs.sq_max_sges = attrs->cap.max_send_sge;
+ qp->attrs.rq_max_sges = attrs->cap.max_recv_sge;
+
+ /* Make those two tunables fixed for now. */
+ qp->tx_ctx.gso_seg_limit = 1;
+ qp->tx_ctx.zcopy_tx = zcopy_tx;
+
+ qp->attrs.state = SIW_QP_STATE_IDLE;
+
+ if (udata) {
+ struct siw_uresp_create_qp uresp = {};
+
+ uresp.num_sqe = num_sqe;
+ uresp.num_rqe = num_rqe;
+ uresp.qp_id = qp_id(qp);
+
+ if (qp->sendq) {
+ length = num_sqe * sizeof(struct siw_sqe);
+ qp->sq_entry =
+ siw_mmap_entry_insert(uctx, qp->sendq,
+ length, &uresp.sq_key);
+ if (!qp->sq_entry) {
+ rv = -ENOMEM;
+ goto err_out_xa;
+ }
+ }
+
+ if (qp->recvq) {
+ length = num_rqe * sizeof(struct siw_rqe);
+ qp->rq_entry =
+ siw_mmap_entry_insert(uctx, qp->recvq,
+ length, &uresp.rq_key);
+ if (!qp->rq_entry) {
+ uresp.sq_key = SIW_INVAL_UOBJ_KEY;
+ rv = -ENOMEM;
+ goto err_out_xa;
+ }
+ }
+
+ if (udata->outlen < sizeof(uresp)) {
+ rv = -EINVAL;
+ goto err_out_xa;
+ }
+ rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+ if (rv)
+ goto err_out_xa;
+ }
+ qp->tx_cpu = siw_get_tx_cpu(sdev);
+ if (qp->tx_cpu < 0) {
+ rv = -EINVAL;
+ goto err_out_xa;
+ }
+ INIT_LIST_HEAD(&qp->devq);
+ spin_lock_irqsave(&sdev->lock, flags);
+ list_add_tail(&qp->devq, &sdev->qp_list);
+ spin_unlock_irqrestore(&sdev->lock, flags);
+
+ init_completion(&qp->qp_free);
+
+ return 0;
+
+err_out_xa:
+ xa_erase(&sdev->qp_xa, qp_id(qp));
+ if (uctx) {
+ rdma_user_mmap_entry_remove(qp->sq_entry);
+ rdma_user_mmap_entry_remove(qp->rq_entry);
+ }
+ vfree(qp->sendq);
+ vfree(qp->recvq);
+
+err_atomic:
+ atomic_dec(&sdev->num_qp);
+ return rv;
+}
+
+/*
+ * Minimum siw_query_qp() verb interface.
+ *
+ * @qp_attr_mask is not used but all available information is provided
+ */
+int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr,
+ int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+ struct siw_qp *qp;
+ struct siw_device *sdev;
+
+ if (base_qp && qp_attr && qp_init_attr) {
+ qp = to_siw_qp(base_qp);
+ sdev = to_siw_dev(base_qp->device);
+ } else {
+ return -EINVAL;
+ }
+ qp_attr->cap.max_inline_data = SIW_MAX_INLINE;
+ qp_attr->cap.max_send_wr = qp->attrs.sq_size;
+ qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges;
+ qp_attr->cap.max_recv_wr = qp->attrs.rq_size;
+ qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges;
+ qp_attr->path_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
+ qp_attr->max_rd_atomic = qp->attrs.irq_size;
+ qp_attr->max_dest_rd_atomic = qp->attrs.orq_size;
+
+ qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_READ;
+
+ qp_init_attr->qp_type = base_qp->qp_type;
+ qp_init_attr->send_cq = base_qp->send_cq;
+ qp_init_attr->recv_cq = base_qp->recv_cq;
+ qp_init_attr->srq = base_qp->srq;
+
+ qp_init_attr->cap = qp_attr->cap;
+
+ return 0;
+}
+
+int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata)
+{
+ struct siw_qp_attrs new_attrs;
+ enum siw_qp_attr_mask siw_attr_mask = 0;
+ struct siw_qp *qp = to_siw_qp(base_qp);
+ int rv = 0;
+
+ if (!attr_mask)
+ return 0;
+
+ if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS)
+ return -EOPNOTSUPP;
+
+ memset(&new_attrs, 0, sizeof(new_attrs));
+
+ if (attr_mask & IB_QP_ACCESS_FLAGS) {
+ siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS;
+
+ if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ)
+ new_attrs.flags |= SIW_RDMA_READ_ENABLED;
+ if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE)
+ new_attrs.flags |= SIW_RDMA_WRITE_ENABLED;
+ if (attr->qp_access_flags & IB_ACCESS_MW_BIND)
+ new_attrs.flags |= SIW_RDMA_BIND_ENABLED;
+ }
+ if (attr_mask & IB_QP_STATE) {
+ siw_dbg_qp(qp, "desired IB QP state: %s\n",
+ ib_qp_state_to_string[attr->qp_state]);
+
+ new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state];
+
+ if (new_attrs.state > SIW_QP_STATE_RTS)
+ qp->tx_ctx.tx_suspend = 1;
+
+ siw_attr_mask |= SIW_QP_ATTR_STATE;
+ }
+ if (!siw_attr_mask)
+ goto out;
+
+ down_write(&qp->state_lock);
+
+ rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask);
+
+ up_write(&qp->state_lock);
+out:
+ return rv;
+}
+
+int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata)
+{
+ struct siw_qp *qp = to_siw_qp(base_qp);
+ struct siw_ucontext *uctx =
+ rdma_udata_to_drv_context(udata, struct siw_ucontext,
+ base_ucontext);
+ struct siw_qp_attrs qp_attrs;
+
+ siw_dbg_qp(qp, "state %d\n", qp->attrs.state);
+
+ /*
+ * Mark QP as in process of destruction to prevent from
+ * any async callbacks to RDMA core
+ */
+ qp->attrs.flags |= SIW_QP_IN_DESTROY;
+ qp->rx_stream.rx_suspend = 1;
+
+ if (uctx) {
+ rdma_user_mmap_entry_remove(qp->sq_entry);
+ rdma_user_mmap_entry_remove(qp->rq_entry);
+ }
+
+ down_write(&qp->state_lock);
+
+ qp_attrs.state = SIW_QP_STATE_ERROR;
+ siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE);
+
+ if (qp->cep) {
+ siw_cep_put(qp->cep);
+ qp->cep = NULL;
+ }
+ up_write(&qp->state_lock);
+
+ kfree(qp->tx_ctx.mpa_crc_hd);
+ kfree(qp->rx_stream.mpa_crc_hd);
+
+ qp->scq = qp->rcq = NULL;
+
+ siw_qp_put(qp);
+ wait_for_completion(&qp->qp_free);
+
+ return 0;
+}
+
+/*
+ * siw_copy_inline_sgl()
+ *
+ * Prepare sgl of inlined data for sending. For userland callers
+ * function checks if given buffer addresses and len's are within
+ * process context bounds.
+ * Data from all provided sge's are copied together into the wqe,
+ * referenced by a single sge.
+ */
+static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr,
+ struct siw_sqe *sqe)
+{
+ struct ib_sge *core_sge = core_wr->sg_list;
+ void *kbuf = &sqe->sge[1];
+ int num_sge = core_wr->num_sge, bytes = 0;
+
+ sqe->sge[0].laddr = (uintptr_t)kbuf;
+ sqe->sge[0].lkey = 0;
+
+ while (num_sge--) {
+ if (!core_sge->length) {
+ core_sge++;
+ continue;
+ }
+ bytes += core_sge->length;
+ if (bytes > SIW_MAX_INLINE) {
+ bytes = -EINVAL;
+ break;
+ }
+ memcpy(kbuf, ib_virt_dma_to_ptr(core_sge->addr),
+ core_sge->length);
+
+ kbuf += core_sge->length;
+ core_sge++;
+ }
+ sqe->sge[0].length = max(bytes, 0);
+ sqe->num_sge = bytes > 0 ? 1 : 0;
+
+ return bytes;
+}
+
+/* Complete SQ WR's without processing */
+static int siw_sq_flush_wr(struct siw_qp *qp, const struct ib_send_wr *wr,
+ const struct ib_send_wr **bad_wr)
+{
+ int rv = 0;
+
+ while (wr) {
+ struct siw_sqe sqe = {};
+
+ switch (wr->opcode) {
+ case IB_WR_RDMA_WRITE:
+ sqe.opcode = SIW_OP_WRITE;
+ break;
+ case IB_WR_RDMA_READ:
+ sqe.opcode = SIW_OP_READ;
+ break;
+ case IB_WR_RDMA_READ_WITH_INV:
+ sqe.opcode = SIW_OP_READ_LOCAL_INV;
+ break;
+ case IB_WR_SEND:
+ sqe.opcode = SIW_OP_SEND;
+ break;
+ case IB_WR_SEND_WITH_IMM:
+ sqe.opcode = SIW_OP_SEND_WITH_IMM;
+ break;
+ case IB_WR_SEND_WITH_INV:
+ sqe.opcode = SIW_OP_SEND_REMOTE_INV;
+ break;
+ case IB_WR_LOCAL_INV:
+ sqe.opcode = SIW_OP_INVAL_STAG;
+ break;
+ case IB_WR_REG_MR:
+ sqe.opcode = SIW_OP_REG_MR;
+ break;
+ default:
+ rv = -EINVAL;
+ break;
+ }
+ if (!rv) {
+ sqe.id = wr->wr_id;
+ rv = siw_sqe_complete(qp, &sqe, 0,
+ SIW_WC_WR_FLUSH_ERR);
+ }
+ if (rv) {
+ if (bad_wr)
+ *bad_wr = wr;
+ break;
+ }
+ wr = wr->next;
+ }
+ return rv;
+}
+
+/* Complete RQ WR's without processing */
+static int siw_rq_flush_wr(struct siw_qp *qp, const struct ib_recv_wr *wr,
+ const struct ib_recv_wr **bad_wr)
+{
+ struct siw_rqe rqe = {};
+ int rv = 0;
+
+ while (wr) {
+ rqe.id = wr->wr_id;
+ rv = siw_rqe_complete(qp, &rqe, 0, 0, SIW_WC_WR_FLUSH_ERR);
+ if (rv) {
+ if (bad_wr)
+ *bad_wr = wr;
+ break;
+ }
+ wr = wr->next;
+ }
+ return rv;
+}
+
+/*
+ * siw_post_send()
+ *
+ * Post a list of S-WR's to a SQ.
+ *
+ * @base_qp: Base QP contained in siw QP
+ * @wr: Null terminated list of user WR's
+ * @bad_wr: Points to failing WR in case of synchronous failure.
+ */
+int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
+ const struct ib_send_wr **bad_wr)
+{
+ struct siw_qp *qp = to_siw_qp(base_qp);
+ struct siw_wqe *wqe = tx_wqe(qp);
+
+ unsigned long flags;
+ int rv = 0;
+
+ if (wr && !rdma_is_kernel_res(&qp->base_qp.res)) {
+ siw_dbg_qp(qp, "wr must be empty for user mapped sq\n");
+ *bad_wr = wr;
+ return -EINVAL;
+ }
+
+ /*
+ * Try to acquire QP state lock. Must be non-blocking
+ * to accommodate kernel clients needs.
+ */
+ if (!down_read_trylock(&qp->state_lock)) {
+ if (qp->attrs.state == SIW_QP_STATE_ERROR) {
+ /*
+ * ERROR state is final, so we can be sure
+ * this state will not change as long as the QP
+ * exists.
+ *
+ * This handles an ib_drain_sq() call with
+ * a concurrent request to set the QP state
+ * to ERROR.
+ */
+ rv = siw_sq_flush_wr(qp, wr, bad_wr);
+ } else {
+ siw_dbg_qp(qp, "QP locked, state %d\n",
+ qp->attrs.state);
+ *bad_wr = wr;
+ rv = -ENOTCONN;
+ }
+ return rv;
+ }
+ if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) {
+ if (qp->attrs.state == SIW_QP_STATE_ERROR) {
+ /*
+ * Immediately flush this WR to CQ, if QP
+ * is in ERROR state. SQ is guaranteed to
+ * be empty, so WR complets in-order.
+ *
+ * Typically triggered by ib_drain_sq().
+ */
+ rv = siw_sq_flush_wr(qp, wr, bad_wr);
+ } else {
+ siw_dbg_qp(qp, "QP out of state %d\n",
+ qp->attrs.state);
+ *bad_wr = wr;
+ rv = -ENOTCONN;
+ }
+ up_read(&qp->state_lock);
+ return rv;
+ }
+ spin_lock_irqsave(&qp->sq_lock, flags);
+
+ while (wr) {
+ u32 idx = qp->sq_put % qp->attrs.sq_size;
+ struct siw_sqe *sqe = &qp->sendq[idx];
+
+ if (sqe->flags) {
+ siw_dbg_qp(qp, "sq full\n");
+ rv = -ENOMEM;
+ break;
+ }
+ if (wr->num_sge > qp->attrs.sq_max_sges) {
+ siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
+ rv = -EINVAL;
+ break;
+ }
+ sqe->id = wr->wr_id;
+
+ if ((wr->send_flags & IB_SEND_SIGNALED) ||
+ (qp->attrs.flags & SIW_SIGNAL_ALL_WR))
+ sqe->flags |= SIW_WQE_SIGNALLED;
+
+ if (wr->send_flags & IB_SEND_FENCE)
+ sqe->flags |= SIW_WQE_READ_FENCE;
+
+ switch (wr->opcode) {
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_INV:
+ if (wr->send_flags & IB_SEND_SOLICITED)
+ sqe->flags |= SIW_WQE_SOLICITED;
+
+ if (!(wr->send_flags & IB_SEND_INLINE)) {
+ siw_copy_sgl(wr->sg_list, sqe->sge,
+ wr->num_sge);
+ sqe->num_sge = wr->num_sge;
+ } else {
+ rv = siw_copy_inline_sgl(wr, sqe);
+ if (rv <= 0) {
+ rv = -EINVAL;
+ break;
+ }
+ sqe->flags |= SIW_WQE_INLINE;
+ sqe->num_sge = 1;
+ }
+ if (wr->opcode == IB_WR_SEND)
+ sqe->opcode = SIW_OP_SEND;
+ else {
+ sqe->opcode = SIW_OP_SEND_REMOTE_INV;
+ sqe->rkey = wr->ex.invalidate_rkey;
+ }
+ break;
+
+ case IB_WR_RDMA_READ_WITH_INV:
+ case IB_WR_RDMA_READ:
+ /*
+ * iWarp restricts RREAD sink to SGL containing
+ * 1 SGE only. we could relax to SGL with multiple
+ * elements referring the SAME ltag or even sending
+ * a private per-rreq tag referring to a checked
+ * local sgl with MULTIPLE ltag's.
+ */
+ if (unlikely(wr->num_sge != 1)) {
+ rv = -EINVAL;
+ break;
+ }
+ siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1);
+ /*
+ * NOTE: zero length RREAD is allowed!
+ */
+ sqe->raddr = rdma_wr(wr)->remote_addr;
+ sqe->rkey = rdma_wr(wr)->rkey;
+ sqe->num_sge = 1;
+
+ if (wr->opcode == IB_WR_RDMA_READ)
+ sqe->opcode = SIW_OP_READ;
+ else
+ sqe->opcode = SIW_OP_READ_LOCAL_INV;
+ break;
+
+ case IB_WR_RDMA_WRITE:
+ if (!(wr->send_flags & IB_SEND_INLINE)) {
+ siw_copy_sgl(wr->sg_list, &sqe->sge[0],
+ wr->num_sge);
+ sqe->num_sge = wr->num_sge;
+ } else {
+ rv = siw_copy_inline_sgl(wr, sqe);
+ if (unlikely(rv < 0)) {
+ rv = -EINVAL;
+ break;
+ }
+ sqe->flags |= SIW_WQE_INLINE;
+ sqe->num_sge = 1;
+ }
+ sqe->raddr = rdma_wr(wr)->remote_addr;
+ sqe->rkey = rdma_wr(wr)->rkey;
+ sqe->opcode = SIW_OP_WRITE;
+ break;
+
+ case IB_WR_REG_MR:
+ sqe->base_mr = (uintptr_t)reg_wr(wr)->mr;
+ sqe->rkey = reg_wr(wr)->key;
+ sqe->access = reg_wr(wr)->access & IWARP_ACCESS_MASK;
+ sqe->opcode = SIW_OP_REG_MR;
+ break;
+
+ case IB_WR_LOCAL_INV:
+ sqe->rkey = wr->ex.invalidate_rkey;
+ sqe->opcode = SIW_OP_INVAL_STAG;
+ break;
+
+ default:
+ siw_dbg_qp(qp, "ib wr type %d unsupported\n",
+ wr->opcode);
+ rv = -EINVAL;
+ break;
+ }
+ siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%pK\n",
+ sqe->opcode, sqe->flags,
+ (void *)(uintptr_t)sqe->id);
+
+ if (unlikely(rv < 0))
+ break;
+
+ /* make SQE only valid after completely written */
+ smp_wmb();
+ sqe->flags |= SIW_WQE_VALID;
+
+ qp->sq_put++;
+ wr = wr->next;
+ }
+
+ /*
+ * Send directly if SQ processing is not in progress.
+ * Eventual immediate errors (rv < 0) do not affect the involved
+ * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ
+ * processing, if new work is already pending. But rv must be passed
+ * to caller.
+ */
+ if (wqe->wr_status != SIW_WR_IDLE) {
+ spin_unlock_irqrestore(&qp->sq_lock, flags);
+ goto skip_direct_sending;
+ }
+ rv = siw_activate_tx(qp);
+ spin_unlock_irqrestore(&qp->sq_lock, flags);
+
+ if (rv <= 0)
+ goto skip_direct_sending;
+
+ if (rdma_is_kernel_res(&qp->base_qp.res)) {
+ rv = siw_sq_start(qp);
+ } else {
+ qp->tx_ctx.in_syscall = 1;
+
+ if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend))
+ siw_qp_cm_drop(qp, 0);
+
+ qp->tx_ctx.in_syscall = 0;
+ }
+skip_direct_sending:
+
+ up_read(&qp->state_lock);
+
+ if (rv >= 0)
+ return 0;
+ /*
+ * Immediate error
+ */
+ siw_dbg_qp(qp, "error %d\n", rv);
+
+ *bad_wr = wr;
+ return rv;
+}
+
+/*
+ * siw_post_receive()
+ *
+ * Post a list of R-WR's to a RQ.
+ *
+ * @base_qp: Base QP contained in siw QP
+ * @wr: Null terminated list of user WR's
+ * @bad_wr: Points to failing WR in case of synchronous failure.
+ */
+int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr,
+ const struct ib_recv_wr **bad_wr)
+{
+ struct siw_qp *qp = to_siw_qp(base_qp);
+ unsigned long flags;
+ int rv = 0;
+
+ if (qp->srq || qp->attrs.rq_size == 0) {
+ *bad_wr = wr;
+ return -EINVAL;
+ }
+ if (!rdma_is_kernel_res(&qp->base_qp.res)) {
+ siw_dbg_qp(qp, "no kernel post_recv for user mapped rq\n");
+ *bad_wr = wr;
+ return -EINVAL;
+ }
+
+ /*
+ * Try to acquire QP state lock. Must be non-blocking
+ * to accommodate kernel clients needs.
+ */
+ if (!down_read_trylock(&qp->state_lock)) {
+ if (qp->attrs.state == SIW_QP_STATE_ERROR) {
+ /*
+ * ERROR state is final, so we can be sure
+ * this state will not change as long as the QP
+ * exists.
+ *
+ * This handles an ib_drain_rq() call with
+ * a concurrent request to set the QP state
+ * to ERROR.
+ */
+ rv = siw_rq_flush_wr(qp, wr, bad_wr);
+ } else {
+ siw_dbg_qp(qp, "QP locked, state %d\n",
+ qp->attrs.state);
+ *bad_wr = wr;
+ rv = -ENOTCONN;
+ }
+ return rv;
+ }
+ if (qp->attrs.state > SIW_QP_STATE_RTS) {
+ if (qp->attrs.state == SIW_QP_STATE_ERROR) {
+ /*
+ * Immediately flush this WR to CQ, if QP
+ * is in ERROR state. RQ is guaranteed to
+ * be empty, so WR complets in-order.
+ *
+ * Typically triggered by ib_drain_rq().
+ */
+ rv = siw_rq_flush_wr(qp, wr, bad_wr);
+ } else {
+ siw_dbg_qp(qp, "QP out of state %d\n",
+ qp->attrs.state);
+ *bad_wr = wr;
+ rv = -ENOTCONN;
+ }
+ up_read(&qp->state_lock);
+ return rv;
+ }
+ /*
+ * Serialize potentially multiple producers.
+ * Not needed for single threaded consumer side.
+ */
+ spin_lock_irqsave(&qp->rq_lock, flags);
+
+ while (wr) {
+ u32 idx = qp->rq_put % qp->attrs.rq_size;
+ struct siw_rqe *rqe = &qp->recvq[idx];
+
+ if (rqe->flags) {
+ siw_dbg_qp(qp, "RQ full\n");
+ rv = -ENOMEM;
+ break;
+ }
+ if (wr->num_sge > qp->attrs.rq_max_sges) {
+ siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
+ rv = -EINVAL;
+ break;
+ }
+ rqe->id = wr->wr_id;
+ rqe->num_sge = wr->num_sge;
+ siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
+
+ /* make sure RQE is completely written before valid */
+ smp_wmb();
+
+ rqe->flags = SIW_WQE_VALID;
+
+ qp->rq_put++;
+ wr = wr->next;
+ }
+ spin_unlock_irqrestore(&qp->rq_lock, flags);
+
+ up_read(&qp->state_lock);
+
+ if (rv < 0) {
+ siw_dbg_qp(qp, "error %d\n", rv);
+ *bad_wr = wr;
+ }
+ return rv > 0 ? 0 : rv;
+}
+
+int siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata)
+{
+ struct siw_cq *cq = to_siw_cq(base_cq);
+ struct siw_device *sdev = to_siw_dev(base_cq->device);
+ struct siw_ucontext *ctx =
+ rdma_udata_to_drv_context(udata, struct siw_ucontext,
+ base_ucontext);
+
+ siw_dbg_cq(cq, "free CQ resources\n");
+
+ siw_cq_flush(cq);
+
+ if (ctx)
+ rdma_user_mmap_entry_remove(cq->cq_entry);
+
+ atomic_dec(&sdev->num_cq);
+
+ vfree(cq->queue);
+ return 0;
+}
+
+/*
+ * siw_create_cq()
+ *
+ * Populate CQ of requested size
+ *
+ * @base_cq: CQ as allocated by RDMA midlayer
+ * @attr: Initial CQ attributes
+ * @udata: relates to user context
+ */
+
+int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr,
+ struct ib_udata *udata)
+{
+ struct siw_device *sdev = to_siw_dev(base_cq->device);
+ struct siw_cq *cq = to_siw_cq(base_cq);
+ int rv, size = attr->cqe;
+
+ if (attr->flags)
+ return -EOPNOTSUPP;
+
+ if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) {
+ siw_dbg(base_cq->device, "too many CQ's\n");
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ if (size < 1 || size > sdev->attrs.max_cqe) {
+ siw_dbg(base_cq->device, "CQ size error: %d\n", size);
+ rv = -EINVAL;
+ goto err_out;
+ }
+ size = roundup_pow_of_two(size);
+ cq->base_cq.cqe = size;
+ cq->num_cqe = size;
+
+ if (udata)
+ cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) +
+ sizeof(struct siw_cq_ctrl));
+ else
+ cq->queue = vzalloc(size * sizeof(struct siw_cqe) +
+ sizeof(struct siw_cq_ctrl));
+
+ if (cq->queue == NULL) {
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ get_random_bytes(&cq->id, 4);
+ siw_dbg(base_cq->device, "new CQ [%u]\n", cq->id);
+
+ spin_lock_init(&cq->lock);
+
+ cq->notify = (struct siw_cq_ctrl *)&cq->queue[size];
+
+ if (udata) {
+ struct siw_uresp_create_cq uresp = {};
+ struct siw_ucontext *ctx =
+ rdma_udata_to_drv_context(udata, struct siw_ucontext,
+ base_ucontext);
+ size_t length = size * sizeof(struct siw_cqe) +
+ sizeof(struct siw_cq_ctrl);
+
+ cq->cq_entry =
+ siw_mmap_entry_insert(ctx, cq->queue,
+ length, &uresp.cq_key);
+ if (!cq->cq_entry) {
+ rv = -ENOMEM;
+ goto err_out;
+ }
+
+ uresp.cq_id = cq->id;
+ uresp.num_cqe = size;
+
+ if (udata->outlen < sizeof(uresp)) {
+ rv = -EINVAL;
+ goto err_out;
+ }
+ rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+ if (rv)
+ goto err_out;
+ }
+ return 0;
+
+err_out:
+ siw_dbg(base_cq->device, "CQ creation failed: %d", rv);
+
+ if (cq->queue) {
+ struct siw_ucontext *ctx =
+ rdma_udata_to_drv_context(udata, struct siw_ucontext,
+ base_ucontext);
+ if (ctx)
+ rdma_user_mmap_entry_remove(cq->cq_entry);
+ vfree(cq->queue);
+ }
+ atomic_dec(&sdev->num_cq);
+
+ return rv;
+}
+
+/*
+ * siw_poll_cq()
+ *
+ * Reap CQ entries if available and copy work completion status into
+ * array of WC's provided by caller. Returns number of reaped CQE's.
+ *
+ * @base_cq: Base CQ contained in siw CQ.
+ * @num_cqe: Maximum number of CQE's to reap.
+ * @wc: Array of work completions to be filled by siw.
+ */
+int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc)
+{
+ struct siw_cq *cq = to_siw_cq(base_cq);
+ int i;
+
+ for (i = 0; i < num_cqe; i++) {
+ if (!siw_reap_cqe(cq, wc))
+ break;
+ wc++;
+ }
+ return i;
+}
+
+/*
+ * siw_req_notify_cq()
+ *
+ * Request notification for new CQE's added to that CQ.
+ * Defined flags:
+ * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification
+ * event if a WQE with notification flag set enters the CQ
+ * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification
+ * event if a WQE enters the CQ.
+ * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the
+ * number of not reaped CQE's regardless of its notification
+ * type and current or new CQ notification settings.
+ *
+ * @base_cq: Base CQ contained in siw CQ.
+ * @flags: Requested notification flags.
+ */
+int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags)
+{
+ struct siw_cq *cq = to_siw_cq(base_cq);
+
+ siw_dbg_cq(cq, "flags: 0x%02x\n", flags);
+
+ if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
+ /*
+ * Enable CQ event for next solicited completion.
+ * and make it visible to all associated producers.
+ */
+ smp_store_mb(cq->notify->flags, SIW_NOTIFY_SOLICITED);
+ else
+ /*
+ * Enable CQ event for any signalled completion.
+ * and make it visible to all associated producers.
+ */
+ smp_store_mb(cq->notify->flags, SIW_NOTIFY_ALL);
+
+ if (flags & IB_CQ_REPORT_MISSED_EVENTS)
+ return cq->cq_put - cq->cq_get;
+
+ return 0;
+}
+
+/*
+ * siw_dereg_mr()
+ *
+ * Release Memory Region.
+ *
+ * @base_mr: Base MR contained in siw MR.
+ * @udata: points to user context, unused.
+ */
+int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata)
+{
+ struct siw_mr *mr = to_siw_mr(base_mr);
+ struct siw_device *sdev = to_siw_dev(base_mr->device);
+
+ siw_dbg_mem(mr->mem, "deregister MR\n");
+
+ atomic_dec(&sdev->num_mr);
+
+ siw_mr_drop_mem(mr);
+ kfree_rcu(mr, rcu);
+
+ return 0;
+}
+
+/*
+ * siw_reg_user_mr()
+ *
+ * Register Memory Region.
+ *
+ * @pd: Protection Domain
+ * @start: starting address of MR (virtual address)
+ * @len: len of MR
+ * @rnic_va: not used by siw
+ * @rights: MR access rights
+ * @udata: user buffer to communicate STag and Key.
+ */
+struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
+ u64 rnic_va, int rights, struct ib_udata *udata)
+{
+ struct siw_mr *mr = NULL;
+ struct siw_umem *umem = NULL;
+ struct siw_ureq_reg_mr ureq;
+ struct siw_device *sdev = to_siw_dev(pd->device);
+
+ unsigned long mem_limit = rlimit(RLIMIT_MEMLOCK);
+ int rv;
+
+ siw_dbg_pd(pd, "start: 0x%pK, va: 0x%pK, len: %llu\n",
+ (void *)(uintptr_t)start, (void *)(uintptr_t)rnic_va,
+ (unsigned long long)len);
+
+ if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
+ siw_dbg_pd(pd, "too many mr's\n");
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ if (!len) {
+ rv = -EINVAL;
+ goto err_out;
+ }
+ if (mem_limit != RLIM_INFINITY) {
+ unsigned long num_pages =
+ (PAGE_ALIGN(len + (start & ~PAGE_MASK))) >> PAGE_SHIFT;
+ mem_limit >>= PAGE_SHIFT;
+
+ if (num_pages > mem_limit - current->mm->locked_vm) {
+ siw_dbg_pd(pd, "pages req %lu, max %lu, lock %lu\n",
+ num_pages, mem_limit,
+ current->mm->locked_vm);
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ }
+ umem = siw_umem_get(start, len, ib_access_writable(rights));
+ if (IS_ERR(umem)) {
+ rv = PTR_ERR(umem);
+ siw_dbg_pd(pd, "getting user memory failed: %d\n", rv);
+ umem = NULL;
+ goto err_out;
+ }
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr) {
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ rv = siw_mr_add_mem(mr, pd, umem, start, len, rights);
+ if (rv)
+ goto err_out;
+
+ if (udata) {
+ struct siw_uresp_reg_mr uresp = {};
+ struct siw_mem *mem = mr->mem;
+
+ if (udata->inlen < sizeof(ureq)) {
+ rv = -EINVAL;
+ goto err_out;
+ }
+ rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq));
+ if (rv)
+ goto err_out;
+
+ mr->base_mr.lkey |= ureq.stag_key;
+ mr->base_mr.rkey |= ureq.stag_key;
+ mem->stag |= ureq.stag_key;
+ uresp.stag = mem->stag;
+
+ if (udata->outlen < sizeof(uresp)) {
+ rv = -EINVAL;
+ goto err_out;
+ }
+ rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+ if (rv)
+ goto err_out;
+ }
+ mr->mem->stag_valid = 1;
+
+ return &mr->base_mr;
+
+err_out:
+ atomic_dec(&sdev->num_mr);
+ if (mr) {
+ if (mr->mem)
+ siw_mr_drop_mem(mr);
+ kfree_rcu(mr, rcu);
+ } else {
+ if (umem)
+ siw_umem_release(umem, false);
+ }
+ return ERR_PTR(rv);
+}
+
+struct ib_mr *siw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+ u32 max_sge)
+{
+ struct siw_device *sdev = to_siw_dev(pd->device);
+ struct siw_mr *mr = NULL;
+ struct siw_pbl *pbl = NULL;
+ int rv;
+
+ if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
+ siw_dbg_pd(pd, "too many mr's\n");
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ if (mr_type != IB_MR_TYPE_MEM_REG) {
+ siw_dbg_pd(pd, "mr type %d unsupported\n", mr_type);
+ rv = -EOPNOTSUPP;
+ goto err_out;
+ }
+ if (max_sge > SIW_MAX_SGE_PBL) {
+ siw_dbg_pd(pd, "too many sge's: %d\n", max_sge);
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ pbl = siw_pbl_alloc(max_sge);
+ if (IS_ERR(pbl)) {
+ rv = PTR_ERR(pbl);
+ siw_dbg_pd(pd, "pbl allocation failed: %d\n", rv);
+ pbl = NULL;
+ goto err_out;
+ }
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr) {
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ rv = siw_mr_add_mem(mr, pd, pbl, 0, max_sge * PAGE_SIZE, 0);
+ if (rv)
+ goto err_out;
+
+ mr->mem->is_pbl = 1;
+
+ siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
+
+ return &mr->base_mr;
+
+err_out:
+ atomic_dec(&sdev->num_mr);
+
+ if (!mr) {
+ kfree(pbl);
+ } else {
+ if (mr->mem)
+ siw_mr_drop_mem(mr);
+ kfree_rcu(mr, rcu);
+ }
+ siw_dbg_pd(pd, "failed: %d\n", rv);
+
+ return ERR_PTR(rv);
+}
+
+/* Just used to count number of pages being mapped */
+static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr)
+{
+ return 0;
+}
+
+int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle,
+ unsigned int *sg_off)
+{
+ struct scatterlist *slp;
+ struct siw_mr *mr = to_siw_mr(base_mr);
+ struct siw_mem *mem = mr->mem;
+ struct siw_pbl *pbl = mem->pbl;
+ struct siw_pble *pble;
+ unsigned long pbl_size;
+ int i, rv;
+
+ if (!pbl) {
+ siw_dbg_mem(mem, "no PBL allocated\n");
+ return -EINVAL;
+ }
+ pble = pbl->pbe;
+
+ if (pbl->max_buf < num_sle) {
+ siw_dbg_mem(mem, "too many SGE's: %d > %d\n",
+ num_sle, pbl->max_buf);
+ return -ENOMEM;
+ }
+ for_each_sg(sl, slp, num_sle, i) {
+ if (sg_dma_len(slp) == 0) {
+ siw_dbg_mem(mem, "empty SGE\n");
+ return -EINVAL;
+ }
+ if (i == 0) {
+ pble->addr = sg_dma_address(slp);
+ pble->size = sg_dma_len(slp);
+ pble->pbl_off = 0;
+ pbl_size = pble->size;
+ pbl->num_buf = 1;
+ } else {
+ /* Merge PBL entries if adjacent */
+ if (pble->addr + pble->size == sg_dma_address(slp)) {
+ pble->size += sg_dma_len(slp);
+ } else {
+ pble++;
+ pbl->num_buf++;
+ pble->addr = sg_dma_address(slp);
+ pble->size = sg_dma_len(slp);
+ pble->pbl_off = pbl_size;
+ }
+ pbl_size += sg_dma_len(slp);
+ }
+ siw_dbg_mem(mem,
+ "sge[%d], size %u, addr 0x%p, total %lu\n",
+ i, pble->size, ib_virt_dma_to_ptr(pble->addr),
+ pbl_size);
+ }
+ rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page);
+ if (rv > 0) {
+ mem->len = base_mr->length;
+ mem->va = base_mr->iova;
+ siw_dbg_mem(mem,
+ "%llu bytes, start 0x%pK, %u SLE to %u entries\n",
+ mem->len, (void *)(uintptr_t)mem->va, num_sle,
+ pbl->num_buf);
+ }
+ return rv;
+}
+
+/*
+ * siw_get_dma_mr()
+ *
+ * Create a (empty) DMA memory region, where no umem is attached.
+ */
+struct ib_mr *siw_get_dma_mr(struct ib_pd *pd, int rights)
+{
+ struct siw_device *sdev = to_siw_dev(pd->device);
+ struct siw_mr *mr = NULL;
+ int rv;
+
+ if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
+ siw_dbg_pd(pd, "too many mr's\n");
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr) {
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ rv = siw_mr_add_mem(mr, pd, NULL, 0, ULONG_MAX, rights);
+ if (rv)
+ goto err_out;
+
+ mr->mem->stag_valid = 1;
+
+ siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
+
+ return &mr->base_mr;
+
+err_out:
+ if (rv)
+ kfree(mr);
+
+ atomic_dec(&sdev->num_mr);
+
+ return ERR_PTR(rv);
+}
+
+/*
+ * siw_create_srq()
+ *
+ * Create Shared Receive Queue of attributes @init_attrs
+ * within protection domain given by @pd.
+ *
+ * @base_srq: Base SRQ contained in siw SRQ.
+ * @init_attrs: SRQ init attributes.
+ * @udata: points to user context
+ */
+int siw_create_srq(struct ib_srq *base_srq,
+ struct ib_srq_init_attr *init_attrs, struct ib_udata *udata)
+{
+ struct siw_srq *srq = to_siw_srq(base_srq);
+ struct ib_srq_attr *attrs = &init_attrs->attr;
+ struct siw_device *sdev = to_siw_dev(base_srq->device);
+ struct siw_ucontext *ctx =
+ rdma_udata_to_drv_context(udata, struct siw_ucontext,
+ base_ucontext);
+ int rv;
+
+ if (init_attrs->srq_type != IB_SRQT_BASIC)
+ return -EOPNOTSUPP;
+
+ if (atomic_inc_return(&sdev->num_srq) > SIW_MAX_SRQ) {
+ siw_dbg_pd(base_srq->pd, "too many SRQ's\n");
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ if (attrs->max_wr == 0 || attrs->max_wr > SIW_MAX_SRQ_WR ||
+ attrs->max_sge > SIW_MAX_SGE || attrs->srq_limit > attrs->max_wr) {
+ rv = -EINVAL;
+ goto err_out;
+ }
+ srq->max_sge = attrs->max_sge;
+ srq->num_rqe = roundup_pow_of_two(attrs->max_wr);
+ srq->limit = attrs->srq_limit;
+ if (srq->limit)
+ srq->armed = true;
+
+ srq->is_kernel_res = !udata;
+
+ if (udata)
+ srq->recvq =
+ vmalloc_user(srq->num_rqe * sizeof(struct siw_rqe));
+ else
+ srq->recvq = vcalloc(srq->num_rqe, sizeof(struct siw_rqe));
+
+ if (srq->recvq == NULL) {
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ if (udata) {
+ struct siw_uresp_create_srq uresp = {};
+ size_t length = srq->num_rqe * sizeof(struct siw_rqe);
+
+ srq->srq_entry =
+ siw_mmap_entry_insert(ctx, srq->recvq,
+ length, &uresp.srq_key);
+ if (!srq->srq_entry) {
+ rv = -ENOMEM;
+ goto err_out;
+ }
+
+ uresp.num_rqe = srq->num_rqe;
+
+ if (udata->outlen < sizeof(uresp)) {
+ rv = -EINVAL;
+ goto err_out;
+ }
+ rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+ if (rv)
+ goto err_out;
+ }
+ spin_lock_init(&srq->lock);
+
+ siw_dbg_pd(base_srq->pd, "[SRQ]: success\n");
+
+ return 0;
+
+err_out:
+ if (srq->recvq) {
+ if (ctx)
+ rdma_user_mmap_entry_remove(srq->srq_entry);
+ vfree(srq->recvq);
+ }
+ atomic_dec(&sdev->num_srq);
+
+ return rv;
+}
+
+/*
+ * siw_modify_srq()
+ *
+ * Modify SRQ. The caller may resize SRQ and/or set/reset notification
+ * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification.
+ *
+ * NOTE: it is unclear if RDMA core allows for changing the MAX_SGE
+ * parameter. siw_modify_srq() does not check the attrs->max_sge param.
+ */
+int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs,
+ enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+ struct siw_srq *srq = to_siw_srq(base_srq);
+ unsigned long flags;
+ int rv = 0;
+
+ spin_lock_irqsave(&srq->lock, flags);
+
+ if (attr_mask & IB_SRQ_MAX_WR) {
+ /* resize request not yet supported */
+ rv = -EOPNOTSUPP;
+ goto out;
+ }
+ if (attr_mask & IB_SRQ_LIMIT) {
+ if (attrs->srq_limit) {
+ if (unlikely(attrs->srq_limit > srq->num_rqe)) {
+ rv = -EINVAL;
+ goto out;
+ }
+ srq->armed = true;
+ } else {
+ srq->armed = false;
+ }
+ srq->limit = attrs->srq_limit;
+ }
+out:
+ spin_unlock_irqrestore(&srq->lock, flags);
+
+ return rv;
+}
+
+/*
+ * siw_query_srq()
+ *
+ * Query SRQ attributes.
+ */
+int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs)
+{
+ struct siw_srq *srq = to_siw_srq(base_srq);
+ unsigned long flags;
+
+ spin_lock_irqsave(&srq->lock, flags);
+
+ attrs->max_wr = srq->num_rqe;
+ attrs->max_sge = srq->max_sge;
+ attrs->srq_limit = srq->limit;
+
+ spin_unlock_irqrestore(&srq->lock, flags);
+
+ return 0;
+}
+
+/*
+ * siw_destroy_srq()
+ *
+ * Destroy SRQ.
+ * It is assumed that the SRQ is not referenced by any
+ * QP anymore - the code trusts the RDMA core environment to keep track
+ * of QP references.
+ */
+int siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata)
+{
+ struct siw_srq *srq = to_siw_srq(base_srq);
+ struct siw_device *sdev = to_siw_dev(base_srq->device);
+ struct siw_ucontext *ctx =
+ rdma_udata_to_drv_context(udata, struct siw_ucontext,
+ base_ucontext);
+
+ if (ctx)
+ rdma_user_mmap_entry_remove(srq->srq_entry);
+ vfree(srq->recvq);
+ atomic_dec(&sdev->num_srq);
+ return 0;
+}
+
+/*
+ * siw_post_srq_recv()
+ *
+ * Post a list of receive queue elements to SRQ.
+ * NOTE: The function does not check or lock a certain SRQ state
+ * during the post operation. The code simply trusts the
+ * RDMA core environment.
+ *
+ * @base_srq: Base SRQ contained in siw SRQ
+ * @wr: List of R-WR's
+ * @bad_wr: Updated to failing WR if posting fails.
+ */
+int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr,
+ const struct ib_recv_wr **bad_wr)
+{
+ struct siw_srq *srq = to_siw_srq(base_srq);
+ unsigned long flags;
+ int rv = 0;
+
+ if (unlikely(!srq->is_kernel_res)) {
+ siw_dbg_pd(base_srq->pd,
+ "[SRQ]: no kernel post_recv for mapped srq\n");
+ rv = -EINVAL;
+ goto out;
+ }
+ /*
+ * Serialize potentially multiple producers.
+ * Also needed to serialize potentially multiple
+ * consumers.
+ */
+ spin_lock_irqsave(&srq->lock, flags);
+
+ while (wr) {
+ u32 idx = srq->rq_put % srq->num_rqe;
+ struct siw_rqe *rqe = &srq->recvq[idx];
+
+ if (rqe->flags) {
+ siw_dbg_pd(base_srq->pd, "SRQ full\n");
+ rv = -ENOMEM;
+ break;
+ }
+ if (unlikely(wr->num_sge > srq->max_sge)) {
+ siw_dbg_pd(base_srq->pd,
+ "[SRQ]: too many sge's: %d\n", wr->num_sge);
+ rv = -EINVAL;
+ break;
+ }
+ rqe->id = wr->wr_id;
+ rqe->num_sge = wr->num_sge;
+ siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
+
+ /* Make sure S-RQE is completely written before valid */
+ smp_wmb();
+
+ rqe->flags = SIW_WQE_VALID;
+
+ srq->rq_put++;
+ wr = wr->next;
+ }
+ spin_unlock_irqrestore(&srq->lock, flags);
+out:
+ if (unlikely(rv < 0)) {
+ siw_dbg_pd(base_srq->pd, "[SRQ]: error %d\n", rv);
+ *bad_wr = wr;
+ }
+ return rv;
+}
+
+void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype)
+{
+ struct ib_event event;
+ struct ib_qp *base_qp = &qp->base_qp;
+
+ /*
+ * Do not report asynchronous errors on QP which gets
+ * destroyed via verbs interface (siw_destroy_qp())
+ */
+ if (qp->attrs.flags & SIW_QP_IN_DESTROY)
+ return;
+
+ event.event = etype;
+ event.device = base_qp->device;
+ event.element.qp = base_qp;
+
+ if (base_qp->event_handler) {
+ siw_dbg_qp(qp, "reporting event %d\n", etype);
+ base_qp->event_handler(&event, base_qp->qp_context);
+ }
+}
+
+void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype)
+{
+ struct ib_event event;
+ struct ib_cq *base_cq = &cq->base_cq;
+
+ event.event = etype;
+ event.device = base_cq->device;
+ event.element.cq = base_cq;
+
+ if (base_cq->event_handler) {
+ siw_dbg_cq(cq, "reporting CQ event %d\n", etype);
+ base_cq->event_handler(&event, base_cq->cq_context);
+ }
+}
+
+void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype)
+{
+ struct ib_event event;
+ struct ib_srq *base_srq = &srq->base_srq;
+
+ event.event = etype;
+ event.device = base_srq->device;
+ event.element.srq = base_srq;
+
+ if (base_srq->event_handler) {
+ siw_dbg_pd(srq->base_srq.pd,
+ "reporting SRQ event %d\n", etype);
+ base_srq->event_handler(&event, base_srq->srq_context);
+ }
+}
+
+void siw_port_event(struct siw_device *sdev, u32 port, enum ib_event_type etype)
+{
+ struct ib_event event;
+
+ event.event = etype;
+ event.device = &sdev->base_dev;
+ event.element.port_num = port;
+
+ siw_dbg(&sdev->base_dev, "reporting port event %d\n", etype);
+
+ ib_dispatch_event(&event);
+}
diff --git a/drivers/infiniband/sw/siw/siw_verbs.h b/drivers/infiniband/sw/siw/siw_verbs.h
new file mode 100644
index 0000000000..09964234f8
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_verbs.h
@@ -0,0 +1,90 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#ifndef _SIW_VERBS_H
+#define _SIW_VERBS_H
+
+#include <linux/errno.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "siw.h"
+#include "siw_cm.h"
+
+/*
+ * siw_copy_sgl()
+ *
+ * Copy SGL from RDMA core representation to local
+ * representation.
+ */
+static inline void siw_copy_sgl(struct ib_sge *sge, struct siw_sge *siw_sge,
+ int num_sge)
+{
+ while (num_sge--) {
+ siw_sge->laddr = sge->addr;
+ siw_sge->length = sge->length;
+ siw_sge->lkey = sge->lkey;
+
+ siw_sge++;
+ sge++;
+ }
+}
+
+int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata);
+void siw_dealloc_ucontext(struct ib_ucontext *base_ctx);
+int siw_query_port(struct ib_device *base_dev, u32 port,
+ struct ib_port_attr *attr);
+int siw_get_port_immutable(struct ib_device *base_dev, u32 port,
+ struct ib_port_immutable *port_immutable);
+int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr,
+ struct ib_udata *udata);
+int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr,
+ struct ib_udata *udata);
+int siw_query_port(struct ib_device *base_dev, u32 port,
+ struct ib_port_attr *attr);
+int siw_query_gid(struct ib_device *base_dev, u32 port, int idx,
+ union ib_gid *gid);
+int siw_alloc_pd(struct ib_pd *base_pd, struct ib_udata *udata);
+int siw_dealloc_pd(struct ib_pd *base_pd, struct ib_udata *udata);
+int siw_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *attr,
+ struct ib_udata *udata);
+int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr,
+ int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
+int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata);
+int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata);
+int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
+ const struct ib_send_wr **bad_wr);
+int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr,
+ const struct ib_recv_wr **bad_wr);
+int siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata);
+int siw_poll_cq(struct ib_cq *base_cq, int num_entries, struct ib_wc *wc);
+int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags);
+struct ib_mr *siw_reg_user_mr(struct ib_pd *base_pd, u64 start, u64 len,
+ u64 rnic_va, int rights, struct ib_udata *udata);
+struct ib_mr *siw_alloc_mr(struct ib_pd *base_pd, enum ib_mr_type mr_type,
+ u32 max_sge);
+struct ib_mr *siw_get_dma_mr(struct ib_pd *base_pd, int rights);
+int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle,
+ unsigned int *sg_off);
+int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata);
+int siw_create_srq(struct ib_srq *base_srq, struct ib_srq_init_attr *attr,
+ struct ib_udata *udata);
+int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attr,
+ enum ib_srq_attr_mask mask, struct ib_udata *udata);
+int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attr);
+int siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata);
+int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr,
+ const struct ib_recv_wr **bad_wr);
+int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma);
+void siw_mmap_free(struct rdma_user_mmap_entry *rdma_entry);
+void siw_qp_event(struct siw_qp *qp, enum ib_event_type type);
+void siw_cq_event(struct siw_cq *cq, enum ib_event_type type);
+void siw_srq_event(struct siw_srq *srq, enum ib_event_type type);
+void siw_port_event(struct siw_device *dev, u32 port, enum ib_event_type type);
+
+#endif