summaryrefslogtreecommitdiffstats
path: root/src/msg/async/rdma
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 18:24:20 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 18:24:20 +0000
commit483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
treee5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/msg/async/rdma
parentInitial commit. (diff)
downloadceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz
ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip
Adding upstream version 14.2.21.upstream/14.2.21upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/msg/async/rdma')
-rw-r--r--src/msg/async/rdma/Infiniband.cc1234
-rw-r--r--src/msg/async/rdma/Infiniband.h529
-rw-r--r--src/msg/async/rdma/RDMAConnectedSocketImpl.cc743
-rw-r--r--src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc183
-rw-r--r--src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc107
-rw-r--r--src/msg/async/rdma/RDMAServerSocketImpl.cc127
-rw-r--r--src/msg/async/rdma/RDMAStack.cc610
-rw-r--r--src/msg/async/rdma/RDMAStack.h348
8 files changed, 3881 insertions, 0 deletions
diff --git a/src/msg/async/rdma/Infiniband.cc b/src/msg/async/rdma/Infiniband.cc
new file mode 100644
index 00000000..34299975
--- /dev/null
+++ b/src/msg/async/rdma/Infiniband.cc
@@ -0,0 +1,1234 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "Infiniband.h"
+#include "common/errno.h"
+#include "common/debug.h"
+#include "RDMAStack.h"
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << "Infiniband "
+
+static const uint32_t MAX_SHARED_RX_SGE_COUNT = 1;
+static const uint32_t MAX_INLINE_DATA = 0;
+static const uint32_t TCP_MSG_LEN = sizeof("0000:00000000:00000000:00000000:00000000000000000000000000000000");
+static const uint32_t CQ_DEPTH = 30000;
+
+Port::Port(CephContext *cct, struct ibv_context* ictxt, uint8_t ipn): ctxt(ictxt), port_num(ipn), port_attr(new ibv_port_attr), gid_idx(0)
+{
+#ifdef HAVE_IBV_EXP
+ union ibv_gid cgid;
+ struct ibv_exp_gid_attr gid_attr;
+ bool malformed = false;
+
+ ldout(cct,1) << __func__ << " using experimental verbs for gid" << dendl;
+ int r = ibv_query_port(ctxt, port_num, port_attr);
+ if (r == -1) {
+ lderr(cct) << __func__ << " query port failed " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+
+ lid = port_attr->lid;
+
+ // search for requested GID in GIDs table
+ ldout(cct, 1) << __func__ << " looking for local GID " << (cct->_conf->ms_async_rdma_local_gid)
+ << " of type " << (cct->_conf->ms_async_rdma_roce_ver) << dendl;
+ r = sscanf(cct->_conf->ms_async_rdma_local_gid.c_str(),
+ "%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx"
+ ":%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx",
+ &cgid.raw[ 0], &cgid.raw[ 1],
+ &cgid.raw[ 2], &cgid.raw[ 3],
+ &cgid.raw[ 4], &cgid.raw[ 5],
+ &cgid.raw[ 6], &cgid.raw[ 7],
+ &cgid.raw[ 8], &cgid.raw[ 9],
+ &cgid.raw[10], &cgid.raw[11],
+ &cgid.raw[12], &cgid.raw[13],
+ &cgid.raw[14], &cgid.raw[15]);
+
+ if (r != 16) {
+ ldout(cct, 1) << __func__ << " malformed or no GID supplied, using GID index 0" << dendl;
+ malformed = true;
+ }
+
+ gid_attr.comp_mask = IBV_EXP_QUERY_GID_ATTR_TYPE;
+
+ for (gid_idx = 0; gid_idx < port_attr->gid_tbl_len; gid_idx++) {
+ r = ibv_query_gid(ctxt, port_num, gid_idx, &gid);
+ if (r) {
+ lderr(cct) << __func__ << " query gid of port " << port_num << " index " << gid_idx << " failed " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+ r = ibv_exp_query_gid_attr(ctxt, port_num, gid_idx, &gid_attr);
+ if (r) {
+ lderr(cct) << __func__ << " query gid attributes of port " << port_num << " index " << gid_idx << " failed " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+
+ if (malformed) break; // stay with gid_idx=0
+ if ( (gid_attr.type == cct->_conf->ms_async_rdma_roce_ver) &&
+ (memcmp(&gid, &cgid, 16) == 0) ) {
+ ldout(cct, 1) << __func__ << " found at index " << gid_idx << dendl;
+ break;
+ }
+ }
+
+ if (gid_idx == port_attr->gid_tbl_len) {
+ lderr(cct) << __func__ << " Requested local GID was not found in GID table" << dendl;
+ ceph_abort();
+ }
+#else
+ int r = ibv_query_port(ctxt, port_num, port_attr);
+ if (r == -1) {
+ lderr(cct) << __func__ << " query port failed " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+
+ lid = port_attr->lid;
+ r = ibv_query_gid(ctxt, port_num, 0, &gid);
+ if (r) {
+ lderr(cct) << __func__ << " query gid failed " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+#endif
+}
+
+
+Device::Device(CephContext *cct, ibv_device* d, struct ibv_context *dc)
+ : device(d), device_attr(new ibv_device_attr), active_port(nullptr)
+{
+ if (device == NULL) {
+ lderr(cct) << __func__ << " device == NULL" << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+ name = ibv_get_device_name(device);
+ if (cct->_conf->ms_async_rdma_cm) {
+ ctxt = dc;
+ } else {
+ ctxt = ibv_open_device(device);
+ }
+ if (ctxt == NULL) {
+ lderr(cct) << __func__ << " open rdma device failed. " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+ int r = ibv_query_device(ctxt, device_attr);
+ if (r == -1) {
+ lderr(cct) << __func__ << " failed to query rdma device. " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+}
+
+void Device::binding_port(CephContext *cct, int port_num) {
+ port_cnt = device_attr->phys_port_cnt;
+ for (uint8_t i = 0; i < port_cnt; ++i) {
+ Port *port = new Port(cct, ctxt, i+1);
+ if (i + 1 == port_num && port->get_port_attr()->state == IBV_PORT_ACTIVE) {
+ active_port = port;
+ ldout(cct, 1) << __func__ << " found active port " << i+1 << dendl;
+ break;
+ } else {
+ ldout(cct, 10) << __func__ << " port " << i+1 << " is not what we want. state: " << port->get_port_attr()->state << ")"<< dendl;
+ }
+ delete port;
+ }
+ if (nullptr == active_port) {
+ lderr(cct) << __func__ << " port not found" << dendl;
+ ceph_assert(active_port);
+ }
+}
+
+
+Infiniband::QueuePair::QueuePair(
+ CephContext *c, Infiniband& infiniband, ibv_qp_type type,
+ int port, ibv_srq *srq,
+ Infiniband::CompletionQueue* txcq, Infiniband::CompletionQueue* rxcq,
+ uint32_t tx_queue_len, uint32_t rx_queue_len, struct rdma_cm_id *cid, uint32_t q_key)
+: cct(c), infiniband(infiniband),
+ type(type),
+ ctxt(infiniband.device->ctxt),
+ ib_physical_port(port),
+ pd(infiniband.pd->pd),
+ srq(srq),
+ qp(NULL),
+ cm_id(cid),
+ txcq(txcq),
+ rxcq(rxcq),
+ initial_psn(0),
+ max_send_wr(tx_queue_len),
+ max_recv_wr(rx_queue_len),
+ q_key(q_key),
+ dead(false)
+{
+ initial_psn = lrand48() & 0xffffff;
+ if (type != IBV_QPT_RC && type != IBV_QPT_UD && type != IBV_QPT_RAW_PACKET) {
+ lderr(cct) << __func__ << " invalid queue pair type" << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+ pd = infiniband.pd->pd;
+}
+
+int Infiniband::QueuePair::init()
+{
+ ldout(cct, 20) << __func__ << " started." << dendl;
+ ibv_qp_init_attr qpia;
+ // FIPS zeroization audit 20191115: this memset is not security related.
+ memset(&qpia, 0, sizeof(qpia));
+ qpia.send_cq = txcq->get_cq();
+ qpia.recv_cq = rxcq->get_cq();
+ if (srq) {
+ qpia.srq = srq; // use the same shared receive queue
+ } else {
+ qpia.cap.max_recv_wr = max_recv_wr;
+ qpia.cap.max_recv_sge = 1;
+ }
+ qpia.cap.max_send_wr = max_send_wr; // max outstanding send requests
+ qpia.cap.max_send_sge = 1; // max send scatter-gather elements
+ qpia.cap.max_inline_data = MAX_INLINE_DATA; // max bytes of immediate data on send q
+ qpia.qp_type = type; // RC, UC, UD, or XRC
+ qpia.sq_sig_all = 0; // only generate CQEs on requested WQEs
+
+ if (!cct->_conf->ms_async_rdma_cm) {
+ qp = ibv_create_qp(pd, &qpia);
+ if (qp == NULL) {
+ lderr(cct) << __func__ << " failed to create queue pair" << cpp_strerror(errno) << dendl;
+ if (errno == ENOMEM) {
+ lderr(cct) << __func__ << " try reducing ms_async_rdma_receive_queue_length, "
+ " ms_async_rdma_send_buffers or"
+ " ms_async_rdma_buffer_size" << dendl;
+ }
+ return -1;
+ }
+ } else {
+ ceph_assert(cm_id->verbs == pd->context);
+ if (rdma_create_qp(cm_id, pd, &qpia)) {
+ lderr(cct) << __func__ << " failed to create queue pair with rdmacm library"
+ << cpp_strerror(errno) << dendl;
+ return -1;
+ }
+ qp = cm_id->qp;
+ }
+ ldout(cct, 20) << __func__ << " successfully create queue pair: "
+ << "qp=" << qp << dendl;
+
+ if (cct->_conf->ms_async_rdma_cm)
+ return 0;
+
+ // move from RESET to INIT state
+ ibv_qp_attr qpa;
+ memset(&qpa, 0, sizeof(qpa));
+ qpa.qp_state = IBV_QPS_INIT;
+ qpa.pkey_index = 0;
+ qpa.port_num = (uint8_t)(ib_physical_port);
+ qpa.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE;
+ qpa.qkey = q_key;
+
+ int mask = IBV_QP_STATE | IBV_QP_PORT;
+ switch (type) {
+ case IBV_QPT_RC:
+ mask |= IBV_QP_ACCESS_FLAGS;
+ mask |= IBV_QP_PKEY_INDEX;
+ break;
+ case IBV_QPT_UD:
+ mask |= IBV_QP_QKEY;
+ mask |= IBV_QP_PKEY_INDEX;
+ break;
+ case IBV_QPT_RAW_PACKET:
+ break;
+ default:
+ ceph_abort();
+ }
+
+ int ret = ibv_modify_qp(qp, &qpa, mask);
+ if (ret) {
+ ibv_destroy_qp(qp);
+ lderr(cct) << __func__ << " failed to transition to INIT state: "
+ << cpp_strerror(errno) << dendl;
+ return -1;
+ }
+ ldout(cct, 20) << __func__ << " successfully change queue pair to INIT:"
+ << " qp=" << qp << dendl;
+ return 0;
+}
+
+/**
+ * Change RC QueuePair into the ERROR state. This is necessary modify
+ * the Queue Pair into the Error state and poll all of the relevant
+ * Work Completions prior to destroying a Queue Pair.
+ * Since destroying a Queue Pair does not guarantee that its Work
+ * Completions are removed from the CQ upon destruction. Even if the
+ * Work Completions are already in the CQ, it might not be possible to
+ * retrieve them. If the Queue Pair is associated with an SRQ, it is
+ * recommended wait for the affiliated event IBV_EVENT_QP_LAST_WQE_REACHED
+ *
+ * \return
+ * -errno if the QueuePair can't switch to ERROR
+ * 0 for success.
+ */
+int Infiniband::QueuePair::to_dead()
+{
+ if (dead)
+ return 0;
+ ibv_qp_attr qpa;
+ memset(&qpa, 0, sizeof(qpa));
+ qpa.qp_state = IBV_QPS_ERR;
+
+ int mask = IBV_QP_STATE;
+ int ret = ibv_modify_qp(qp, &qpa, mask);
+ if (ret) {
+ lderr(cct) << __func__ << " failed to transition to ERROR state: "
+ << cpp_strerror(errno) << dendl;
+ return -errno;
+ }
+ dead = true;
+ return ret;
+}
+
+int Infiniband::QueuePair::get_remote_qp_number(uint32_t *rqp) const
+{
+ ibv_qp_attr qpa;
+ ibv_qp_init_attr qpia;
+
+ int r = ibv_query_qp(qp, &qpa, IBV_QP_DEST_QPN, &qpia);
+ if (r) {
+ lderr(cct) << __func__ << " failed to query qp: "
+ << cpp_strerror(errno) << dendl;
+ return -1;
+ }
+
+ if (rqp)
+ *rqp = qpa.dest_qp_num;
+ return 0;
+}
+
+/**
+ * Get the remote infiniband address for this QueuePair, as set in #plumb().
+ * LIDs are "local IDs" in infiniband terminology. They are short, locally
+ * routable addresses.
+ */
+int Infiniband::QueuePair::get_remote_lid(uint16_t *lid) const
+{
+ ibv_qp_attr qpa;
+ ibv_qp_init_attr qpia;
+
+ int r = ibv_query_qp(qp, &qpa, IBV_QP_AV, &qpia);
+ if (r) {
+ lderr(cct) << __func__ << " failed to query qp: "
+ << cpp_strerror(errno) << dendl;
+ return -1;
+ }
+
+ if (lid)
+ *lid = qpa.ah_attr.dlid;
+ return 0;
+}
+
+/**
+ * Get the state of a QueuePair.
+ */
+int Infiniband::QueuePair::get_state() const
+{
+ ibv_qp_attr qpa;
+ ibv_qp_init_attr qpia;
+
+ int r = ibv_query_qp(qp, &qpa, IBV_QP_STATE, &qpia);
+ if (r) {
+ lderr(cct) << __func__ << " failed to get state: "
+ << cpp_strerror(errno) << dendl;
+ return -1;
+ }
+ return qpa.qp_state;
+}
+
+/**
+ * Return true if the queue pair is in an error state, false otherwise.
+ */
+bool Infiniband::QueuePair::is_error() const
+{
+ ibv_qp_attr qpa;
+ ibv_qp_init_attr qpia;
+
+ int r = ibv_query_qp(qp, &qpa, -1, &qpia);
+ if (r) {
+ lderr(cct) << __func__ << " failed to get state: "
+ << cpp_strerror(errno) << dendl;
+ return true;
+ }
+ return qpa.cur_qp_state == IBV_QPS_ERR;
+}
+
+
+Infiniband::CompletionChannel::CompletionChannel(CephContext *c, Infiniband &ib)
+ : cct(c), infiniband(ib), channel(NULL), cq(NULL), cq_events_that_need_ack(0)
+{
+}
+
+Infiniband::CompletionChannel::~CompletionChannel()
+{
+ if (channel) {
+ int r = ibv_destroy_comp_channel(channel);
+ if (r < 0)
+ lderr(cct) << __func__ << " failed to destroy cc: " << cpp_strerror(errno) << dendl;
+ ceph_assert(r == 0);
+ }
+}
+
+int Infiniband::CompletionChannel::init()
+{
+ ldout(cct, 20) << __func__ << " started." << dendl;
+ channel = ibv_create_comp_channel(infiniband.device->ctxt);
+ if (!channel) {
+ lderr(cct) << __func__ << " failed to create receive completion channel: "
+ << cpp_strerror(errno) << dendl;
+ return -1;
+ }
+ int rc = NetHandler(cct).set_nonblock(channel->fd);
+ if (rc < 0) {
+ ibv_destroy_comp_channel(channel);
+ return -1;
+ }
+ return 0;
+}
+
+void Infiniband::CompletionChannel::ack_events()
+{
+ ibv_ack_cq_events(cq, cq_events_that_need_ack);
+ cq_events_that_need_ack = 0;
+}
+
+bool Infiniband::CompletionChannel::get_cq_event()
+{
+ ibv_cq *cq = NULL;
+ void *ev_ctx;
+ if (ibv_get_cq_event(channel, &cq, &ev_ctx)) {
+ if (errno != EAGAIN && errno != EINTR)
+ lderr(cct) << __func__ << " failed to retrieve CQ event: "
+ << cpp_strerror(errno) << dendl;
+ return false;
+ }
+
+ /* accumulate number of cq events that need to
+ * * be acked, and periodically ack them
+ * */
+ if (++cq_events_that_need_ack == MAX_ACK_EVENT) {
+ ldout(cct, 20) << __func__ << " ack aq events." << dendl;
+ ibv_ack_cq_events(cq, MAX_ACK_EVENT);
+ cq_events_that_need_ack = 0;
+ }
+
+ return true;
+}
+
+
+Infiniband::CompletionQueue::~CompletionQueue()
+{
+ if (cq) {
+ int r = ibv_destroy_cq(cq);
+ if (r < 0)
+ lderr(cct) << __func__ << " failed to destroy cq: " << cpp_strerror(errno) << dendl;
+ ceph_assert(r == 0);
+ }
+}
+
+int Infiniband::CompletionQueue::init()
+{
+ cq = ibv_create_cq(infiniband.device->ctxt, queue_depth, this, channel->get_channel(), 0);
+ if (!cq) {
+ lderr(cct) << __func__ << " failed to create receive completion queue: "
+ << cpp_strerror(errno) << dendl;
+ return -1;
+ }
+
+ if (ibv_req_notify_cq(cq, 0)) {
+ lderr(cct) << __func__ << " ibv_req_notify_cq failed: " << cpp_strerror(errno) << dendl;
+ ibv_destroy_cq(cq);
+ cq = nullptr;
+ return -1;
+ }
+
+ channel->bind_cq(cq);
+ ldout(cct, 20) << __func__ << " successfully create cq=" << cq << dendl;
+ return 0;
+}
+
+int Infiniband::CompletionQueue::rearm_notify(bool solicite_only)
+{
+ ldout(cct, 20) << __func__ << " started." << dendl;
+ int r = ibv_req_notify_cq(cq, 0);
+ if (r < 0)
+ lderr(cct) << __func__ << " failed to notify cq: " << cpp_strerror(errno) << dendl;
+ return r;
+}
+
+int Infiniband::CompletionQueue::poll_cq(int num_entries, ibv_wc *ret_wc_array) {
+ int r = ibv_poll_cq(cq, num_entries, ret_wc_array);
+ if (r < 0) {
+ lderr(cct) << __func__ << " poll_completion_queue occur met error: "
+ << cpp_strerror(errno) << dendl;
+ return -1;
+ }
+ return r;
+}
+
+
+Infiniband::ProtectionDomain::ProtectionDomain(CephContext *cct, Device *device)
+ : pd(ibv_alloc_pd(device->ctxt))
+{
+ if (pd == NULL) {
+ lderr(cct) << __func__ << " failed to allocate infiniband protection domain: " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+}
+
+Infiniband::ProtectionDomain::~ProtectionDomain()
+{
+ ibv_dealloc_pd(pd);
+}
+
+
+Infiniband::MemoryManager::Chunk::Chunk(ibv_mr* m, uint32_t len, char* b)
+ : mr(m), bytes(len), offset(0), buffer(b)
+{
+}
+
+Infiniband::MemoryManager::Chunk::~Chunk()
+{
+}
+
+void Infiniband::MemoryManager::Chunk::set_offset(uint32_t o)
+{
+ offset = o;
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::get_offset()
+{
+ return offset;
+}
+
+void Infiniband::MemoryManager::Chunk::set_bound(uint32_t b)
+{
+ bound = b;
+}
+
+void Infiniband::MemoryManager::Chunk::prepare_read(uint32_t b)
+{
+ offset = 0;
+ bound = b;
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::get_bound()
+{
+ return bound;
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::read(char* buf, uint32_t len)
+{
+ uint32_t left = bound - offset;
+ if (left >= len) {
+ memcpy(buf, buffer+offset, len);
+ offset += len;
+ return len;
+ } else {
+ memcpy(buf, buffer+offset, left);
+ offset = 0;
+ bound = 0;
+ return left;
+ }
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::write(char* buf, uint32_t len)
+{
+ uint32_t left = bytes - offset;
+ if (left >= len) {
+ memcpy(buffer+offset, buf, len);
+ offset += len;
+ return len;
+ } else {
+ memcpy(buffer+offset, buf, left);
+ offset = bytes;
+ return left;
+ }
+}
+
+bool Infiniband::MemoryManager::Chunk::full()
+{
+ return offset == bytes;
+}
+
+bool Infiniband::MemoryManager::Chunk::over()
+{
+ return Infiniband::MemoryManager::Chunk::offset == bound;
+}
+
+void Infiniband::MemoryManager::Chunk::clear()
+{
+ offset = 0;
+ bound = 0;
+}
+
+Infiniband::MemoryManager::Cluster::Cluster(MemoryManager& m, uint32_t s)
+ : manager(m), buffer_size(s), lock("cluster_lock")
+{
+}
+
+Infiniband::MemoryManager::Cluster::~Cluster()
+{
+ int r = ibv_dereg_mr(chunk_base->mr);
+ ceph_assert(r == 0);
+ const auto chunk_end = chunk_base + num_chunk;
+ for (auto chunk = chunk_base; chunk != chunk_end; chunk++) {
+ chunk->~Chunk();
+ }
+
+ ::free(chunk_base);
+ manager.free(base);
+}
+
+int Infiniband::MemoryManager::Cluster::fill(uint32_t num)
+{
+ ceph_assert(!base);
+ num_chunk = num;
+ uint32_t bytes = buffer_size * num;
+
+ base = (char*)manager.malloc(bytes);
+ end = base + bytes;
+ ceph_assert(base);
+ chunk_base = static_cast<Chunk*>(::malloc(sizeof(Chunk) * num));
+ // FIPS zeroization audit 20191115: this memset is not security related.
+ memset(static_cast<void*>(chunk_base), 0, sizeof(Chunk) * num);
+ free_chunks.reserve(num);
+ ibv_mr* m = ibv_reg_mr(manager.pd->pd, base, bytes, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
+ ceph_assert(m);
+ Chunk* chunk = chunk_base;
+ for (uint32_t offset = 0; offset < bytes; offset += buffer_size){
+ new(chunk) Chunk(m, buffer_size, base+offset);
+ free_chunks.push_back(chunk);
+ chunk++;
+ }
+ return 0;
+}
+
+void Infiniband::MemoryManager::Cluster::take_back(std::vector<Chunk*> &ck)
+{
+ Mutex::Locker l(lock);
+ for (auto c : ck) {
+ c->clear();
+ free_chunks.push_back(c);
+ }
+}
+
+int Infiniband::MemoryManager::Cluster::get_buffers(std::vector<Chunk*> &chunks, size_t bytes)
+{
+ uint32_t num = bytes / buffer_size + 1;
+ if (bytes % buffer_size == 0)
+ --num;
+ int r = num;
+ Mutex::Locker l(lock);
+ if (free_chunks.empty())
+ return 0;
+ if (!bytes) {
+ r = free_chunks.size();
+ for (auto c : free_chunks)
+ chunks.push_back(c);
+ free_chunks.clear();
+ return r;
+ }
+ if (free_chunks.size() < num) {
+ num = free_chunks.size();
+ r = num;
+ }
+ for (uint32_t i = 0; i < num; ++i) {
+ chunks.push_back(free_chunks.back());
+ free_chunks.pop_back();
+ }
+ return r;
+}
+
+bool Infiniband::MemoryManager::MemPoolContext::can_alloc(unsigned nbufs)
+{
+ /* unlimited */
+ if (manager->cct->_conf->ms_async_rdma_receive_buffers <= 0)
+ return true;
+
+ if (n_bufs_allocated + nbufs > (unsigned)manager->cct->_conf->ms_async_rdma_receive_buffers) {
+ lderr(manager->cct) << __func__ << " WARNING: OUT OF RX BUFFERS: allocated: " <<
+ n_bufs_allocated << " requested: " << nbufs <<
+ " limit: " << manager->cct->_conf->ms_async_rdma_receive_buffers << dendl;
+ return false;
+ }
+
+ return true;
+}
+
+void Infiniband::MemoryManager::MemPoolContext::set_stat_logger(PerfCounters *logger) {
+ perf_logger = logger;
+ if (perf_logger != nullptr)
+ perf_logger->set(l_msgr_rdma_rx_bufs_total, n_bufs_allocated);
+}
+
+void Infiniband::MemoryManager::MemPoolContext::update_stats(int nbufs)
+{
+ n_bufs_allocated += nbufs;
+
+ if (!perf_logger)
+ return;
+
+ if (nbufs > 0) {
+ perf_logger->inc(l_msgr_rdma_rx_bufs_total, nbufs);
+ } else {
+ perf_logger->dec(l_msgr_rdma_rx_bufs_total, -nbufs);
+ }
+}
+
+void *Infiniband::MemoryManager::mem_pool::slow_malloc()
+{
+ void *p;
+
+ Mutex::Locker l(PoolAllocator::lock);
+ PoolAllocator::g_ctx = ctx;
+ // this will trigger pool expansion via PoolAllocator::malloc()
+ p = boost::pool<PoolAllocator>::malloc();
+ PoolAllocator::g_ctx = nullptr;
+ return p;
+}
+
+Infiniband::MemoryManager::MemPoolContext *Infiniband::MemoryManager::PoolAllocator::g_ctx = nullptr;
+Mutex Infiniband::MemoryManager::PoolAllocator::lock("pool-alloc-lock");
+
+// lock is taken by mem_pool::slow_malloc()
+char *Infiniband::MemoryManager::PoolAllocator::malloc(const size_type bytes)
+{
+ mem_info *m;
+ Chunk *ch;
+ size_t rx_buf_size;
+ unsigned nbufs;
+ MemoryManager *manager;
+ CephContext *cct;
+
+ ceph_assert(g_ctx);
+ manager = g_ctx->manager;
+ cct = manager->cct;
+ rx_buf_size = sizeof(Chunk) + cct->_conf->ms_async_rdma_buffer_size;
+ nbufs = bytes/rx_buf_size;
+
+ if (!g_ctx->can_alloc(nbufs))
+ return NULL;
+
+ m = static_cast<mem_info *>(manager->malloc(bytes + sizeof(*m)));
+ if (!m) {
+ lderr(cct) << __func__ << " failed to allocate " <<
+ bytes << " + " << sizeof(*m) << " bytes of memory for " << nbufs << dendl;
+ return NULL;
+ }
+
+ m->mr = ibv_reg_mr(manager->pd->pd, m->chunks, bytes, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
+ if (m->mr == NULL) {
+ lderr(cct) << __func__ << " failed to register " <<
+ bytes << " + " << sizeof(*m) << " bytes of memory for " << nbufs << dendl;
+ manager->free(m);
+ return NULL;
+ }
+
+ m->nbufs = nbufs;
+ // save this chunk context
+ m->ctx = g_ctx;
+
+ // note that the memory can be allocated before perf logger is set
+ g_ctx->update_stats(nbufs);
+
+ /* initialize chunks */
+ ch = m->chunks;
+ for (unsigned i = 0; i < nbufs; i++) {
+ ch->lkey = m->mr->lkey;
+ ch->bytes = cct->_conf->ms_async_rdma_buffer_size;
+ ch->offset = 0;
+ ch->buffer = ch->data; // TODO: refactor tx and remove buffer
+ ch = reinterpret_cast<Chunk *>(reinterpret_cast<char *>(ch) + rx_buf_size);
+ }
+
+ return reinterpret_cast<char *>(m->chunks);
+}
+
+
+void Infiniband::MemoryManager::PoolAllocator::free(char * const block)
+{
+ mem_info *m;
+ Mutex::Locker l(lock);
+
+ m = reinterpret_cast<mem_info *>(block) - 1;
+ m->ctx->update_stats(-m->nbufs);
+ ibv_dereg_mr(m->mr);
+ m->ctx->manager->free(m);
+}
+
+Infiniband::MemoryManager::MemoryManager(CephContext *c, Device *d, ProtectionDomain *p)
+ : cct(c), device(d), pd(p),
+ rxbuf_pool_ctx(this),
+ rxbuf_pool(&rxbuf_pool_ctx, sizeof(Chunk) + c->_conf->ms_async_rdma_buffer_size,
+ c->_conf->ms_async_rdma_receive_buffers > 0 ?
+ // if possible make initial pool size 2 * receive_queue_len
+ // that way there will be no pool expansion upon receive of the
+ // first packet.
+ (c->_conf->ms_async_rdma_receive_buffers < 2 * c->_conf->ms_async_rdma_receive_queue_len ?
+ c->_conf->ms_async_rdma_receive_buffers : 2 * c->_conf->ms_async_rdma_receive_queue_len) :
+ // rx pool is infinite, we can set any initial size that we want
+ 2 * c->_conf->ms_async_rdma_receive_queue_len)
+{
+}
+
+Infiniband::MemoryManager::~MemoryManager()
+{
+ if (send)
+ delete send;
+}
+
+void* Infiniband::MemoryManager::huge_pages_malloc(size_t size)
+{
+ size_t real_size = ALIGN_TO_PAGE_SIZE(size + HUGE_PAGE_SIZE);
+ char *ptr = (char *)mmap(NULL, real_size, PROT_READ | PROT_WRITE,MAP_PRIVATE | MAP_ANONYMOUS |MAP_POPULATE | MAP_HUGETLB,-1, 0);
+ if (ptr == MAP_FAILED) {
+ ptr = (char *)std::malloc(real_size);
+ if (ptr == NULL) return NULL;
+ real_size = 0;
+ }
+ *((size_t *)ptr) = real_size;
+ return ptr + HUGE_PAGE_SIZE;
+}
+
+void Infiniband::MemoryManager::huge_pages_free(void *ptr)
+{
+ if (ptr == NULL) return;
+ void *real_ptr = (char *)ptr -HUGE_PAGE_SIZE;
+ size_t real_size = *((size_t *)real_ptr);
+ ceph_assert(real_size % HUGE_PAGE_SIZE == 0);
+ if (real_size != 0)
+ munmap(real_ptr, real_size);
+ else
+ std::free(real_ptr);
+}
+
+
+void* Infiniband::MemoryManager::malloc(size_t size)
+{
+ if (cct->_conf->ms_async_rdma_enable_hugepage)
+ return huge_pages_malloc(size);
+ else
+ return std::malloc(size);
+}
+
+void Infiniband::MemoryManager::free(void *ptr)
+{
+ if (cct->_conf->ms_async_rdma_enable_hugepage)
+ huge_pages_free(ptr);
+ else
+ std::free(ptr);
+}
+
+void Infiniband::MemoryManager::create_tx_pool(uint32_t size, uint32_t tx_num)
+{
+ ceph_assert(device);
+ ceph_assert(pd);
+
+ send = new Cluster(*this, size);
+ send->fill(tx_num);
+}
+
+void Infiniband::MemoryManager::return_tx(std::vector<Chunk*> &chunks)
+{
+ send->take_back(chunks);
+}
+
+int Infiniband::MemoryManager::get_send_buffers(std::vector<Chunk*> &c, size_t bytes)
+{
+ return send->get_buffers(c, bytes);
+}
+
+static std::atomic<bool> init_prereq = {false};
+
+void Infiniband::verify_prereq(CephContext *cct) {
+
+ //On RDMA MUST be called before fork
+ int rc = ibv_fork_init();
+ if (rc) {
+ lderr(cct) << __func__ << " failed to call ibv_for_init(). On RDMA must be called before fork. Application aborts." << dendl;
+ ceph_abort();
+ }
+
+ ldout(cct, 20) << __func__ << " ms_async_rdma_enable_hugepage value is: " << cct->_conf->ms_async_rdma_enable_hugepage << dendl;
+ if (cct->_conf->ms_async_rdma_enable_hugepage){
+ rc = setenv("RDMAV_HUGEPAGES_SAFE","1",1);
+ ldout(cct, 0) << __func__ << " RDMAV_HUGEPAGES_SAFE is set as: " << getenv("RDMAV_HUGEPAGES_SAFE") << dendl;
+ if (rc) {
+ lderr(cct) << __func__ << " failed to export RDMA_HUGEPAGES_SAFE. On RDMA must be exported before using huge pages. Application aborts." << dendl;
+ ceph_abort();
+ }
+ }
+
+ //Check ulimit
+ struct rlimit limit;
+ getrlimit(RLIMIT_MEMLOCK, &limit);
+ if (limit.rlim_cur != RLIM_INFINITY || limit.rlim_max != RLIM_INFINITY) {
+ lderr(cct) << __func__ << "!!! WARNING !!! For RDMA to work properly user memlock (ulimit -l) must be big enough to allow large amount of registered memory."
+ " We recommend setting this parameter to infinity" << dendl;
+ }
+ init_prereq = true;
+}
+
+Infiniband::Infiniband(CephContext *cct)
+ : cct(cct), lock("IB lock"),
+ device_name(cct->_conf->ms_async_rdma_device_name),
+ port_num( cct->_conf->ms_async_rdma_port_num)
+{
+ if (!init_prereq)
+ verify_prereq(cct);
+ ldout(cct, 20) << __func__ << " constructing Infiniband..." << dendl;
+}
+
+void Infiniband::init()
+{
+ Mutex::Locker l(lock);
+
+ if (initialized)
+ return;
+
+ device_list = new DeviceList(cct);
+ initialized = true;
+
+ device = device_list->get_device(device_name.c_str());
+ ceph_assert(device);
+ device->binding_port(cct, port_num);
+ ib_physical_port = device->active_port->get_port_num();
+ pd = new ProtectionDomain(cct, device);
+ ceph_assert(NetHandler(cct).set_nonblock(device->ctxt->async_fd) == 0);
+
+ support_srq = cct->_conf->ms_async_rdma_support_srq;
+ if (support_srq)
+ rx_queue_len = device->device_attr->max_srq_wr;
+ else
+ rx_queue_len = device->device_attr->max_qp_wr;
+ if (rx_queue_len > cct->_conf->ms_async_rdma_receive_queue_len) {
+ rx_queue_len = cct->_conf->ms_async_rdma_receive_queue_len;
+ ldout(cct, 1) << __func__ << " receive queue length is " << rx_queue_len << " receive buffers" << dendl;
+ } else {
+ ldout(cct, 0) << __func__ << " requested receive queue length " <<
+ cct->_conf->ms_async_rdma_receive_queue_len <<
+ " is too big. Setting " << rx_queue_len << dendl;
+ }
+
+ // check for the misconfiguration
+ if (cct->_conf->ms_async_rdma_receive_buffers > 0 &&
+ rx_queue_len > (unsigned)cct->_conf->ms_async_rdma_receive_buffers) {
+ lderr(cct) << __func__ << " rdma_receive_queue_len (" <<
+ rx_queue_len << ") > ms_async_rdma_receive_buffers(" <<
+ cct->_conf->ms_async_rdma_receive_buffers << ")." << dendl;
+ ceph_abort();
+ }
+
+ tx_queue_len = device->device_attr->max_qp_wr;
+ if (tx_queue_len > cct->_conf->ms_async_rdma_send_buffers) {
+ tx_queue_len = cct->_conf->ms_async_rdma_send_buffers;
+ ldout(cct, 1) << __func__ << " assigning: " << tx_queue_len << " send buffers" << dendl;
+ } else {
+ ldout(cct, 0) << __func__ << " using the max allowed send buffers: " << tx_queue_len << dendl;
+ }
+
+ ldout(cct, 1) << __func__ << " device allow " << device->device_attr->max_cqe
+ << " completion entries" << dendl;
+
+ memory_manager = new MemoryManager(cct, device, pd);
+ memory_manager->create_tx_pool(cct->_conf->ms_async_rdma_buffer_size, tx_queue_len);
+
+ if (support_srq) {
+ srq = create_shared_receive_queue(rx_queue_len, MAX_SHARED_RX_SGE_COUNT);
+ post_chunks_to_rq(rx_queue_len, NULL); //add to srq
+ }
+}
+
+Infiniband::~Infiniband()
+{
+ if (!initialized)
+ return;
+ if (support_srq)
+ ibv_destroy_srq(srq);
+ delete memory_manager;
+ delete pd;
+}
+
+/**
+ * Create a shared receive queue. This basically wraps the verbs call.
+ *
+ * \param[in] max_wr
+ * The max number of outstanding work requests in the SRQ.
+ * \param[in] max_sge
+ * The max number of scatter elements per WR.
+ * \return
+ * A valid ibv_srq pointer, or NULL on error.
+ */
+ibv_srq* Infiniband::create_shared_receive_queue(uint32_t max_wr, uint32_t max_sge)
+{
+ ibv_srq_init_attr sia;
+ // FIPS zeroization audit 20191115: this memset is not security related.
+ memset(&sia, 0, sizeof(sia));
+ sia.srq_context = device->ctxt;
+ sia.attr.max_wr = max_wr;
+ sia.attr.max_sge = max_sge;
+ return ibv_create_srq(pd->pd, &sia);
+}
+
+int Infiniband::get_tx_buffers(std::vector<Chunk*> &c, size_t bytes)
+{
+ return memory_manager->get_send_buffers(c, bytes);
+}
+
+/**
+ * Create a new QueuePair. This factory should be used in preference to
+ * the QueuePair constructor directly, since this lets derivatives of
+ * Infiniband, e.g. MockInfiniband (if it existed),
+ * return mocked out QueuePair derivatives.
+ *
+ * \return
+ * QueuePair on success or NULL if init fails
+ * See QueuePair::QueuePair for parameter documentation.
+ */
+Infiniband::QueuePair* Infiniband::create_queue_pair(CephContext *cct, CompletionQueue *tx,
+ CompletionQueue* rx, ibv_qp_type type, struct rdma_cm_id *cm_id)
+{
+ Infiniband::QueuePair *qp = new QueuePair(
+ cct, *this, type, ib_physical_port, srq, tx, rx, tx_queue_len, rx_queue_len, cm_id);
+ if (qp->init()) {
+ delete qp;
+ return NULL;
+ }
+ return qp;
+}
+
+int Infiniband::post_chunks_to_rq(int num, ibv_qp *qp)
+{
+ int ret, i = 0;
+ ibv_sge isge[num];
+ Chunk *chunk;
+ ibv_recv_wr rx_work_request[num];
+
+ while (i < num) {
+ chunk = get_memory_manager()->get_rx_buffer();
+ if (chunk == NULL) {
+ lderr(cct) << __func__ << " WARNING: out of memory. Requested " << num <<
+ " rx buffers. Got " << i << dendl;
+ if (i == 0)
+ return 0;
+ // if we got some buffers post them and hope for the best
+ rx_work_request[i-1].next = 0;
+ break;
+ }
+
+ isge[i].addr = reinterpret_cast<uint64_t>(chunk->data);
+ isge[i].length = chunk->bytes;
+ isge[i].lkey = chunk->lkey;
+
+ memset(&rx_work_request[i], 0, sizeof(rx_work_request[i]));
+ rx_work_request[i].wr_id = reinterpret_cast<uint64_t>(chunk);// stash descriptor ptr
+ if (i == num - 1) {
+ rx_work_request[i].next = 0;
+ } else {
+ rx_work_request[i].next = &rx_work_request[i+1];
+ }
+ rx_work_request[i].sg_list = &isge[i];
+ rx_work_request[i].num_sge = 1;
+ i++;
+ }
+ ibv_recv_wr *badworkrequest;
+ if (support_srq) {
+ ret = ibv_post_srq_recv(srq, &rx_work_request[0], &badworkrequest);
+ ceph_assert(ret == 0);
+ } else {
+ ceph_assert(qp);
+ ret = ibv_post_recv(qp, &rx_work_request[0], &badworkrequest);
+ ceph_assert(ret == 0);
+ }
+ return i;
+}
+
+Infiniband::CompletionChannel* Infiniband::create_comp_channel(CephContext *c)
+{
+ Infiniband::CompletionChannel *cc = new Infiniband::CompletionChannel(c, *this);
+ if (cc->init()) {
+ delete cc;
+ return NULL;
+ }
+ return cc;
+}
+
+Infiniband::CompletionQueue* Infiniband::create_comp_queue(
+ CephContext *cct, CompletionChannel *cc)
+{
+ Infiniband::CompletionQueue *cq = new Infiniband::CompletionQueue(
+ cct, *this, CQ_DEPTH, cc);
+ if (cq->init()) {
+ delete cq;
+ return NULL;
+ }
+ return cq;
+}
+
+// 1 means no valid buffer read, 0 means got enough buffer
+// else return < 0 means error
+int Infiniband::recv_msg(CephContext *cct, int sd, IBSYNMsg& im)
+{
+ char msg[TCP_MSG_LEN];
+ char gid[33];
+ ssize_t r = ::read(sd, &msg, sizeof(msg));
+ // Drop incoming qpt
+ if (cct->_conf->ms_inject_socket_failures && sd >= 0) {
+ if (rand() % cct->_conf->ms_inject_socket_failures == 0) {
+ ldout(cct, 0) << __func__ << " injecting socket failure" << dendl;
+ return -EINVAL;
+ }
+ }
+ if (r < 0) {
+ r = -errno;
+ lderr(cct) << __func__ << " got error " << r << ": "
+ << cpp_strerror(r) << dendl;
+ } else if (r == 0) { // valid disconnect message of length 0
+ ldout(cct, 10) << __func__ << " got disconnect message " << dendl;
+ } else if ((size_t)r != sizeof(msg)) { // invalid message
+ ldout(cct, 1) << __func__ << " got bad length (" << r << ") " << dendl;
+ r = -EINVAL;
+ } else { // valid message
+ sscanf(msg, "%hx:%x:%x:%x:%s", &(im.lid), &(im.qpn), &(im.psn), &(im.peer_qpn),gid);
+ wire_gid_to_gid(gid, &(im.gid));
+ ldout(cct, 5) << __func__ << " recevd: " << im.lid << ", " << im.qpn << ", " << im.psn << ", " << im.peer_qpn << ", " << gid << dendl;
+ }
+ return r;
+}
+
+int Infiniband::send_msg(CephContext *cct, int sd, IBSYNMsg& im)
+{
+ int retry = 0;
+ ssize_t r;
+
+ char msg[TCP_MSG_LEN];
+ char gid[33];
+retry:
+ gid_to_wire_gid(&(im.gid), gid);
+ sprintf(msg, "%04x:%08x:%08x:%08x:%s", im.lid, im.qpn, im.psn, im.peer_qpn, gid);
+ ldout(cct, 10) << __func__ << " sending: " << im.lid << ", " << im.qpn << ", " << im.psn
+ << ", " << im.peer_qpn << ", " << gid << dendl;
+ r = ::write(sd, msg, sizeof(msg));
+ // Drop incoming qpt
+ if (cct->_conf->ms_inject_socket_failures && sd >= 0) {
+ if (rand() % cct->_conf->ms_inject_socket_failures == 0) {
+ ldout(cct, 0) << __func__ << " injecting socket failure" << dendl;
+ return -EINVAL;
+ }
+ }
+
+ if ((size_t)r != sizeof(msg)) {
+ // FIXME need to handle EAGAIN instead of retry
+ if (r < 0 && (errno == EINTR || errno == EAGAIN) && retry < 3) {
+ retry++;
+ goto retry;
+ }
+ if (r < 0)
+ lderr(cct) << __func__ << " send returned error " << errno << ": "
+ << cpp_strerror(errno) << dendl;
+ else
+ lderr(cct) << __func__ << " send got bad length (" << r << ") " << cpp_strerror(errno) << dendl;
+ return -errno;
+ }
+ return 0;
+}
+
+void Infiniband::wire_gid_to_gid(const char *wgid, union ibv_gid *gid)
+{
+ char tmp[9];
+ uint32_t v32;
+ int i;
+
+ for (tmp[8] = 0, i = 0; i < 4; ++i) {
+ memcpy(tmp, wgid + i * 8, 8);
+ sscanf(tmp, "%x", &v32);
+ *(uint32_t *)(&gid->raw[i * 4]) = ntohl(v32);
+ }
+}
+
+void Infiniband::gid_to_wire_gid(const union ibv_gid *gid, char wgid[])
+{
+ for (int i = 0; i < 4; ++i)
+ sprintf(&wgid[i * 8], "%08x", htonl(*(uint32_t *)(gid->raw + i * 4)));
+}
+
+Infiniband::QueuePair::~QueuePair()
+{
+ if (qp) {
+ ldout(cct, 20) << __func__ << " destroy qp=" << qp << dendl;
+ ceph_assert(!ibv_destroy_qp(qp));
+ }
+}
+
+/**
+ * Given a string representation of the `status' field from Verbs
+ * struct `ibv_wc'.
+ *
+ * \param[in] status
+ * The integer status obtained in ibv_wc.status.
+ * \return
+ * A string corresponding to the given status.
+ */
+const char* Infiniband::wc_status_to_string(int status)
+{
+ static const char *lookup[] = {
+ "SUCCESS",
+ "LOC_LEN_ERR",
+ "LOC_QP_OP_ERR",
+ "LOC_EEC_OP_ERR",
+ "LOC_PROT_ERR",
+ "WR_FLUSH_ERR",
+ "MW_BIND_ERR",
+ "BAD_RESP_ERR",
+ "LOC_ACCESS_ERR",
+ "REM_INV_REQ_ERR",
+ "REM_ACCESS_ERR",
+ "REM_OP_ERR",
+ "RETRY_EXC_ERR",
+ "RNR_RETRY_EXC_ERR",
+ "LOC_RDD_VIOL_ERR",
+ "REM_INV_RD_REQ_ERR",
+ "REM_ABORT_ERR",
+ "INV_EECN_ERR",
+ "INV_EEC_STATE_ERR",
+ "FATAL_ERR",
+ "RESP_TIMEOUT_ERR",
+ "GENERAL_ERR"
+ };
+
+ if (status < IBV_WC_SUCCESS || status > IBV_WC_GENERAL_ERR)
+ return "<status out of range!>";
+ return lookup[status];
+}
+
+const char* Infiniband::qp_state_string(int status) {
+ switch(status) {
+ case IBV_QPS_RESET : return "IBV_QPS_RESET";
+ case IBV_QPS_INIT : return "IBV_QPS_INIT";
+ case IBV_QPS_RTR : return "IBV_QPS_RTR";
+ case IBV_QPS_RTS : return "IBV_QPS_RTS";
+ case IBV_QPS_SQD : return "IBV_QPS_SQD";
+ case IBV_QPS_SQE : return "IBV_QPS_SQE";
+ case IBV_QPS_ERR : return "IBV_QPS_ERR";
+ default: return " out of range.";
+ }
+}
diff --git a/src/msg/async/rdma/Infiniband.h b/src/msg/async/rdma/Infiniband.h
new file mode 100644
index 00000000..2889cdfc
--- /dev/null
+++ b/src/msg/async/rdma/Infiniband.h
@@ -0,0 +1,529 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_INFINIBAND_H
+#define CEPH_INFINIBAND_H
+
+#include <boost/pool/pool.hpp>
+// need this because boost messes with ceph log/assert definitions
+#include "include/ceph_assert.h"
+
+#include <infiniband/verbs.h>
+#include <rdma/rdma_cma.h>
+
+#include <atomic>
+#include <string>
+#include <vector>
+
+#include "include/int_types.h"
+#include "include/page.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Mutex.h"
+#include "common/perf_counters.h"
+#include "msg/msg_types.h"
+#include "msg/async/net_handler.h"
+
+#define HUGE_PAGE_SIZE (2 * 1024 * 1024)
+#define ALIGN_TO_PAGE_SIZE(x) \
+ (((x) + HUGE_PAGE_SIZE -1) / HUGE_PAGE_SIZE * HUGE_PAGE_SIZE)
+
+struct IBSYNMsg {
+ uint16_t lid;
+ uint32_t qpn;
+ uint32_t psn;
+ uint32_t peer_qpn;
+ union ibv_gid gid;
+} __attribute__((packed));
+
+class RDMAStack;
+class CephContext;
+
+class Port {
+ struct ibv_context* ctxt;
+ int port_num;
+ struct ibv_port_attr* port_attr;
+ uint16_t lid;
+ int gid_idx = 0;
+ union ibv_gid gid;
+
+ public:
+ explicit Port(CephContext *cct, struct ibv_context* ictxt, uint8_t ipn);
+ uint16_t get_lid() { return lid; }
+ ibv_gid get_gid() { return gid; }
+ int get_port_num() { return port_num; }
+ ibv_port_attr* get_port_attr() { return port_attr; }
+ int get_gid_idx() { return gid_idx; }
+};
+
+
+class Device {
+ ibv_device *device;
+ const char* name;
+ uint8_t port_cnt = 0;
+ public:
+ explicit Device(CephContext *c, ibv_device* d, struct ibv_context *dc);
+ ~Device() {
+ if (active_port) {
+ delete active_port;
+ ceph_assert(ibv_close_device(ctxt) == 0);
+ }
+ }
+ const char* get_name() { return name;}
+ uint16_t get_lid() { return active_port->get_lid(); }
+ ibv_gid get_gid() { return active_port->get_gid(); }
+ int get_gid_idx() { return active_port->get_gid_idx(); }
+ void binding_port(CephContext *c, int port_num);
+ struct ibv_context *ctxt;
+ ibv_device_attr *device_attr;
+ Port* active_port;
+};
+
+
+class DeviceList {
+ struct ibv_device ** device_list;
+ struct ibv_context ** device_context_list;
+ int num;
+ Device** devices;
+ public:
+ explicit DeviceList(CephContext *cct): device_list(ibv_get_device_list(&num)),
+ device_context_list(rdma_get_devices(&num)) {
+ if (device_list == NULL || num == 0) {
+ lderr(cct) << __func__ << " failed to get rdma device list. " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+ devices = new Device*[num];
+
+ for (int i = 0;i < num; ++i) {
+ devices[i] = new Device(cct, device_list[i], device_context_list[i]);
+ }
+ }
+ ~DeviceList() {
+ for (int i=0; i < num; ++i) {
+ delete devices[i];
+ }
+ delete []devices;
+ ibv_free_device_list(device_list);
+ }
+
+ Device* get_device(const char* device_name) {
+ ceph_assert(devices);
+ for (int i = 0; i < num; ++i) {
+ if (!strlen(device_name) || !strcmp(device_name, devices[i]->get_name())) {
+ return devices[i];
+ }
+ }
+ return NULL;
+ }
+};
+
+// stat counters
+enum {
+ l_msgr_rdma_dispatcher_first = 94000,
+
+ l_msgr_rdma_polling,
+ l_msgr_rdma_inflight_tx_chunks,
+ l_msgr_rdma_rx_bufs_in_use,
+ l_msgr_rdma_rx_bufs_total,
+
+ l_msgr_rdma_tx_total_wc,
+ l_msgr_rdma_tx_total_wc_errors,
+ l_msgr_rdma_tx_wc_retry_errors,
+ l_msgr_rdma_tx_wc_wr_flush_errors,
+
+ l_msgr_rdma_rx_total_wc,
+ l_msgr_rdma_rx_total_wc_errors,
+ l_msgr_rdma_rx_fin,
+
+ l_msgr_rdma_handshake_errors,
+
+ l_msgr_rdma_total_async_events,
+ l_msgr_rdma_async_last_wqe_events,
+
+ l_msgr_rdma_created_queue_pair,
+ l_msgr_rdma_active_queue_pair,
+
+ l_msgr_rdma_dispatcher_last,
+};
+
+enum {
+ l_msgr_rdma_first = 95000,
+
+ l_msgr_rdma_tx_no_mem,
+ l_msgr_rdma_tx_parital_mem,
+ l_msgr_rdma_tx_failed,
+
+ l_msgr_rdma_tx_chunks,
+ l_msgr_rdma_tx_bytes,
+ l_msgr_rdma_rx_chunks,
+ l_msgr_rdma_rx_bytes,
+ l_msgr_rdma_pending_sent_conns,
+
+ l_msgr_rdma_last,
+};
+
+class RDMADispatcher;
+
+class Infiniband {
+ public:
+ class ProtectionDomain {
+ public:
+ explicit ProtectionDomain(CephContext *cct, Device *device);
+ ~ProtectionDomain();
+
+ ibv_pd* const pd;
+ };
+
+
+ class MemoryManager {
+ public:
+ class Chunk {
+ public:
+ Chunk(ibv_mr* m, uint32_t len, char* b);
+ ~Chunk();
+
+ void set_offset(uint32_t o);
+ uint32_t get_offset();
+ void set_bound(uint32_t b);
+ void prepare_read(uint32_t b);
+ uint32_t get_bound();
+ uint32_t read(char* buf, uint32_t len);
+ uint32_t write(char* buf, uint32_t len);
+ bool full();
+ bool over();
+ void clear();
+
+ public:
+ ibv_mr* mr;
+ uint32_t lkey = 0;
+ uint32_t bytes;
+ uint32_t bound = 0;
+ uint32_t offset;
+ char* buffer; // TODO: remove buffer/refactor TX
+ char data[0];
+ };
+
+ class Cluster {
+ public:
+ Cluster(MemoryManager& m, uint32_t s);
+ ~Cluster();
+
+ int fill(uint32_t num);
+ void take_back(std::vector<Chunk*> &ck);
+ int get_buffers(std::vector<Chunk*> &chunks, size_t bytes);
+ Chunk *get_chunk_by_buffer(const char *c) {
+ uint32_t idx = (c - base) / buffer_size;
+ Chunk *chunk = chunk_base + idx;
+ return chunk;
+ }
+ bool is_my_buffer(const char *c) const {
+ return c >= base && c < end;
+ }
+
+ MemoryManager& manager;
+ uint32_t buffer_size;
+ uint32_t num_chunk = 0;
+ Mutex lock;
+ std::vector<Chunk*> free_chunks;
+ char *base = nullptr;
+ char *end = nullptr;
+ Chunk* chunk_base = nullptr;
+ };
+
+ class MemPoolContext {
+ PerfCounters *perf_logger;
+
+ public:
+ MemoryManager *manager;
+ unsigned n_bufs_allocated;
+ // true if it is possible to alloc
+ // more memory for the pool
+ explicit MemPoolContext(MemoryManager *m) :
+ perf_logger(nullptr),
+ manager(m),
+ n_bufs_allocated(0) {}
+ bool can_alloc(unsigned nbufs);
+ void update_stats(int val);
+ void set_stat_logger(PerfCounters *logger);
+ };
+
+ class PoolAllocator {
+ struct mem_info {
+ ibv_mr *mr;
+ MemPoolContext *ctx;
+ unsigned nbufs;
+ Chunk chunks[0];
+ };
+ public:
+ typedef std::size_t size_type;
+ typedef std::ptrdiff_t difference_type;
+
+ static char * malloc(const size_type bytes);
+ static void free(char * const block);
+
+ static MemPoolContext *g_ctx;
+ static Mutex lock;
+ };
+
+ /**
+ * modify boost pool so that it is possible to
+ * have a thread safe 'context' when allocating/freeing
+ * the memory. It is needed to allow a different pool
+ * configurations and bookkeeping per CephContext and
+ * also to be able to use same allocator to deal with
+ * RX and TX pool.
+ * TODO: use boost pool to allocate TX chunks too
+ */
+ class mem_pool : public boost::pool<PoolAllocator> {
+ private:
+ MemPoolContext *ctx;
+ void *slow_malloc();
+
+ public:
+ explicit mem_pool(MemPoolContext *ctx, const size_type nrequested_size,
+ const size_type nnext_size = 32,
+ const size_type nmax_size = 0) :
+ pool(nrequested_size, nnext_size, nmax_size),
+ ctx(ctx) { }
+
+ void *malloc() {
+ if (!store().empty())
+ return (store().malloc)();
+ // need to alloc more memory...
+ // slow path code
+ return slow_malloc();
+ }
+ };
+
+ MemoryManager(CephContext *c, Device *d, ProtectionDomain *p);
+ ~MemoryManager();
+
+ void* malloc(size_t size);
+ void free(void *ptr);
+
+ void create_tx_pool(uint32_t size, uint32_t tx_num);
+ void return_tx(std::vector<Chunk*> &chunks);
+ int get_send_buffers(std::vector<Chunk*> &c, size_t bytes);
+ bool is_tx_buffer(const char* c) { return send->is_my_buffer(c); }
+ Chunk *get_tx_chunk_by_buffer(const char *c) {
+ return send->get_chunk_by_buffer(c);
+ }
+ uint32_t get_tx_buffer_size() const {
+ return send->buffer_size;
+ }
+
+ Chunk *get_rx_buffer() {
+ return reinterpret_cast<Chunk *>(rxbuf_pool.malloc());
+ }
+
+ void release_rx_buffer(Chunk *chunk) {
+ rxbuf_pool.free(chunk);
+ }
+
+ void set_rx_stat_logger(PerfCounters *logger) {
+ rxbuf_pool_ctx.set_stat_logger(logger);
+ }
+
+ CephContext *cct;
+ private:
+ // TODO: Cluster -> TxPool txbuf_pool
+ // chunk layout fix
+ //
+ Cluster* send = nullptr;// SEND
+ Device *device;
+ ProtectionDomain *pd;
+ MemPoolContext rxbuf_pool_ctx;
+ mem_pool rxbuf_pool;
+
+
+ void* huge_pages_malloc(size_t size);
+ void huge_pages_free(void *ptr);
+ };
+
+ private:
+ uint32_t tx_queue_len = 0;
+ uint32_t rx_queue_len = 0;
+ uint32_t max_sge = 0;
+ uint8_t ib_physical_port = 0;
+ MemoryManager* memory_manager = nullptr;
+ ibv_srq* srq = nullptr; // shared receive work queue
+ Device *device = NULL;
+ ProtectionDomain *pd = NULL;
+ DeviceList *device_list = nullptr;
+ void wire_gid_to_gid(const char *wgid, union ibv_gid *gid);
+ void gid_to_wire_gid(const union ibv_gid *gid, char wgid[]);
+ CephContext *cct;
+ Mutex lock;
+ bool initialized = false;
+ const std::string &device_name;
+ uint8_t port_num;
+ bool support_srq = false;
+
+ public:
+ explicit Infiniband(CephContext *c);
+ ~Infiniband();
+ void init();
+ static void verify_prereq(CephContext *cct);
+
+ class CompletionChannel {
+ static const uint32_t MAX_ACK_EVENT = 5000;
+ CephContext *cct;
+ Infiniband& infiniband;
+ ibv_comp_channel *channel;
+ ibv_cq *cq;
+ uint32_t cq_events_that_need_ack;
+
+ public:
+ CompletionChannel(CephContext *c, Infiniband &ib);
+ ~CompletionChannel();
+ int init();
+ bool get_cq_event();
+ int get_fd() { return channel->fd; }
+ ibv_comp_channel* get_channel() { return channel; }
+ void bind_cq(ibv_cq *c) { cq = c; }
+ void ack_events();
+ };
+
+ // this class encapsulates the creation, use, and destruction of an RC
+ // completion queue.
+ //
+ // You need to call init and it will create a cq and associate to comp channel
+ class CompletionQueue {
+ public:
+ CompletionQueue(CephContext *c, Infiniband &ib,
+ const uint32_t qd, CompletionChannel *cc)
+ : cct(c), infiniband(ib), channel(cc), cq(NULL), queue_depth(qd) {}
+ ~CompletionQueue();
+ int init();
+ int poll_cq(int num_entries, ibv_wc *ret_wc_array);
+
+ ibv_cq* get_cq() const { return cq; }
+ int rearm_notify(bool solicited_only=true);
+ CompletionChannel* get_cc() const { return channel; }
+ private:
+ CephContext *cct;
+ Infiniband& infiniband; // Infiniband to which this QP belongs
+ CompletionChannel *channel;
+ ibv_cq *cq;
+ uint32_t queue_depth;
+ };
+
+ // this class encapsulates the creation, use, and destruction of an RC
+ // queue pair.
+ //
+ // you need call init and it will create a qp and bring it to the INIT state.
+ // after obtaining the lid, qpn, and psn of a remote queue pair, one
+ // must call plumb() to bring the queue pair to the RTS state.
+ class QueuePair {
+ public:
+ QueuePair(CephContext *c, Infiniband& infiniband, ibv_qp_type type,
+ int ib_physical_port, ibv_srq *srq,
+ Infiniband::CompletionQueue* txcq,
+ Infiniband::CompletionQueue* rxcq,
+ uint32_t tx_queue_len, uint32_t max_recv_wr, struct rdma_cm_id *cid, uint32_t q_key = 0);
+ ~QueuePair();
+
+ int init();
+
+ /**
+ * Get the initial packet sequence number for this QueuePair.
+ * This is randomly generated on creation. It should not be confused
+ * with the remote side's PSN, which is set in #plumb().
+ */
+ uint32_t get_initial_psn() const { return initial_psn; };
+ /**
+ * Get the local queue pair number for this QueuePair.
+ * QPNs are analogous to UDP/TCP port numbers.
+ */
+ uint32_t get_local_qp_number() const { return qp->qp_num; };
+ /**
+ * Get the remote queue pair number for this QueuePair, as set in #plumb().
+ * QPNs are analogous to UDP/TCP port numbers.
+ */
+ int get_remote_qp_number(uint32_t *rqp) const;
+ /**
+ * Get the remote infiniband address for this QueuePair, as set in #plumb().
+ * LIDs are "local IDs" in infiniband terminology. They are short, locally
+ * routable addresses.
+ */
+ int get_remote_lid(uint16_t *lid) const;
+ /**
+ * Get the state of a QueuePair.
+ */
+ int get_state() const;
+ /**
+ * Return true if the queue pair is in an error state, false otherwise.
+ */
+ bool is_error() const;
+ void add_tx_wr(uint32_t amt) { tx_wr_inflight += amt; }
+ void dec_tx_wr(uint32_t amt) { tx_wr_inflight -= amt; }
+ uint32_t get_tx_wr() const { return tx_wr_inflight; }
+ ibv_qp* get_qp() const { return qp; }
+ Infiniband::CompletionQueue* get_tx_cq() const { return txcq; }
+ Infiniband::CompletionQueue* get_rx_cq() const { return rxcq; }
+ int to_dead();
+ bool is_dead() const { return dead; }
+
+ private:
+ CephContext *cct;
+ Infiniband& infiniband; // Infiniband to which this QP belongs
+ ibv_qp_type type; // QP type (IBV_QPT_RC, etc.)
+ ibv_context* ctxt; // device context of the HCA to use
+ int ib_physical_port;
+ ibv_pd* pd; // protection domain
+ ibv_srq* srq; // shared receive queue
+ ibv_qp* qp; // infiniband verbs QP handle
+ struct rdma_cm_id *cm_id;
+ Infiniband::CompletionQueue* txcq;
+ Infiniband::CompletionQueue* rxcq;
+ uint32_t initial_psn; // initial packet sequence number
+ uint32_t max_send_wr;
+ uint32_t max_recv_wr;
+ uint32_t q_key;
+ bool dead;
+ std::atomic<uint32_t> tx_wr_inflight = {0}; // counter for inflight Tx WQEs
+ };
+
+ public:
+ typedef MemoryManager::Cluster Cluster;
+ typedef MemoryManager::Chunk Chunk;
+ QueuePair* create_queue_pair(CephContext *c, CompletionQueue*, CompletionQueue*,
+ ibv_qp_type type, struct rdma_cm_id *cm_id);
+ ibv_srq* create_shared_receive_queue(uint32_t max_wr, uint32_t max_sge);
+ // post rx buffers to srq, return number of buffers actually posted
+ int post_chunks_to_rq(int num, ibv_qp *qp=NULL);
+ void post_chunk_to_pool(Chunk* chunk) {
+ get_memory_manager()->release_rx_buffer(chunk);
+ }
+ int get_tx_buffers(std::vector<Chunk*> &c, size_t bytes);
+ CompletionChannel *create_comp_channel(CephContext *c);
+ CompletionQueue *create_comp_queue(CephContext *c, CompletionChannel *cc=NULL);
+ uint8_t get_ib_physical_port() { return ib_physical_port; }
+ int send_msg(CephContext *cct, int sd, IBSYNMsg& msg);
+ int recv_msg(CephContext *cct, int sd, IBSYNMsg& msg);
+ uint16_t get_lid() { return device->get_lid(); }
+ ibv_gid get_gid() { return device->get_gid(); }
+ MemoryManager* get_memory_manager() { return memory_manager; }
+ Device* get_device() { return device; }
+ int get_async_fd() { return device->ctxt->async_fd; }
+ bool is_tx_buffer(const char* c) { return memory_manager->is_tx_buffer(c);}
+ Chunk *get_tx_chunk_by_buffer(const char *c) { return memory_manager->get_tx_chunk_by_buffer(c); }
+ static const char* wc_status_to_string(int status);
+ static const char* qp_state_string(int status);
+ uint32_t get_rx_queue_len() const { return rx_queue_len; }
+};
+
+#endif
diff --git a/src/msg/async/rdma/RDMAConnectedSocketImpl.cc b/src/msg/async/rdma/RDMAConnectedSocketImpl.cc
new file mode 100644
index 00000000..89be7428
--- /dev/null
+++ b/src/msg/async/rdma/RDMAConnectedSocketImpl.cc
@@ -0,0 +1,743 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include "RDMAStack.h"
+
+class C_handle_connection_established : public EventCallback {
+ RDMAConnectedSocketImpl *csi;
+ bool active = true;
+ public:
+ C_handle_connection_established(RDMAConnectedSocketImpl *w) : csi(w) {}
+ void do_request(uint64_t fd) final {
+ if (active)
+ csi->handle_connection_established();
+ }
+ void close() {
+ active = false;
+ }
+};
+
+class C_handle_connection_read : public EventCallback {
+ RDMAConnectedSocketImpl *csi;
+ bool active = true;
+ public:
+ explicit C_handle_connection_read(RDMAConnectedSocketImpl *w): csi(w) {}
+ void do_request(uint64_t fd) final {
+ if (active)
+ csi->handle_connection();
+ }
+ void close() {
+ active = false;
+ }
+};
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << " RDMAConnectedSocketImpl "
+
+RDMAConnectedSocketImpl::RDMAConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s,
+ RDMAWorker *w)
+ : cct(cct), connected(0), error(0), infiniband(ib),
+ dispatcher(s), worker(w), lock("RDMAConnectedSocketImpl::lock"),
+ is_server(false), read_handler(new C_handle_connection_read(this)),
+ established_handler(new C_handle_connection_established(this)),
+ active(false), pending(false)
+{
+ if (!cct->_conf->ms_async_rdma_cm) {
+ qp = infiniband->create_queue_pair(cct, s->get_tx_cq(), s->get_rx_cq(), IBV_QPT_RC, NULL);
+ my_msg.qpn = qp->get_local_qp_number();
+ my_msg.psn = qp->get_initial_psn();
+ my_msg.lid = infiniband->get_lid();
+ my_msg.peer_qpn = 0;
+ my_msg.gid = infiniband->get_gid();
+ notify_fd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
+ dispatcher->register_qp(qp, this);
+ dispatcher->perf_logger->inc(l_msgr_rdma_created_queue_pair);
+ dispatcher->perf_logger->inc(l_msgr_rdma_active_queue_pair);
+ }
+}
+
+RDMAConnectedSocketImpl::~RDMAConnectedSocketImpl()
+{
+ ldout(cct, 20) << __func__ << " destruct." << dendl;
+ cleanup();
+ worker->remove_pending_conn(this);
+ dispatcher->erase_qpn(my_msg.qpn);
+
+ for (unsigned i=0; i < wc.size(); ++i) {
+ dispatcher->post_chunk_to_pool(reinterpret_cast<Chunk*>(wc[i].wr_id));
+ }
+ for (unsigned i=0; i < buffers.size(); ++i) {
+ dispatcher->post_chunk_to_pool(buffers[i]);
+ }
+
+ Mutex::Locker l(lock);
+ if (notify_fd >= 0)
+ ::close(notify_fd);
+ if (tcp_fd >= 0)
+ ::close(tcp_fd);
+ error = ECONNRESET;
+}
+
+void RDMAConnectedSocketImpl::pass_wc(std::vector<ibv_wc> &&v)
+{
+ Mutex::Locker l(lock);
+ if (wc.empty())
+ wc = std::move(v);
+ else
+ wc.insert(wc.end(), v.begin(), v.end());
+ notify();
+}
+
+void RDMAConnectedSocketImpl::get_wc(std::vector<ibv_wc> &w)
+{
+ Mutex::Locker l(lock);
+ if (wc.empty())
+ return ;
+ w.swap(wc);
+}
+
+int RDMAConnectedSocketImpl::activate()
+{
+ ibv_qp_attr qpa;
+ int r;
+
+ // now connect up the qps and switch to RTR
+ memset(&qpa, 0, sizeof(qpa));
+ qpa.qp_state = IBV_QPS_RTR;
+ qpa.path_mtu = IBV_MTU_1024;
+ qpa.dest_qp_num = peer_msg.qpn;
+ qpa.rq_psn = peer_msg.psn;
+ qpa.max_dest_rd_atomic = 1;
+ qpa.min_rnr_timer = 12;
+ //qpa.ah_attr.is_global = 0;
+ qpa.ah_attr.is_global = 1;
+ qpa.ah_attr.grh.hop_limit = 6;
+ qpa.ah_attr.grh.dgid = peer_msg.gid;
+
+ qpa.ah_attr.grh.sgid_index = infiniband->get_device()->get_gid_idx();
+
+ qpa.ah_attr.dlid = peer_msg.lid;
+ qpa.ah_attr.sl = cct->_conf->ms_async_rdma_sl;
+ qpa.ah_attr.grh.traffic_class = cct->_conf->ms_async_rdma_dscp;
+ qpa.ah_attr.src_path_bits = 0;
+ qpa.ah_attr.port_num = (uint8_t)(infiniband->get_ib_physical_port());
+
+ ldout(cct, 20) << __func__ << " Choosing gid_index " << (int)qpa.ah_attr.grh.sgid_index << ", sl " << (int)qpa.ah_attr.sl << dendl;
+
+ r = ibv_modify_qp(qp->get_qp(), &qpa, IBV_QP_STATE |
+ IBV_QP_AV |
+ IBV_QP_PATH_MTU |
+ IBV_QP_DEST_QPN |
+ IBV_QP_RQ_PSN |
+ IBV_QP_MIN_RNR_TIMER |
+ IBV_QP_MAX_DEST_RD_ATOMIC);
+ if (r) {
+ lderr(cct) << __func__ << " failed to transition to RTR state: "
+ << cpp_strerror(errno) << dendl;
+ return -1;
+ }
+
+ ldout(cct, 20) << __func__ << " transition to RTR state successfully." << dendl;
+
+ // now move to RTS
+ qpa.qp_state = IBV_QPS_RTS;
+
+ // How long to wait before retrying if packet lost or server dead.
+ // Supposedly the timeout is 4.096us*2^timeout. However, the actual
+ // timeout appears to be 4.096us*2^(timeout+1), so the setting
+ // below creates a 135ms timeout.
+ qpa.timeout = 14;
+
+ // How many times to retry after timeouts before giving up.
+ qpa.retry_cnt = 7;
+
+ // How many times to retry after RNR (receiver not ready) condition
+ // before giving up. Occurs when the remote side has not yet posted
+ // a receive request.
+ qpa.rnr_retry = 7; // 7 is infinite retry.
+ qpa.sq_psn = my_msg.psn;
+ qpa.max_rd_atomic = 1;
+
+ r = ibv_modify_qp(qp->get_qp(), &qpa, IBV_QP_STATE |
+ IBV_QP_TIMEOUT |
+ IBV_QP_RETRY_CNT |
+ IBV_QP_RNR_RETRY |
+ IBV_QP_SQ_PSN |
+ IBV_QP_MAX_QP_RD_ATOMIC);
+ if (r) {
+ lderr(cct) << __func__ << " failed to transition to RTS state: "
+ << cpp_strerror(errno) << dendl;
+ return -1;
+ }
+
+ // the queue pair should be ready to use once the client has finished
+ // setting up their end.
+ ldout(cct, 20) << __func__ << " transition to RTS state successfully." << dendl;
+ ldout(cct, 20) << __func__ << " QueuePair: " << qp << " with qp:" << qp->get_qp() << dendl;
+
+ if (!is_server) {
+ connected = 1; //indicate successfully
+ ldout(cct, 20) << __func__ << " handle fake send, wake it up. QP: " << my_msg.qpn << dendl;
+ submit(false);
+ }
+ active = true;
+
+ return 0;
+}
+
+int RDMAConnectedSocketImpl::try_connect(const entity_addr_t& peer_addr, const SocketOptions &opts) {
+ ldout(cct, 20) << __func__ << " nonblock:" << opts.nonblock << ", nodelay:"
+ << opts.nodelay << ", rbuf_size: " << opts.rcbuf_size << dendl;
+ NetHandler net(cct);
+
+ // we construct a socket to transport ib sync message
+ // but we shouldn't block in tcp connecting
+ if (opts.nonblock) {
+ tcp_fd = net.nonblock_connect(peer_addr, opts.connect_bind_addr);
+ } else {
+ tcp_fd = net.connect(peer_addr, opts.connect_bind_addr);
+ }
+
+ if (tcp_fd < 0) {
+ return -errno;
+ }
+
+ int r = net.set_socket_options(tcp_fd, opts.nodelay, opts.rcbuf_size);
+ if (r < 0) {
+ ::close(tcp_fd);
+ tcp_fd = -1;
+ return -errno;
+ }
+
+ ldout(cct, 20) << __func__ << " tcp_fd: " << tcp_fd << dendl;
+ net.set_priority(tcp_fd, opts.priority, peer_addr.get_family());
+ r = 0;
+ if (opts.nonblock) {
+ worker->center.create_file_event(tcp_fd, EVENT_READABLE | EVENT_WRITABLE , established_handler);
+ } else {
+ r = handle_connection_established(false);
+ }
+ return r;
+}
+
+int RDMAConnectedSocketImpl::handle_connection_established(bool need_set_fault) {
+ ldout(cct, 20) << __func__ << " start " << dendl;
+ // delete read event
+ worker->center.delete_file_event(tcp_fd, EVENT_READABLE | EVENT_WRITABLE);
+ if (1 == connected) {
+ ldout(cct, 1) << __func__ << " warnning: logic failed " << dendl;
+ if (need_set_fault) {
+ fault();
+ }
+ return -1;
+ }
+ // send handshake msg to server
+ my_msg.peer_qpn = 0;
+ int r = infiniband->send_msg(cct, tcp_fd, my_msg);
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " send handshake msg failed." << r << dendl;
+ if (need_set_fault) {
+ fault();
+ }
+ return r;
+ }
+ worker->center.create_file_event(tcp_fd, EVENT_READABLE, read_handler);
+ ldout(cct, 20) << __func__ << " finish " << dendl;
+ return 0;
+}
+
+void RDMAConnectedSocketImpl::handle_connection() {
+ ldout(cct, 20) << __func__ << " QP: " << my_msg.qpn << " tcp_fd: " << tcp_fd << " notify_fd: " << notify_fd << dendl;
+ int r = infiniband->recv_msg(cct, tcp_fd, peer_msg);
+ if (r <= 0) {
+ if (r != -EAGAIN) {
+ dispatcher->perf_logger->inc(l_msgr_rdma_handshake_errors);
+ ldout(cct, 1) << __func__ << " recv handshake msg failed." << dendl;
+ fault();
+ }
+ return;
+ }
+
+ if (1 == connected) {
+ ldout(cct, 1) << __func__ << " warnning: logic failed: read len: " << r << dendl;
+ fault();
+ return;
+ }
+
+ if (!is_server) {// syn + ack from server
+ my_msg.peer_qpn = peer_msg.qpn;
+ ldout(cct, 20) << __func__ << " peer msg : < " << peer_msg.qpn << ", " << peer_msg.psn
+ << ", " << peer_msg.lid << ", " << peer_msg.peer_qpn << "> " << dendl;
+ if (!connected) {
+ r = activate();
+ ceph_assert(!r);
+ }
+ notify();
+ r = infiniband->send_msg(cct, tcp_fd, my_msg);
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " send client ack failed." << dendl;
+ dispatcher->perf_logger->inc(l_msgr_rdma_handshake_errors);
+ fault();
+ }
+ } else {
+ if (peer_msg.peer_qpn == 0) {// syn from client
+ if (active) {
+ ldout(cct, 10) << __func__ << " server is already active." << dendl;
+ return ;
+ }
+ r = activate();
+ ceph_assert(!r);
+ r = infiniband->send_msg(cct, tcp_fd, my_msg);
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " server ack failed." << dendl;
+ dispatcher->perf_logger->inc(l_msgr_rdma_handshake_errors);
+ fault();
+ return ;
+ }
+ } else { // ack from client
+ connected = 1;
+ ldout(cct, 10) << __func__ << " handshake of rdma is done. server connected: " << connected << dendl;
+ //cleanup();
+ submit(false);
+ notify();
+ }
+ }
+}
+
+ssize_t RDMAConnectedSocketImpl::read(char* buf, size_t len)
+{
+ uint64_t i = 0;
+ int r = ::read(notify_fd, &i, sizeof(i));
+ ldout(cct, 20) << __func__ << " notify_fd : " << i << " in " << my_msg.qpn << " r = " << r << dendl;
+
+ if (!active) {
+ ldout(cct, 1) << __func__ << " when ib not active. len: " << len << dendl;
+ return -EAGAIN;
+ }
+
+ if (0 == connected) {
+ ldout(cct, 1) << __func__ << " when ib not connected. len: " << len <<dendl;
+ return -EAGAIN;
+ }
+ ssize_t read = 0;
+ if (!buffers.empty())
+ read = read_buffers(buf,len);
+
+ std::vector<ibv_wc> cqe;
+ get_wc(cqe);
+ if (cqe.empty()) {
+ if (!buffers.empty()) {
+ notify();
+ }
+ if (read > 0) {
+ return read;
+ }
+ if (error) {
+ return -error;
+ } else {
+ return -EAGAIN;
+ }
+ }
+
+ ldout(cct, 20) << __func__ << " poll queue got " << cqe.size() << " responses. QP: " << my_msg.qpn << dendl;
+ for (size_t i = 0; i < cqe.size(); ++i) {
+ ibv_wc* response = &cqe[i];
+ ceph_assert(response->status == IBV_WC_SUCCESS);
+ Chunk* chunk = reinterpret_cast<Chunk *>(response->wr_id);
+ ldout(cct, 25) << __func__ << " chunk length: " << response->byte_len << " bytes." << chunk << dendl;
+ chunk->prepare_read(response->byte_len);
+ worker->perf_logger->inc(l_msgr_rdma_rx_bytes, response->byte_len);
+ if (response->byte_len == 0) {
+ dispatcher->perf_logger->inc(l_msgr_rdma_rx_fin);
+ if (connected) {
+ error = ECONNRESET;
+ ldout(cct, 20) << __func__ << " got remote close msg..." << dendl;
+ }
+ dispatcher->post_chunk_to_pool(chunk);
+ } else {
+ if (read == (ssize_t)len) {
+ buffers.push_back(chunk);
+ ldout(cct, 25) << __func__ << " buffers add a chunk: " << response->byte_len << dendl;
+ } else if (read + response->byte_len > (ssize_t)len) {
+ read += chunk->read(buf+read, (ssize_t)len-read);
+ buffers.push_back(chunk);
+ ldout(cct, 25) << __func__ << " buffers add a chunk: " << chunk->get_offset() << ":" << chunk->get_bound() << dendl;
+ } else {
+ read += chunk->read(buf+read, response->byte_len);
+ dispatcher->post_chunk_to_pool(chunk);
+ update_post_backlog();
+ }
+ }
+ }
+
+ worker->perf_logger->inc(l_msgr_rdma_rx_chunks, cqe.size());
+ if (is_server && connected == 0) {
+ ldout(cct, 20) << __func__ << " we do not need last handshake, QP: " << my_msg.qpn << " peer QP: " << peer_msg.qpn << dendl;
+ connected = 1; //if so, we don't need the last handshake
+ cleanup();
+ submit(false);
+ }
+
+ if (!buffers.empty()) {
+ notify();
+ }
+
+ if (read == 0 && error)
+ return -error;
+ return read == 0 ? -EAGAIN : read;
+}
+
+ssize_t RDMAConnectedSocketImpl::read_buffers(char* buf, size_t len)
+{
+ size_t read = 0, tmp = 0;
+ auto c = buffers.begin();
+ for (; c != buffers.end() ; ++c) {
+ tmp = (*c)->read(buf+read, len-read);
+ read += tmp;
+ ldout(cct, 25) << __func__ << " this iter read: " << tmp << " bytes." << " offset: " << (*c)->get_offset() << " ,bound: " << (*c)->get_bound() << ". Chunk:" << *c << dendl;
+ if ((*c)->over()) {
+ dispatcher->post_chunk_to_pool(*c);
+ update_post_backlog();
+ ldout(cct, 25) << __func__ << " one chunk over." << dendl;
+ }
+ if (read == len) {
+ break;
+ }
+ }
+
+ if (c != buffers.end() && (*c)->over())
+ ++c;
+ buffers.erase(buffers.begin(), c);
+ ldout(cct, 25) << __func__ << " got " << read << " bytes, buffers size: " << buffers.size() << dendl;
+ return read;
+}
+
+ssize_t RDMAConnectedSocketImpl::zero_copy_read(bufferptr &data)
+{
+ if (error)
+ return -error;
+ static const int MAX_COMPLETIONS = 16;
+ ibv_wc wc[MAX_COMPLETIONS];
+ ssize_t size = 0;
+
+ ibv_wc* response;
+ Chunk* chunk;
+ bool loaded = false;
+ auto iter = buffers.begin();
+ if (iter != buffers.end()) {
+ chunk = *iter;
+ // FIXME need to handle release
+ // auto del = std::bind(&Chunk::post_srq, std::move(chunk), infiniband);
+ buffers.erase(iter);
+ loaded = true;
+ size = chunk->bound;
+ }
+
+ std::vector<ibv_wc> cqe;
+ get_wc(cqe);
+ if (cqe.empty())
+ return size == 0 ? -EAGAIN : size;
+
+ ldout(cct, 20) << __func__ << " pool completion queue got " << cqe.size() << " responses."<< dendl;
+
+ for (size_t i = 0; i < cqe.size(); ++i) {
+ response = &wc[i];
+ chunk = reinterpret_cast<Chunk*>(response->wr_id);
+ chunk->prepare_read(response->byte_len);
+ if (!loaded && i == 0) {
+ // FIXME need to handle release
+ // auto del = std::bind(&Chunk::post_srq, std::move(chunk), infiniband);
+ size = chunk->bound;
+ continue;
+ }
+ buffers.push_back(chunk);
+ iter++;
+ }
+
+ if (size == 0)
+ return -EAGAIN;
+ return size;
+}
+
+ssize_t RDMAConnectedSocketImpl::send(bufferlist &bl, bool more)
+{
+ if (error) {
+ if (!active)
+ return -EPIPE;
+ return -error;
+ }
+ size_t bytes = bl.length();
+ if (!bytes)
+ return 0;
+ {
+ Mutex::Locker l(lock);
+ pending_bl.claim_append(bl);
+ if (!connected) {
+ ldout(cct, 20) << __func__ << " fake send to upper, QP: " << my_msg.qpn << dendl;
+ return bytes;
+ }
+ }
+ ldout(cct, 20) << __func__ << " QP: " << my_msg.qpn << dendl;
+ ssize_t r = submit(more);
+ if (r < 0 && r != -EAGAIN)
+ return r;
+ return bytes;
+}
+
+ssize_t RDMAConnectedSocketImpl::submit(bool more)
+{
+ if (error)
+ return -error;
+ Mutex::Locker l(lock);
+ size_t bytes = pending_bl.length();
+ ldout(cct, 20) << __func__ << " we need " << bytes << " bytes. iov size: "
+ << pending_bl.buffers().size() << dendl;
+ if (!bytes)
+ return 0;
+
+ auto fill_tx_via_copy = [this](std::vector<Chunk*> &tx_buffers,
+ unsigned bytes,
+ auto& start,
+ const auto& end) -> unsigned {
+ ceph_assert(start != end);
+ auto chunk_idx = tx_buffers.size();
+ int ret = worker->get_reged_mem(this, tx_buffers, bytes);
+ if (ret == 0) {
+ ldout(cct, 1) << __func__ << " no enough buffers in worker " << worker << dendl;
+ worker->perf_logger->inc(l_msgr_rdma_tx_no_mem);
+ return 0;
+ }
+
+ unsigned total_copied = 0;
+ Chunk *current_chunk = tx_buffers[chunk_idx];
+ while (start != end) {
+ const uintptr_t addr = reinterpret_cast<uintptr_t>(start->c_str());
+ unsigned copied = 0;
+ while (copied < start->length()) {
+ uint32_t r = current_chunk->write((char*)addr+copied, start->length() - copied);
+ copied += r;
+ total_copied += r;
+ bytes -= r;
+ if (current_chunk->full()){
+ if (++chunk_idx == tx_buffers.size())
+ return total_copied;
+ current_chunk = tx_buffers[chunk_idx];
+ }
+ }
+ ++start;
+ }
+ ceph_assert(bytes == 0);
+ return total_copied;
+ };
+
+ std::vector<Chunk*> tx_buffers;
+ auto it = std::cbegin(pending_bl.buffers());
+ auto copy_it = it;
+ unsigned total = 0;
+ unsigned need_reserve_bytes = 0;
+ while (it != pending_bl.buffers().end()) {
+ if (infiniband->is_tx_buffer(it->raw_c_str())) {
+ if (need_reserve_bytes) {
+ unsigned copied = fill_tx_via_copy(tx_buffers, need_reserve_bytes, copy_it, it);
+ total += copied;
+ if (copied < need_reserve_bytes)
+ goto sending;
+ need_reserve_bytes = 0;
+ }
+ ceph_assert(copy_it == it);
+ tx_buffers.push_back(infiniband->get_tx_chunk_by_buffer(it->raw_c_str()));
+ total += it->length();
+ ++copy_it;
+ } else {
+ need_reserve_bytes += it->length();
+ }
+ ++it;
+ }
+ if (need_reserve_bytes)
+ total += fill_tx_via_copy(tx_buffers, need_reserve_bytes, copy_it, it);
+
+ sending:
+ if (total == 0)
+ return -EAGAIN;
+ ceph_assert(total <= pending_bl.length());
+ bufferlist swapped;
+ if (total < pending_bl.length()) {
+ worker->perf_logger->inc(l_msgr_rdma_tx_parital_mem);
+ pending_bl.splice(total, pending_bl.length()-total, &swapped);
+ pending_bl.swap(swapped);
+ } else {
+ pending_bl.clear();
+ }
+
+ ldout(cct, 20) << __func__ << " left bytes: " << pending_bl.length() << " in buffers "
+ << pending_bl.buffers().size() << " tx chunks " << tx_buffers.size() << dendl;
+
+ int r = post_work_request(tx_buffers);
+ if (r < 0)
+ return r;
+
+ ldout(cct, 20) << __func__ << " finished sending " << bytes << " bytes." << dendl;
+ return pending_bl.length() ? -EAGAIN : 0;
+}
+
+int RDMAConnectedSocketImpl::post_work_request(std::vector<Chunk*> &tx_buffers)
+{
+ ldout(cct, 20) << __func__ << " QP: " << my_msg.qpn << " " << tx_buffers[0] << dendl;
+ vector<Chunk*>::iterator current_buffer = tx_buffers.begin();
+ ibv_sge isge[tx_buffers.size()];
+ uint32_t current_sge = 0;
+ ibv_send_wr iswr[tx_buffers.size()];
+ uint32_t current_swr = 0;
+ ibv_send_wr* pre_wr = NULL;
+ uint32_t num = 0;
+
+ // FIPS zeroization audit 20191115: these memsets are not security related.
+ memset(iswr, 0, sizeof(iswr));
+ memset(isge, 0, sizeof(isge));
+
+ while (current_buffer != tx_buffers.end()) {
+ isge[current_sge].addr = reinterpret_cast<uint64_t>((*current_buffer)->buffer);
+ isge[current_sge].length = (*current_buffer)->get_offset();
+ isge[current_sge].lkey = (*current_buffer)->mr->lkey;
+ ldout(cct, 25) << __func__ << " sending buffer: " << *current_buffer << " length: " << isge[current_sge].length << dendl;
+
+ iswr[current_swr].wr_id = reinterpret_cast<uint64_t>(*current_buffer);
+ iswr[current_swr].next = NULL;
+ iswr[current_swr].sg_list = &isge[current_sge];
+ iswr[current_swr].num_sge = 1;
+ iswr[current_swr].opcode = IBV_WR_SEND;
+ iswr[current_swr].send_flags = IBV_SEND_SIGNALED;
+ /*if (isge[current_sge].length < infiniband->max_inline_data) {
+ iswr[current_swr].send_flags = IBV_SEND_INLINE;
+ ldout(cct, 20) << __func__ << " send_inline." << dendl;
+ }*/
+
+ num++;
+ worker->perf_logger->inc(l_msgr_rdma_tx_bytes, isge[current_sge].length);
+ if (pre_wr)
+ pre_wr->next = &iswr[current_swr];
+ pre_wr = &iswr[current_swr];
+ ++current_sge;
+ ++current_swr;
+ ++current_buffer;
+ }
+
+ ibv_send_wr *bad_tx_work_request;
+ if (ibv_post_send(qp->get_qp(), iswr, &bad_tx_work_request)) {
+ ldout(cct, 1) << __func__ << " failed to send data"
+ << " (most probably should be peer not ready): "
+ << cpp_strerror(errno) << dendl;
+ worker->perf_logger->inc(l_msgr_rdma_tx_failed);
+ return -errno;
+ }
+ qp->add_tx_wr(num);
+ worker->perf_logger->inc(l_msgr_rdma_tx_chunks, tx_buffers.size());
+ ldout(cct, 20) << __func__ << " qp state is " << Infiniband::qp_state_string(qp->get_state()) << dendl;
+ return 0;
+}
+
+void RDMAConnectedSocketImpl::fin() {
+ ibv_send_wr wr;
+ // FIPS zeroization audit 20191115: this memset is not security related.
+ memset(&wr, 0, sizeof(wr));
+
+ wr.wr_id = reinterpret_cast<uint64_t>(qp);
+ wr.num_sge = 0;
+ wr.opcode = IBV_WR_SEND;
+ wr.send_flags = IBV_SEND_SIGNALED;
+ ibv_send_wr* bad_tx_work_request;
+ if (ibv_post_send(qp->get_qp(), &wr, &bad_tx_work_request)) {
+ ldout(cct, 1) << __func__ << " failed to send message="
+ << " ibv_post_send failed(most probably should be peer not ready): "
+ << cpp_strerror(errno) << dendl;
+ worker->perf_logger->inc(l_msgr_rdma_tx_failed);
+ return ;
+ }
+ qp->add_tx_wr(1);
+}
+
+void RDMAConnectedSocketImpl::cleanup() {
+ if (read_handler && tcp_fd >= 0) {
+ (static_cast<C_handle_connection_read*>(read_handler))->close();
+ worker->center.submit_to(worker->center.get_id(), [this]() {
+ worker->center.delete_file_event(tcp_fd, EVENT_READABLE | EVENT_WRITABLE);
+ }, false);
+ delete read_handler;
+ read_handler = nullptr;
+ }
+ if (established_handler) {
+ (static_cast<C_handle_connection_established*>(established_handler))->close();
+ delete established_handler;
+ established_handler = nullptr;
+ }
+}
+
+void RDMAConnectedSocketImpl::notify()
+{
+ // note: notify_fd is an event fd (man eventfd)
+ // write argument must be a 64bit integer
+ uint64_t i = 1;
+
+ ceph_assert(sizeof(i) == write(notify_fd, &i, sizeof(i)));
+}
+
+void RDMAConnectedSocketImpl::shutdown()
+{
+ if (!error)
+ fin();
+ error = ECONNRESET;
+ active = false;
+}
+
+void RDMAConnectedSocketImpl::close()
+{
+ if (!error)
+ fin();
+ error = ECONNRESET;
+ active = false;
+}
+
+void RDMAConnectedSocketImpl::fault()
+{
+ ldout(cct, 1) << __func__ << " tcp fd " << tcp_fd << dendl;
+ /*if (qp) {
+ qp->to_dead();
+ qp = NULL;
+ }*/
+ error = ECONNRESET;
+ connected = 1;
+ notify();
+}
+
+void RDMAConnectedSocketImpl::set_accept_fd(int sd)
+{
+ tcp_fd = sd;
+ is_server = true;
+ worker->center.submit_to(worker->center.get_id(), [this]() {
+ worker->center.create_file_event(tcp_fd, EVENT_READABLE, read_handler);
+ }, true);
+}
+
+void RDMAConnectedSocketImpl::post_chunks_to_rq(int num)
+{
+ post_backlog += num - infiniband->post_chunks_to_rq(num, qp->get_qp());
+}
+
+void RDMAConnectedSocketImpl::update_post_backlog()
+{
+ if (post_backlog)
+ post_backlog -= post_backlog - dispatcher->post_chunks_to_rq(post_backlog, qp->get_qp());
+}
diff --git a/src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc b/src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc
new file mode 100644
index 00000000..432c2d2b
--- /dev/null
+++ b/src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc
@@ -0,0 +1,183 @@
+#include "RDMAStack.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << " RDMAIWARPConnectedSocketImpl "
+
+#define TIMEOUT_MS 3000
+#define RETRY_COUNT 7
+
+RDMAIWARPConnectedSocketImpl::RDMAIWARPConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s,
+ RDMAWorker *w, RDMACMInfo *info)
+ : RDMAConnectedSocketImpl(cct, ib, s, w), cm_con_handler(new C_handle_cm_connection(this))
+{
+ status = IDLE;
+ notify_fd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
+ if (info) {
+ is_server = true;
+ cm_id = info->cm_id;
+ cm_channel = info->cm_channel;
+ status = RDMA_ID_CREATED;
+ remote_qpn = info->qp_num;
+ if (alloc_resource()) {
+ close_notify();
+ return;
+ }
+ worker->center.submit_to(worker->center.get_id(), [this]() {
+ worker->center.create_file_event(cm_channel->fd, EVENT_READABLE, cm_con_handler);
+ status = CHANNEL_FD_CREATED;
+ }, false);
+ status = RESOURCE_ALLOCATED;
+ local_qpn = qp->get_local_qp_number();
+ my_msg.qpn = local_qpn;
+ } else {
+ is_server = false;
+ cm_channel = rdma_create_event_channel();
+ rdma_create_id(cm_channel, &cm_id, NULL, RDMA_PS_TCP);
+ status = RDMA_ID_CREATED;
+ ldout(cct, 20) << __func__ << " successfully created cm id: " << cm_id << dendl;
+ }
+}
+
+RDMAIWARPConnectedSocketImpl::~RDMAIWARPConnectedSocketImpl() {
+ ldout(cct, 20) << __func__ << " destruct." << dendl;
+ std::unique_lock l(close_mtx);
+ close_condition.wait(l, [&] { return closed; });
+ if (status >= RDMA_ID_CREATED) {
+ rdma_destroy_id(cm_id);
+ rdma_destroy_event_channel(cm_channel);
+ }
+}
+
+int RDMAIWARPConnectedSocketImpl::try_connect(const entity_addr_t& peer_addr, const SocketOptions &opts) {
+ worker->center.create_file_event(cm_channel->fd, EVENT_READABLE, cm_con_handler);
+ status = CHANNEL_FD_CREATED;
+ if (rdma_resolve_addr(cm_id, NULL, const_cast<struct sockaddr*>(peer_addr.get_sockaddr()), TIMEOUT_MS)) {
+ lderr(cct) << __func__ << " failed to resolve addr" << dendl;
+ return -1;
+ }
+ return 0;
+}
+
+void RDMAIWARPConnectedSocketImpl::close() {
+ error = ECONNRESET;
+ active = false;
+ if (status >= CONNECTED) {
+ rdma_disconnect(cm_id);
+ }
+ close_notify();
+}
+
+void RDMAIWARPConnectedSocketImpl::shutdown() {
+ error = ECONNRESET;
+ active = false;
+}
+
+void RDMAIWARPConnectedSocketImpl::handle_cm_connection() {
+ struct rdma_cm_event *event;
+ rdma_get_cm_event(cm_channel, &event);
+ ldout(cct, 20) << __func__ << " event name: " << rdma_event_str(event->event)
+ << " (cm id: " << cm_id << ")" << dendl;
+ struct rdma_conn_param cm_params;
+ switch (event->event) {
+ case RDMA_CM_EVENT_ADDR_RESOLVED:
+ status = ADDR_RESOLVED;
+ if (rdma_resolve_route(cm_id, TIMEOUT_MS)) {
+ lderr(cct) << __func__ << " failed to resolve rdma addr" << dendl;
+ notify();
+ }
+ break;
+
+ case RDMA_CM_EVENT_ROUTE_RESOLVED:
+ status = ROUTE_RESOLVED;
+ if (alloc_resource()) {
+ lderr(cct) << __func__ << " failed to alloc resource while resolving the route" << dendl;
+ connected = -ECONNREFUSED;
+ notify();
+ break;
+ }
+ local_qpn = qp->get_local_qp_number();
+ my_msg.qpn = local_qpn;
+
+ // FIPS zeroization audit 20191115: this memset is not security related.
+ memset(&cm_params, 0, sizeof(cm_params));
+ cm_params.retry_count = RETRY_COUNT;
+ cm_params.qp_num = local_qpn;
+ if (rdma_connect(cm_id, &cm_params)) {
+ lderr(cct) << __func__ << " failed to connect remote rdma port" << dendl;
+ connected = -ECONNREFUSED;
+ notify();
+ }
+ break;
+
+ case RDMA_CM_EVENT_ESTABLISHED:
+ ldout(cct, 20) << __func__ << " qp_num=" << cm_id->qp->qp_num << dendl;
+ status = CONNECTED;
+ if (!is_server) {
+ remote_qpn = event->param.conn.qp_num;
+ activate();
+ notify();
+ }
+ break;
+
+ case RDMA_CM_EVENT_ADDR_ERROR:
+ case RDMA_CM_EVENT_ROUTE_ERROR:
+ case RDMA_CM_EVENT_CONNECT_ERROR:
+ case RDMA_CM_EVENT_UNREACHABLE:
+ case RDMA_CM_EVENT_REJECTED:
+ lderr(cct) << __func__ << " rdma connection rejected" << dendl;
+ connected = -ECONNREFUSED;
+ notify();
+ break;
+
+ case RDMA_CM_EVENT_DISCONNECTED:
+ status = DISCONNECTED;
+ close_notify();
+ if (!error) {
+ error = ECONNRESET;
+ notify();
+ }
+ break;
+
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ break;
+
+ default:
+ ceph_abort_msg("unhandled event");
+ break;
+ }
+ rdma_ack_cm_event(event);
+}
+
+void RDMAIWARPConnectedSocketImpl::activate() {
+ ldout(cct, 30) << __func__ << dendl;
+ active = true;
+ connected = 1;
+}
+
+int RDMAIWARPConnectedSocketImpl::alloc_resource() {
+ ldout(cct, 30) << __func__ << dendl;
+ qp = infiniband->create_queue_pair(cct, dispatcher->get_tx_cq(),
+ dispatcher->get_rx_cq(), IBV_QPT_RC, cm_id);
+ if (!qp) {
+ return -1;
+ }
+ if (!cct->_conf->ms_async_rdma_support_srq)
+ dispatcher->post_chunks_to_rq(infiniband->get_rx_queue_len(), qp->get_qp());
+ dispatcher->register_qp(qp, this);
+ dispatcher->perf_logger->inc(l_msgr_rdma_created_queue_pair);
+ dispatcher->perf_logger->inc(l_msgr_rdma_active_queue_pair);
+ return 0;
+}
+
+void RDMAIWARPConnectedSocketImpl::close_notify() {
+ ldout(cct, 30) << __func__ << dendl;
+ if (status >= CHANNEL_FD_CREATED) {
+ worker->center.delete_file_event(cm_channel->fd, EVENT_READABLE);
+ }
+ std::unique_lock l(close_mtx);
+ if (!closed) {
+ closed = true;
+ close_condition.notify_all();
+ }
+}
diff --git a/src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc b/src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc
new file mode 100644
index 00000000..210eaf00
--- /dev/null
+++ b/src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc
@@ -0,0 +1,107 @@
+#include <poll.h>
+
+#include "msg/async/net_handler.h"
+#include "RDMAStack.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << " RDMAIWARPServerSocketImpl "
+
+RDMAIWARPServerSocketImpl::RDMAIWARPServerSocketImpl(
+ CephContext *cct, Infiniband* i,
+ RDMADispatcher *s, RDMAWorker *w, entity_addr_t& a, unsigned addr_slot)
+ : RDMAServerSocketImpl(cct, i, s, w, a, addr_slot)
+{
+}
+
+int RDMAIWARPServerSocketImpl::listen(entity_addr_t &sa,
+ const SocketOptions &opt)
+{
+ ldout(cct, 20) << __func__ << " bind to rdma point" << dendl;
+ cm_channel = rdma_create_event_channel();
+ rdma_create_id(cm_channel, &cm_id, NULL, RDMA_PS_TCP);
+ ldout(cct, 20) << __func__ << " successfully created cm id: " << cm_id << dendl;
+ int rc = rdma_bind_addr(cm_id, const_cast<struct sockaddr*>(sa.get_sockaddr()));
+ if (rc < 0) {
+ rc = -errno;
+ ldout(cct, 10) << __func__ << " unable to bind to " << sa.get_sockaddr()
+ << " on port " << sa.get_port() << ": " << cpp_strerror(errno) << dendl;
+ goto err;
+ }
+ rc = rdma_listen(cm_id, 128);
+ if (rc < 0) {
+ rc = -errno;
+ ldout(cct, 10) << __func__ << " unable to listen to " << sa.get_sockaddr()
+ << " on port " << sa.get_port() << ": " << cpp_strerror(errno) << dendl;
+ goto err;
+ }
+ server_setup_socket = cm_channel->fd;
+ ldout(cct, 20) << __func__ << " fd of cm_channel is " << server_setup_socket << dendl;
+ return 0;
+
+err:
+ server_setup_socket = -1;
+ rdma_destroy_id(cm_id);
+ rdma_destroy_event_channel(cm_channel);
+ return rc;
+}
+
+int RDMAIWARPServerSocketImpl::accept(ConnectedSocket *sock, const SocketOptions &opt,
+ entity_addr_t *out, Worker *w)
+{
+ ldout(cct, 15) << __func__ << dendl;
+
+ ceph_assert(sock);
+ struct pollfd pfd = {
+ .fd = cm_channel->fd,
+ .events = POLLIN,
+ };
+ int ret = poll(&pfd, 1, 0);
+ ceph_assert(ret >= 0);
+ if (!ret)
+ return -EAGAIN;
+
+ struct rdma_cm_event *cm_event;
+ rdma_get_cm_event(cm_channel, &cm_event);
+ ldout(cct, 20) << __func__ << " event name: " << rdma_event_str(cm_event->event) << dendl;
+
+ struct rdma_cm_id *event_cm_id = cm_event->id;
+ struct rdma_event_channel *event_channel = rdma_create_event_channel();
+
+ rdma_migrate_id(event_cm_id, event_channel);
+
+ struct rdma_cm_id *new_cm_id = event_cm_id;
+ struct rdma_conn_param *remote_conn_param = &cm_event->param.conn;
+ struct rdma_conn_param local_conn_param;
+
+ RDMACMInfo info(new_cm_id, event_channel, remote_conn_param->qp_num);
+ RDMAIWARPConnectedSocketImpl* server =
+ new RDMAIWARPConnectedSocketImpl(cct, infiniband, dispatcher, dynamic_cast<RDMAWorker*>(w), &info);
+
+ // FIPS zeroization audit 20191115: this memset is not security related.
+ memset(&local_conn_param, 0, sizeof(local_conn_param));
+ local_conn_param.qp_num = server->get_local_qpn();
+
+ if (rdma_accept(new_cm_id, &local_conn_param)) {
+ return -EAGAIN;
+ }
+ server->activate();
+ ldout(cct, 20) << __func__ << " accepted a new QP" << dendl;
+
+ rdma_ack_cm_event(cm_event);
+
+ std::unique_ptr<RDMAConnectedSocketImpl> csi(server);
+ *sock = ConnectedSocket(std::move(csi));
+ struct sockaddr *addr = &new_cm_id->route.addr.dst_addr;
+ out->set_sockaddr(addr);
+
+ return 0;
+}
+
+void RDMAIWARPServerSocketImpl::abort_accept()
+{
+ if (server_setup_socket >= 0) {
+ rdma_destroy_id(cm_id);
+ rdma_destroy_event_channel(cm_channel);
+ }
+}
diff --git a/src/msg/async/rdma/RDMAServerSocketImpl.cc b/src/msg/async/rdma/RDMAServerSocketImpl.cc
new file mode 100644
index 00000000..98402cfd
--- /dev/null
+++ b/src/msg/async/rdma/RDMAServerSocketImpl.cc
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "msg/async/net_handler.h"
+#include "RDMAStack.h"
+
+#include "include/compat.h"
+#include "include/sock_compat.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << " RDMAServerSocketImpl "
+
+RDMAServerSocketImpl::RDMAServerSocketImpl(
+ CephContext *cct, Infiniband* i, RDMADispatcher *s, RDMAWorker *w,
+ entity_addr_t& a, unsigned slot)
+ : ServerSocketImpl(a.get_type(), slot),
+ cct(cct), net(cct), server_setup_socket(-1), infiniband(i),
+ dispatcher(s), worker(w), sa(a)
+{
+}
+
+int RDMAServerSocketImpl::listen(entity_addr_t &sa, const SocketOptions &opt)
+{
+ int rc = 0;
+ server_setup_socket = net.create_socket(sa.get_family(), true);
+ if (server_setup_socket < 0) {
+ rc = -errno;
+ lderr(cct) << __func__ << " failed to create server socket: "
+ << cpp_strerror(errno) << dendl;
+ return rc;
+ }
+
+ rc = net.set_nonblock(server_setup_socket);
+ if (rc < 0) {
+ goto err;
+ }
+
+ rc = net.set_socket_options(server_setup_socket, opt.nodelay, opt.rcbuf_size);
+ if (rc < 0) {
+ goto err;
+ }
+
+ rc = ::bind(server_setup_socket, sa.get_sockaddr(), sa.get_sockaddr_len());
+ if (rc < 0) {
+ rc = -errno;
+ ldout(cct, 10) << __func__ << " unable to bind to " << sa.get_sockaddr()
+ << " on port " << sa.get_port() << ": " << cpp_strerror(errno) << dendl;
+ goto err;
+ }
+
+ rc = ::listen(server_setup_socket, cct->_conf->ms_tcp_listen_backlog);
+ if (rc < 0) {
+ rc = -errno;
+ lderr(cct) << __func__ << " unable to listen on " << sa << ": " << cpp_strerror(errno) << dendl;
+ goto err;
+ }
+
+ ldout(cct, 20) << __func__ << " bind to " << sa.get_sockaddr() << " on port " << sa.get_port() << dendl;
+ return 0;
+
+err:
+ ::close(server_setup_socket);
+ server_setup_socket = -1;
+ return rc;
+}
+
+int RDMAServerSocketImpl::accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w)
+{
+ ldout(cct, 15) << __func__ << dendl;
+
+ ceph_assert(sock);
+
+ sockaddr_storage ss;
+ socklen_t slen = sizeof(ss);
+ int sd = accept_cloexec(server_setup_socket, (sockaddr*)&ss, &slen);
+ if (sd < 0) {
+ return -errno;
+ }
+
+ int r = net.set_nonblock(sd);
+ if (r < 0) {
+ ::close(sd);
+ return -errno;
+ }
+
+ r = net.set_socket_options(sd, opt.nodelay, opt.rcbuf_size);
+ if (r < 0) {
+ ::close(sd);
+ return -errno;
+ }
+
+ ceph_assert(NULL != out); //out should not be NULL in accept connection
+
+ out->set_type(addr_type);
+ out->set_sockaddr((sockaddr*)&ss);
+ net.set_priority(sd, opt.priority, out->get_family());
+
+ RDMAConnectedSocketImpl* server;
+ //Worker* w = dispatcher->get_stack()->get_worker();
+ server = new RDMAConnectedSocketImpl(cct, infiniband, dispatcher, dynamic_cast<RDMAWorker*>(w));
+ server->set_accept_fd(sd);
+ ldout(cct, 20) << __func__ << " accepted a new QP, tcp_fd: " << sd << dendl;
+ std::unique_ptr<RDMAConnectedSocketImpl> csi(server);
+ *sock = ConnectedSocket(std::move(csi));
+
+ return 0;
+}
+
+void RDMAServerSocketImpl::abort_accept()
+{
+ if (server_setup_socket >= 0)
+ ::close(server_setup_socket);
+}
diff --git a/src/msg/async/rdma/RDMAStack.cc b/src/msg/async/rdma/RDMAStack.cc
new file mode 100644
index 00000000..f63a8e7d
--- /dev/null
+++ b/src/msg/async/rdma/RDMAStack.cc
@@ -0,0 +1,610 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <poll.h>
+#include <errno.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "include/str_list.h"
+#include "include/compat.h"
+#include "common/Cycles.h"
+#include "common/deleter.h"
+#include "common/Tub.h"
+#include "RDMAStack.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << "RDMAStack "
+
+RDMADispatcher::~RDMADispatcher()
+{
+ ldout(cct, 20) << __func__ << " destructing rdma dispatcher" << dendl;
+ polling_stop();
+
+ ceph_assert(qp_conns.empty());
+ ceph_assert(num_qp_conn == 0);
+ ceph_assert(dead_queue_pairs.empty());
+ ceph_assert(num_dead_queue_pair == 0);
+
+ delete async_handler;
+}
+
+RDMADispatcher::RDMADispatcher(CephContext* c, RDMAStack* s)
+ : cct(c), async_handler(new C_handle_cq_async(this)), lock("RDMADispatcher::lock"),
+ w_lock("RDMADispatcher::for worker pending list"), stack(s)
+{
+ PerfCountersBuilder plb(cct, "AsyncMessenger::RDMADispatcher", l_msgr_rdma_dispatcher_first, l_msgr_rdma_dispatcher_last);
+
+ plb.add_u64_counter(l_msgr_rdma_polling, "polling", "Whether dispatcher thread is polling");
+ plb.add_u64_counter(l_msgr_rdma_inflight_tx_chunks, "inflight_tx_chunks", "The number of inflight tx chunks");
+ plb.add_u64_counter(l_msgr_rdma_rx_bufs_in_use, "rx_bufs_in_use", "The number of rx buffers that are holding data and being processed");
+ plb.add_u64_counter(l_msgr_rdma_rx_bufs_total, "rx_bufs_total", "The total number of rx buffers");
+
+ plb.add_u64_counter(l_msgr_rdma_tx_total_wc, "tx_total_wc", "The number of tx work comletions");
+ plb.add_u64_counter(l_msgr_rdma_tx_total_wc_errors, "tx_total_wc_errors", "The number of tx errors");
+ plb.add_u64_counter(l_msgr_rdma_tx_wc_retry_errors, "tx_retry_errors", "The number of tx retry errors");
+ plb.add_u64_counter(l_msgr_rdma_tx_wc_wr_flush_errors, "tx_wr_flush_errors", "The number of tx work request flush errors");
+
+ plb.add_u64_counter(l_msgr_rdma_rx_total_wc, "rx_total_wc", "The number of total rx work completion");
+ plb.add_u64_counter(l_msgr_rdma_rx_total_wc_errors, "rx_total_wc_errors", "The number of total rx error work completion");
+ plb.add_u64_counter(l_msgr_rdma_rx_fin, "rx_fin", "The number of rx finish work request");
+
+ plb.add_u64_counter(l_msgr_rdma_total_async_events, "total_async_events", "The number of async events");
+ plb.add_u64_counter(l_msgr_rdma_async_last_wqe_events, "async_last_wqe_events", "The number of last wqe events");
+
+ plb.add_u64_counter(l_msgr_rdma_handshake_errors, "handshake_errors", "The number of handshake errors");
+
+
+ plb.add_u64_counter(l_msgr_rdma_created_queue_pair, "created_queue_pair", "Active queue pair number");
+ plb.add_u64_counter(l_msgr_rdma_active_queue_pair, "active_queue_pair", "Created queue pair number");
+
+ perf_logger = plb.create_perf_counters();
+ cct->get_perfcounters_collection()->add(perf_logger);
+ Cycles::init();
+}
+
+void RDMADispatcher::polling_start()
+{
+ // take lock because listen/connect can happen from different worker threads
+ Mutex::Locker l(lock);
+
+ if (t.joinable())
+ return; // dispatcher thread already running
+
+ get_stack()->get_infiniband().get_memory_manager()->set_rx_stat_logger(perf_logger);
+
+ tx_cc = get_stack()->get_infiniband().create_comp_channel(cct);
+ ceph_assert(tx_cc);
+ rx_cc = get_stack()->get_infiniband().create_comp_channel(cct);
+ ceph_assert(rx_cc);
+ tx_cq = get_stack()->get_infiniband().create_comp_queue(cct, tx_cc);
+ ceph_assert(tx_cq);
+ rx_cq = get_stack()->get_infiniband().create_comp_queue(cct, rx_cc);
+ ceph_assert(rx_cq);
+
+ t = std::thread(&RDMADispatcher::polling, this);
+ ceph_pthread_setname(t.native_handle(), "rdma-polling");
+}
+
+void RDMADispatcher::polling_stop()
+{
+ {
+ Mutex::Locker l(lock);
+ done = true;
+ }
+
+ if (!t.joinable())
+ return;
+
+ t.join();
+
+ tx_cc->ack_events();
+ rx_cc->ack_events();
+ delete tx_cq;
+ delete rx_cq;
+ delete tx_cc;
+ delete rx_cc;
+}
+
+void RDMADispatcher::handle_async_event()
+{
+ ldout(cct, 30) << __func__ << dendl;
+ while (1) {
+ ibv_async_event async_event;
+ if (ibv_get_async_event(get_stack()->get_infiniband().get_device()->ctxt, &async_event)) {
+ if (errno != EAGAIN)
+ lderr(cct) << __func__ << " ibv_get_async_event failed. (errno=" << errno
+ << " " << cpp_strerror(errno) << ")" << dendl;
+ return;
+ }
+ perf_logger->inc(l_msgr_rdma_total_async_events);
+ // FIXME: Currently we must ensure no other factor make QP in ERROR state,
+ // otherwise this qp can't be deleted in current cleanup flow.
+ if (async_event.event_type == IBV_EVENT_QP_LAST_WQE_REACHED) {
+ perf_logger->inc(l_msgr_rdma_async_last_wqe_events);
+ uint64_t qpn = async_event.element.qp->qp_num;
+ ldout(cct, 10) << __func__ << " event associated qp=" << async_event.element.qp
+ << " evt: " << ibv_event_type_str(async_event.event_type) << dendl;
+ Mutex::Locker l(lock);
+ RDMAConnectedSocketImpl *conn = get_conn_lockless(qpn);
+ if (!conn) {
+ ldout(cct, 1) << __func__ << " missing qp_num=" << qpn << " discard event" << dendl;
+ } else {
+ ldout(cct, 1) << __func__ << " it's not forwardly stopped by us, reenable=" << conn << dendl;
+ conn->fault();
+ if (!cct->_conf->ms_async_rdma_cm)
+ erase_qpn_lockless(qpn);
+ }
+ } else {
+ ldout(cct, 1) << __func__ << " ibv_get_async_event: dev=" << get_stack()->get_infiniband().get_device()->ctxt
+ << " evt: " << ibv_event_type_str(async_event.event_type)
+ << dendl;
+ }
+ ibv_ack_async_event(&async_event);
+ }
+}
+
+void RDMADispatcher::post_chunk_to_pool(Chunk* chunk)
+{
+ Mutex::Locker l(lock);
+ get_stack()->get_infiniband().post_chunk_to_pool(chunk);
+ perf_logger->dec(l_msgr_rdma_rx_bufs_in_use);
+}
+
+int RDMADispatcher::post_chunks_to_rq(int num, ibv_qp *qp)
+{
+ Mutex::Locker l(lock);
+ return get_stack()->get_infiniband().post_chunks_to_rq(num, qp);
+}
+
+void RDMADispatcher::polling()
+{
+ static int MAX_COMPLETIONS = 32;
+ ibv_wc wc[MAX_COMPLETIONS];
+
+ std::map<RDMAConnectedSocketImpl*, std::vector<ibv_wc> > polled;
+ std::vector<ibv_wc> tx_cqe;
+ ldout(cct, 20) << __func__ << " going to poll tx cq: " << tx_cq << " rx cq: " << rx_cq << dendl;
+ RDMAConnectedSocketImpl *conn = nullptr;
+ uint64_t last_inactive = Cycles::rdtsc();
+ bool rearmed = false;
+ int r = 0;
+
+ while (true) {
+ int tx_ret = tx_cq->poll_cq(MAX_COMPLETIONS, wc);
+ if (tx_ret > 0) {
+ ldout(cct, 20) << __func__ << " tx completion queue got " << tx_ret
+ << " responses."<< dendl;
+ handle_tx_event(wc, tx_ret);
+ }
+
+ int rx_ret = rx_cq->poll_cq(MAX_COMPLETIONS, wc);
+ if (rx_ret > 0) {
+ ldout(cct, 20) << __func__ << " rx completion queue got " << rx_ret
+ << " responses."<< dendl;
+ perf_logger->inc(l_msgr_rdma_rx_total_wc, rx_ret);
+ perf_logger->inc(l_msgr_rdma_rx_bufs_in_use, rx_ret);
+
+ Mutex::Locker l(lock);//make sure connected socket alive when pass wc
+
+ for (int i = 0; i < rx_ret; ++i) {
+ ibv_wc* response = &wc[i];
+ Chunk* chunk = reinterpret_cast<Chunk *>(response->wr_id);
+
+ if (response->status == IBV_WC_SUCCESS) {
+ ceph_assert(wc[i].opcode == IBV_WC_RECV);
+ conn = get_conn_lockless(response->qp_num);
+ if (!conn) {
+ ldout(cct, 1) << __func__ << " csi with qpn " << response->qp_num << " may be dead. chunk " << chunk << " will be back ? " << r << dendl;
+ get_stack()->get_infiniband().post_chunk_to_pool(chunk);
+ perf_logger->dec(l_msgr_rdma_rx_bufs_in_use);
+ } else {
+ conn->post_chunks_to_rq(1);
+ polled[conn].push_back(*response);
+ }
+ } else {
+ perf_logger->inc(l_msgr_rdma_rx_total_wc_errors);
+ ldout(cct, 1) << __func__ << " work request returned error for buffer(" << chunk
+ << ") status(" << response->status << ":"
+ << get_stack()->get_infiniband().wc_status_to_string(response->status) << ")" << dendl;
+ if (response->status != IBV_WC_WR_FLUSH_ERR) {
+ conn = get_conn_lockless(response->qp_num);
+ if (conn && conn->is_connected())
+ conn->fault();
+ }
+ get_stack()->get_infiniband().post_chunk_to_pool(chunk);
+ perf_logger->dec(l_msgr_rdma_rx_bufs_in_use);
+ }
+ }
+ for (auto &&i : polled)
+ i.first->pass_wc(std::move(i.second));
+ polled.clear();
+ }
+
+ if (!tx_ret && !rx_ret) {
+ // NOTE: Has TX just transitioned to idle? We should do it when idle!
+ // It's now safe to delete queue pairs (see comment by declaration
+ // for dead_queue_pairs).
+ // Additionally, don't delete qp while outstanding_buffers isn't empty,
+ // because we need to check qp's state before sending
+ perf_logger->set(l_msgr_rdma_inflight_tx_chunks, inflight);
+ if (num_dead_queue_pair) {
+ Mutex::Locker l(lock); // FIXME reuse dead qp because creating one qp costs 1 ms
+ auto it = dead_queue_pairs.begin();
+ while (it != dead_queue_pairs.end()) {
+ auto i = *it;
+ // Bypass QPs that do not collect all Tx completions yet.
+ if (i->get_tx_wr()) {
+ ldout(cct, 20) << __func__ << " bypass qp=" << i << " tx_wr=" << i->get_tx_wr() << dendl;
+ ++it;
+ } else {
+ ldout(cct, 10) << __func__ << " finally delete qp=" << i << dendl;
+ delete i;
+ it = dead_queue_pairs.erase(it);
+ perf_logger->dec(l_msgr_rdma_active_queue_pair);
+ --num_dead_queue_pair;
+ }
+ }
+ }
+ if (!num_qp_conn && done && dead_queue_pairs.empty())
+ break;
+
+ uint64_t now = Cycles::rdtsc();
+ if (Cycles::to_microseconds(now - last_inactive) > cct->_conf->ms_async_rdma_polling_us) {
+ handle_async_event();
+ if (!rearmed) {
+ // Clean up cq events after rearm notify ensure no new incoming event
+ // arrived between polling and rearm
+ tx_cq->rearm_notify();
+ rx_cq->rearm_notify();
+ rearmed = true;
+ continue;
+ }
+
+ struct pollfd channel_poll[2];
+ channel_poll[0].fd = tx_cc->get_fd();
+ channel_poll[0].events = POLLIN | POLLERR | POLLNVAL | POLLHUP;
+ channel_poll[0].revents = 0;
+ channel_poll[1].fd = rx_cc->get_fd();
+ channel_poll[1].events = POLLIN | POLLERR | POLLNVAL | POLLHUP;
+ channel_poll[1].revents = 0;
+ r = 0;
+ perf_logger->set(l_msgr_rdma_polling, 0);
+ while (!done && r == 0) {
+ r = TEMP_FAILURE_RETRY(poll(channel_poll, 2, 100));
+ if (r < 0) {
+ r = -errno;
+ lderr(cct) << __func__ << " poll failed " << r << dendl;
+ ceph_abort();
+ }
+ }
+ if (r > 0 && tx_cc->get_cq_event())
+ ldout(cct, 20) << __func__ << " got tx cq event." << dendl;
+ if (r > 0 && rx_cc->get_cq_event())
+ ldout(cct, 20) << __func__ << " got rx cq event." << dendl;
+ last_inactive = Cycles::rdtsc();
+ perf_logger->set(l_msgr_rdma_polling, 1);
+ rearmed = false;
+ }
+ }
+ }
+}
+
+void RDMADispatcher::notify_pending_workers() {
+ if (num_pending_workers) {
+ RDMAWorker *w = nullptr;
+ {
+ Mutex::Locker l(w_lock);
+ if (!pending_workers.empty()) {
+ w = pending_workers.front();
+ pending_workers.pop_front();
+ --num_pending_workers;
+ }
+ }
+ if (w)
+ w->notify_worker();
+ }
+}
+
+void RDMADispatcher::register_qp(QueuePair *qp, RDMAConnectedSocketImpl* csi)
+{
+ Mutex::Locker l(lock);
+ ceph_assert(!qp_conns.count(qp->get_local_qp_number()));
+ qp_conns[qp->get_local_qp_number()] = std::make_pair(qp, csi);
+ ++num_qp_conn;
+}
+
+RDMAConnectedSocketImpl* RDMADispatcher::get_conn_lockless(uint32_t qp)
+{
+ auto it = qp_conns.find(qp);
+ if (it == qp_conns.end())
+ return nullptr;
+ if (it->second.first->is_dead())
+ return nullptr;
+ return it->second.second;
+}
+
+Infiniband::QueuePair* RDMADispatcher::get_qp(uint32_t qp)
+{
+ Mutex::Locker l(lock);
+ // Try to find the QP in qp_conns firstly.
+ auto it = qp_conns.find(qp);
+ if (it != qp_conns.end())
+ return it->second.first;
+
+ // Try again in dead_queue_pairs.
+ for (auto &i: dead_queue_pairs)
+ if (i->get_local_qp_number() == qp)
+ return i;
+
+ return nullptr;
+}
+
+void RDMADispatcher::erase_qpn_lockless(uint32_t qpn)
+{
+ auto it = qp_conns.find(qpn);
+ if (it == qp_conns.end())
+ return ;
+ ++num_dead_queue_pair;
+ dead_queue_pairs.push_back(it->second.first);
+ qp_conns.erase(it);
+ --num_qp_conn;
+}
+
+void RDMADispatcher::erase_qpn(uint32_t qpn)
+{
+ Mutex::Locker l(lock);
+ erase_qpn_lockless(qpn);
+}
+
+void RDMADispatcher::handle_tx_event(ibv_wc *cqe, int n)
+{
+ std::vector<Chunk*> tx_chunks;
+
+ for (int i = 0; i < n; ++i) {
+ ibv_wc* response = &cqe[i];
+ Chunk* chunk = reinterpret_cast<Chunk *>(response->wr_id);
+ ldout(cct, 25) << __func__ << " QP: " << response->qp_num
+ << " len: " << response->byte_len << " , addr:" << chunk
+ << " " << get_stack()->get_infiniband().wc_status_to_string(response->status) << dendl;
+
+ QueuePair *qp = get_qp(response->qp_num);
+ if (qp)
+ qp->dec_tx_wr(1);
+
+ if (response->status != IBV_WC_SUCCESS) {
+ perf_logger->inc(l_msgr_rdma_tx_total_wc_errors);
+ if (response->status == IBV_WC_RETRY_EXC_ERR) {
+ ldout(cct, 1) << __func__ << " connection between server and client not working. Disconnect this now" << dendl;
+ perf_logger->inc(l_msgr_rdma_tx_wc_retry_errors);
+ } else if (response->status == IBV_WC_WR_FLUSH_ERR) {
+ ldout(cct, 1) << __func__ << " Work Request Flushed Error: this connection's qp="
+ << response->qp_num << " should be down while this WR=" << response->wr_id
+ << " still in flight." << dendl;
+ perf_logger->inc(l_msgr_rdma_tx_wc_wr_flush_errors);
+ } else {
+ ldout(cct, 1) << __func__ << " send work request returned error for buffer("
+ << response->wr_id << ") status(" << response->status << "): "
+ << get_stack()->get_infiniband().wc_status_to_string(response->status) << dendl;
+ Mutex::Locker l(lock);//make sure connected socket alive when pass wc
+ RDMAConnectedSocketImpl *conn = get_conn_lockless(response->qp_num);
+
+ if (conn && conn->is_connected()) {
+ ldout(cct, 25) << __func__ << " qp state is : " << conn->get_qp_state() << dendl;
+ conn->fault();
+ } else {
+ ldout(cct, 1) << __func__ << " missing qp_num=" << response->qp_num << " discard event" << dendl;
+ }
+ }
+ }
+
+ //TX completion may come either from regular send message or from 'fin' message.
+ //In the case of 'fin' wr_id points to the QueuePair.
+ if (get_stack()->get_infiniband().get_memory_manager()->is_tx_buffer(chunk->buffer)) {
+ tx_chunks.push_back(chunk);
+ } else if (reinterpret_cast<QueuePair*>(response->wr_id)->get_local_qp_number() == response->qp_num ) {
+ ldout(cct, 1) << __func__ << " sending of the disconnect msg completed" << dendl;
+ } else {
+ ldout(cct, 1) << __func__ << " not tx buffer, chunk " << chunk << dendl;
+ ceph_abort();
+ }
+ }
+
+ perf_logger->inc(l_msgr_rdma_tx_total_wc, n);
+ post_tx_buffer(tx_chunks);
+}
+
+/**
+ * Add the given Chunks to the given free queue.
+ *
+ * \param[in] chunks
+ * The Chunks to enqueue.
+ * \return
+ * 0 if success or -1 for failure
+ */
+void RDMADispatcher::post_tx_buffer(std::vector<Chunk*> &chunks)
+{
+ if (chunks.empty())
+ return ;
+
+ inflight -= chunks.size();
+ get_stack()->get_infiniband().get_memory_manager()->return_tx(chunks);
+ ldout(cct, 30) << __func__ << " release " << chunks.size()
+ << " chunks, inflight " << inflight << dendl;
+ notify_pending_workers();
+}
+
+
+RDMAWorker::RDMAWorker(CephContext *c, unsigned i)
+ : Worker(c, i), stack(nullptr),
+ tx_handler(new C_handle_cq_tx(this)), lock("RDMAWorker::lock")
+{
+ // initialize perf_logger
+ char name[128];
+ sprintf(name, "AsyncMessenger::RDMAWorker-%u", id);
+ PerfCountersBuilder plb(cct, name, l_msgr_rdma_first, l_msgr_rdma_last);
+
+ plb.add_u64_counter(l_msgr_rdma_tx_no_mem, "tx_no_mem", "The count of no tx buffer");
+ plb.add_u64_counter(l_msgr_rdma_tx_parital_mem, "tx_parital_mem", "The count of parital tx buffer");
+ plb.add_u64_counter(l_msgr_rdma_tx_failed, "tx_failed_post", "The number of tx failed posted");
+
+ plb.add_u64_counter(l_msgr_rdma_tx_chunks, "tx_chunks", "The number of tx chunks transmitted");
+ plb.add_u64_counter(l_msgr_rdma_tx_bytes, "tx_bytes", "The bytes of tx chunks transmitted", NULL, 0, unit_t(UNIT_BYTES));
+ plb.add_u64_counter(l_msgr_rdma_rx_chunks, "rx_chunks", "The number of rx chunks transmitted");
+ plb.add_u64_counter(l_msgr_rdma_rx_bytes, "rx_bytes", "The bytes of rx chunks transmitted", NULL, 0, unit_t(UNIT_BYTES));
+ plb.add_u64_counter(l_msgr_rdma_pending_sent_conns, "pending_sent_conns", "The count of pending sent conns");
+
+ perf_logger = plb.create_perf_counters();
+ cct->get_perfcounters_collection()->add(perf_logger);
+}
+
+RDMAWorker::~RDMAWorker()
+{
+ delete tx_handler;
+}
+
+void RDMAWorker::initialize()
+{
+ if (!dispatcher) {
+ dispatcher = &stack->get_dispatcher();
+ }
+}
+
+int RDMAWorker::listen(entity_addr_t &sa, unsigned addr_slot,
+ const SocketOptions &opt,ServerSocket *sock)
+{
+ get_stack()->get_infiniband().init();
+ dispatcher->polling_start();
+ RDMAServerSocketImpl *p;
+ if (cct->_conf->ms_async_rdma_type == "iwarp") {
+ p = new RDMAIWARPServerSocketImpl(
+ cct, &get_stack()->get_infiniband(), &get_stack()->get_dispatcher(), this,
+ sa, addr_slot);
+ } else {
+ p = new RDMAServerSocketImpl(cct, &get_stack()->get_infiniband(),
+ &get_stack()->get_dispatcher(), this, sa,
+ addr_slot);
+ }
+ int r = p->listen(sa, opt);
+ if (r < 0) {
+ delete p;
+ return r;
+ }
+
+ *sock = ServerSocket(std::unique_ptr<ServerSocketImpl>(p));
+ return 0;
+}
+
+int RDMAWorker::connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket)
+{
+ get_stack()->get_infiniband().init();
+ dispatcher->polling_start();
+
+ RDMAConnectedSocketImpl* p;
+ if (cct->_conf->ms_async_rdma_type == "iwarp") {
+ p = new RDMAIWARPConnectedSocketImpl(cct, &get_stack()->get_infiniband(), &get_stack()->get_dispatcher(), this);
+ } else {
+ p = new RDMAConnectedSocketImpl(cct, &get_stack()->get_infiniband(), &get_stack()->get_dispatcher(), this);
+ }
+ int r = p->try_connect(addr, opts);
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " try connecting failed." << dendl;
+ delete p;
+ return r;
+ }
+ std::unique_ptr<RDMAConnectedSocketImpl> csi(p);
+ *socket = ConnectedSocket(std::move(csi));
+ return 0;
+}
+
+int RDMAWorker::get_reged_mem(RDMAConnectedSocketImpl *o, std::vector<Chunk*> &c, size_t bytes)
+{
+ ceph_assert(center.in_thread());
+ int r = get_stack()->get_infiniband().get_tx_buffers(c, bytes);
+ ceph_assert(r >= 0);
+ size_t got = get_stack()->get_infiniband().get_memory_manager()->get_tx_buffer_size() * r;
+ ldout(cct, 30) << __func__ << " need " << bytes << " bytes, reserve " << got << " registered bytes, inflight " << dispatcher->inflight << dendl;
+ stack->get_dispatcher().inflight += r;
+ if (got >= bytes)
+ return r;
+
+ if (o) {
+ if (!o->is_pending()) {
+ pending_sent_conns.push_back(o);
+ perf_logger->inc(l_msgr_rdma_pending_sent_conns, 1);
+ o->set_pending(1);
+ }
+ dispatcher->make_pending_worker(this);
+ }
+ return r;
+}
+
+
+void RDMAWorker::handle_pending_message()
+{
+ ldout(cct, 20) << __func__ << " pending conns " << pending_sent_conns.size() << dendl;
+ while (!pending_sent_conns.empty()) {
+ RDMAConnectedSocketImpl *o = pending_sent_conns.front();
+ pending_sent_conns.pop_front();
+ ssize_t r = o->submit(false);
+ ldout(cct, 20) << __func__ << " sent pending bl socket=" << o << " r=" << r << dendl;
+ if (r < 0) {
+ if (r == -EAGAIN) {
+ pending_sent_conns.push_back(o);
+ dispatcher->make_pending_worker(this);
+ return ;
+ }
+ o->fault();
+ }
+ o->set_pending(0);
+ perf_logger->dec(l_msgr_rdma_pending_sent_conns, 1);
+ }
+ dispatcher->notify_pending_workers();
+}
+
+RDMAStack::RDMAStack(CephContext *cct, const string &t)
+ : NetworkStack(cct, t), ib(cct), dispatcher(cct, this)
+{
+ ldout(cct, 20) << __func__ << " constructing RDMAStack..." << dendl;
+
+ unsigned num = get_num_worker();
+ for (unsigned i = 0; i < num; ++i) {
+ RDMAWorker* w = dynamic_cast<RDMAWorker*>(get_worker(i));
+ w->set_stack(this);
+ }
+ ldout(cct, 20) << " creating RDMAStack:" << this << " with dispatcher:" << &dispatcher << dendl;
+}
+
+RDMAStack::~RDMAStack()
+{
+ if (cct->_conf->ms_async_rdma_enable_hugepage) {
+ unsetenv("RDMAV_HUGEPAGES_SAFE"); //remove env variable on destruction
+ }
+}
+
+void RDMAStack::spawn_worker(unsigned i, std::function<void ()> &&func)
+{
+ threads.resize(i+1);
+ threads[i] = std::thread(func);
+}
+
+void RDMAStack::join_worker(unsigned i)
+{
+ ceph_assert(threads.size() > i && threads[i].joinable());
+ threads[i].join();
+}
diff --git a/src/msg/async/rdma/RDMAStack.h b/src/msg/async/rdma/RDMAStack.h
new file mode 100644
index 00000000..e4d34ee0
--- /dev/null
+++ b/src/msg/async/rdma/RDMAStack.h
@@ -0,0 +1,348 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_RDMASTACK_H
+#define CEPH_MSG_RDMASTACK_H
+
+#include <sys/eventfd.h>
+
+#include <list>
+#include <vector>
+#include <thread>
+
+#include "common/ceph_context.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "msg/async/Stack.h"
+#include "Infiniband.h"
+
+class RDMAConnectedSocketImpl;
+class RDMAServerSocketImpl;
+class RDMAStack;
+class RDMAWorker;
+
+class RDMADispatcher {
+ typedef Infiniband::MemoryManager::Chunk Chunk;
+ typedef Infiniband::QueuePair QueuePair;
+
+ std::thread t;
+ CephContext *cct;
+ Infiniband::CompletionQueue* tx_cq = nullptr;
+ Infiniband::CompletionQueue* rx_cq = nullptr;
+ Infiniband::CompletionChannel *tx_cc = nullptr, *rx_cc = nullptr;
+ EventCallbackRef async_handler;
+ bool done = false;
+ std::atomic<uint64_t> num_dead_queue_pair = {0};
+ std::atomic<uint64_t> num_qp_conn = {0};
+ Mutex lock; // protect `qp_conns`, `dead_queue_pairs`
+ // qp_num -> InfRcConnection
+ // The main usage of `qp_conns` is looking up connection by qp_num,
+ // so the lifecycle of element in `qp_conns` is the lifecycle of qp.
+ //// make qp queue into dead state
+ /**
+ * 1. Connection call mark_down
+ * 2. Move the Queue Pair into the Error state(QueuePair::to_dead)
+ * 3. Wait for the affiliated event IBV_EVENT_QP_LAST_WQE_REACHED(handle_async_event)
+ * 4. Wait for CQ to be empty(handle_tx_event)
+ * 5. Destroy the QP by calling ibv_destroy_qp()(handle_tx_event)
+ *
+ * @param qp The qp needed to dead
+ */
+ ceph::unordered_map<uint32_t, std::pair<QueuePair*, RDMAConnectedSocketImpl*> > qp_conns;
+
+ /// if a queue pair is closed when transmit buffers are active
+ /// on it, the transmit buffers never get returned via tx_cq. To
+ /// work around this problem, don't delete queue pairs immediately. Instead,
+ /// save them in this vector and delete them at a safe time, when there are
+ /// no outstanding transmit buffers to be lost.
+ std::vector<QueuePair*> dead_queue_pairs;
+
+ std::atomic<uint64_t> num_pending_workers = {0};
+ Mutex w_lock; // protect pending workers
+ // fixme: lockfree
+ std::list<RDMAWorker*> pending_workers;
+ RDMAStack* stack;
+
+ class C_handle_cq_async : public EventCallback {
+ RDMADispatcher *dispatcher;
+ public:
+ explicit C_handle_cq_async(RDMADispatcher *w): dispatcher(w) {}
+ void do_request(uint64_t fd) {
+ // worker->handle_tx_event();
+ dispatcher->handle_async_event();
+ }
+ };
+
+ public:
+ PerfCounters *perf_logger;
+
+ explicit RDMADispatcher(CephContext* c, RDMAStack* s);
+ virtual ~RDMADispatcher();
+ void handle_async_event();
+
+ void polling_start();
+ void polling_stop();
+ void polling();
+ void register_qp(QueuePair *qp, RDMAConnectedSocketImpl* csi);
+ void make_pending_worker(RDMAWorker* w) {
+ Mutex::Locker l(w_lock);
+ auto it = std::find(pending_workers.begin(), pending_workers.end(), w);
+ if (it != pending_workers.end())
+ return;
+ pending_workers.push_back(w);
+ ++num_pending_workers;
+ }
+ RDMAStack* get_stack() { return stack; }
+ RDMAConnectedSocketImpl* get_conn_lockless(uint32_t qp);
+ QueuePair* get_qp(uint32_t qp);
+ void erase_qpn_lockless(uint32_t qpn);
+ void erase_qpn(uint32_t qpn);
+ Infiniband::CompletionQueue* get_tx_cq() const { return tx_cq; }
+ Infiniband::CompletionQueue* get_rx_cq() const { return rx_cq; }
+ void notify_pending_workers();
+ void handle_tx_event(ibv_wc *cqe, int n);
+ void post_tx_buffer(std::vector<Chunk*> &chunks);
+
+ std::atomic<uint64_t> inflight = {0};
+
+ void post_chunk_to_pool(Chunk* chunk);
+ int post_chunks_to_rq(int num, ibv_qp *qp=NULL);
+};
+
+class RDMAWorker : public Worker {
+ typedef Infiniband::CompletionQueue CompletionQueue;
+ typedef Infiniband::CompletionChannel CompletionChannel;
+ typedef Infiniband::MemoryManager::Chunk Chunk;
+ typedef Infiniband::MemoryManager MemoryManager;
+ typedef std::vector<Chunk*>::iterator ChunkIter;
+ RDMAStack *stack;
+ EventCallbackRef tx_handler;
+ std::list<RDMAConnectedSocketImpl*> pending_sent_conns;
+ RDMADispatcher* dispatcher = nullptr;
+ Mutex lock;
+
+ class C_handle_cq_tx : public EventCallback {
+ RDMAWorker *worker;
+ public:
+ explicit C_handle_cq_tx(RDMAWorker *w): worker(w) {}
+ void do_request(uint64_t fd) {
+ worker->handle_pending_message();
+ }
+ };
+
+ public:
+ PerfCounters *perf_logger;
+ explicit RDMAWorker(CephContext *c, unsigned i);
+ virtual ~RDMAWorker();
+ virtual int listen(entity_addr_t &addr,
+ unsigned addr_slot,
+ const SocketOptions &opts, ServerSocket *) override;
+ virtual int connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) override;
+ virtual void initialize() override;
+ RDMAStack *get_stack() { return stack; }
+ int get_reged_mem(RDMAConnectedSocketImpl *o, std::vector<Chunk*> &c, size_t bytes);
+ void remove_pending_conn(RDMAConnectedSocketImpl *o) {
+ ceph_assert(center.in_thread());
+ pending_sent_conns.remove(o);
+ }
+ void handle_pending_message();
+ void set_stack(RDMAStack *s) { stack = s; }
+ void notify_worker() {
+ center.dispatch_event_external(tx_handler);
+ }
+};
+
+struct RDMACMInfo {
+ RDMACMInfo(rdma_cm_id *cid, rdma_event_channel *cm_channel_, uint32_t qp_num_)
+ : cm_id(cid), cm_channel(cm_channel_), qp_num(qp_num_) {}
+ rdma_cm_id *cm_id;
+ rdma_event_channel *cm_channel;
+ uint32_t qp_num;
+};
+
+class RDMAConnectedSocketImpl : public ConnectedSocketImpl {
+ public:
+ typedef Infiniband::MemoryManager::Chunk Chunk;
+ typedef Infiniband::CompletionChannel CompletionChannel;
+ typedef Infiniband::CompletionQueue CompletionQueue;
+
+ protected:
+ CephContext *cct;
+ Infiniband::QueuePair *qp;
+ IBSYNMsg peer_msg;
+ IBSYNMsg my_msg;
+ int connected;
+ int error;
+ Infiniband* infiniband;
+ RDMADispatcher* dispatcher;
+ RDMAWorker* worker;
+ std::vector<Chunk*> buffers;
+ int notify_fd = -1;
+ bufferlist pending_bl;
+
+ Mutex lock;
+ std::vector<ibv_wc> wc;
+ bool is_server;
+ EventCallbackRef read_handler;
+ EventCallbackRef established_handler;
+ int tcp_fd = -1;
+ bool active;// qp is active ?
+ bool pending;
+ int post_backlog = 0;
+
+ void notify();
+ ssize_t read_buffers(char* buf, size_t len);
+ int post_work_request(std::vector<Chunk*>&);
+
+ public:
+ RDMAConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s,
+ RDMAWorker *w);
+ virtual ~RDMAConnectedSocketImpl();
+
+ void pass_wc(std::vector<ibv_wc> &&v);
+ void get_wc(std::vector<ibv_wc> &w);
+ virtual int is_connected() override { return connected; }
+
+ virtual ssize_t read(char* buf, size_t len) override;
+ virtual ssize_t zero_copy_read(bufferptr &data) override;
+ virtual ssize_t send(bufferlist &bl, bool more) override;
+ virtual void shutdown() override;
+ virtual void close() override;
+ virtual int fd() const override { return notify_fd; }
+ virtual int socket_fd() const override { return tcp_fd; }
+ void fault();
+ const char* get_qp_state() { return Infiniband::qp_state_string(qp->get_state()); }
+ ssize_t submit(bool more);
+ int activate();
+ void fin();
+ void handle_connection();
+ int handle_connection_established(bool need_set_fault = true);
+ void cleanup();
+ void set_accept_fd(int sd);
+ virtual int try_connect(const entity_addr_t&, const SocketOptions &opt);
+ bool is_pending() {return pending;}
+ void set_pending(bool val) {pending = val;}
+ void post_chunks_to_rq(int num);
+ void update_post_backlog();
+};
+
+enum RDMA_CM_STATUS {
+ IDLE = 1,
+ RDMA_ID_CREATED,
+ CHANNEL_FD_CREATED,
+ RESOURCE_ALLOCATED,
+ ADDR_RESOLVED,
+ ROUTE_RESOLVED,
+ CONNECTED,
+ DISCONNECTED,
+ ERROR
+};
+
+class RDMAIWARPConnectedSocketImpl : public RDMAConnectedSocketImpl {
+ public:
+ RDMAIWARPConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s,
+ RDMAWorker *w, RDMACMInfo *info = nullptr);
+ ~RDMAIWARPConnectedSocketImpl();
+ virtual int try_connect(const entity_addr_t&, const SocketOptions &opt) override;
+ virtual void close() override;
+ virtual void shutdown() override;
+ virtual void handle_cm_connection();
+ uint32_t get_local_qpn() const { return local_qpn; }
+ void activate();
+ int alloc_resource();
+ void close_notify();
+
+ private:
+ rdma_cm_id *cm_id;
+ rdma_event_channel *cm_channel;
+ uint32_t local_qpn;
+ uint32_t remote_qpn;
+ EventCallbackRef cm_con_handler;
+ bool is_server;
+ std::mutex close_mtx;
+ std::condition_variable close_condition;
+ bool closed;
+ RDMA_CM_STATUS status;
+
+
+ class C_handle_cm_connection : public EventCallback {
+ RDMAIWARPConnectedSocketImpl *csi;
+ public:
+ C_handle_cm_connection(RDMAIWARPConnectedSocketImpl *w): csi(w) {}
+ void do_request(uint64_t fd) {
+ csi->handle_cm_connection();
+ }
+ };
+};
+
+class RDMAServerSocketImpl : public ServerSocketImpl {
+ protected:
+ CephContext *cct;
+ NetHandler net;
+ int server_setup_socket;
+ Infiniband* infiniband;
+ RDMADispatcher *dispatcher;
+ RDMAWorker *worker;
+ entity_addr_t sa;
+
+ public:
+ RDMAServerSocketImpl(CephContext *cct, Infiniband* i, RDMADispatcher *s,
+ RDMAWorker *w, entity_addr_t& a, unsigned slot);
+
+ virtual int listen(entity_addr_t &sa, const SocketOptions &opt);
+ virtual int accept(ConnectedSocket *s, const SocketOptions &opts, entity_addr_t *out, Worker *w) override;
+ virtual void abort_accept() override;
+ virtual int fd() const override { return server_setup_socket; }
+ int get_fd() { return server_setup_socket; }
+};
+
+class RDMAIWARPServerSocketImpl : public RDMAServerSocketImpl {
+ public:
+ RDMAIWARPServerSocketImpl(
+ CephContext *cct, Infiniband *i, RDMADispatcher *s, RDMAWorker *w,
+ entity_addr_t& addr, unsigned addr_slot);
+ virtual int listen(entity_addr_t &sa, const SocketOptions &opt) override;
+ virtual int accept(ConnectedSocket *s, const SocketOptions &opts, entity_addr_t *out, Worker *w) override;
+ virtual void abort_accept() override;
+ private:
+ rdma_cm_id *cm_id;
+ rdma_event_channel *cm_channel;
+};
+
+class RDMAStack : public NetworkStack {
+ vector<std::thread> threads;
+ PerfCounters *perf_counter;
+ Infiniband ib;
+ RDMADispatcher dispatcher;
+
+ std::atomic<bool> fork_finished = {false};
+
+ public:
+ explicit RDMAStack(CephContext *cct, const string &t);
+ virtual ~RDMAStack();
+ virtual bool support_zero_copy_read() const override { return false; }
+ virtual bool nonblock_connect_need_writable_event() const override { return false; }
+
+ virtual void spawn_worker(unsigned i, std::function<void ()> &&func) override;
+ virtual void join_worker(unsigned i) override;
+ RDMADispatcher &get_dispatcher() { return dispatcher; }
+ Infiniband &get_infiniband() { return ib; }
+ virtual bool is_ready() override { return fork_finished.load(); };
+ virtual void ready() override { fork_finished = true; };
+};
+
+
+#endif