Adding upstream version 18.2.2.upstream/18.2.2

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-21 11:54:28 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-21 11:54:28 +0000
commit: e6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree: 64f88b554b444a49f656b6c656111a145cbbaa28 /src/msg/async/rdma
parent: Initial commit. (diff)
download: ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
8 files changed, 4112 insertions, 0 deletions
diff --git a/src/msg/async/rdma/Infiniband.cc b/src/msg/async/rdma/Infiniband.cc
new file mode 100644
index 000000000..52323f948
--- /dev/null
+++ b/src/msg/async/rdma/Infiniband.cc
@@ -0,0 +1,1321 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "Infiniband.h"
+#include "common/errno.h"
+#include "common/debug.h"
+#include "RDMAStack.h"
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << "Infiniband "
+
+static const uint32_t MAX_SHARED_RX_SGE_COUNT = 1;
+static const uint32_t MAX_INLINE_DATA = 0;
+static const uint32_t TCP_MSG_LEN = sizeof("0000:00000000:00000000:00000000:00000000000000000000000000000000");
+static const uint32_t CQ_DEPTH = 30000;
+
+Port::Port(CephContext *cct, struct ibv_context* ictxt, uint8_t ipn): ctxt(ictxt), port_num(ipn),
+  gid_idx(cct->_conf.get_val<int64_t>("ms_async_rdma_gid_idx"))
+{
+  int r = ibv_query_port(ctxt, port_num, &port_attr);
+  if (r == -1) {
+    lderr(cct) << __func__  << " query port failed  " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+
+  lid = port_attr.lid;
+  ceph_assert(gid_idx < port_attr.gid_tbl_len);
+#ifdef HAVE_IBV_EXP
+  union ibv_gid cgid;
+  struct ibv_exp_gid_attr gid_attr;
+  bool malformed = false;
+
+  ldout(cct,1) << __func__ << " using experimental verbs for gid" << dendl;
+
+
+  // search for requested GID in GIDs table
+  ldout(cct, 1) << __func__ << " looking for local GID " << (cct->_conf->ms_async_rdma_local_gid)
+    << " of type " << (cct->_conf->ms_async_rdma_roce_ver) << dendl;
+  r = sscanf(cct->_conf->ms_async_rdma_local_gid.c_str(),
+	     "%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx"
+	     ":%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx",
+	     &cgid.raw[ 0], &cgid.raw[ 1],
+	     &cgid.raw[ 2], &cgid.raw[ 3],
+	     &cgid.raw[ 4], &cgid.raw[ 5],
+	     &cgid.raw[ 6], &cgid.raw[ 7],
+	     &cgid.raw[ 8], &cgid.raw[ 9],
+	     &cgid.raw[10], &cgid.raw[11],
+	     &cgid.raw[12], &cgid.raw[13],
+	     &cgid.raw[14], &cgid.raw[15]);
+
+  if (r != 16) {
+    ldout(cct, 1) << __func__ << " malformed or no GID supplied, using GID index 0" << dendl;
+    malformed = true;
+  }
+
+  gid_attr.comp_mask = IBV_EXP_QUERY_GID_ATTR_TYPE;
+
+  for (gid_idx = 0; gid_idx < port_attr.gid_tbl_len; gid_idx++) {
+    r = ibv_query_gid(ctxt, port_num, gid_idx, &gid);
+    if (r) {
+      lderr(cct) << __func__  << " query gid of port " << port_num << " index " << gid_idx << " failed  " << cpp_strerror(errno) << dendl;
+      ceph_abort();
+    }
+    r = ibv_exp_query_gid_attr(ctxt, port_num, gid_idx, &gid_attr);
+    if (r) {
+      lderr(cct) << __func__  << " query gid attributes of port " << port_num << " index " << gid_idx << " failed  " << cpp_strerror(errno) << dendl;
+      ceph_abort();
+    }
+
+    if (malformed) break; // stay with gid_idx=0
+    if ( (gid_attr.type == cct->_conf->ms_async_rdma_roce_ver) &&
+	 (memcmp(&gid, &cgid, 16) == 0) ) {
+      ldout(cct, 1) << __func__ << " found at index " << gid_idx << dendl;
+      break;
+    }
+  }
+
+  if (gid_idx == port_attr.gid_tbl_len) {
+    lderr(cct) << __func__ << " Requested local GID was not found in GID table" << dendl;
+    ceph_abort();
+  }
+#else
+  r = ibv_query_gid(ctxt, port_num, gid_idx, &gid);
+  if (r) {
+    lderr(cct) << __func__  << " query gid failed  " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+#endif
+}
+
+Device::Device(CephContext *cct, ibv_device* ib_dev): device(ib_dev), active_port(nullptr)
+{
+  ceph_assert(device);
+  ctxt = ibv_open_device(device);
+  ceph_assert(ctxt);
+
+  name = ibv_get_device_name(device);
+
+  int r = ibv_query_device(ctxt, &device_attr);
+  if (r) {
+    lderr(cct) << __func__ << " failed to query rdma device. " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+}
+
+Device::Device(CephContext *cct, struct ibv_context *ib_ctx): device(ib_ctx->device),
+                                                              active_port(nullptr)
+{
+  ceph_assert(device);
+  ctxt = ib_ctx;
+  ceph_assert(ctxt);
+
+  name = ibv_get_device_name(device);
+
+  int r = ibv_query_device(ctxt, &device_attr);
+  if (r) {
+    lderr(cct) << __func__ << " failed to query rdma device. " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+}
+
+void Device::binding_port(CephContext *cct, int port_num) {
+  port_cnt = device_attr.phys_port_cnt;
+  for (uint8_t port_id = 1; port_id <= port_cnt; ++port_id) {
+    Port *port = new Port(cct, ctxt, port_id);
+    if (port_id == port_num && port->get_port_attr()->state == IBV_PORT_ACTIVE) {
+      active_port = port;
+      ldout(cct, 1) << __func__ << " found active port " << static_cast<int>(port_id) << dendl;
+      break;
+    } else {
+      ldout(cct, 10) << __func__ << " port " << port_id << " is not what we want. state: "
+                     << ibv_port_state_str(port->get_port_attr()->state) << dendl;
+      delete port;
+    }
+  }
+  if (nullptr == active_port) {
+    lderr(cct) << __func__ << "  port not found" << dendl;
+    ceph_assert(active_port);
+  }
+}
+
+
+Infiniband::QueuePair::QueuePair(
+    CephContext *c, Infiniband& infiniband, ibv_qp_type type,
+    int port, ibv_srq *srq,
+    Infiniband::CompletionQueue* txcq, Infiniband::CompletionQueue* rxcq,
+    uint32_t tx_queue_len, uint32_t rx_queue_len, struct rdma_cm_id *cid, uint32_t q_key)
+: cct(c), infiniband(infiniband),
+  type(type),
+  ctxt(infiniband.device->ctxt),
+  ib_physical_port(port),
+  pd(infiniband.pd->pd),
+  srq(srq),
+  qp(NULL),
+  cm_id(cid), peer_cm_meta{0}, local_cm_meta{0},
+  txcq(txcq),
+  rxcq(rxcq),
+  initial_psn(lrand48() & PSN_MSK),
+  // One extra WR for beacon
+  max_send_wr(tx_queue_len + 1),
+  max_recv_wr(rx_queue_len),
+  q_key(q_key),
+  dead(false)
+{
+  if (type != IBV_QPT_RC && type != IBV_QPT_UD && type != IBV_QPT_RAW_PACKET) {
+    lderr(cct) << __func__ << " invalid queue pair type" << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+}
+
+int Infiniband::QueuePair::modify_qp_to_error(void)
+{
+    ibv_qp_attr qpa;
+    // FIPS zeroization audit 20191115: this memset is not security related.
+    memset(&qpa, 0, sizeof(qpa));
+    qpa.qp_state = IBV_QPS_ERR;
+    if (ibv_modify_qp(qp, &qpa, IBV_QP_STATE)) {
+      lderr(cct) << __func__ << " failed to transition to ERROR state: " << cpp_strerror(errno) << dendl;
+      return -1;
+    }
+    ldout(cct, 20) << __func__ << " transition to ERROR state successfully." << dendl;
+    return 0;
+}
+
+int Infiniband::QueuePair::modify_qp_to_rts(void)
+{
+  // move from RTR state RTS
+  ibv_qp_attr qpa;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&qpa, 0, sizeof(qpa));
+  qpa.qp_state = IBV_QPS_RTS;
+  /*
+   * How long to wait before retrying if packet lost or server dead.
+   * Supposedly the timeout is 4.096us*2^timeout.  However, the actual
+   * timeout appears to be 4.096us*2^(timeout+1), so the setting
+   * below creates a 135ms timeout.
+   */
+  qpa.timeout = 0x12;
+  // How many times to retry after timeouts before giving up.
+  qpa.retry_cnt = 7;
+  /*
+   * How many times to retry after RNR (receiver not ready) condition
+   * before giving up. Occurs when the remote side has not yet posted
+   * a receive request.
+   */
+  qpa.rnr_retry = 7; // 7 is infinite retry.
+  qpa.sq_psn = local_cm_meta.psn;
+  qpa.max_rd_atomic = 1;
+
+  int attr_mask = IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC;
+  int r = ibv_modify_qp(qp, &qpa, attr_mask);
+  if (r) {
+    lderr(cct) << __func__ << " failed to transition to RTS state: " << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+  ldout(cct, 20) << __func__ << " transition to RTS state successfully." << dendl;
+  return 0;
+}
+
+int Infiniband::QueuePair::modify_qp_to_rtr(void)
+{
+  // move from INIT to RTR state
+  ibv_qp_attr qpa;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&qpa, 0, sizeof(qpa));
+  qpa.qp_state = IBV_QPS_RTR;
+  qpa.path_mtu = IBV_MTU_1024;
+  qpa.dest_qp_num = peer_cm_meta.local_qpn;
+  qpa.rq_psn = peer_cm_meta.psn;
+  qpa.max_dest_rd_atomic = 1;
+  qpa.min_rnr_timer = 0x12;
+  qpa.ah_attr.is_global = 1;
+  qpa.ah_attr.grh.hop_limit = 6;
+  qpa.ah_attr.grh.dgid = peer_cm_meta.gid;
+  qpa.ah_attr.grh.sgid_index = infiniband.get_device()->get_gid_idx();
+  qpa.ah_attr.grh.traffic_class = cct->_conf->ms_async_rdma_dscp;
+  //qpa.ah_attr.grh.flow_label = 0;
+
+  qpa.ah_attr.dlid = peer_cm_meta.lid;
+  qpa.ah_attr.sl = cct->_conf->ms_async_rdma_sl;
+  qpa.ah_attr.src_path_bits = 0;
+  qpa.ah_attr.port_num = (uint8_t)(ib_physical_port);
+
+  ldout(cct, 20) << __func__ << " Choosing gid_index " << (int)qpa.ah_attr.grh.sgid_index << ", sl " << (int)qpa.ah_attr.sl << dendl;
+
+  int attr_mask = IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MIN_RNR_TIMER | IBV_QP_MAX_DEST_RD_ATOMIC;
+
+  int r = ibv_modify_qp(qp, &qpa, attr_mask);
+  if (r) {
+    lderr(cct) << __func__ << " failed to transition to RTR state: " << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+  ldout(cct, 20) << __func__ << " transition to RTR state successfully." << dendl;
+  return 0;
+}
+
+int Infiniband::QueuePair::modify_qp_to_init(void)
+{
+  // move from RESET to INIT state
+  ibv_qp_attr qpa;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&qpa, 0, sizeof(qpa));
+  qpa.qp_state   = IBV_QPS_INIT;
+  qpa.pkey_index = 0;
+  qpa.port_num   = (uint8_t)(ib_physical_port);
+  qpa.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE;
+  qpa.qkey       = q_key;
+
+  int mask = IBV_QP_STATE | IBV_QP_PORT;
+  switch (type) {
+    case IBV_QPT_RC:
+      mask |= IBV_QP_ACCESS_FLAGS;
+      mask |= IBV_QP_PKEY_INDEX;
+      break;
+    case IBV_QPT_UD:
+      mask |= IBV_QP_QKEY;
+      mask |= IBV_QP_PKEY_INDEX;
+      break;
+    case IBV_QPT_RAW_PACKET:
+      break;
+    default:
+      ceph_abort();
+  }
+
+  if (ibv_modify_qp(qp, &qpa, mask)) {
+    lderr(cct) << __func__ << " failed to switch to INIT state Queue Pair, qp number: " << qp->qp_num
+               << " Error: " << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+  ldout(cct, 20) << __func__ << " successfully switch to INIT state Queue Pair, qp number: " << qp->qp_num << dendl;
+  return 0;
+}
+
+int Infiniband::QueuePair::init()
+{
+  ldout(cct, 20) << __func__ << " started." << dendl;
+  ibv_qp_init_attr qpia;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&qpia, 0, sizeof(qpia));
+  qpia.send_cq = txcq->get_cq();
+  qpia.recv_cq = rxcq->get_cq();
+  if (srq) {
+    qpia.srq = srq;                      // use the same shared receive queue
+  } else {
+    qpia.cap.max_recv_wr = max_recv_wr;
+    qpia.cap.max_recv_sge = 1;
+  }
+  qpia.cap.max_send_wr  = max_send_wr; // max outstanding send requests
+  qpia.cap.max_send_sge = 1;           // max send scatter-gather elements
+  qpia.cap.max_inline_data = MAX_INLINE_DATA;          // max bytes of immediate data on send q
+  qpia.qp_type = type;                 // RC, UC, UD, or XRC
+  qpia.sq_sig_all = 0;                 // only generate CQEs on requested WQEs
+
+  if (!cct->_conf->ms_async_rdma_cm) {
+    qp = ibv_create_qp(pd, &qpia);
+    if (qp == NULL) {
+      lderr(cct) << __func__ << " failed to create queue pair" << cpp_strerror(errno) << dendl;
+      if (errno == ENOMEM) {
+        lderr(cct) << __func__ << " try reducing ms_async_rdma_receive_queue_length, "
+                                  " ms_async_rdma_send_buffers or"
+                                  " ms_async_rdma_buffer_size" << dendl;
+      }
+      return -1;
+    }
+    if (modify_qp_to_init() != 0) {
+      ibv_destroy_qp(qp);
+      return -1;
+    }
+  } else {
+    ceph_assert(cm_id->verbs == pd->context);
+    if (rdma_create_qp(cm_id, pd, &qpia)) {
+      lderr(cct) << __func__ << " failed to create queue pair with rdmacm library"
+                 << cpp_strerror(errno) << dendl;
+      return -1;
+    }
+    qp = cm_id->qp;
+  }
+  ldout(cct, 20) << __func__ << " successfully create queue pair: "
+                 << "qp=" << qp << dendl;
+  local_cm_meta.local_qpn = get_local_qp_number();
+  local_cm_meta.psn = get_initial_psn();
+  local_cm_meta.lid = infiniband.get_lid();
+  local_cm_meta.peer_qpn = 0;
+  local_cm_meta.gid = infiniband.get_gid();
+  if (!srq) {
+    int rq_wrs = infiniband.post_chunks_to_rq(max_recv_wr, this);
+    if (rq_wrs  == 0) {
+      lderr(cct) << __func__ << " intialize no SRQ Queue Pair, qp number: " << qp->qp_num
+                 << " fatal error: can't post SQ WR " << dendl;
+      return -1;
+    }
+    ldout(cct, 20) << __func__ << " initialize no SRQ Queue Pair, qp number: "
+                   << qp->qp_num << " post SQ WR " << rq_wrs << dendl;
+  }
+  return 0;
+}
+
+void Infiniband::QueuePair::wire_gid_to_gid(const char *wgid, ib_cm_meta_t* cm_meta_data)
+{
+  char tmp[9];
+  uint32_t v32;
+  int i;
+
+  for (tmp[8] = 0, i = 0; i < 4; ++i) {
+    memcpy(tmp, wgid + i * 8, 8);
+    sscanf(tmp, "%x", &v32);
+    *(uint32_t *)(&cm_meta_data->gid.raw[i * 4]) = ntohl(v32);
+  }
+}
+
+void Infiniband::QueuePair::gid_to_wire_gid(const ib_cm_meta_t& cm_meta_data, char wgid[])
+{
+  for (int i = 0; i < 4; ++i)
+    sprintf(&wgid[i * 8], "%08x", htonl(*(uint32_t *)(cm_meta_data.gid.raw + i * 4)));
+}
+
+/*
+ * return value
+ *   1: means no valid buffer read
+ *   0: means got enough buffer
+ * < 0: means error
+ */
+int Infiniband::QueuePair::recv_cm_meta(CephContext *cct, int socket_fd)
+{
+  char msg[TCP_MSG_LEN];
+  char gid[33];
+  ssize_t r = ::read(socket_fd, &msg, sizeof(msg));
+  // Drop incoming qpt
+  if (cct->_conf->ms_inject_socket_failures && socket_fd >= 0) {
+    if (rand() % cct->_conf->ms_inject_socket_failures == 0) {
+      ldout(cct, 0) << __func__ << " injecting socket failure" << dendl;
+      return -EINVAL;
+    }
+  }
+  if (r < 0) {
+    r = -errno;
+    lderr(cct) << __func__ << " got error " << r << ": "
+               << cpp_strerror(r) << dendl;
+  } else if (r == 0) { // valid disconnect message of length 0
+    ldout(cct, 10) << __func__ << " got disconnect message " << dendl;
+  } else if ((size_t)r != sizeof(msg)) { // invalid message
+    ldout(cct, 1) << __func__ << " got bad length (" << r << ") " << dendl;
+    r = -EINVAL;
+  } else { // valid message
+    sscanf(msg, "%hx:%x:%x:%x:%s", &(peer_cm_meta.lid), &(peer_cm_meta.local_qpn), &(peer_cm_meta.psn), &(peer_cm_meta.peer_qpn), gid);
+    wire_gid_to_gid(gid, &peer_cm_meta);
+    ldout(cct, 5) << __func__ << " recevd: " << peer_cm_meta.lid << ", " << peer_cm_meta.local_qpn
+                  << ", " << peer_cm_meta.psn << ", " << peer_cm_meta.peer_qpn << ", " << gid << dendl;
+  }
+  return r;
+}
+
+int Infiniband::QueuePair::send_cm_meta(CephContext *cct, int socket_fd)
+{
+  int retry = 0;
+  ssize_t r;
+
+  char msg[TCP_MSG_LEN];
+  char gid[33];
+retry:
+  gid_to_wire_gid(local_cm_meta, gid);
+  sprintf(msg, "%04x:%08x:%08x:%08x:%s", local_cm_meta.lid, local_cm_meta.local_qpn, local_cm_meta.psn, local_cm_meta.peer_qpn, gid);
+  ldout(cct, 10) << __func__ << " sending: " << local_cm_meta.lid << ", " << local_cm_meta.local_qpn
+                 << ", " << local_cm_meta.psn << ", " << local_cm_meta.peer_qpn << ", "  << gid  << dendl;
+  r = ::write(socket_fd, msg, sizeof(msg));
+  // Drop incoming qpt
+  if (cct->_conf->ms_inject_socket_failures && socket_fd >= 0) {
+    if (rand() % cct->_conf->ms_inject_socket_failures == 0) {
+      ldout(cct, 0) << __func__ << " injecting socket failure" << dendl;
+      return -EINVAL;
+    }
+  }
+
+  if ((size_t)r != sizeof(msg)) {
+    // FIXME need to handle EAGAIN instead of retry
+    if (r < 0 && (errno == EINTR || errno == EAGAIN) && retry < 3) {
+      retry++;
+      goto retry;
+    }
+    if (r < 0)
+      lderr(cct) << __func__ << " send returned error " << errno << ": "
+                 << cpp_strerror(errno) << dendl;
+    else
+      lderr(cct) << __func__ << " send got bad length (" << r << ") " << cpp_strerror(errno) << dendl;
+    return -errno;
+  }
+  return 0;
+}
+
+/**
+ * Switch QP to ERROR state and then post a beacon to be able to drain all
+ * WCEs and then safely destroy QP. See RDMADispatcher::handle_tx_event()
+ * for details.
+ *
+ * \return
+ *      -errno if the QueuePair can't switch to ERROR
+ *      0 for success.
+ */
+int Infiniband::QueuePair::to_dead()
+{
+  if (dead)
+    return 0;
+
+  if (modify_qp_to_error()) {
+    return -1;
+  }
+  ldout(cct, 20) << __func__ << " force trigger error state Queue Pair, qp number: " << local_cm_meta.local_qpn
+                 << " bound remote QueuePair, qp number: " << local_cm_meta.peer_qpn << dendl;
+
+  struct ibv_send_wr *bad_wr = nullptr, beacon;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&beacon, 0, sizeof(beacon));
+  beacon.wr_id = BEACON_WRID;
+  beacon.opcode = IBV_WR_SEND;
+  beacon.send_flags = IBV_SEND_SIGNALED;
+  if (ibv_post_send(qp, &beacon, &bad_wr)) {
+    lderr(cct) << __func__ << " failed to send a beacon: " << cpp_strerror(errno) << dendl;
+    return -errno;
+  }
+  ldout(cct, 20) << __func__ << " trigger error state Queue Pair, qp number: " << local_cm_meta.local_qpn << " Beacon sent " << dendl;
+  dead = true;
+
+  return 0;
+}
+
+int Infiniband::QueuePair::get_remote_qp_number(uint32_t *rqp) const
+{
+  ibv_qp_attr qpa;
+  ibv_qp_init_attr qpia;
+
+  int r = ibv_query_qp(qp, &qpa, IBV_QP_DEST_QPN, &qpia);
+  if (r) {
+    lderr(cct) << __func__ << " failed to query qp: "
+      << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+
+  if (rqp)
+    *rqp = qpa.dest_qp_num;
+  return 0;
+}
+
+/**
+ * Get the remote infiniband address for this QueuePair, as set in #plumb().
+ * LIDs are "local IDs" in infiniband terminology. They are short, locally
+ * routable addresses.
+ */
+int Infiniband::QueuePair::get_remote_lid(uint16_t *lid) const
+{
+  ibv_qp_attr qpa;
+  ibv_qp_init_attr qpia;
+
+  int r = ibv_query_qp(qp, &qpa, IBV_QP_AV, &qpia);
+  if (r) {
+    lderr(cct) << __func__ << " failed to query qp: "
+      << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+
+  if (lid)
+    *lid = qpa.ah_attr.dlid;
+  return 0;
+}
+
+/**
+ * Get the state of a QueuePair.
+ */
+int Infiniband::QueuePair::get_state() const
+{
+  ibv_qp_attr qpa;
+  ibv_qp_init_attr qpia;
+
+  int r = ibv_query_qp(qp, &qpa, IBV_QP_STATE, &qpia);
+  if (r) {
+    lderr(cct) << __func__ << " failed to get state: "
+      << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+  return qpa.qp_state;
+}
+
+Infiniband::CompletionChannel::CompletionChannel(CephContext *c, Infiniband &ib)
+  : cct(c), infiniband(ib), channel(NULL), cq(NULL), cq_events_that_need_ack(0)
+{
+}
+
+Infiniband::CompletionChannel::~CompletionChannel()
+{
+  if (channel) {
+    int r = ibv_destroy_comp_channel(channel);
+    if (r < 0)
+      lderr(cct) << __func__ << " failed to destroy cc: " << cpp_strerror(errno) << dendl;
+    ceph_assert(r == 0);
+  }
+}
+
+int Infiniband::CompletionChannel::init()
+{
+  ldout(cct, 20) << __func__ << " started." << dendl;
+  channel = ibv_create_comp_channel(infiniband.device->ctxt);
+  if (!channel) {
+    lderr(cct) << __func__ << " failed to create receive completion channel: "
+                          << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+  int rc = ceph::NetHandler(cct).set_nonblock(channel->fd);
+  if (rc < 0) {
+    ibv_destroy_comp_channel(channel);
+    return -1;
+  }
+  return 0;
+}
+
+void Infiniband::CompletionChannel::ack_events()
+{
+  ibv_ack_cq_events(cq, cq_events_that_need_ack);
+  cq_events_that_need_ack = 0;
+}
+
+bool Infiniband::CompletionChannel::get_cq_event()
+{
+  ibv_cq *cq = NULL;
+  void *ev_ctx;
+  if (ibv_get_cq_event(channel, &cq, &ev_ctx)) {
+    if (errno != EAGAIN && errno != EINTR)
+      lderr(cct) << __func__ << " failed to retrieve CQ event: "
+                 << cpp_strerror(errno) << dendl;
+    return false;
+  }
+
+  /* accumulate number of cq events that need to
+   *    * be acked, and periodically ack them
+   *       */
+  if (++cq_events_that_need_ack == MAX_ACK_EVENT) {
+    ldout(cct, 20) << __func__ << " ack aq events." << dendl;
+    ibv_ack_cq_events(cq, MAX_ACK_EVENT);
+    cq_events_that_need_ack = 0;
+  }
+
+  return true;
+}
+
+
+Infiniband::CompletionQueue::~CompletionQueue()
+{
+  if (cq) {
+    int r = ibv_destroy_cq(cq);
+    if (r < 0)
+      lderr(cct) << __func__ << " failed to destroy cq: " << cpp_strerror(errno) << dendl;
+    ceph_assert(r == 0);
+  }
+}
+
+int Infiniband::CompletionQueue::init()
+{
+  cq = ibv_create_cq(infiniband.device->ctxt, queue_depth, this, channel->get_channel(), 0);
+  if (!cq) {
+    lderr(cct) << __func__ << " failed to create receive completion queue: "
+      << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+
+  if (ibv_req_notify_cq(cq, 0)) {
+    lderr(cct) << __func__ << " ibv_req_notify_cq failed: " << cpp_strerror(errno) << dendl;
+    ibv_destroy_cq(cq);
+    cq = nullptr;
+    return -1;
+  }
+
+  channel->bind_cq(cq);
+  ldout(cct, 20) << __func__ << " successfully create cq=" << cq << dendl;
+  return 0;
+}
+
+int Infiniband::CompletionQueue::rearm_notify(bool solicite_only)
+{
+  ldout(cct, 20) << __func__ << " started." << dendl;
+  int r = ibv_req_notify_cq(cq, 0);
+  if (r < 0)
+    lderr(cct) << __func__ << " failed to notify cq: " << cpp_strerror(errno) << dendl;
+  return r;
+}
+
+int Infiniband::CompletionQueue::poll_cq(int num_entries, ibv_wc *ret_wc_array) {
+  int r = ibv_poll_cq(cq, num_entries, ret_wc_array);
+  if (r < 0) {
+    lderr(cct) << __func__ << " poll_completion_queue occur met error: "
+      << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+  return r;
+}
+
+
+Infiniband::ProtectionDomain::ProtectionDomain(CephContext *cct, Device *device)
+  : pd(ibv_alloc_pd(device->ctxt))
+{
+  if (pd == NULL) {
+    lderr(cct) << __func__ << " failed to allocate infiniband protection domain: " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+}
+
+Infiniband::ProtectionDomain::~ProtectionDomain()
+{
+  ibv_dealloc_pd(pd);
+}
+
+
+Infiniband::MemoryManager::Chunk::Chunk(ibv_mr* m, uint32_t bytes, char* buffer,
+    uint32_t offset, uint32_t bound, uint32_t lkey, QueuePair* qp)
+  : mr(m), qp(qp), lkey(lkey), bytes(bytes), offset(offset), bound(bound), buffer(buffer)
+{
+}
+
+Infiniband::MemoryManager::Chunk::~Chunk()
+{
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::get_offset()
+{
+  return offset;
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::get_size() const
+{
+  return bound - offset;
+}
+
+void Infiniband::MemoryManager::Chunk::prepare_read(uint32_t b)
+{
+  offset = 0;
+  bound = b;
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::get_bound()
+{
+  return bound;
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::read(char* buf, uint32_t len)
+{
+  uint32_t left = get_size();
+  uint32_t read_len = left <= len ? left : len;
+  memcpy(buf, buffer + offset, read_len);
+  offset += read_len;
+  return read_len;
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::write(char* buf, uint32_t len)
+{
+  uint32_t write_len = (bytes - offset) <= len ? (bytes - offset) : len;
+  memcpy(buffer + offset, buf, write_len);
+  offset += write_len;
+  return write_len;
+}
+
+bool Infiniband::MemoryManager::Chunk::full()
+{
+  return offset == bytes;
+}
+
+void Infiniband::MemoryManager::Chunk::reset_read_chunk()
+{
+  offset = 0;
+  bound = 0;
+}
+
+void Infiniband::MemoryManager::Chunk::reset_write_chunk()
+{
+  offset = 0;
+  bound = bytes;
+}
+
+Infiniband::MemoryManager::Cluster::Cluster(MemoryManager& m, uint32_t s)
+  : manager(m), buffer_size(s)
+{
+}
+
+Infiniband::MemoryManager::Cluster::~Cluster()
+{
+  int r = ibv_dereg_mr(chunk_base->mr);
+  ceph_assert(r == 0);
+  const auto chunk_end = chunk_base + num_chunk;
+  for (auto chunk = chunk_base; chunk != chunk_end; chunk++) {
+    chunk->~Chunk();
+  }
+
+  ::free(chunk_base);
+  manager.free(base);
+}
+
+int Infiniband::MemoryManager::Cluster::fill(uint32_t num)
+{
+  ceph_assert(!base);
+  num_chunk = num;
+  uint32_t bytes = buffer_size * num;
+
+  base = (char*)manager.malloc(bytes);
+  end = base + bytes;
+  ceph_assert(base);
+  chunk_base = static_cast<Chunk*>(::malloc(sizeof(Chunk) * num));
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(static_cast<void*>(chunk_base), 0, sizeof(Chunk) * num);
+  free_chunks.reserve(num);
+  ibv_mr* m = ibv_reg_mr(manager.pd->pd, base, bytes, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
+  ceph_assert(m);
+  Chunk* chunk = chunk_base;
+  for (uint32_t offset = 0; offset < bytes; offset += buffer_size){
+    new(chunk) Chunk(m, buffer_size, base + offset, 0, buffer_size, m->lkey);
+    free_chunks.push_back(chunk);
+    chunk++;
+  }
+  return 0;
+}
+
+void Infiniband::MemoryManager::Cluster::take_back(std::vector<Chunk*> &ck)
+{
+  std::lock_guard l{lock};
+  for (auto c : ck) {
+    c->reset_write_chunk();
+    free_chunks.push_back(c);
+  }
+}
+
+int Infiniband::MemoryManager::Cluster::get_buffers(std::vector<Chunk*> &chunks, size_t block_size)
+{
+  std::lock_guard l{lock};
+  uint32_t chunk_buffer_number = (block_size + buffer_size - 1) / buffer_size;
+  chunk_buffer_number = free_chunks.size() < chunk_buffer_number ? free_chunks.size(): chunk_buffer_number;
+  uint32_t r = 0;
+
+  for (r = 0; r < chunk_buffer_number; ++r) {
+    chunks.push_back(free_chunks.back());
+    free_chunks.pop_back();
+  }
+  return r;
+}
+
+bool Infiniband::MemoryManager::MemPoolContext::can_alloc(unsigned nbufs)
+{
+  /* unlimited */
+  if (manager->cct->_conf->ms_async_rdma_receive_buffers <= 0)
+    return true;
+
+  if (n_bufs_allocated + nbufs > (unsigned)manager->cct->_conf->ms_async_rdma_receive_buffers) {
+    lderr(manager->cct) << __func__ << " WARNING: OUT OF RX BUFFERS: allocated: " <<
+        n_bufs_allocated << " requested: " << nbufs <<
+        " limit: " << manager->cct->_conf->ms_async_rdma_receive_buffers << dendl;
+    return false;
+  }
+
+  return true;
+}
+
+void Infiniband::MemoryManager::MemPoolContext::set_stat_logger(PerfCounters *logger) {
+  perf_logger = logger;
+  if (perf_logger != nullptr)
+    perf_logger->set(l_msgr_rdma_rx_bufs_total, n_bufs_allocated);
+}
+
+void Infiniband::MemoryManager::MemPoolContext::update_stats(int nbufs)
+{
+  n_bufs_allocated += nbufs;
+
+  if (!perf_logger)
+    return;
+
+  if (nbufs > 0) {
+    perf_logger->inc(l_msgr_rdma_rx_bufs_total, nbufs);
+  } else {
+    perf_logger->dec(l_msgr_rdma_rx_bufs_total, -nbufs);
+  }
+}
+
+void *Infiniband::MemoryManager::mem_pool::slow_malloc()
+{
+  // this will trigger pool expansion via PoolAllocator::malloc()
+  return PoolAllocator::with_context(ctx, [this] {
+    return boost::pool<PoolAllocator>::malloc();
+  });
+}
+
+Infiniband::MemoryManager::MemPoolContext*
+Infiniband::MemoryManager::PoolAllocator::g_ctx = nullptr;
+
+// lock is taken by mem_pool::slow_malloc()
+ceph::mutex& Infiniband::MemoryManager::PoolAllocator::get_lock()
+{
+  static ceph::mutex lock = ceph::make_mutex("pool-alloc-lock");
+  return lock;
+}
+
+char *Infiniband::MemoryManager::PoolAllocator::malloc(const size_type block_size)
+{
+  ceph_assert(g_ctx);
+  MemoryManager *manager = g_ctx->manager;
+  CephContext *cct = manager->cct;
+  size_t chunk_buffer_size = sizeof(Chunk) + cct->_conf->ms_async_rdma_buffer_size;
+  size_t chunk_buffer_number = block_size / chunk_buffer_size;
+
+  if (!g_ctx->can_alloc(chunk_buffer_number))
+    return NULL;
+
+  mem_info *minfo= static_cast<mem_info *>(manager->malloc(block_size + sizeof(mem_info)));
+  if (!minfo) {
+    lderr(cct) << __func__ << " failed to allocate " << chunk_buffer_number << " buffers "
+      " Its block size is : " << block_size + sizeof(mem_info) << dendl;
+    return NULL;
+  }
+
+  minfo->mr = ibv_reg_mr(manager->pd->pd, minfo->chunks, block_size, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
+  if (minfo->mr == NULL) {
+    lderr(cct) << __func__ << " failed to do rdma memory registration " << block_size << " bytes. "
+      " relase allocated memory now." << dendl;
+    manager->free(minfo);
+    return NULL;
+  }
+
+  minfo->nbufs = chunk_buffer_number;
+  // save this chunk context
+  minfo->ctx   = g_ctx;
+
+  // note that the memory can be allocated before perf logger is set
+  g_ctx->update_stats(chunk_buffer_number);
+
+  /* initialize chunks */
+  Chunk *chunk = minfo->chunks;
+  for (unsigned i = 0; i < chunk_buffer_number; i++) {
+    new(chunk) Chunk(minfo->mr, cct->_conf->ms_async_rdma_buffer_size, chunk->data, 0, 0, minfo->mr->lkey);
+    chunk = reinterpret_cast<Chunk *>(reinterpret_cast<char *>(chunk) + chunk_buffer_size);
+  }
+
+  return reinterpret_cast<char *>(minfo->chunks);
+}
+
+
+void Infiniband::MemoryManager::PoolAllocator::free(char * const block)
+{
+  mem_info *m;
+  std::lock_guard l{get_lock()};
+    
+  Chunk *mem_info_chunk = reinterpret_cast<Chunk *>(block);
+  m = reinterpret_cast<mem_info *>(reinterpret_cast<char *>(mem_info_chunk) - offsetof(mem_info, chunks));
+  m->ctx->update_stats(-m->nbufs);
+  ibv_dereg_mr(m->mr);
+  m->ctx->manager->free(m);
+}
+
+Infiniband::MemoryManager::MemoryManager(CephContext *c, Device *d, ProtectionDomain *p)
+  : cct(c), device(d), pd(p),
+    rxbuf_pool_ctx(this),
+    rxbuf_pool(&rxbuf_pool_ctx, sizeof(Chunk) + c->_conf->ms_async_rdma_buffer_size,
+               c->_conf->ms_async_rdma_receive_buffers > 0 ?
+                  // if possible make initial pool size 2 * receive_queue_len
+                  // that way there will be no pool expansion upon receive of the
+                  // first packet.
+                  (c->_conf->ms_async_rdma_receive_buffers < 2 * c->_conf->ms_async_rdma_receive_queue_len ?
+                   c->_conf->ms_async_rdma_receive_buffers :  2 * c->_conf->ms_async_rdma_receive_queue_len) :
+                  // rx pool is infinite, we can set any initial size that we want
+                   2 * c->_conf->ms_async_rdma_receive_queue_len,
+                   device->device_attr.max_mr_size / (sizeof(Chunk) + cct->_conf->ms_async_rdma_buffer_size))
+{
+}
+
+Infiniband::MemoryManager::~MemoryManager()
+{
+  if (send)
+    delete send;
+}
+
+void* Infiniband::MemoryManager::huge_pages_malloc(size_t size)
+{
+  size_t real_size = ALIGN_TO_PAGE_2MB(size) + HUGE_PAGE_SIZE_2MB;
+  char *ptr = (char *)mmap(NULL, real_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE | MAP_HUGETLB, -1, 0);
+  if (ptr == MAP_FAILED) {
+    ptr = (char *)std::malloc(real_size);
+    if (ptr == NULL) return NULL;
+    real_size = 0;
+  }
+  *((size_t *)ptr) = real_size;
+  return ptr + HUGE_PAGE_SIZE_2MB;
+}
+
+void Infiniband::MemoryManager::huge_pages_free(void *ptr)
+{
+  if (ptr == NULL) return;
+  void *real_ptr = (char *)ptr - HUGE_PAGE_SIZE_2MB;
+  size_t real_size = *((size_t *)real_ptr);
+  ceph_assert(real_size % HUGE_PAGE_SIZE_2MB == 0);
+  if (real_size != 0)
+    munmap(real_ptr, real_size);
+  else
+    std::free(real_ptr);
+}
+
+
+void* Infiniband::MemoryManager::malloc(size_t size)
+{
+  if (cct->_conf->ms_async_rdma_enable_hugepage)
+    return huge_pages_malloc(size);
+  else
+    return std::malloc(size);
+}
+
+void Infiniband::MemoryManager::free(void *ptr)
+{
+  if (cct->_conf->ms_async_rdma_enable_hugepage)
+    huge_pages_free(ptr);
+  else
+    std::free(ptr);
+}
+
+void Infiniband::MemoryManager::create_tx_pool(uint32_t size, uint32_t tx_num)
+{
+  ceph_assert(device);
+  ceph_assert(pd);
+
+  send = new Cluster(*this, size);
+  send->fill(tx_num);
+}
+
+void Infiniband::MemoryManager::return_tx(std::vector<Chunk*> &chunks)
+{
+  send->take_back(chunks);
+}
+
+int Infiniband::MemoryManager::get_send_buffers(std::vector<Chunk*> &c, size_t bytes)
+{
+  return send->get_buffers(c, bytes);
+}
+
+static std::atomic<bool> init_prereq = {false};
+
+void Infiniband::verify_prereq(CephContext *cct) {
+   int rc = 0;
+   ldout(cct, 20) << __func__ << " ms_async_rdma_enable_hugepage value is: " << cct->_conf->ms_async_rdma_enable_hugepage <<  dendl;
+   if (cct->_conf->ms_async_rdma_enable_hugepage){
+     rc =  setenv("RDMAV_HUGEPAGES_SAFE","1",1);
+     ldout(cct, 0) << __func__ << " RDMAV_HUGEPAGES_SAFE is set as: " << getenv("RDMAV_HUGEPAGES_SAFE") <<  dendl;
+     if (rc) {
+       lderr(cct) << __func__ << " failed to export RDMA_HUGEPAGES_SAFE. On RDMA must be exported before using huge pages. Application aborts." << dendl;
+       ceph_abort();
+     }
+   }
+
+  //On RDMA MUST be called before fork
+   rc = ibv_fork_init();
+   if (rc) {
+      lderr(cct) << __func__ << " failed to call ibv_for_init(). On RDMA must be called before fork. Application aborts." << dendl;
+      ceph_abort();
+   }
+
+   //Check ulimit
+   struct rlimit limit;
+   getrlimit(RLIMIT_MEMLOCK, &limit);
+   if (limit.rlim_cur != RLIM_INFINITY || limit.rlim_max != RLIM_INFINITY) {
+      lderr(cct) << __func__ << "!!! WARNING !!! For RDMA to work properly user memlock (ulimit -l) must be big enough to allow large amount of registered memory."
+				  " We recommend setting this parameter to infinity" << dendl;
+   }
+   init_prereq = true;
+}
+
+Infiniband::Infiniband(CephContext *cct)
+  : cct(cct),
+    device_name(cct->_conf->ms_async_rdma_device_name),
+    port_num( cct->_conf->ms_async_rdma_port_num)
+{
+  if (!init_prereq)
+    verify_prereq(cct);
+  ldout(cct, 20) << __func__ << " constructing Infiniband..." << dendl;
+}
+
+void Infiniband::init()
+{
+  std::lock_guard l{lock};
+
+  if (initialized)
+    return;
+
+  device_list = new DeviceList(cct);
+  initialized = true;
+
+  device = device_list->get_device(device_name.c_str());
+  ceph_assert(device);
+  device->binding_port(cct, port_num);
+  ib_physical_port = device->active_port->get_port_num();
+  pd = new ProtectionDomain(cct, device);
+  ceph_assert(ceph::NetHandler(cct).set_nonblock(device->ctxt->async_fd) == 0);
+
+  support_srq = cct->_conf->ms_async_rdma_support_srq;
+  if (support_srq) {
+    ceph_assert(device->device_attr.max_srq);
+    rx_queue_len = device->device_attr.max_srq_wr;
+  }
+  else
+    rx_queue_len = device->device_attr.max_qp_wr;
+  if (rx_queue_len > cct->_conf->ms_async_rdma_receive_queue_len) {
+    rx_queue_len = cct->_conf->ms_async_rdma_receive_queue_len;
+    ldout(cct, 1) << __func__ << " assigning: " << rx_queue_len << " receive buffers" << dendl;
+  } else {
+    ldout(cct, 0) << __func__ << " using the max allowed receive buffers: " << rx_queue_len << dendl;
+  }
+
+  // check for the misconfiguration
+  if (cct->_conf->ms_async_rdma_receive_buffers > 0 &&
+      rx_queue_len > (unsigned)cct->_conf->ms_async_rdma_receive_buffers) {
+    lderr(cct) << __func__ << " rdma_receive_queue_len (" <<
+                  rx_queue_len << ") > ms_async_rdma_receive_buffers(" <<
+                  cct->_conf->ms_async_rdma_receive_buffers << ")." << dendl;
+    ceph_abort();
+  }
+
+  // Keep extra one WR for a beacon to indicate all WCEs were consumed
+  tx_queue_len = device->device_attr.max_qp_wr - 1;
+  if (tx_queue_len > cct->_conf->ms_async_rdma_send_buffers) {
+    tx_queue_len = cct->_conf->ms_async_rdma_send_buffers;
+    ldout(cct, 1) << __func__ << " assigning: " << tx_queue_len << " send buffers"  << dendl;
+  } else {
+    ldout(cct, 0) << __func__ << " using the max allowed send buffers: " << tx_queue_len << dendl;
+  }
+
+  //check for the memory region size misconfiguration
+  if ((uint64_t)cct->_conf->ms_async_rdma_buffer_size * tx_queue_len > device->device_attr.max_mr_size) {
+    lderr(cct) << __func__ << " Out of max memory region size " << dendl;
+    ceph_abort();
+  }
+
+  ldout(cct, 1) << __func__ << " device allow " << device->device_attr.max_cqe
+                << " completion entries" << dendl;
+
+  memory_manager = new MemoryManager(cct, device, pd);
+  memory_manager->create_tx_pool(cct->_conf->ms_async_rdma_buffer_size, tx_queue_len);
+
+  if (support_srq) {
+    srq = create_shared_receive_queue(rx_queue_len, MAX_SHARED_RX_SGE_COUNT);
+    post_chunks_to_rq(rx_queue_len, NULL); //add to srq
+  }
+}
+
+Infiniband::~Infiniband()
+{
+  if (!initialized)
+    return;
+  if (support_srq)
+    ibv_destroy_srq(srq);
+  delete memory_manager;
+  delete pd;
+  delete device_list;
+}
+
+/**
+ * Create a shared receive queue. This basically wraps the verbs call. 
+ *
+ * \param[in] max_wr
+ *      The max number of outstanding work requests in the SRQ.
+ * \param[in] max_sge
+ *      The max number of scatter elements per WR.
+ * \return
+ *      A valid ibv_srq pointer, or NULL on error.
+ */
+ibv_srq* Infiniband::create_shared_receive_queue(uint32_t max_wr, uint32_t max_sge)
+{
+  ibv_srq_init_attr sia;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&sia, 0, sizeof(sia));
+  sia.srq_context = device->ctxt;
+  sia.attr.max_wr = max_wr;
+  sia.attr.max_sge = max_sge;
+  return ibv_create_srq(pd->pd, &sia);
+}
+
+int Infiniband::get_tx_buffers(std::vector<Chunk*> &c, size_t bytes)
+{
+  return memory_manager->get_send_buffers(c, bytes);
+}
+
+/**
+ * Create a new QueuePair. This factory should be used in preference to
+ * the QueuePair constructor directly, since this lets derivatives of
+ * Infiniband, e.g. MockInfiniband (if it existed),
+ * return mocked out QueuePair derivatives.
+ *
+ * \return
+ *      QueuePair on success or NULL if init fails
+ * See QueuePair::QueuePair for parameter documentation.
+ */
+Infiniband::QueuePair* Infiniband::create_queue_pair(CephContext *cct, CompletionQueue *tx,
+    CompletionQueue* rx, ibv_qp_type type, struct rdma_cm_id *cm_id)
+{
+  Infiniband::QueuePair *qp = new QueuePair(
+      cct, *this, type, ib_physical_port, srq, tx, rx, tx_queue_len, rx_queue_len, cm_id);
+  if (qp->init()) {
+    delete qp;
+    return NULL;
+  }
+  return qp;
+}
+
+int Infiniband::post_chunks_to_rq(int rq_wr_num, QueuePair *qp)
+{
+  int ret = 0;
+  Chunk *chunk = nullptr;
+
+  ibv_recv_wr *rx_work_request = static_cast<ibv_recv_wr*>(::calloc(rq_wr_num, sizeof(ibv_recv_wr)));
+  ibv_sge *isge = static_cast<ibv_sge*>(::calloc(rq_wr_num, sizeof(ibv_sge)));
+  ceph_assert(rx_work_request);
+  ceph_assert(isge);
+
+  int i = 0;
+  while (i < rq_wr_num) {
+    chunk = get_memory_manager()->get_rx_buffer();
+    if (chunk == nullptr) {
+      lderr(cct) << __func__ << " WARNING: out of memory. Request " << rq_wr_num <<
+                 " rx buffers. Only get " << i << " rx buffers." << dendl;
+      if (i == 0) {
+        ::free(rx_work_request);
+        ::free(isge);
+        return 0;
+      }
+      break; //get some buffers, so we need post them to recevie queue
+    }
+
+    isge[i].addr = reinterpret_cast<uint64_t>(chunk->data);
+    isge[i].length = chunk->bytes;
+    isge[i].lkey = chunk->lkey;
+
+    rx_work_request[i].wr_id = reinterpret_cast<uint64_t>(chunk);// assign chunk address as work request id
+
+    if (i != 0) {
+      rx_work_request[i - 1].next = &rx_work_request[i];
+    }
+    rx_work_request[i].sg_list = &isge[i];
+    rx_work_request[i].num_sge = 1;
+
+    if (qp && !qp->get_srq()) {
+       chunk->set_qp(qp);
+       qp->add_rq_wr(chunk);
+    }
+    i++;
+  }
+
+  ibv_recv_wr *badworkrequest = nullptr;
+  if (support_srq) {
+    ret = ibv_post_srq_recv(srq, rx_work_request, &badworkrequest);
+  } else {
+    ceph_assert(qp);
+    ret = ibv_post_recv(qp->get_qp(), rx_work_request, &badworkrequest);
+  }
+
+  ::free(rx_work_request);
+  ::free(isge);
+  ceph_assert(badworkrequest == nullptr && ret == 0);
+  return i;
+}
+
+Infiniband::CompletionChannel* Infiniband::create_comp_channel(CephContext *c)
+{
+  Infiniband::CompletionChannel *cc = new Infiniband::CompletionChannel(c, *this);
+  if (cc->init()) {
+    delete cc;
+    return NULL;
+  }
+  return cc;
+}
+
+Infiniband::CompletionQueue* Infiniband::create_comp_queue(
+    CephContext *cct, CompletionChannel *cc)
+{
+  Infiniband::CompletionQueue *cq = new Infiniband::CompletionQueue(
+      cct, *this, CQ_DEPTH, cc);
+  if (cq->init()) {
+    delete cq;
+    return NULL;
+  }
+  return cq;
+}
+
+Infiniband::QueuePair::~QueuePair()
+{
+  ldout(cct, 20) << __func__ << " destroy Queue Pair, qp number: " << qp->qp_num << " left SQ WR " << recv_queue.size() << dendl;
+  if (qp) {
+    ldout(cct, 20) << __func__ << " destroy qp=" << qp << dendl;
+    ceph_assert(!ibv_destroy_qp(qp));
+  }
+
+  for (auto& chunk: recv_queue) {
+    infiniband.get_memory_manager()->release_rx_buffer(chunk);
+  }
+  recv_queue.clear();
+}
+
+/**
+ * Given a string representation of the `status' field from Verbs
+ * struct `ibv_wc'.
+ *
+ * \param[in] status
+ *      The integer status obtained in ibv_wc.status.
+ * \return
+ *      A string corresponding to the given status.
+ */
+const char* Infiniband::wc_status_to_string(int status)
+{
+  static const char *lookup[] = {
+      "SUCCESS",
+      "LOC_LEN_ERR",
+      "LOC_QP_OP_ERR",
+      "LOC_EEC_OP_ERR",
+      "LOC_PROT_ERR",
+      "WR_FLUSH_ERR",
+      "MW_BIND_ERR",
+      "BAD_RESP_ERR",
+      "LOC_ACCESS_ERR",
+      "REM_INV_REQ_ERR",
+      "REM_ACCESS_ERR",
+      "REM_OP_ERR",
+      "RETRY_EXC_ERR",
+      "RNR_RETRY_EXC_ERR",
+      "LOC_RDD_VIOL_ERR",
+      "REM_INV_RD_REQ_ERR",
+      "REM_ABORT_ERR",
+      "INV_EECN_ERR",
+      "INV_EEC_STATE_ERR",
+      "FATAL_ERR",
+      "RESP_TIMEOUT_ERR",
+      "GENERAL_ERR"
+  };
+
+  if (status < IBV_WC_SUCCESS || status > IBV_WC_GENERAL_ERR)
+    return "<status out of range!>";
+  return lookup[status];
+}
+
+const char* Infiniband::qp_state_string(int status) {
+  switch(status) {
+    case IBV_QPS_RESET : return "IBV_QPS_RESET";
+    case IBV_QPS_INIT  : return "IBV_QPS_INIT";
+    case IBV_QPS_RTR   : return "IBV_QPS_RTR";
+    case IBV_QPS_RTS   : return "IBV_QPS_RTS";
+    case IBV_QPS_SQD   : return "IBV_QPS_SQD";
+    case IBV_QPS_SQE   : return "IBV_QPS_SQE";
+    case IBV_QPS_ERR   : return "IBV_QPS_ERR";
+    default: return " out of range.";
+  }
+}
diff --git a/src/msg/async/rdma/Infiniband.h b/src/msg/async/rdma/Infiniband.h
new file mode 100644
index 000000000..f18442e4e
--- /dev/null
+++ b/src/msg/async/rdma/Infiniband.h
@@ -0,0 +1,591 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_INFINIBAND_H
+#define CEPH_INFINIBAND_H
+
+#include <boost/pool/pool.hpp>
+// need this because boost messes with ceph log/assert definitions
+#include "include/ceph_assert.h"
+
+#include <infiniband/verbs.h>
+#include <rdma/rdma_cma.h>
+
+#include <atomic>
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "include/common_fwd.h"
+#include "include/int_types.h"
+#include "include/page.h"
+#include "include/scope_guard.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/ceph_mutex.h"
+#include "common/perf_counters.h"
+#include "msg/msg_types.h"
+#include "msg/async/net_handler.h"
+
+#define HUGE_PAGE_SIZE_2MB (2 * 1024 * 1024)
+#define ALIGN_TO_PAGE_2MB(x) \
+    (((x) + (HUGE_PAGE_SIZE_2MB - 1)) & ~(HUGE_PAGE_SIZE_2MB - 1))
+
+#define PSN_LEN 24
+#define PSN_MSK ((1 << PSN_LEN) - 1)
+
+#define BEACON_WRID 0xDEADBEEF
+
+struct ib_cm_meta_t {
+  uint16_t lid;
+  uint32_t local_qpn;
+  uint32_t psn;
+  uint32_t peer_qpn;
+  union ibv_gid gid;
+} __attribute__((packed));
+
+class RDMAStack;
+
+class Port {
+  struct ibv_context* ctxt;
+  int port_num;
+  struct ibv_port_attr port_attr;
+  uint16_t lid;
+  int gid_idx;
+  union ibv_gid gid;
+
+ public:
+  explicit Port(CephContext *cct, struct ibv_context* ictxt, uint8_t ipn);
+  uint16_t get_lid() { return lid; }
+  ibv_gid  get_gid() { return gid; }
+  int get_port_num() { return port_num; }
+  ibv_port_attr* get_port_attr() { return &port_attr; }
+  int get_gid_idx() { return gid_idx; }
+};
+
+
+class Device {
+  ibv_device *device;
+  const char* name;
+  uint8_t  port_cnt = 0;
+ public:
+  explicit Device(CephContext *c, ibv_device* ib_dev);
+  explicit Device(CephContext *c, ibv_context *ib_ctx);
+  ~Device() {
+    if (active_port) {
+      delete active_port;
+      ceph_assert(ibv_close_device(ctxt) == 0);
+    }
+  }
+  const char* get_name() { return name;}
+  uint16_t get_lid() { return active_port->get_lid(); }
+  ibv_gid get_gid() { return active_port->get_gid(); }
+  int get_gid_idx() { return active_port->get_gid_idx(); }
+  void binding_port(CephContext *c, int port_num);
+  struct ibv_context *ctxt;
+  ibv_device_attr device_attr;
+  Port* active_port;
+};
+
+
+class DeviceList {
+  struct ibv_device ** device_list;
+  struct ibv_context ** device_context_list;
+  int num;
+  Device** devices;
+ public:
+  explicit DeviceList(CephContext *cct): device_list(nullptr), device_context_list(nullptr),
+                                         num(0), devices(nullptr) {
+    device_list = ibv_get_device_list(&num);
+    ceph_assert(device_list);
+    ceph_assert(num);
+    if (cct->_conf->ms_async_rdma_cm) {
+        device_context_list = rdma_get_devices(NULL);
+        ceph_assert(device_context_list);
+    }
+    devices = new Device*[num];
+
+    for (int i = 0;i < num; ++i) {
+      if (cct->_conf->ms_async_rdma_cm) {
+         devices[i] = new Device(cct, device_context_list[i]);
+      } else {
+         devices[i] = new Device(cct, device_list[i]);
+      }
+    }
+  }
+  ~DeviceList() {
+    for (int i=0; i < num; ++i) {
+      delete devices[i];
+    }
+    delete []devices;
+    ibv_free_device_list(device_list);
+    rdma_free_devices(device_context_list);
+  }
+
+  Device* get_device(const char* device_name) {
+    for (int i = 0; i < num; ++i) {
+      if (!strlen(device_name) || !strcmp(device_name, devices[i]->get_name())) {
+        return devices[i];
+      }
+    }
+    return NULL;
+  }
+};
+
+// stat counters
+enum {
+  l_msgr_rdma_dispatcher_first = 94000,
+
+  l_msgr_rdma_polling,
+  l_msgr_rdma_inflight_tx_chunks,
+  l_msgr_rdma_rx_bufs_in_use,
+  l_msgr_rdma_rx_bufs_total,
+
+  l_msgr_rdma_tx_total_wc,
+  l_msgr_rdma_tx_total_wc_errors,
+  l_msgr_rdma_tx_wc_retry_errors,
+  l_msgr_rdma_tx_wc_wr_flush_errors,
+
+  l_msgr_rdma_rx_total_wc,
+  l_msgr_rdma_rx_total_wc_errors,
+  l_msgr_rdma_rx_fin,
+
+  l_msgr_rdma_handshake_errors,
+
+  l_msgr_rdma_total_async_events,
+  l_msgr_rdma_async_last_wqe_events,
+
+  l_msgr_rdma_created_queue_pair,
+  l_msgr_rdma_active_queue_pair,
+
+  l_msgr_rdma_dispatcher_last,
+};
+
+enum {
+  l_msgr_rdma_first = 95000,
+
+  l_msgr_rdma_tx_no_mem,
+  l_msgr_rdma_tx_parital_mem,
+  l_msgr_rdma_tx_failed,
+
+  l_msgr_rdma_tx_chunks,
+  l_msgr_rdma_tx_bytes,
+  l_msgr_rdma_rx_chunks,
+  l_msgr_rdma_rx_bytes,
+  l_msgr_rdma_pending_sent_conns,
+
+  l_msgr_rdma_last,
+};
+
+class RDMADispatcher;
+
+class Infiniband {
+ public:
+  class ProtectionDomain {
+   public:
+    explicit ProtectionDomain(CephContext *cct, Device *device);
+    ~ProtectionDomain();
+
+    ibv_pd* const pd;
+  };
+
+  class QueuePair;
+  class MemoryManager {
+   public:
+    class Chunk {
+     public:
+      Chunk(ibv_mr* m, uint32_t bytes, char* buffer, uint32_t offset = 0, uint32_t bound = 0, uint32_t lkey = 0, QueuePair* qp = nullptr);
+      ~Chunk();
+
+      uint32_t get_offset();
+      uint32_t get_size() const;
+      void prepare_read(uint32_t b);
+      uint32_t get_bound();
+      uint32_t read(char* buf, uint32_t len);
+      uint32_t write(char* buf, uint32_t len);
+      bool full();
+      void reset_read_chunk();
+      void reset_write_chunk();
+      void set_qp(QueuePair *qp) { this->qp = qp; }
+      void clear_qp() { set_qp(nullptr); }
+      QueuePair* get_qp() { return qp; }
+
+     public:
+      ibv_mr* mr;
+      QueuePair *qp;
+      uint32_t lkey;
+      uint32_t bytes;
+      uint32_t offset;
+      uint32_t bound;
+      char* buffer; // TODO: remove buffer/refactor TX
+      char  data[0];
+    };
+
+    class Cluster {
+     public:
+      Cluster(MemoryManager& m, uint32_t s);
+      ~Cluster();
+
+      int fill(uint32_t num);
+      void take_back(std::vector<Chunk*> &ck);
+      int get_buffers(std::vector<Chunk*> &chunks, size_t bytes);
+      Chunk *get_chunk_by_buffer(const char *c) {
+        uint32_t idx = (c - base) / buffer_size;
+        Chunk *chunk = chunk_base + idx;
+        return chunk;
+      }
+      bool is_my_buffer(const char *c) const {
+        return c >= base && c < end;
+      }
+
+      bool is_valid_chunk(const Chunk* c) const {
+        return c >= chunk_base && c < chunk_base + num_chunk;
+      }
+      MemoryManager& manager;
+      uint32_t buffer_size;
+      uint32_t num_chunk = 0;
+      ceph::mutex lock = ceph::make_mutex("cluster_lock");
+      std::vector<Chunk*> free_chunks;
+      char *base = nullptr;
+      char *end = nullptr;
+      Chunk* chunk_base = nullptr;
+    };
+
+    class MemPoolContext {
+      PerfCounters *perf_logger;
+
+     public:
+      MemoryManager *manager;
+      unsigned n_bufs_allocated;
+      // true if it is possible to alloc
+      // more memory for the pool
+      explicit MemPoolContext(MemoryManager *m) :
+        perf_logger(nullptr),
+        manager(m),
+        n_bufs_allocated(0) {}
+      bool can_alloc(unsigned nbufs);
+      void update_stats(int val);
+      void set_stat_logger(PerfCounters *logger);
+    };
+
+    class PoolAllocator {
+      struct mem_info {
+        ibv_mr   *mr;
+        MemPoolContext *ctx;
+        unsigned nbufs;
+        Chunk    chunks[0];
+      };
+     public:
+      typedef std::size_t size_type;
+      typedef std::ptrdiff_t difference_type;
+
+      static char * malloc(const size_type bytes);
+      static void free(char * const block);
+
+      template<typename Func>
+      static std::invoke_result_t<Func> with_context(MemPoolContext* ctx,
+						     Func&& func) {
+	std::lock_guard l{get_lock()};
+	g_ctx = ctx;
+	scope_guard reset_ctx{[] { g_ctx = nullptr; }};
+	return std::move(func)();
+      }
+    private:
+      static ceph::mutex& get_lock();
+      static MemPoolContext* g_ctx;
+    };
+
+    /**
+     * modify boost pool so that it is possible to
+     * have a thread safe 'context' when allocating/freeing
+     * the memory. It is needed to allow a different pool
+     * configurations and bookkeeping per CephContext and
+     * also to be able to use same allocator to deal with
+     * RX and TX pool.
+     * TODO: use boost pool to allocate TX chunks too
+     */
+    class mem_pool : public boost::pool<PoolAllocator> {
+     private:
+      MemPoolContext *ctx;
+      void *slow_malloc();
+
+     public:
+      ceph::mutex lock = ceph::make_mutex("mem_pool_lock");
+      explicit mem_pool(MemPoolContext *ctx, const size_type nrequested_size,
+          const size_type nnext_size = 32,
+          const size_type nmax_size = 0) :
+        pool(nrequested_size, nnext_size, nmax_size),
+        ctx(ctx) { }
+
+      void *malloc() {
+        if (!store().empty())
+          return (store().malloc)();
+        // need to alloc more memory...
+        // slow path code
+        return slow_malloc();
+      }
+    };
+
+    MemoryManager(CephContext *c, Device *d, ProtectionDomain *p);
+    ~MemoryManager();
+
+    void* malloc(size_t size);
+    void  free(void *ptr);
+
+    void create_tx_pool(uint32_t size, uint32_t tx_num);
+    void return_tx(std::vector<Chunk*> &chunks);
+    int get_send_buffers(std::vector<Chunk*> &c, size_t bytes);
+    bool is_tx_buffer(const char* c) { return send->is_my_buffer(c); }
+    bool is_valid_chunk(const Chunk* c) { return send->is_valid_chunk(c); }
+    Chunk *get_tx_chunk_by_buffer(const char *c) {
+      return send->get_chunk_by_buffer(c);
+    }
+    uint32_t get_tx_buffer_size() const {
+      return send->buffer_size;
+    }
+
+    Chunk *get_rx_buffer() {
+       std::lock_guard l{rxbuf_pool.lock};
+       return reinterpret_cast<Chunk *>(rxbuf_pool.malloc());
+    }
+
+    void release_rx_buffer(Chunk *chunk) {
+      std::lock_guard l{rxbuf_pool.lock};
+      chunk->clear_qp();
+      rxbuf_pool.free(chunk);
+    }
+
+    void set_rx_stat_logger(PerfCounters *logger) {
+      rxbuf_pool_ctx.set_stat_logger(logger);
+    }
+
+    CephContext  *cct;
+   private:
+    // TODO: Cluster -> TxPool txbuf_pool
+    // chunk layout fix
+    //  
+    Cluster* send = nullptr;// SEND
+    Device *device;
+    ProtectionDomain *pd;
+    MemPoolContext rxbuf_pool_ctx;
+    mem_pool     rxbuf_pool;
+
+
+    void* huge_pages_malloc(size_t size);
+    void  huge_pages_free(void *ptr);
+  };
+
+ private:
+  uint32_t tx_queue_len = 0;
+  uint32_t rx_queue_len = 0;
+  uint32_t max_sge = 0;
+  uint8_t  ib_physical_port = 0;
+  MemoryManager* memory_manager = nullptr;
+  ibv_srq* srq = nullptr;             // shared receive work queue
+  Device *device = NULL;
+  ProtectionDomain *pd = NULL;
+  DeviceList *device_list = nullptr;
+  CephContext *cct;
+  ceph::mutex lock = ceph::make_mutex("IB lock");
+  bool initialized = false;
+  const std::string &device_name;
+  uint8_t port_num;
+  bool support_srq = false;
+
+ public:
+  explicit Infiniband(CephContext *c);
+  ~Infiniband();
+  void init();
+  static void verify_prereq(CephContext *cct);
+
+  class CompletionChannel {
+    static const uint32_t MAX_ACK_EVENT = 5000;
+    CephContext *cct;
+    Infiniband& infiniband;
+    ibv_comp_channel *channel;
+    ibv_cq *cq;
+    uint32_t cq_events_that_need_ack;
+
+   public:
+    CompletionChannel(CephContext *c, Infiniband &ib);
+    ~CompletionChannel();
+    int init();
+    bool get_cq_event();
+    int get_fd() { return channel->fd; }
+    ibv_comp_channel* get_channel() { return channel; }
+    void bind_cq(ibv_cq *c) { cq = c; }
+    void ack_events();
+  };
+
+  // this class encapsulates the creation, use, and destruction of an RC
+  // completion queue.
+  //
+  // You need to call init and it will create a cq and associate to comp channel
+  class CompletionQueue {
+   public:
+    CompletionQueue(CephContext *c, Infiniband &ib,
+                    const uint32_t qd, CompletionChannel *cc)
+      : cct(c), infiniband(ib), channel(cc), cq(NULL), queue_depth(qd) {}
+    ~CompletionQueue();
+    int init();
+    int poll_cq(int num_entries, ibv_wc *ret_wc_array);
+
+    ibv_cq* get_cq() const { return cq; }
+    int rearm_notify(bool solicited_only=true);
+    CompletionChannel* get_cc() const { return channel; }
+   private:
+    CephContext *cct;
+    Infiniband&  infiniband;     // Infiniband to which this QP belongs
+    CompletionChannel *channel;
+    ibv_cq *cq;
+    uint32_t queue_depth;
+  };
+
+  // this class encapsulates the creation, use, and destruction of an RC
+  // queue pair.
+  //
+  // you need call init and it will create a qp and bring it to the INIT state.
+  // after obtaining the lid, qpn, and psn of a remote queue pair, one
+  // must call plumb() to bring the queue pair to the RTS state.
+  class QueuePair {
+   public:
+    typedef MemoryManager::Chunk Chunk;
+    QueuePair(CephContext *c, Infiniband& infiniband, ibv_qp_type type,
+              int ib_physical_port,  ibv_srq *srq,
+              Infiniband::CompletionQueue* txcq,
+              Infiniband::CompletionQueue* rxcq,
+              uint32_t tx_queue_len, uint32_t max_recv_wr, struct rdma_cm_id *cid, uint32_t q_key = 0);
+    ~QueuePair();
+
+    int modify_qp_to_error();
+    int modify_qp_to_rts();
+    int modify_qp_to_rtr();
+    int modify_qp_to_init();
+    int init();
+
+    /**
+     * Get the initial packet sequence number for this QueuePair.
+     * This is randomly generated on creation. It should not be confused
+     * with the remote side's PSN, which is set in #plumb(). 
+     */
+    uint32_t get_initial_psn() const { return initial_psn; };
+    /**
+     * Get the local queue pair number for this QueuePair.
+     * QPNs are analogous to UDP/TCP port numbers.
+     */
+    uint32_t get_local_qp_number() const { return qp->qp_num; };
+    /**
+     * Get the remote queue pair number for this QueuePair, as set in #plumb().
+     * QPNs are analogous to UDP/TCP port numbers.
+     */
+    int get_remote_qp_number(uint32_t *rqp) const;
+    /**
+     * Get the remote infiniband address for this QueuePair, as set in #plumb().
+     * LIDs are "local IDs" in infiniband terminology. They are short, locally
+     * routable addresses.
+     */
+    int get_remote_lid(uint16_t *lid) const;
+    /**
+     * Get the state of a QueuePair.
+     */
+    int get_state() const;
+    /*
+     * send/receive connection management meta data
+     */
+    int send_cm_meta(CephContext *cct, int socket_fd);
+    int recv_cm_meta(CephContext *cct, int socket_fd);
+    void wire_gid_to_gid(const char *wgid, ib_cm_meta_t* cm_meta_data);
+    void gid_to_wire_gid(const ib_cm_meta_t& cm_meta_data, char wgid[]);
+    ibv_qp* get_qp() const { return qp; }
+    Infiniband::CompletionQueue* get_tx_cq() const { return txcq; }
+    Infiniband::CompletionQueue* get_rx_cq() const { return rxcq; }
+    int to_dead();
+    bool is_dead() const { return dead; }
+    ib_cm_meta_t& get_peer_cm_meta() { return peer_cm_meta; }
+    ib_cm_meta_t& get_local_cm_meta() { return local_cm_meta; }
+    void add_rq_wr(Chunk* chunk)
+    {
+      if (srq) return;
+
+      std::lock_guard l{lock};
+      recv_queue.push_back(chunk);
+    }
+
+    void remove_rq_wr(Chunk* chunk) {
+      if (srq) return;
+
+      std::lock_guard l{lock};
+      auto it = std::find(recv_queue.begin(), recv_queue.end(), chunk);
+      ceph_assert(it != recv_queue.end());
+      recv_queue.erase(it);
+    }
+    ibv_srq* get_srq() const { return srq; }
+
+   private:
+    CephContext  *cct;
+    Infiniband&  infiniband;     // Infiniband to which this QP belongs
+    ibv_qp_type  type;           // QP type (IBV_QPT_RC, etc.)
+    ibv_context* ctxt;           // device context of the HCA to use
+    int ib_physical_port;
+    ibv_pd*      pd;             // protection domain
+    ibv_srq*     srq;            // shared receive queue
+    ibv_qp*      qp;             // infiniband verbs QP handle
+    struct rdma_cm_id *cm_id;
+    ib_cm_meta_t peer_cm_meta;
+    ib_cm_meta_t local_cm_meta;
+    Infiniband::CompletionQueue* txcq;
+    Infiniband::CompletionQueue* rxcq;
+    uint32_t     initial_psn;    // initial packet sequence number
+    uint32_t     max_send_wr;
+    uint32_t     max_recv_wr;
+    uint32_t     q_key;
+    bool dead;
+    std::vector<Chunk*> recv_queue;
+    ceph::mutex lock = ceph::make_mutex("queue_pair_lock");
+  };
+
+ public:
+  typedef MemoryManager::Cluster Cluster;
+  typedef MemoryManager::Chunk Chunk;
+  QueuePair* create_queue_pair(CephContext *c, CompletionQueue*, CompletionQueue*,
+      ibv_qp_type type, struct rdma_cm_id *cm_id);
+  ibv_srq* create_shared_receive_queue(uint32_t max_wr, uint32_t max_sge);
+  // post rx buffers to srq, return number of buffers actually posted
+  int post_chunks_to_rq(int num, QueuePair *qp = nullptr);
+  void post_chunk_to_pool(Chunk* chunk) {
+    QueuePair *qp = chunk->get_qp();
+    if (qp != nullptr) {
+      qp->remove_rq_wr(chunk);
+    }
+    get_memory_manager()->release_rx_buffer(chunk);
+  }
+  int get_tx_buffers(std::vector<Chunk*> &c, size_t bytes);
+  CompletionChannel *create_comp_channel(CephContext *c);
+  CompletionQueue *create_comp_queue(CephContext *c, CompletionChannel *cc=NULL);
+  uint8_t get_ib_physical_port() { return ib_physical_port; }
+  uint16_t get_lid() { return device->get_lid(); }
+  ibv_gid get_gid() { return device->get_gid(); }
+  MemoryManager* get_memory_manager() { return memory_manager; }
+  Device* get_device() { return device; }
+  int get_async_fd() { return device->ctxt->async_fd; }
+  bool is_tx_buffer(const char* c) { return memory_manager->is_tx_buffer(c);}
+  Chunk *get_tx_chunk_by_buffer(const char *c) { return memory_manager->get_tx_chunk_by_buffer(c); }
+  static const char* wc_status_to_string(int status);
+  static const char* qp_state_string(int status);
+  uint32_t get_rx_queue_len() const { return rx_queue_len; }
+};
+
+#endif
diff --git a/src/msg/async/rdma/RDMAConnectedSocketImpl.cc b/src/msg/async/rdma/RDMAConnectedSocketImpl.cc
new file mode 100644
index 000000000..6c79dc54f
--- /dev/null
+++ b/src/msg/async/rdma/RDMAConnectedSocketImpl.cc
@@ -0,0 +1,607 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "RDMAStack.h"
+
+class C_handle_connection_established : public EventCallback {
+  RDMAConnectedSocketImpl *csi;
+  bool active = true;
+ public:
+  C_handle_connection_established(RDMAConnectedSocketImpl *w) : csi(w) {}
+  void do_request(uint64_t fd) final {
+    if (active)
+      csi->handle_connection_established();
+  }
+  void close() {
+    active = false;
+  }
+};
+
+class C_handle_connection_read : public EventCallback {
+  RDMAConnectedSocketImpl *csi;
+  bool active = true;
+ public:
+  explicit C_handle_connection_read(RDMAConnectedSocketImpl *w): csi(w) {}
+  void do_request(uint64_t fd) final {
+    if (active)
+      csi->handle_connection();
+  }
+  void close() {
+    active = false;
+  }
+};
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << " RDMAConnectedSocketImpl "
+
+RDMAConnectedSocketImpl::RDMAConnectedSocketImpl(CephContext *cct, std::shared_ptr<Infiniband> &ib,
+                                                 std::shared_ptr<RDMADispatcher>& rdma_dispatcher,
+                                                 RDMAWorker *w)
+  : cct(cct), connected(0), error(0), ib(ib),
+    dispatcher(rdma_dispatcher), worker(w),
+    is_server(false), read_handler(new C_handle_connection_read(this)),
+    established_handler(new C_handle_connection_established(this)),
+    active(false), pending(false)
+{
+  if (!cct->_conf->ms_async_rdma_cm) {
+    qp = ib->create_queue_pair(cct, dispatcher->get_tx_cq(), dispatcher->get_rx_cq(), IBV_QPT_RC, NULL);
+    if (!qp) {
+      lderr(cct) << __func__ << " queue pair create failed" << dendl;
+      return;
+    }
+    local_qpn = qp->get_local_qp_number();
+    notify_fd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
+    dispatcher->register_qp(qp, this);
+    dispatcher->perf_logger->inc(l_msgr_rdma_created_queue_pair);
+    dispatcher->perf_logger->inc(l_msgr_rdma_active_queue_pair);
+  }
+}
+
+RDMAConnectedSocketImpl::~RDMAConnectedSocketImpl()
+{
+  ldout(cct, 20) << __func__ << " destruct." << dendl;
+  cleanup();
+  worker->remove_pending_conn(this);
+  dispatcher->schedule_qp_destroy(local_qpn);
+
+  for (unsigned i=0; i < wc.size(); ++i) {
+    dispatcher->post_chunk_to_pool(reinterpret_cast<Chunk*>(wc[i].wr_id));
+  }
+  for (unsigned i=0; i < buffers.size(); ++i) {
+    dispatcher->post_chunk_to_pool(buffers[i]);
+  }
+
+  std::lock_guard l{lock};
+  if (notify_fd >= 0)
+    ::close(notify_fd);
+  if (tcp_fd >= 0)
+    ::close(tcp_fd);
+  error = ECONNRESET;
+}
+
+void RDMAConnectedSocketImpl::pass_wc(std::vector<ibv_wc> &&v)
+{
+  std::lock_guard l{lock};
+  if (wc.empty())
+    wc = std::move(v);
+  else
+    wc.insert(wc.end(), v.begin(), v.end());
+  notify();
+}
+
+void RDMAConnectedSocketImpl::get_wc(std::vector<ibv_wc> &w)
+{
+  std::lock_guard l{lock};
+  if (wc.empty())
+    return ;
+  w.swap(wc);
+}
+
+int RDMAConnectedSocketImpl::activate()
+{
+  qp->get_local_cm_meta().peer_qpn = qp->get_peer_cm_meta().local_qpn;
+  if (qp->modify_qp_to_rtr() != 0)
+    return -1;
+
+  if (qp->modify_qp_to_rts() != 0)
+    return -1;
+
+  if (!is_server) {
+    connected = 1; //indicate successfully
+    ldout(cct, 20) << __func__ << " handle fake send, wake it up. QP: " << local_qpn << dendl;
+    submit(false);
+  }
+  active = true;
+  peer_qpn = qp->get_local_cm_meta().peer_qpn;
+
+  return 0;
+}
+
+int RDMAConnectedSocketImpl::try_connect(const entity_addr_t& peer_addr, const SocketOptions &opts) {
+  ldout(cct, 20) << __func__ << " nonblock:" << opts.nonblock << ", nodelay:"
+                 << opts.nodelay << ", rbuf_size: " << opts.rcbuf_size << dendl;
+  ceph::NetHandler net(cct);
+
+  // we construct a socket to transport ib sync message
+  // but we shouldn't block in tcp connecting
+  if (opts.nonblock) {
+    tcp_fd = net.nonblock_connect(peer_addr, opts.connect_bind_addr);
+  } else {
+    tcp_fd = net.connect(peer_addr, opts.connect_bind_addr);
+  }
+
+  if (tcp_fd < 0) {
+    return -errno;
+  }
+
+  int r = net.set_socket_options(tcp_fd, opts.nodelay, opts.rcbuf_size);
+  if (r < 0) {
+    ::close(tcp_fd);
+    tcp_fd = -1;
+    return -errno;
+  }
+
+  ldout(cct, 20) << __func__ << " tcp_fd: " << tcp_fd << dendl;
+  net.set_priority(tcp_fd, opts.priority, peer_addr.get_family());
+  r = 0;
+  if (opts.nonblock) {
+    worker->center.create_file_event(tcp_fd, EVENT_READABLE | EVENT_WRITABLE , established_handler);
+  } else {
+    r = handle_connection_established(false);
+  }
+  return r;
+}
+
+int RDMAConnectedSocketImpl::handle_connection_established(bool need_set_fault) {
+  ldout(cct, 20) << __func__ << " start " << dendl;
+  // delete read event
+  worker->center.delete_file_event(tcp_fd, EVENT_READABLE | EVENT_WRITABLE);
+  if (1 == connected) {
+    ldout(cct, 1) << __func__ << " warnning: logic failed " << dendl;
+    if (need_set_fault) {
+      fault();
+    }
+    return -1;
+  }
+  // send handshake msg to server
+  qp->get_local_cm_meta().peer_qpn = 0;
+  int r = qp->send_cm_meta(cct, tcp_fd);
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " send handshake msg failed." << r << dendl;
+    if (need_set_fault) {
+      fault();
+    }
+    return r;
+  }
+  worker->center.create_file_event(tcp_fd, EVENT_READABLE, read_handler);
+  ldout(cct, 20) << __func__ << " finish " << dendl;
+  return 0;
+}
+
+void RDMAConnectedSocketImpl::handle_connection() {
+  ldout(cct, 20) << __func__ << " QP: " << local_qpn << " tcp_fd: " << tcp_fd << " notify_fd: " << notify_fd << dendl;
+  int r = qp->recv_cm_meta(cct, tcp_fd);
+  if (r <= 0) {
+    if (r != -EAGAIN) {
+      dispatcher->perf_logger->inc(l_msgr_rdma_handshake_errors);
+      ldout(cct, 1) << __func__ << " recv handshake msg failed." << dendl;
+      fault();
+    }
+    return;
+  }
+
+  if (1 == connected) {
+    ldout(cct, 1) << __func__ << " warnning: logic failed: read len: " << r << dendl;
+    fault();
+    return;
+  }
+
+  if (!is_server) {// first time: cm meta sync + ack from server
+    if (!connected) {
+      r = activate();
+      ceph_assert(!r);
+    }
+    notify();
+    r = qp->send_cm_meta(cct, tcp_fd);
+    if (r < 0) {
+      ldout(cct, 1) << __func__ << " send client ack failed." << dendl;
+      dispatcher->perf_logger->inc(l_msgr_rdma_handshake_errors);
+      fault();
+    }
+  } else {
+    if (qp->get_peer_cm_meta().peer_qpn == 0) {// first time: cm meta sync from client
+      if (active) {
+        ldout(cct, 10) << __func__ << " server is already active." << dendl;
+        return ;
+      }
+      r = activate();
+      ceph_assert(!r);
+      r = qp->send_cm_meta(cct, tcp_fd);
+      if (r < 0) {
+        ldout(cct, 1) << __func__ << " server ack failed." << dendl;
+        dispatcher->perf_logger->inc(l_msgr_rdma_handshake_errors);
+        fault();
+        return ;
+      }
+    } else { // second time: cm meta ack from client
+      connected = 1;
+      ldout(cct, 10) << __func__ << " handshake of rdma is done. server connected: " << connected << dendl;
+      //cleanup();
+      submit(false);
+      notify();
+    }
+  }
+}
+
+ssize_t RDMAConnectedSocketImpl::read(char* buf, size_t len)
+{
+  eventfd_t event_val = 0;
+  int r = eventfd_read(notify_fd, &event_val);
+  ldout(cct, 20) << __func__ << " notify_fd : " << event_val << " in " << local_qpn
+                 << " r = " << r << dendl;
+
+  if (!active) {
+    ldout(cct, 1) << __func__ << " when ib not active. len: " << len << dendl;
+    return -EAGAIN;
+  }
+
+  if (0 == connected) {
+    ldout(cct, 1) << __func__ << " when ib not connected. len: " << len <<dendl;
+    return -EAGAIN;
+  }
+  ssize_t read = 0;
+  read = read_buffers(buf,len);
+
+  if (is_server && connected == 0) {
+    ldout(cct, 20) << __func__ << " we do not need last handshake, QP: " << local_qpn << " peer QP: " << peer_qpn << dendl;
+    connected = 1; //if so, we don't need the last handshake
+    cleanup();
+    submit(false);
+  }
+
+  if (!buffers.empty()) {
+    notify();
+  }
+
+  if (read == 0 && error)
+    return -error;
+  return read == 0 ? -EAGAIN : read;
+}
+
+void RDMAConnectedSocketImpl::buffer_prefetch(void)
+{
+  std::vector<ibv_wc> cqe;
+  get_wc(cqe);
+  if(cqe.empty())
+    return;
+
+  for(size_t i = 0; i < cqe.size(); ++i) {
+    ibv_wc* response = &cqe[i];
+    ceph_assert(response->status == IBV_WC_SUCCESS);
+    Chunk* chunk = reinterpret_cast<Chunk *>(response->wr_id);
+    chunk->prepare_read(response->byte_len);
+
+    if (chunk->get_size() == 0) {
+      chunk->reset_read_chunk();
+      dispatcher->perf_logger->inc(l_msgr_rdma_rx_fin);
+      if (connected) {
+        error = ECONNRESET;
+        ldout(cct, 20) << __func__ << " got remote close msg..." << dendl;
+      }
+      dispatcher->post_chunk_to_pool(chunk);
+      continue;
+    } else {
+      buffers.push_back(chunk);
+      ldout(cct, 25) << __func__ << " buffers add a chunk: " << chunk->get_offset() << ":" << chunk->get_bound() << dendl;
+    }
+  }
+  worker->perf_logger->inc(l_msgr_rdma_rx_chunks, cqe.size());
+}
+
+ssize_t RDMAConnectedSocketImpl::read_buffers(char* buf, size_t len)
+{
+  size_t read_size = 0, tmp = 0;
+  buffer_prefetch();
+  auto pchunk = buffers.begin();
+  while (pchunk != buffers.end()) {
+    tmp = (*pchunk)->read(buf + read_size, len - read_size);
+    read_size += tmp;
+    ldout(cct, 25) << __func__ << " read chunk " << *pchunk << " bytes length" << tmp << " offset: "
+                   << (*pchunk)->get_offset() << " ,bound: " << (*pchunk)->get_bound() << dendl;
+
+    if ((*pchunk)->get_size() == 0) {
+      (*pchunk)->reset_read_chunk();
+      dispatcher->post_chunk_to_pool(*pchunk);
+      update_post_backlog();
+      ldout(cct, 25) << __func__ << " read over one chunk " << dendl;
+      pchunk++;
+    }
+
+    if (read_size == len) {
+      break;
+    }
+  }
+
+  buffers.erase(buffers.begin(), pchunk);
+  ldout(cct, 25) << __func__ << " got " << read_size  << " bytes, buffers size: " << buffers.size() << dendl;
+  worker->perf_logger->inc(l_msgr_rdma_rx_bytes, read_size);
+  return read_size;
+}
+
+ssize_t RDMAConnectedSocketImpl::send(ceph::buffer::list &bl, bool more)
+{
+  if (error) {
+    if (!active)
+      return -EPIPE;
+    return -error;
+  }
+  size_t bytes = bl.length();
+  if (!bytes)
+    return 0;
+  {
+    std::lock_guard l{lock};
+    pending_bl.claim_append(bl);
+    if (!connected) {
+      ldout(cct, 20) << __func__ << " fake send to upper, QP: " << local_qpn << dendl;
+      return bytes;
+    }
+  }
+  ldout(cct, 20) << __func__ << " QP: " << local_qpn << dendl;
+  ssize_t r = submit(more);
+  if (r < 0 && r != -EAGAIN)
+    return r;
+  return bytes;
+}
+
+size_t RDMAConnectedSocketImpl::tx_copy_chunk(std::vector<Chunk*> &tx_buffers,
+    size_t req_copy_len, decltype(std::cbegin(pending_bl.buffers()))& start,
+    const decltype(std::cbegin(pending_bl.buffers()))& end)
+{
+  ceph_assert(start != end);
+  auto chunk_idx = tx_buffers.size();
+  if (0 == worker->get_reged_mem(this, tx_buffers, req_copy_len)) {
+    ldout(cct, 1) << __func__ << " no enough buffers in worker " << worker << dendl;
+    worker->perf_logger->inc(l_msgr_rdma_tx_no_mem);
+    return 0;
+  }
+
+  Chunk *current_chunk = tx_buffers[chunk_idx];
+  size_t write_len = 0;
+  while (start != end) {
+    const uintptr_t addr = reinterpret_cast<uintptr_t>(start->c_str());
+
+    size_t slice_write_len = 0;
+    while (slice_write_len < start->length()) {
+      size_t real_len = current_chunk->write((char*)addr + slice_write_len, start->length() - slice_write_len);
+
+      slice_write_len += real_len;
+      write_len += real_len;
+      req_copy_len -= real_len;
+
+      if (current_chunk->full()) {
+        if (++chunk_idx == tx_buffers.size())
+          return write_len;
+        current_chunk = tx_buffers[chunk_idx];
+      }
+    }
+
+    ++start;
+  }
+  ceph_assert(req_copy_len == 0);
+  return write_len;
+}
+
+ssize_t RDMAConnectedSocketImpl::submit(bool more)
+{
+  if (error)
+    return -error;
+  std::lock_guard l{lock};
+  size_t bytes = pending_bl.length();
+  ldout(cct, 20) << __func__ << " we need " << bytes << " bytes. iov size: "
+                 << pending_bl.get_num_buffers() << dendl;
+  if (!bytes)
+    return 0;
+
+  std::vector<Chunk*> tx_buffers;
+  auto it = std::cbegin(pending_bl.buffers());
+  auto copy_start = it;
+  size_t total_copied = 0, wait_copy_len = 0;
+  while (it != pending_bl.buffers().end()) {
+    if (ib->is_tx_buffer(it->raw_c_str())) {
+      if (wait_copy_len) {
+        size_t copied = tx_copy_chunk(tx_buffers, wait_copy_len, copy_start, it);
+        total_copied += copied;
+        if (copied < wait_copy_len)
+          goto sending;
+        wait_copy_len = 0;
+      }
+      ceph_assert(copy_start == it);
+      tx_buffers.push_back(ib->get_tx_chunk_by_buffer(it->raw_c_str()));
+      total_copied += it->length();
+      ++copy_start;
+    } else {
+      wait_copy_len += it->length();
+    }
+    ++it;
+  }
+  if (wait_copy_len)
+    total_copied += tx_copy_chunk(tx_buffers, wait_copy_len, copy_start, it);
+
+ sending:
+  if (total_copied == 0)
+    return -EAGAIN;
+  ceph_assert(total_copied <= pending_bl.length());
+  ceph::buffer::list swapped;
+  if (total_copied < pending_bl.length()) {
+    worker->perf_logger->inc(l_msgr_rdma_tx_parital_mem);
+    pending_bl.splice(total_copied, pending_bl.length() - total_copied, &swapped);
+    pending_bl.swap(swapped);
+  } else {
+    pending_bl.clear();
+  }
+
+  ldout(cct, 20) << __func__ << " left bytes: " << pending_bl.length() << " in buffers "
+                 << pending_bl.get_num_buffers() << " tx chunks " << tx_buffers.size() << dendl;
+
+  int r = post_work_request(tx_buffers);
+  if (r < 0)
+    return r;
+
+  ldout(cct, 20) << __func__ << " finished sending " << total_copied << " bytes." << dendl;
+  return pending_bl.length() ? -EAGAIN : 0;
+}
+
+int RDMAConnectedSocketImpl::post_work_request(std::vector<Chunk*> &tx_buffers)
+{
+  ldout(cct, 20) << __func__ << " QP: " << local_qpn << " " << tx_buffers[0] << dendl;
+  auto current_buffer = tx_buffers.begin();
+  ibv_sge isge[tx_buffers.size()];
+  uint32_t current_sge = 0;
+  ibv_send_wr iswr[tx_buffers.size()];
+  uint32_t current_swr = 0;
+  ibv_send_wr* pre_wr = NULL;
+  uint32_t num = 0; 
+
+  // FIPS zeroization audit 20191115: these memsets are not security related.
+  memset(iswr, 0, sizeof(iswr));
+  memset(isge, 0, sizeof(isge));
+ 
+  while (current_buffer != tx_buffers.end()) {
+    isge[current_sge].addr = reinterpret_cast<uint64_t>((*current_buffer)->buffer);
+    isge[current_sge].length = (*current_buffer)->get_offset();
+    isge[current_sge].lkey = (*current_buffer)->mr->lkey;
+    ldout(cct, 25) << __func__ << " sending buffer: " << *current_buffer << " length: " << isge[current_sge].length  << dendl;
+
+    iswr[current_swr].wr_id = reinterpret_cast<uint64_t>(*current_buffer);
+    iswr[current_swr].next = NULL;
+    iswr[current_swr].sg_list = &isge[current_sge];
+    iswr[current_swr].num_sge = 1;
+    iswr[current_swr].opcode = IBV_WR_SEND;
+    iswr[current_swr].send_flags = IBV_SEND_SIGNALED;
+
+    num++;
+    worker->perf_logger->inc(l_msgr_rdma_tx_bytes, isge[current_sge].length);
+    if (pre_wr)
+      pre_wr->next = &iswr[current_swr];
+    pre_wr = &iswr[current_swr];
+    ++current_sge;
+    ++current_swr;
+    ++current_buffer;
+  }
+
+  ibv_send_wr *bad_tx_work_request = nullptr;
+  if (ibv_post_send(qp->get_qp(), iswr, &bad_tx_work_request)) {
+    ldout(cct, 1) << __func__ << " failed to send data"
+                  << " (most probably should be peer not ready): "
+                  << cpp_strerror(errno) << dendl;
+    worker->perf_logger->inc(l_msgr_rdma_tx_failed);
+    return -errno;
+  }
+  worker->perf_logger->inc(l_msgr_rdma_tx_chunks, tx_buffers.size());
+  ldout(cct, 20) << __func__ << " qp state is " << get_qp_state() << dendl;
+  return 0;
+}
+
+void RDMAConnectedSocketImpl::fin() {
+  ibv_send_wr wr;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&wr, 0, sizeof(wr));
+
+  wr.wr_id = reinterpret_cast<uint64_t>(qp);
+  wr.num_sge = 0;
+  wr.opcode = IBV_WR_SEND;
+  wr.send_flags = IBV_SEND_SIGNALED;
+  ibv_send_wr* bad_tx_work_request = nullptr;
+  if (ibv_post_send(qp->get_qp(), &wr, &bad_tx_work_request)) {
+    ldout(cct, 1) << __func__ << " failed to send message="
+                  << " ibv_post_send failed(most probably should be peer not ready): "
+                  << cpp_strerror(errno) << dendl;
+    worker->perf_logger->inc(l_msgr_rdma_tx_failed);
+    return ;
+  }
+}
+
+void RDMAConnectedSocketImpl::cleanup() {
+  if (read_handler && tcp_fd >= 0) {
+    (static_cast<C_handle_connection_read*>(read_handler))->close();
+    worker->center.submit_to(worker->center.get_id(), [this]() {
+      worker->center.delete_file_event(tcp_fd, EVENT_READABLE | EVENT_WRITABLE);
+    }, false);
+    delete read_handler;
+    read_handler = nullptr;
+  }
+  if (established_handler) {
+    (static_cast<C_handle_connection_established*>(established_handler))->close();
+    delete established_handler;
+    established_handler = nullptr;
+  }
+}
+
+void RDMAConnectedSocketImpl::notify()
+{
+  eventfd_t event_val = 1;
+  int r = eventfd_write(notify_fd, event_val);
+  ceph_assert(r == 0);
+}
+
+void RDMAConnectedSocketImpl::shutdown()
+{
+  if (!error)
+    fin();
+  error = ECONNRESET;
+  active = false;
+}
+
+void RDMAConnectedSocketImpl::close()
+{
+  if (!error)
+    fin();
+  error = ECONNRESET;
+  active = false;
+}
+
+void RDMAConnectedSocketImpl::set_priority(int sd, int prio, int domain) {
+    ceph::NetHandler net(cct);
+    net.set_priority(sd, prio, domain);
+}
+
+void RDMAConnectedSocketImpl::fault()
+{
+  ldout(cct, 1) << __func__ << " tcp fd " << tcp_fd << dendl;
+  error = ECONNRESET;
+  connected = 1;
+  notify();
+}
+
+void RDMAConnectedSocketImpl::set_accept_fd(int sd)
+{
+  tcp_fd = sd;
+  is_server = true;
+  worker->center.submit_to(worker->center.get_id(), [this]() {
+			   worker->center.create_file_event(tcp_fd, EVENT_READABLE, read_handler);
+			   }, true);
+}
+
+void RDMAConnectedSocketImpl::post_chunks_to_rq(int num)
+{
+  post_backlog += num - ib->post_chunks_to_rq(num, qp);
+}
+
+void RDMAConnectedSocketImpl::update_post_backlog()
+{
+  if (post_backlog)
+    post_backlog -= post_backlog - dispatcher->post_chunks_to_rq(post_backlog, qp);
+}
diff --git a/src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc b/src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc
new file mode 100644
index 000000000..606dbd281
--- /dev/null
+++ b/src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc
@@ -0,0 +1,183 @@
+#include "RDMAStack.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << " RDMAIWARPConnectedSocketImpl "
+
+#define TIMEOUT_MS 3000
+#define RETRY_COUNT 7
+
+RDMAIWARPConnectedSocketImpl::RDMAIWARPConnectedSocketImpl(CephContext *cct, std::shared_ptr<Infiniband>& ib,
+                                                           std::shared_ptr<RDMADispatcher>& rdma_dispatcher,
+                                                           RDMAWorker *w, RDMACMInfo *info)
+  : RDMAConnectedSocketImpl(cct, ib, rdma_dispatcher, w), cm_con_handler(new C_handle_cm_connection(this))
+{
+  status = IDLE;
+  notify_fd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
+  if (info) {
+    is_server = true;
+    cm_id = info->cm_id;
+    cm_channel = info->cm_channel;
+    status = RDMA_ID_CREATED;
+    peer_qpn = info->qp_num;
+    if (alloc_resource()) {
+      close_notify();
+      return;
+    }
+    worker->center.submit_to(worker->center.get_id(), [this]() {
+      worker->center.create_file_event(cm_channel->fd, EVENT_READABLE, cm_con_handler);
+      status = CHANNEL_FD_CREATED;
+    }, false);
+    status = RESOURCE_ALLOCATED;
+    qp->get_local_cm_meta().peer_qpn = peer_qpn;
+    qp->get_peer_cm_meta().local_qpn = peer_qpn;
+  } else {
+    is_server = false;
+    cm_channel = rdma_create_event_channel();
+    rdma_create_id(cm_channel, &cm_id, NULL, RDMA_PS_TCP);
+    status = RDMA_ID_CREATED;
+    ldout(cct, 20) << __func__ << " successfully created cm id: " << cm_id << dendl;
+  }
+}
+
+RDMAIWARPConnectedSocketImpl::~RDMAIWARPConnectedSocketImpl() {
+  ldout(cct, 20) << __func__ << " destruct." << dendl;
+  std::unique_lock l(close_mtx);
+  close_condition.wait(l, [&] { return closed; });
+  if (status >= RDMA_ID_CREATED) {
+    rdma_destroy_id(cm_id);
+    rdma_destroy_event_channel(cm_channel);
+  }
+}
+
+int RDMAIWARPConnectedSocketImpl::try_connect(const entity_addr_t& peer_addr, const SocketOptions &opts) {
+  worker->center.create_file_event(cm_channel->fd, EVENT_READABLE, cm_con_handler);
+  status = CHANNEL_FD_CREATED;
+  if (rdma_resolve_addr(cm_id, NULL, const_cast<struct sockaddr*>(peer_addr.get_sockaddr()), TIMEOUT_MS)) {
+    lderr(cct) << __func__ << " failed to resolve addr" << dendl;
+    return -1;
+  }
+  return 0;
+}
+
+void RDMAIWARPConnectedSocketImpl::close() {
+  error = ECONNRESET;
+  active = false;
+  if (status >= CONNECTED) {
+    rdma_disconnect(cm_id);
+  }
+  close_notify();
+}
+
+void RDMAIWARPConnectedSocketImpl::shutdown() {
+  error = ECONNRESET;
+  active = false;
+}
+
+void RDMAIWARPConnectedSocketImpl::handle_cm_connection() {
+  struct rdma_cm_event *event;
+  rdma_get_cm_event(cm_channel, &event);
+  ldout(cct, 20) << __func__ << " event name: " << rdma_event_str(event->event)
+                             << " (cm id: " << cm_id << ")" << dendl;
+  struct rdma_conn_param cm_params;
+  switch (event->event) {
+    case RDMA_CM_EVENT_ADDR_RESOLVED:
+      status = ADDR_RESOLVED;
+      if (rdma_resolve_route(cm_id, TIMEOUT_MS)) {
+        lderr(cct) << __func__ << " failed to resolve rdma addr" << dendl;
+        notify();
+      }
+      break;
+
+    case RDMA_CM_EVENT_ROUTE_RESOLVED:
+      status = ROUTE_RESOLVED;
+      if (alloc_resource()) {
+        lderr(cct) << __func__ << " failed to alloc resource while resolving the route" << dendl;
+        connected = -ECONNREFUSED;
+        notify();
+        break;
+      }
+
+      // FIPS zeroization audit 20191115: this memset is not security related.
+      memset(&cm_params, 0, sizeof(cm_params));
+      cm_params.retry_count = RETRY_COUNT;
+      cm_params.qp_num = local_qpn;
+      if (rdma_connect(cm_id, &cm_params)) {
+        lderr(cct) << __func__ << " failed to connect remote rdma port" << dendl;
+        connected = -ECONNREFUSED;
+        notify();
+      }
+      break;
+
+    case RDMA_CM_EVENT_ESTABLISHED:
+      ldout(cct, 20) << __func__ << " qp_num=" << cm_id->qp->qp_num << dendl;
+      status = CONNECTED;
+      if (!is_server) {
+        peer_qpn = event->param.conn.qp_num;
+        activate();
+        qp->get_local_cm_meta().peer_qpn = peer_qpn;
+        qp->get_peer_cm_meta().local_qpn = peer_qpn;
+        notify();
+      }
+      break;
+
+    case RDMA_CM_EVENT_ADDR_ERROR:
+    case RDMA_CM_EVENT_ROUTE_ERROR:
+    case RDMA_CM_EVENT_CONNECT_ERROR:
+    case RDMA_CM_EVENT_UNREACHABLE:
+    case RDMA_CM_EVENT_REJECTED:
+      lderr(cct) << __func__ << " rdma connection rejected" << dendl;
+      connected = -ECONNREFUSED;
+      notify();
+      break;
+
+    case RDMA_CM_EVENT_DISCONNECTED:
+      status = DISCONNECTED;
+      close_notify();
+      if (!error) {
+        error = ECONNRESET;
+        notify();
+      }
+      break;
+
+    case RDMA_CM_EVENT_DEVICE_REMOVAL:
+      break;
+
+    default:
+      ceph_abort_msg("unhandled event");
+      break;
+  }
+  rdma_ack_cm_event(event);
+}
+
+void RDMAIWARPConnectedSocketImpl::activate() {
+  ldout(cct, 30) << __func__ << dendl;
+  active = true;
+  connected = 1;
+}
+
+int RDMAIWARPConnectedSocketImpl::alloc_resource() {
+  ldout(cct, 30) << __func__ << dendl;
+  qp = ib->create_queue_pair(cct, dispatcher->get_tx_cq(),
+      dispatcher->get_rx_cq(), IBV_QPT_RC, cm_id);
+  if (!qp) {
+    return -1;
+  }
+  local_qpn = qp->get_local_qp_number();
+  dispatcher->register_qp(qp, this);
+  dispatcher->perf_logger->inc(l_msgr_rdma_created_queue_pair);
+  dispatcher->perf_logger->inc(l_msgr_rdma_active_queue_pair);
+  return 0;
+}
+
+void RDMAIWARPConnectedSocketImpl::close_notify() {
+  ldout(cct, 30) << __func__ << dendl;
+  if (status >= CHANNEL_FD_CREATED) {
+    worker->center.delete_file_event(cm_channel->fd, EVENT_READABLE);
+  }
+  std::unique_lock l(close_mtx);
+  if (!closed) {
+    closed = true;
+    close_condition.notify_all();
+  }
+}
diff --git a/src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc b/src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc
new file mode 100644
index 000000000..0500b4420
--- /dev/null
+++ b/src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc
@@ -0,0 +1,119 @@
+#include <poll.h>
+
+#include "msg/async/net_handler.h"
+#include "RDMAStack.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << " RDMAIWARPServerSocketImpl "
+
+RDMAIWARPServerSocketImpl::RDMAIWARPServerSocketImpl(
+  CephContext *cct, std::shared_ptr<Infiniband>& ib,
+  std::shared_ptr<RDMADispatcher>& rdma_dispatcher, RDMAWorker *w,
+  entity_addr_t& a, unsigned addr_slot)
+  : RDMAServerSocketImpl(cct, ib, rdma_dispatcher, w, a, addr_slot)
+{
+}
+
+int RDMAIWARPServerSocketImpl::listen(entity_addr_t &sa,
+				      const SocketOptions &opt)
+{
+  ldout(cct, 20) << __func__ << " bind to rdma point" << dendl;
+  cm_channel = rdma_create_event_channel();
+  rdma_create_id(cm_channel, &cm_id, NULL, RDMA_PS_TCP);
+  ldout(cct, 20) << __func__ << " successfully created cm id: " << cm_id << dendl;
+  int rc = rdma_bind_addr(cm_id, const_cast<struct sockaddr*>(sa.get_sockaddr()));
+  if (rc < 0) {
+    rc = -errno;
+    ldout(cct, 10) << __func__ << " unable to bind to " << sa.get_sockaddr()
+                   << " on port " << sa.get_port() << ": " << cpp_strerror(errno) << dendl;
+    goto err;
+  }
+  rc = rdma_listen(cm_id, 128);
+  if (rc < 0) {
+    rc = -errno;
+    ldout(cct, 10) << __func__ << " unable to listen to " << sa.get_sockaddr()
+                   << " on port " << sa.get_port() << ": " << cpp_strerror(errno) << dendl;
+    goto err;
+  }
+  server_setup_socket = cm_channel->fd;
+  rc = net.set_nonblock(server_setup_socket);
+  if (rc < 0) {
+    goto err;
+  }
+  ldout(cct, 20) << __func__ << " fd of cm_channel is " << server_setup_socket << dendl;
+  return 0;
+
+err:
+  server_setup_socket = -1;
+  rdma_destroy_id(cm_id);
+  rdma_destroy_event_channel(cm_channel);
+  return rc;
+}
+
+int RDMAIWARPServerSocketImpl::accept(ConnectedSocket *sock, const SocketOptions &opt,
+    entity_addr_t *out, Worker *w)
+{
+  ldout(cct, 15) << __func__ << dendl;
+
+  ceph_assert(sock);
+  struct pollfd pfd = {
+    .fd = cm_channel->fd,
+    .events = POLLIN,
+    .revents = 0,
+  };
+  int ret = poll(&pfd, 1, 0);
+  ceph_assert(ret >= 0);
+  if (!ret)
+    return -EAGAIN;
+
+  struct rdma_cm_event *cm_event;
+  rdma_get_cm_event(cm_channel, &cm_event);
+  ldout(cct, 20) << __func__ << " event name: " << rdma_event_str(cm_event->event) << dendl;
+
+  struct rdma_cm_id *event_cm_id = cm_event->id;
+  struct rdma_event_channel *event_channel = rdma_create_event_channel();
+
+  if (net.set_nonblock(event_channel->fd) < 0) {
+      lderr(cct) << __func__ << " failed to switch event channel to non-block, close event channel " << dendl;
+      rdma_destroy_event_channel(event_channel);
+      rdma_ack_cm_event(cm_event);
+      return -errno;
+  }
+
+  rdma_migrate_id(event_cm_id, event_channel);
+
+  struct rdma_conn_param *remote_conn_param = &cm_event->param.conn;
+  struct rdma_conn_param local_conn_param;
+
+  RDMACMInfo info(event_cm_id, event_channel, remote_conn_param->qp_num);
+  RDMAIWARPConnectedSocketImpl* server =
+    new RDMAIWARPConnectedSocketImpl(cct, ib, dispatcher, dynamic_cast<RDMAWorker*>(w), &info);
+
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&local_conn_param, 0, sizeof(local_conn_param));
+  local_conn_param.qp_num = server->get_local_qpn();
+
+  if (rdma_accept(event_cm_id, &local_conn_param)) {
+    return -EAGAIN;
+  }
+  server->activate();
+  ldout(cct, 20) << __func__ << " accepted a new QP" << dendl;
+
+  rdma_ack_cm_event(cm_event);
+
+  std::unique_ptr<RDMAConnectedSocketImpl> csi(server);
+  *sock = ConnectedSocket(std::move(csi));
+  struct sockaddr *addr = &event_cm_id->route.addr.dst_addr;
+  out->set_sockaddr(addr);
+
+  return 0;
+}
+
+void RDMAIWARPServerSocketImpl::abort_accept()
+{
+  if (server_setup_socket >= 0) {
+    rdma_destroy_id(cm_id);
+    rdma_destroy_event_channel(cm_channel);
+  }
+}
diff --git a/src/msg/async/rdma/RDMAServerSocketImpl.cc b/src/msg/async/rdma/RDMAServerSocketImpl.cc
new file mode 100644
index 000000000..665faa931
--- /dev/null
+++ b/src/msg/async/rdma/RDMAServerSocketImpl.cc
@@ -0,0 +1,135 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "msg/async/net_handler.h"
+#include "RDMAStack.h"
+
+#include "include/compat.h"
+#include "include/sock_compat.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << " RDMAServerSocketImpl "
+
+RDMAServerSocketImpl::RDMAServerSocketImpl(
+  CephContext *cct, std::shared_ptr<Infiniband>& ib,
+  std::shared_ptr<RDMADispatcher>& rdma_dispatcher,
+  RDMAWorker *w, entity_addr_t& a, unsigned slot)
+  : ServerSocketImpl(a.get_type(), slot),
+    cct(cct), net(cct), server_setup_socket(-1), ib(ib),
+    dispatcher(rdma_dispatcher), worker(w), sa(a)
+{
+}
+
+int RDMAServerSocketImpl::listen(entity_addr_t &sa, const SocketOptions &opt)
+{
+  int rc = 0;
+  server_setup_socket = net.create_socket(sa.get_family(), true);
+  if (server_setup_socket < 0) {
+    rc = -errno;
+    lderr(cct) << __func__ << " failed to create server socket: "
+               << cpp_strerror(errno) << dendl;
+    return rc;
+  }
+
+  rc = net.set_nonblock(server_setup_socket);
+  if (rc < 0) {
+    goto err;
+  }
+
+  rc = net.set_socket_options(server_setup_socket, opt.nodelay, opt.rcbuf_size);
+  if (rc < 0) {
+    goto err;
+  }
+
+  rc = ::bind(server_setup_socket, sa.get_sockaddr(), sa.get_sockaddr_len());
+  if (rc < 0) {
+    rc = -errno;
+    ldout(cct, 10) << __func__ << " unable to bind to " << sa.get_sockaddr()
+                   << " on port " << sa.get_port() << ": " << cpp_strerror(errno) << dendl;
+    goto err;
+  }
+
+  rc = ::listen(server_setup_socket, cct->_conf->ms_tcp_listen_backlog);
+  if (rc < 0) {
+    rc = -errno;
+    lderr(cct) << __func__ << " unable to listen on " << sa << ": " << cpp_strerror(errno) << dendl;
+    goto err;
+  }
+
+  ldout(cct, 20) << __func__ << " bind to " << sa.get_sockaddr() << " on port " << sa.get_port()  << dendl;
+  return 0;
+
+err:
+  ::close(server_setup_socket);
+  server_setup_socket = -1;
+  return rc;
+}
+
+int RDMAServerSocketImpl::accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w)
+{
+  ldout(cct, 15) << __func__ << dendl;
+
+  ceph_assert(sock);
+
+  sockaddr_storage ss;
+  socklen_t slen = sizeof(ss);
+  int sd = accept_cloexec(server_setup_socket, (sockaddr*)&ss, &slen);
+  if (sd < 0) {
+    return -errno;
+  }
+
+  int r = net.set_nonblock(sd);
+  if (r < 0) {
+    ::close(sd);
+    return -errno;
+  }
+
+  r = net.set_socket_options(sd, opt.nodelay, opt.rcbuf_size);
+  if (r < 0) {
+    ::close(sd);
+    return -errno;
+  }
+
+  ceph_assert(NULL != out); //out should not be NULL in accept connection
+
+  out->set_type(addr_type);
+  out->set_sockaddr((sockaddr*)&ss);
+  net.set_priority(sd, opt.priority, out->get_family());
+
+  RDMAConnectedSocketImpl* server;
+  //Worker* w = dispatcher->get_stack()->get_worker();
+  server = new RDMAConnectedSocketImpl(cct, ib, dispatcher, dynamic_cast<RDMAWorker*>(w));
+  if (!server->get_qp()) {
+    lderr(cct) << __func__ << " server->qp is null" << dendl;
+    // cann't use delete server here, destructor will fail.
+    server->cleanup();
+    ::close(sd);
+    return -1;
+  }
+  server->set_accept_fd(sd);
+  ldout(cct, 20) << __func__ << " accepted a new QP, tcp_fd: " << sd << dendl;
+  std::unique_ptr<RDMAConnectedSocketImpl> csi(server);
+  *sock = ConnectedSocket(std::move(csi));
+
+  return 0;
+}
+
+void RDMAServerSocketImpl::abort_accept()
+{
+  if (server_setup_socket >= 0)
+    ::close(server_setup_socket);
+}
diff --git a/src/msg/async/rdma/RDMAStack.cc b/src/msg/async/rdma/RDMAStack.cc
new file mode 100644
index 000000000..12db599d6
--- /dev/null
+++ b/src/msg/async/rdma/RDMAStack.cc
@@ -0,0 +1,812 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <poll.h>
+#include <errno.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "include/str_list.h"
+#include "include/compat.h"
+#include "common/Cycles.h"
+#include "common/deleter.h"
+#include "RDMAStack.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << "RDMAStack "
+
+RDMADispatcher::~RDMADispatcher()
+{
+  ldout(cct, 20) << __func__ << " destructing rdma dispatcher" << dendl;
+  polling_stop();
+
+  ceph_assert(qp_conns.empty());
+  ceph_assert(num_qp_conn == 0);
+  ceph_assert(dead_queue_pairs.empty());
+}
+
+RDMADispatcher::RDMADispatcher(CephContext* c, std::shared_ptr<Infiniband>& ib)
+  : cct(c), ib(ib)
+{
+  PerfCountersBuilder plb(cct, "AsyncMessenger::RDMADispatcher", l_msgr_rdma_dispatcher_first, l_msgr_rdma_dispatcher_last);
+
+  plb.add_u64_counter(l_msgr_rdma_polling, "polling", "Whether dispatcher thread is polling");
+  plb.add_u64_counter(l_msgr_rdma_inflight_tx_chunks, "inflight_tx_chunks", "The number of inflight tx chunks");
+  plb.add_u64_counter(l_msgr_rdma_rx_bufs_in_use, "rx_bufs_in_use", "The number of rx buffers that are holding data and being processed");
+  plb.add_u64_counter(l_msgr_rdma_rx_bufs_total, "rx_bufs_total", "The total number of rx buffers");
+
+  plb.add_u64_counter(l_msgr_rdma_tx_total_wc, "tx_total_wc", "The number of tx work comletions");
+  plb.add_u64_counter(l_msgr_rdma_tx_total_wc_errors, "tx_total_wc_errors", "The number of tx errors");
+  plb.add_u64_counter(l_msgr_rdma_tx_wc_retry_errors, "tx_retry_errors", "The number of tx retry errors");
+  plb.add_u64_counter(l_msgr_rdma_tx_wc_wr_flush_errors, "tx_wr_flush_errors", "The number of tx work request flush errors");
+
+  plb.add_u64_counter(l_msgr_rdma_rx_total_wc, "rx_total_wc", "The number of total rx work completion");
+  plb.add_u64_counter(l_msgr_rdma_rx_total_wc_errors, "rx_total_wc_errors", "The number of total rx error work completion");
+  plb.add_u64_counter(l_msgr_rdma_rx_fin, "rx_fin", "The number of rx finish work request");
+
+  plb.add_u64_counter(l_msgr_rdma_total_async_events, "total_async_events", "The number of async events");
+  plb.add_u64_counter(l_msgr_rdma_async_last_wqe_events, "async_last_wqe_events", "The number of last wqe events");
+
+  plb.add_u64_counter(l_msgr_rdma_handshake_errors, "handshake_errors", "The number of handshake errors");
+
+
+  plb.add_u64_counter(l_msgr_rdma_created_queue_pair, "created_queue_pair", "Active queue pair number");
+  plb.add_u64_counter(l_msgr_rdma_active_queue_pair, "active_queue_pair", "Created queue pair number");
+
+  perf_logger = plb.create_perf_counters();
+  cct->get_perfcounters_collection()->add(perf_logger);
+  Cycles::init();
+}
+
+void RDMADispatcher::polling_start()
+{
+  // take lock because listen/connect can happen from different worker threads
+  std::lock_guard l{lock};
+
+  if (t.joinable()) 
+    return; // dispatcher thread already running 
+
+  ib->get_memory_manager()->set_rx_stat_logger(perf_logger);
+
+  tx_cc = ib->create_comp_channel(cct);
+  ceph_assert(tx_cc);
+  rx_cc = ib->create_comp_channel(cct);
+  ceph_assert(rx_cc);
+  tx_cq = ib->create_comp_queue(cct, tx_cc);
+  ceph_assert(tx_cq);
+  rx_cq = ib->create_comp_queue(cct, rx_cc);
+  ceph_assert(rx_cq);
+
+  t = std::thread(&RDMADispatcher::polling, this);
+  ceph_pthread_setname(t.native_handle(), "rdma-polling");
+}
+
+void RDMADispatcher::polling_stop()
+{
+  {
+    std::lock_guard l{lock};
+    done = true;
+  }
+
+  if (!t.joinable())
+    return;
+
+  t.join();
+
+  tx_cc->ack_events();
+  rx_cc->ack_events();
+  delete tx_cq;
+  delete rx_cq;
+  delete tx_cc;
+  delete rx_cc;
+}
+
+void RDMADispatcher::handle_async_event()
+{
+  ldout(cct, 30) << __func__ << dendl;
+  while (1) {
+    ibv_async_event async_event;
+    if (ibv_get_async_event(ib->get_device()->ctxt, &async_event)) {
+      if (errno != EAGAIN)
+       lderr(cct) << __func__ << " ibv_get_async_event failed. (errno=" << errno
+                  << " " << cpp_strerror(errno) << ")" << dendl;
+      return;
+    }
+    perf_logger->inc(l_msgr_rdma_total_async_events);
+    ldout(cct, 1) << __func__ << "Event : " << ibv_event_type_str(async_event.event_type) << dendl;
+
+    switch (async_event.event_type) {
+      /***********************CQ events********************/
+      case IBV_EVENT_CQ_ERR:
+        lderr(cct) << __func__ << " Fatal Error, effect all QP bound with same CQ, "
+                   << " CQ Overflow, dev = " << ib->get_device()->ctxt
+                   << " Need destroy and recreate resource " << dendl;
+        break;
+      /***********************QP events********************/
+      case IBV_EVENT_QP_FATAL:
+        {
+          /* Error occurred on a QP and it transitioned to error state */
+          ibv_qp* ib_qp = async_event.element.qp;
+          uint32_t qpn = ib_qp->qp_num;
+          QueuePair* qp = get_qp(qpn);
+          lderr(cct) << __func__ << " Fatal Error, event associate qp number: " << qpn
+                     << " Queue Pair status: " << Infiniband::qp_state_string(qp->get_state())
+                     << " Event : " << ibv_event_type_str(async_event.event_type) << dendl;
+        }
+        break;
+      case IBV_EVENT_QP_LAST_WQE_REACHED:
+        {
+          /*
+           * 1. The QP bound with SRQ is in IBV_QPS_ERR state & no more WQE on the RQ of the QP
+           *    Reason: QP is force switched into Error before posting Beacon WR.
+           *            The QP's WRs will be flushed into CQ with IBV_WC_WR_FLUSH_ERR status
+           *            For SRQ, only WRs on the QP which is switched into Error status will be flushed.
+           *    Handle: Only confirm that qp enter into dead queue pairs
+           * 2. The CQE with error was generated for the last WQE
+           *    Handle: output error log
+           */
+          perf_logger->inc(l_msgr_rdma_async_last_wqe_events);
+          ibv_qp* ib_qp = async_event.element.qp;
+          uint32_t qpn = ib_qp->qp_num;
+          std::lock_guard l{lock};
+          RDMAConnectedSocketImpl *conn = get_conn_lockless(qpn);
+          QueuePair* qp = get_qp_lockless(qpn);
+
+          if (qp && !qp->is_dead()) {
+            lderr(cct) << __func__ << " QP not dead, event associate qp number: " << qpn
+                       << " Queue Pair status: " << Infiniband::qp_state_string(qp->get_state())
+                       << " Event : " << ibv_event_type_str(async_event.event_type) << dendl;
+          }
+          if (!conn) {
+            ldout(cct, 20) << __func__ << " Connection's QP maybe entered into dead status. "
+                           << " qp number: " << qpn << dendl;
+          } else {
+             conn->fault();
+             if (qp) {
+                if (!cct->_conf->ms_async_rdma_cm) {
+                  enqueue_dead_qp_lockless(qpn);
+                }
+             }
+          }
+        }
+        break;
+      case IBV_EVENT_QP_REQ_ERR:
+        /* Invalid Request Local Work Queue Error */
+        [[fallthrough]];
+      case IBV_EVENT_QP_ACCESS_ERR:
+        /* Local access violation error */
+        [[fallthrough]];
+      case IBV_EVENT_COMM_EST:
+        /* Communication was established on a QP */
+        [[fallthrough]];
+      case IBV_EVENT_SQ_DRAINED:
+        /* Send Queue was drained of outstanding messages in progress */
+        [[fallthrough]];
+      case IBV_EVENT_PATH_MIG:
+        /* A connection has migrated to the alternate path */
+        [[fallthrough]];
+      case IBV_EVENT_PATH_MIG_ERR:
+        /* A connection failed to migrate to the alternate path */
+        break;
+      /***********************SRQ events*******************/
+      case IBV_EVENT_SRQ_ERR:
+        /* Error occurred on an SRQ */
+        [[fallthrough]];
+      case IBV_EVENT_SRQ_LIMIT_REACHED:
+        /* SRQ limit was reached */
+        break;
+      /***********************Port events******************/
+      case IBV_EVENT_PORT_ACTIVE:
+        /* Link became active on a port */
+        [[fallthrough]];
+      case IBV_EVENT_PORT_ERR:
+        /* Link became unavailable on a port */
+        [[fallthrough]];
+      case IBV_EVENT_LID_CHANGE:
+        /* LID was changed on a port */
+        [[fallthrough]];
+      case IBV_EVENT_PKEY_CHANGE:
+        /* P_Key table was changed on a port */
+        [[fallthrough]];
+      case IBV_EVENT_SM_CHANGE:
+        /* SM was changed on a port */
+        [[fallthrough]];
+      case IBV_EVENT_CLIENT_REREGISTER:
+        /* SM sent a CLIENT_REREGISTER request to a port */
+        [[fallthrough]];
+      case IBV_EVENT_GID_CHANGE:
+        /* GID table was changed on a port */
+        break;
+
+      /***********************CA events******************/
+      //CA events:
+      case IBV_EVENT_DEVICE_FATAL:
+        /* CA is in FATAL state */
+        lderr(cct) << __func__ << " ibv_get_async_event: dev = " << ib->get_device()->ctxt
+                   << " evt: " << ibv_event_type_str(async_event.event_type) << dendl;
+        break;
+      default:
+        lderr(cct) << __func__ << " ibv_get_async_event: dev = " << ib->get_device()->ctxt
+                   << " unknown event: " << async_event.event_type << dendl;
+        break;
+    }
+    ibv_ack_async_event(&async_event);
+  }
+}
+
+void RDMADispatcher::post_chunk_to_pool(Chunk* chunk)
+{
+  std::lock_guard l{lock};
+  ib->post_chunk_to_pool(chunk);
+  perf_logger->dec(l_msgr_rdma_rx_bufs_in_use);
+}
+
+int RDMADispatcher::post_chunks_to_rq(int num, QueuePair *qp)
+{
+  std::lock_guard l{lock};
+  return ib->post_chunks_to_rq(num, qp);
+}
+
+void RDMADispatcher::polling()
+{
+  static int MAX_COMPLETIONS = 32;
+  ibv_wc wc[MAX_COMPLETIONS];
+
+  std::map<RDMAConnectedSocketImpl*, std::vector<ibv_wc> > polled;
+  std::vector<ibv_wc> tx_cqe;
+  ldout(cct, 20) << __func__ << " going to poll tx cq: " << tx_cq << " rx cq: " << rx_cq << dendl;
+  uint64_t last_inactive = Cycles::rdtsc();
+  bool rearmed = false;
+  int r = 0;
+
+  while (true) {
+    int tx_ret = tx_cq->poll_cq(MAX_COMPLETIONS, wc);
+    if (tx_ret > 0) {
+      ldout(cct, 20) << __func__ << " tx completion queue got " << tx_ret
+                     << " responses."<< dendl;
+      handle_tx_event(wc, tx_ret);
+    }
+
+    int rx_ret = rx_cq->poll_cq(MAX_COMPLETIONS, wc);
+    if (rx_ret > 0) {
+      ldout(cct, 20) << __func__ << " rx completion queue got " << rx_ret
+                     << " responses."<< dendl;
+      handle_rx_event(wc, rx_ret);
+    }
+
+    if (!tx_ret && !rx_ret) {
+      perf_logger->set(l_msgr_rdma_inflight_tx_chunks, inflight);
+      //
+      // Clean up dead QPs when rx/tx CQs are in idle. The thing is that
+      // we can destroy QPs even earlier, just when beacon has been received,
+      // but we have two CQs (rx & tx), thus beacon WC can be poped from tx
+      // CQ before other WCs are fully consumed from rx CQ. For safety, we
+      // wait for beacon and then "no-events" from CQs.
+      //
+      // Calling size() on vector without locks is totally fine, since we
+      // use it as a hint (accuracy is not important here)
+      //
+      if (!dead_queue_pairs.empty()) {
+        decltype(dead_queue_pairs) dead_qps;
+        {
+          std::lock_guard l{lock};
+          dead_queue_pairs.swap(dead_qps);
+        }
+
+        for (auto& qp: dead_qps) {
+          perf_logger->dec(l_msgr_rdma_active_queue_pair);
+          ldout(cct, 10) << __func__ << " finally delete qp = " << qp << dendl;
+          delete qp;
+        }
+      }
+
+      if (!num_qp_conn && done && dead_queue_pairs.empty())
+        break;
+
+      uint64_t now = Cycles::rdtsc();
+      if (Cycles::to_microseconds(now - last_inactive) > cct->_conf->ms_async_rdma_polling_us) {
+        handle_async_event();
+        if (!rearmed) {
+          // Clean up cq events after rearm notify ensure no new incoming event
+          // arrived between polling and rearm
+          tx_cq->rearm_notify();
+          rx_cq->rearm_notify();
+          rearmed = true;
+          continue;
+        }
+
+        struct pollfd channel_poll[2];
+        channel_poll[0].fd = tx_cc->get_fd();
+        channel_poll[0].events = POLLIN;
+        channel_poll[0].revents = 0;
+        channel_poll[1].fd = rx_cc->get_fd();
+        channel_poll[1].events = POLLIN;
+        channel_poll[1].revents = 0;
+        r = 0;
+        perf_logger->set(l_msgr_rdma_polling, 0);
+        while (!done && r == 0) {
+          r = TEMP_FAILURE_RETRY(poll(channel_poll, 2, 100));
+          if (r < 0) {
+            r = -errno;
+            lderr(cct) << __func__ << " poll failed " << r << dendl;
+            ceph_abort();
+          }
+        }
+        if (r > 0 && tx_cc->get_cq_event())
+          ldout(cct, 20) << __func__ << " got tx cq event." << dendl;
+        if (r > 0 && rx_cc->get_cq_event())
+          ldout(cct, 20) << __func__ << " got rx cq event." << dendl;
+        last_inactive = Cycles::rdtsc();
+        perf_logger->set(l_msgr_rdma_polling, 1);
+        rearmed = false;
+      }
+    }
+  }
+}
+
+void RDMADispatcher::notify_pending_workers() {
+  if (num_pending_workers) {
+    RDMAWorker *w = nullptr;
+    {
+      std::lock_guard l{w_lock};
+      if (!pending_workers.empty()) {
+        w = pending_workers.front();
+        pending_workers.pop_front();
+        --num_pending_workers;
+      }
+    }
+    if (w)
+      w->notify_worker();
+  }
+}
+
+void RDMADispatcher::register_qp(QueuePair *qp, RDMAConnectedSocketImpl* csi)
+{
+  std::lock_guard l{lock};
+  ceph_assert(!qp_conns.count(qp->get_local_qp_number()));
+  qp_conns[qp->get_local_qp_number()] = std::make_pair(qp, csi);
+  ++num_qp_conn;
+}
+
+RDMAConnectedSocketImpl* RDMADispatcher::get_conn_lockless(uint32_t qp)
+{
+  auto it = qp_conns.find(qp);
+  if (it == qp_conns.end())
+    return nullptr;
+  if (it->second.first->is_dead())
+    return nullptr;
+  return it->second.second;
+}
+
+Infiniband::QueuePair* RDMADispatcher::get_qp_lockless(uint32_t qp)
+{
+  // Try to find the QP in qp_conns firstly.
+  auto it = qp_conns.find(qp);
+  if (it != qp_conns.end())
+    return it->second.first;
+
+  // Try again in dead_queue_pairs.
+  for (auto &i: dead_queue_pairs)
+    if (i->get_local_qp_number() == qp)
+      return i;
+
+  return nullptr;
+}
+
+Infiniband::QueuePair* RDMADispatcher::get_qp(uint32_t qp)
+{
+  std::lock_guard l{lock};
+  return get_qp_lockless(qp);
+}
+
+void RDMADispatcher::enqueue_dead_qp_lockless(uint32_t qpn)
+{
+  auto it = qp_conns.find(qpn);
+  if (it == qp_conns.end()) {
+    lderr(cct) << __func__ << " QP [" << qpn << "] is not registered." << dendl;
+    return ;
+  }
+  QueuePair *qp = it->second.first;
+  dead_queue_pairs.push_back(qp);
+  qp_conns.erase(it);
+  --num_qp_conn;
+}
+
+void RDMADispatcher::enqueue_dead_qp(uint32_t qpn)
+{
+  std::lock_guard l{lock};
+  enqueue_dead_qp_lockless(qpn);
+}
+
+void RDMADispatcher::schedule_qp_destroy(uint32_t qpn)
+{
+  std::lock_guard l{lock};
+  auto it = qp_conns.find(qpn);
+  if (it == qp_conns.end()) {
+    lderr(cct) << __func__ << " QP [" << qpn << "] is not registered." << dendl;
+    return;
+  }
+  QueuePair *qp = it->second.first;
+  if (qp->to_dead()) {
+    //
+    // Failed to switch to dead. This is abnormal, but we can't
+    // do anything, so just destroy QP.
+    //
+    dead_queue_pairs.push_back(qp);
+    qp_conns.erase(it);
+    --num_qp_conn;
+  } else {
+    //
+    // Successfully switched to dead, thus keep entry in the map.
+    // But only zero out socked pointer in order to return null from
+    // get_conn_lockless();
+    it->second.second = nullptr;
+  }
+}
+
+void RDMADispatcher::handle_tx_event(ibv_wc *cqe, int n)
+{
+  std::vector<Chunk*> tx_chunks;
+
+  for (int i = 0; i < n; ++i) {
+    ibv_wc* response = &cqe[i];
+
+    // If it's beacon WR, enqueue the QP to be destroyed later
+    if (response->wr_id == BEACON_WRID) {
+      enqueue_dead_qp(response->qp_num);
+      continue;
+    }
+
+    ldout(cct, 20) << __func__ << " QP number: " << response->qp_num << " len: " << response->byte_len
+                   << " status: " << ib->wc_status_to_string(response->status) << dendl;
+
+    if (response->status != IBV_WC_SUCCESS) {
+      switch(response->status) {
+        case IBV_WC_RETRY_EXC_ERR:
+          {
+            perf_logger->inc(l_msgr_rdma_tx_wc_retry_errors);
+
+            ldout(cct, 1) << __func__ << " Responder ACK timeout, possible disconnect, or Remote QP in bad state "
+                          << " WCE status(" << response->status << "): " << ib->wc_status_to_string(response->status)
+                          << " WCE QP number " << response->qp_num << " Opcode " << response->opcode
+                          << " wr_id: 0x" << std::hex << response->wr_id << std::dec << dendl;
+
+            std::lock_guard l{lock};
+            RDMAConnectedSocketImpl *conn = get_conn_lockless(response->qp_num);
+            if (conn) {
+              ldout(cct, 1) << __func__ << " SQ WR return error, remote Queue Pair, qp number: "
+                            << conn->get_peer_qpn() << dendl;
+            }
+          }
+          break;
+        case IBV_WC_WR_FLUSH_ERR:
+          {
+            perf_logger->inc(l_msgr_rdma_tx_wc_wr_flush_errors);
+
+            std::lock_guard l{lock};
+            QueuePair *qp = get_qp_lockless(response->qp_num);
+            if (qp) {
+              ldout(cct, 20) << __func__ << " qp state is " << Infiniband::qp_state_string(qp->get_state()) << dendl;
+            }
+            if (qp && qp->is_dead()) {
+              ldout(cct, 20) << __func__ << " outstanding SQ WR is flushed into CQ since QueuePair is dead " << dendl;
+            } else {
+              lderr(cct) << __func__ << " Invalid/Unsupported request to consume outstanding SQ WR,"
+                         << " WCE status(" << response->status << "): " << ib->wc_status_to_string(response->status)
+                         << " WCE QP number " << response->qp_num << " Opcode " << response->opcode
+                         << " wr_id: 0x" << std::hex << response->wr_id << std::dec << dendl;
+
+              RDMAConnectedSocketImpl *conn = get_conn_lockless(response->qp_num);
+              if (conn) {
+                ldout(cct, 1) << __func__ << " SQ WR return error, remote Queue Pair, qp number: "
+                              << conn->get_peer_qpn() << dendl;
+              }
+            }
+          }
+          break;
+
+        default:
+          {
+            lderr(cct) << __func__ << " SQ WR return error,"
+                       << " WCE status(" << response->status << "): " << ib->wc_status_to_string(response->status)
+                       << " WCE QP number " << response->qp_num << " Opcode " << response->opcode
+                       << " wr_id: 0x" << std::hex << response->wr_id << std::dec << dendl;
+
+            std::lock_guard l{lock};
+            RDMAConnectedSocketImpl *conn = get_conn_lockless(response->qp_num);
+            if (conn && conn->is_connected()) {
+              ldout(cct, 20) << __func__ << " SQ WR return error Queue Pair error state is : " << conn->get_qp_state()
+                             << " remote Queue Pair, qp number: " << conn->get_peer_qpn() << dendl;
+              conn->fault();
+            } else {
+              ldout(cct, 1) << __func__ << " Disconnected, qp_num = " << response->qp_num << " discard event" << dendl;
+            }
+          }
+          break;
+      }
+    }
+
+    auto chunk = reinterpret_cast<Chunk *>(response->wr_id);
+    //TX completion may come either from
+    // 1) regular send message, WCE wr_id points to chunk
+    // 2) 'fin' message, wr_id points to the QP
+    if (ib->get_memory_manager()->is_valid_chunk(chunk)) {
+      tx_chunks.push_back(chunk);
+    } else if (reinterpret_cast<QueuePair*>(response->wr_id)->get_local_qp_number() == response->qp_num ) {
+      ldout(cct, 1) << __func__ << " sending of the disconnect msg completed" << dendl;
+    } else {
+      ldout(cct, 1) << __func__ << " not tx buffer, chunk " << chunk << dendl;
+      ceph_abort();
+    }
+  }
+
+  perf_logger->inc(l_msgr_rdma_tx_total_wc, n);
+  post_tx_buffer(tx_chunks);
+}
+
+/**
+ * Add the given Chunks to the given free queue.
+ *
+ * \param[in] chunks
+ *      The Chunks to enqueue.
+ * \return
+ *      0 if success or -1 for failure
+ */
+void RDMADispatcher::post_tx_buffer(std::vector<Chunk*> &chunks)
+{
+  if (chunks.empty())
+    return ;
+
+  inflight -= chunks.size();
+  ib->get_memory_manager()->return_tx(chunks);
+  ldout(cct, 30) << __func__ << " release " << chunks.size()
+                 << " chunks, inflight " << inflight << dendl;
+  notify_pending_workers();
+}
+
+void RDMADispatcher::handle_rx_event(ibv_wc *cqe, int rx_number)
+{
+  perf_logger->inc(l_msgr_rdma_rx_total_wc, rx_number);
+  perf_logger->inc(l_msgr_rdma_rx_bufs_in_use, rx_number);
+
+  std::map<RDMAConnectedSocketImpl*, std::vector<ibv_wc> > polled;
+  std::lock_guard l{lock};//make sure connected socket alive when pass wc
+
+  for (int i = 0; i < rx_number; ++i) {
+    ibv_wc* response = &cqe[i];
+    Chunk* chunk = reinterpret_cast<Chunk *>(response->wr_id);
+    RDMAConnectedSocketImpl *conn = get_conn_lockless(response->qp_num);
+    QueuePair *qp = get_qp_lockless(response->qp_num);
+
+    switch (response->status) {
+      case IBV_WC_SUCCESS:
+        ceph_assert(response->opcode == IBV_WC_RECV);
+        if (!conn) {
+          ldout(cct, 1) << __func__ << " csi with qpn " << response->qp_num << " may be dead. chunk 0x"
+                        << std::hex << chunk << " will be back." << std::dec << dendl;
+          ib->post_chunk_to_pool(chunk);
+          perf_logger->dec(l_msgr_rdma_rx_bufs_in_use);
+        } else {
+          conn->post_chunks_to_rq(1);
+          polled[conn].push_back(*response);
+
+          if (qp != nullptr && !qp->get_srq()) {
+            qp->remove_rq_wr(chunk);
+            chunk->clear_qp();
+          }
+        }
+        break;
+
+      case IBV_WC_WR_FLUSH_ERR:
+        perf_logger->inc(l_msgr_rdma_rx_total_wc_errors);
+
+        if (qp) {
+          ldout(cct, 20) << __func__ << " qp state is " << Infiniband::qp_state_string(qp->get_state()) << dendl;
+        }
+        if (qp && qp->is_dead()) {
+          ldout(cct, 20) << __func__ << " outstanding RQ WR is flushed into CQ since QueuePair is dead " << dendl;
+        } else {
+          ldout(cct, 1) << __func__ << " RQ WR return error,"
+                     << " WCE status(" << response->status << "): " << ib->wc_status_to_string(response->status)
+                     << " WCE QP number " << response->qp_num << " Opcode " << response->opcode
+                     << " wr_id: 0x" << std::hex << response->wr_id << std::dec << dendl;
+          if (conn) {
+            ldout(cct, 1) << __func__ << " RQ WR return error, remote Queue Pair, qp number: "
+                       << conn->get_peer_qpn() << dendl;
+          }
+        }
+
+        ib->post_chunk_to_pool(chunk);
+        perf_logger->dec(l_msgr_rdma_rx_bufs_in_use);
+        break;
+
+      default:
+        perf_logger->inc(l_msgr_rdma_rx_total_wc_errors);
+
+        ldout(cct, 1) << __func__ << " RQ WR return error,"
+                      << " WCE status(" << response->status << "): " << ib->wc_status_to_string(response->status)
+                      << " WCE QP number " << response->qp_num << " Opcode " << response->opcode
+                      << " wr_id: 0x" << std::hex << response->wr_id << std::dec << dendl;
+        if (conn && conn->is_connected())
+          conn->fault();
+
+        ib->post_chunk_to_pool(chunk);
+        perf_logger->dec(l_msgr_rdma_rx_bufs_in_use);
+        break;
+    }
+  }
+
+  for (auto &i : polled)
+    i.first->pass_wc(std::move(i.second));
+  polled.clear();
+}
+
+RDMAWorker::RDMAWorker(CephContext *c, unsigned worker_id)
+  : Worker(c, worker_id),
+    tx_handler(new C_handle_cq_tx(this))
+{
+  // initialize perf_logger
+  char name[128];
+  sprintf(name, "AsyncMessenger::RDMAWorker-%u", id);
+  PerfCountersBuilder plb(cct, name, l_msgr_rdma_first, l_msgr_rdma_last);
+
+  plb.add_u64_counter(l_msgr_rdma_tx_no_mem, "tx_no_mem", "The count of no tx buffer");
+  plb.add_u64_counter(l_msgr_rdma_tx_parital_mem, "tx_parital_mem", "The count of parital tx buffer");
+  plb.add_u64_counter(l_msgr_rdma_tx_failed, "tx_failed_post", "The number of tx failed posted");
+
+  plb.add_u64_counter(l_msgr_rdma_tx_chunks, "tx_chunks", "The number of tx chunks transmitted");
+  plb.add_u64_counter(l_msgr_rdma_tx_bytes, "tx_bytes", "The bytes of tx chunks transmitted", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_u64_counter(l_msgr_rdma_rx_chunks, "rx_chunks", "The number of rx chunks transmitted");
+  plb.add_u64_counter(l_msgr_rdma_rx_bytes, "rx_bytes", "The bytes of rx chunks transmitted", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_u64_counter(l_msgr_rdma_pending_sent_conns, "pending_sent_conns", "The count of pending sent conns");
+
+  perf_logger = plb.create_perf_counters();
+  cct->get_perfcounters_collection()->add(perf_logger);
+}
+
+RDMAWorker::~RDMAWorker()
+{
+  delete tx_handler;
+}
+
+void RDMAWorker::initialize()
+{
+  ceph_assert(dispatcher);
+}
+
+int RDMAWorker::listen(entity_addr_t &sa, unsigned addr_slot,
+		       const SocketOptions &opt,ServerSocket *sock)
+{
+  ib->init();
+  dispatcher->polling_start();
+
+  RDMAServerSocketImpl *p;
+  if (cct->_conf->ms_async_rdma_type == "iwarp") {
+    p = new RDMAIWARPServerSocketImpl(cct, ib, dispatcher, this, sa, addr_slot);
+  } else {
+    p = new RDMAServerSocketImpl(cct, ib, dispatcher, this, sa, addr_slot);
+  }
+  int r = p->listen(sa, opt);
+  if (r < 0) {
+    delete p;
+    return r;
+  }
+
+  *sock = ServerSocket(std::unique_ptr<ServerSocketImpl>(p));
+  return 0;
+}
+
+int RDMAWorker::connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket)
+{
+  ib->init();
+  dispatcher->polling_start();
+
+  RDMAConnectedSocketImpl* p;
+  if (cct->_conf->ms_async_rdma_type == "iwarp") {
+    p = new RDMAIWARPConnectedSocketImpl(cct, ib, dispatcher, this);
+  } else {
+    p = new RDMAConnectedSocketImpl(cct, ib, dispatcher, this);
+  }
+  int r = p->try_connect(addr, opts);
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " try connecting failed." << dendl;
+    delete p;
+    return r;
+  }
+  std::unique_ptr<RDMAConnectedSocketImpl> csi(p);
+  *socket = ConnectedSocket(std::move(csi));
+  return 0;
+}
+
+int RDMAWorker::get_reged_mem(RDMAConnectedSocketImpl *o, std::vector<Chunk*> &c, size_t bytes)
+{
+  ceph_assert(center.in_thread());
+  int r = ib->get_tx_buffers(c, bytes);
+  size_t got = ib->get_memory_manager()->get_tx_buffer_size() * r;
+  ldout(cct, 30) << __func__ << " need " << bytes << " bytes, reserve " << got << " registered  bytes, inflight " << dispatcher->inflight << dendl;
+  dispatcher->inflight += r;
+  if (got >= bytes)
+    return r;
+
+  if (o) {
+    if (!o->is_pending()) {
+      pending_sent_conns.push_back(o);
+      perf_logger->inc(l_msgr_rdma_pending_sent_conns, 1);
+      o->set_pending(1);
+    }
+    dispatcher->make_pending_worker(this);
+  }
+  return r;
+}
+
+
+void RDMAWorker::handle_pending_message()
+{
+  ldout(cct, 20) << __func__ << " pending conns " << pending_sent_conns.size() << dendl;
+  while (!pending_sent_conns.empty()) {
+    RDMAConnectedSocketImpl *o = pending_sent_conns.front();
+    pending_sent_conns.pop_front();
+    ssize_t r = o->submit(false);
+    ldout(cct, 20) << __func__ << " sent pending bl socket=" << o << " r=" << r << dendl;
+    if (r < 0) {
+      if (r == -EAGAIN) {
+        pending_sent_conns.push_back(o);
+        dispatcher->make_pending_worker(this);
+        return ;
+      }
+      o->fault();
+    }
+    o->set_pending(0);
+    perf_logger->dec(l_msgr_rdma_pending_sent_conns, 1);
+  }
+  dispatcher->notify_pending_workers();
+}
+
+RDMAStack::RDMAStack(CephContext *cct)
+  : NetworkStack(cct), ib(std::make_shared<Infiniband>(cct)),
+    rdma_dispatcher(std::make_shared<RDMADispatcher>(cct, ib))
+{
+  ldout(cct, 20) << __func__ << " constructing RDMAStack..." << dendl;
+  ldout(cct, 20) << " creating RDMAStack:" << this << " with dispatcher:" << rdma_dispatcher.get() << dendl;
+}
+
+RDMAStack::~RDMAStack()
+{
+  if (cct->_conf->ms_async_rdma_enable_hugepage) {
+    unsetenv("RDMAV_HUGEPAGES_SAFE");	//remove env variable on destruction
+  }
+}
+
+Worker* RDMAStack::create_worker(CephContext *c, unsigned worker_id)
+{
+  auto w = new RDMAWorker(c, worker_id);
+  w->set_dispatcher(rdma_dispatcher);
+  w->set_ib(ib);
+  return w;
+}
+
+void RDMAStack::spawn_worker(std::function<void ()> &&func)
+{
+  threads.emplace_back(std::move(func));
+}
+
+void RDMAStack::join_worker(unsigned i)
+{
+  ceph_assert(threads.size() > i && threads[i].joinable());
+  threads[i].join();
+}
diff --git a/src/msg/async/rdma/RDMAStack.h b/src/msg/async/rdma/RDMAStack.h
new file mode 100644
index 000000000..a36fd5fb7
--- /dev/null
+++ b/src/msg/async/rdma/RDMAStack.h
@@ -0,0 +1,344 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_RDMASTACK_H
+#define CEPH_MSG_RDMASTACK_H
+
+#include <sys/eventfd.h>
+
+#include <list>
+#include <vector>
+#include <thread>
+
+#include "common/ceph_context.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "msg/async/Stack.h"
+#include "Infiniband.h"
+
+class RDMAConnectedSocketImpl;
+class RDMAServerSocketImpl;
+class RDMAStack;
+class RDMAWorker;
+
+class RDMADispatcher {
+  typedef Infiniband::MemoryManager::Chunk Chunk;
+  typedef Infiniband::QueuePair QueuePair;
+
+  std::thread t;
+  CephContext *cct;
+  std::shared_ptr<Infiniband> ib;
+  Infiniband::CompletionQueue* tx_cq = nullptr;
+  Infiniband::CompletionQueue* rx_cq = nullptr;
+  Infiniband::CompletionChannel *tx_cc = nullptr, *rx_cc = nullptr;
+  bool done = false;
+  std::atomic<uint64_t> num_qp_conn = {0};
+  // protect `qp_conns`, `dead_queue_pairs`
+  ceph::mutex lock = ceph::make_mutex("RDMADispatcher::lock");
+  // qp_num -> InfRcConnection
+  // The main usage of `qp_conns` is looking up connection by qp_num,
+  // so the lifecycle of element in `qp_conns` is the lifecycle of qp.
+  //// make qp queue into dead state
+  /**
+   * 1. Connection call mark_down
+   * 2. Move the Queue Pair into the Error state(QueuePair::to_dead)
+   * 3. Post a beacon
+   * 4. Wait for beacon which indicates queues are drained
+   * 5. Destroy the QP by calling ibv_destroy_qp()
+   *
+   * @param qp The qp needed to dead
+   */
+  ceph::unordered_map<uint32_t, std::pair<QueuePair*, RDMAConnectedSocketImpl*> > qp_conns;
+
+  /// if a queue pair is closed when transmit buffers are active
+  /// on it, the transmit buffers never get returned via tx_cq.  To
+  /// work around this problem, don't delete queue pairs immediately. Instead,
+  /// save them in this vector and delete them at a safe time, when there are
+  /// no outstanding transmit buffers to be lost.
+  std::vector<QueuePair*> dead_queue_pairs;
+
+  std::atomic<uint64_t> num_pending_workers = {0};
+  // protect pending workers
+  ceph::mutex w_lock =
+    ceph::make_mutex("RDMADispatcher::for worker pending list");
+  // fixme: lockfree
+  std::list<RDMAWorker*> pending_workers;
+  void enqueue_dead_qp_lockless(uint32_t qp);
+  void enqueue_dead_qp(uint32_t qpn);
+
+ public:
+  PerfCounters *perf_logger;
+
+  explicit RDMADispatcher(CephContext* c, std::shared_ptr<Infiniband>& ib);
+  virtual ~RDMADispatcher();
+  void handle_async_event();
+
+  void polling_start();
+  void polling_stop();
+  void polling();
+  void register_qp(QueuePair *qp, RDMAConnectedSocketImpl* csi);
+  void make_pending_worker(RDMAWorker* w) {
+    std::lock_guard l{w_lock};
+    auto it = std::find(pending_workers.begin(), pending_workers.end(), w);
+    if (it != pending_workers.end())
+      return;
+    pending_workers.push_back(w);
+    ++num_pending_workers;
+  }
+  RDMAConnectedSocketImpl* get_conn_lockless(uint32_t qp);
+  QueuePair* get_qp_lockless(uint32_t qp);
+  QueuePair* get_qp(uint32_t qp);
+  void schedule_qp_destroy(uint32_t qp);
+  Infiniband::CompletionQueue* get_tx_cq() const { return tx_cq; }
+  Infiniband::CompletionQueue* get_rx_cq() const { return rx_cq; }
+  void notify_pending_workers();
+  void handle_tx_event(ibv_wc *cqe, int n);
+  void post_tx_buffer(std::vector<Chunk*> &chunks);
+  void handle_rx_event(ibv_wc *cqe, int rx_number);
+
+  std::atomic<uint64_t> inflight = {0};
+
+  void post_chunk_to_pool(Chunk* chunk);
+  int post_chunks_to_rq(int num, QueuePair *qp = nullptr);
+};
+
+class RDMAWorker : public Worker {
+  typedef Infiniband::CompletionQueue CompletionQueue;
+  typedef Infiniband::CompletionChannel CompletionChannel;
+  typedef Infiniband::MemoryManager::Chunk Chunk;
+  typedef Infiniband::MemoryManager MemoryManager;
+  typedef std::vector<Chunk*>::iterator ChunkIter;
+  std::shared_ptr<Infiniband> ib;
+  EventCallbackRef tx_handler;
+  std::list<RDMAConnectedSocketImpl*> pending_sent_conns;
+  std::shared_ptr<RDMADispatcher> dispatcher;
+  ceph::mutex lock = ceph::make_mutex("RDMAWorker::lock");
+
+  class C_handle_cq_tx : public EventCallback {
+    RDMAWorker *worker;
+    public:
+    explicit C_handle_cq_tx(RDMAWorker *w): worker(w) {}
+    void do_request(uint64_t fd) {
+      worker->handle_pending_message();
+    }
+  };
+
+ public:
+  PerfCounters *perf_logger;
+  explicit RDMAWorker(CephContext *c, unsigned i);
+  virtual ~RDMAWorker();
+  virtual int listen(entity_addr_t &addr,
+		     unsigned addr_slot,
+		     const SocketOptions &opts, ServerSocket *) override;
+  virtual int connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) override;
+  virtual void initialize() override;
+  int get_reged_mem(RDMAConnectedSocketImpl *o, std::vector<Chunk*> &c, size_t bytes);
+  void remove_pending_conn(RDMAConnectedSocketImpl *o) {
+    ceph_assert(center.in_thread());
+    pending_sent_conns.remove(o);
+  }
+  void handle_pending_message();
+  void set_dispatcher(std::shared_ptr<RDMADispatcher>& dispatcher) { this->dispatcher = dispatcher; }
+  void set_ib(std::shared_ptr<Infiniband> &ib) {this->ib = ib;}
+  void notify_worker() {
+    center.dispatch_event_external(tx_handler);
+  }
+};
+
+struct RDMACMInfo {
+  RDMACMInfo(rdma_cm_id *cid, rdma_event_channel *cm_channel_, uint32_t qp_num_)
+    : cm_id(cid), cm_channel(cm_channel_), qp_num(qp_num_) {}
+  rdma_cm_id *cm_id;
+  rdma_event_channel *cm_channel;
+  uint32_t qp_num;
+};
+
+class RDMAConnectedSocketImpl : public ConnectedSocketImpl {
+ public:
+  typedef Infiniband::MemoryManager::Chunk Chunk;
+  typedef Infiniband::CompletionChannel CompletionChannel;
+  typedef Infiniband::CompletionQueue CompletionQueue;
+
+ protected:
+  CephContext *cct;
+  Infiniband::QueuePair *qp;
+  uint32_t peer_qpn = 0;
+  uint32_t local_qpn = 0;
+  int connected;
+  int error;
+  std::shared_ptr<Infiniband> ib;
+  std::shared_ptr<RDMADispatcher> dispatcher;
+  RDMAWorker* worker;
+  std::vector<Chunk*> buffers;
+  int notify_fd = -1;
+  ceph::buffer::list pending_bl;
+
+  ceph::mutex lock = ceph::make_mutex("RDMAConnectedSocketImpl::lock");
+  std::vector<ibv_wc> wc;
+  bool is_server;
+  EventCallbackRef read_handler;
+  EventCallbackRef established_handler;
+  int tcp_fd = -1;
+  bool active;// qp is active ?
+  bool pending;
+  int post_backlog = 0;
+
+  void notify();
+  void buffer_prefetch(void);
+  ssize_t read_buffers(char* buf, size_t len);
+  int post_work_request(std::vector<Chunk*>&);
+  size_t tx_copy_chunk(std::vector<Chunk*> &tx_buffers, size_t req_copy_len,
+      decltype(std::cbegin(pending_bl.buffers()))& start,
+      const decltype(std::cbegin(pending_bl.buffers()))& end);
+
+ public:
+  RDMAConnectedSocketImpl(CephContext *cct, std::shared_ptr<Infiniband>& ib,
+			  std::shared_ptr<RDMADispatcher>& rdma_dispatcher, RDMAWorker *w);
+  virtual ~RDMAConnectedSocketImpl();
+
+  void pass_wc(std::vector<ibv_wc> &&v);
+  void get_wc(std::vector<ibv_wc> &w);
+  virtual int is_connected() override { return connected; }
+
+  virtual ssize_t read(char* buf, size_t len) override;
+  virtual ssize_t send(ceph::buffer::list &bl, bool more) override;
+  virtual void shutdown() override;
+  virtual void close() override;
+  virtual int fd() const override { return notify_fd; }
+  virtual void set_priority(int sd, int prio, int domain) override;
+  void fault();
+  const char* get_qp_state() { return Infiniband::qp_state_string(qp->get_state()); }
+  uint32_t get_peer_qpn () const { return peer_qpn; }
+  uint32_t get_local_qpn () const { return local_qpn; }
+  Infiniband::QueuePair* get_qp () const { return qp; }
+  ssize_t submit(bool more);
+  int activate();
+  void fin();
+  void handle_connection();
+  int handle_connection_established(bool need_set_fault = true);
+  void cleanup();
+  void set_accept_fd(int sd);
+  virtual int try_connect(const entity_addr_t&, const SocketOptions &opt);
+  bool is_pending() {return pending;}
+  void set_pending(bool val) {pending = val;}
+  void post_chunks_to_rq(int num);
+  void update_post_backlog();
+};
+
+enum RDMA_CM_STATUS {
+  IDLE = 1,
+  RDMA_ID_CREATED,
+  CHANNEL_FD_CREATED,
+  RESOURCE_ALLOCATED,
+  ADDR_RESOLVED,
+  ROUTE_RESOLVED,
+  CONNECTED,
+  DISCONNECTED,
+  ERROR
+};
+
+class RDMAIWARPConnectedSocketImpl : public RDMAConnectedSocketImpl {
+  public:
+  RDMAIWARPConnectedSocketImpl(CephContext *cct, std::shared_ptr<Infiniband>& ib,
+			       std::shared_ptr<RDMADispatcher>& rdma_dispatcher,
+			       RDMAWorker *w, RDMACMInfo *info = nullptr);
+    ~RDMAIWARPConnectedSocketImpl();
+    virtual int try_connect(const entity_addr_t&, const SocketOptions &opt) override;
+    virtual void close() override;
+    virtual void shutdown() override;
+    virtual void handle_cm_connection();
+    void activate();
+    int alloc_resource();
+    void close_notify();
+
+  private:
+    rdma_cm_id *cm_id = nullptr;
+    rdma_event_channel *cm_channel = nullptr;
+    EventCallbackRef cm_con_handler;
+    std::mutex close_mtx;
+    std::condition_variable close_condition;
+    bool closed = false;
+    RDMA_CM_STATUS status = IDLE;
+
+
+  class C_handle_cm_connection : public EventCallback {
+    RDMAIWARPConnectedSocketImpl *csi;
+    public:
+      C_handle_cm_connection(RDMAIWARPConnectedSocketImpl *w): csi(w) {}
+      void do_request(uint64_t fd) {
+        csi->handle_cm_connection();
+      }
+  };
+};
+
+class RDMAServerSocketImpl : public ServerSocketImpl {
+  protected:
+    CephContext *cct;
+    ceph::NetHandler net;
+    int server_setup_socket;
+    std::shared_ptr<Infiniband> ib;
+    std::shared_ptr<RDMADispatcher> dispatcher;
+    RDMAWorker *worker;
+    entity_addr_t sa;
+
+ public:
+  RDMAServerSocketImpl(CephContext *cct, std::shared_ptr<Infiniband>& ib,
+                       std::shared_ptr<RDMADispatcher>& rdma_dispatcher,
+		       RDMAWorker *w, entity_addr_t& a, unsigned slot);
+
+  virtual int listen(entity_addr_t &sa, const SocketOptions &opt);
+  virtual int accept(ConnectedSocket *s, const SocketOptions &opts, entity_addr_t *out, Worker *w) override;
+  virtual void abort_accept() override;
+  virtual int fd() const override { return server_setup_socket; }
+};
+
+class RDMAIWARPServerSocketImpl : public RDMAServerSocketImpl {
+  public:
+    RDMAIWARPServerSocketImpl(
+      CephContext *cct, std::shared_ptr<Infiniband>& ib,
+      std::shared_ptr<RDMADispatcher>& rdma_dispatcher,
+      RDMAWorker* w, entity_addr_t& addr, unsigned addr_slot);
+    virtual int listen(entity_addr_t &sa, const SocketOptions &opt) override;
+    virtual int accept(ConnectedSocket *s, const SocketOptions &opts, entity_addr_t *out, Worker *w) override;
+    virtual void abort_accept() override;
+  private:
+    rdma_cm_id *cm_id = nullptr;
+    rdma_event_channel *cm_channel = nullptr;
+};
+
+class RDMAStack : public NetworkStack {
+  std::vector<std::thread> threads;
+  PerfCounters *perf_counter;
+  std::shared_ptr<Infiniband> ib;
+  std::shared_ptr<RDMADispatcher> rdma_dispatcher;
+
+  std::atomic<bool> fork_finished = {false};
+
+  virtual Worker* create_worker(CephContext *c, unsigned worker_id) override;
+
+ public:
+  explicit RDMAStack(CephContext *cct);
+  virtual ~RDMAStack();
+  virtual bool nonblock_connect_need_writable_event() const override { return false; }
+
+  virtual void spawn_worker(std::function<void ()> &&func) override;
+  virtual void join_worker(unsigned i) override;
+  virtual bool is_ready() override { return fork_finished.load(); };
+  virtual void ready() override { fork_finished = true; };
+};
+
+
+#endif
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-21 11:54:28 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-21 11:54:28 +0000
commit	e6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree	64f88b554b444a49f656b6c656111a145cbbaa28 /src/msg/async/rdma
parent	Initial commit. (diff)
download	ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip