Adding upstream version 14.2.21.upstream/14.2.21 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-27 18:24:20 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-27 18:24:20 +0000
commit: 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
tree: e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/msg/async
parent: Initial commit. (diff)
download: ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz
ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip
74 files changed, 26668 insertions, 0 deletions
diff --git a/src/msg/async/AsyncConnection.cc b/src/msg/async/AsyncConnection.cc
new file mode 100644
index 00000000..b78d84a3
--- /dev/null
+++ b/src/msg/async/AsyncConnection.cc
@@ -0,0 +1,771 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <unistd.h>
+
+#include "include/Context.h"
+#include "include/random.h"
+#include "common/errno.h"
+#include "AsyncMessenger.h"
+#include "AsyncConnection.h"
+
+#include "ProtocolV1.h"
+#include "ProtocolV2.h"
+
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "common/EventTrace.h"
+
+// Constant to limit starting sequence number to 2^31.  Nothing special about it, just a big number.  PLR
+#define SEQ_MASK  0x7fffffff
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix _conn_prefix(_dout)
+ostream& AsyncConnection::_conn_prefix(std::ostream *_dout) {
+  return *_dout << "-- " << async_msgr->get_myaddrs() << " >> "
+		<< *peer_addrs << " conn(" << this
+		<< (msgr2 ? " msgr2=" : " legacy=")
+		<< protocol.get()
+		<< " " << ceph_con_mode_name(protocol->auth_meta->con_mode)
+                << " :" << port
+                << " s=" << get_state_name(state)
+                << " l=" << policy.lossy
+                << ").";
+}
+
+// Notes:
+// 1. Don't dispatch any event when closed! It may cause AsyncConnection alive even if AsyncMessenger dead
+
+const uint32_t AsyncConnection::TCP_PREFETCH_MIN_SIZE = 512;
+
+class C_time_wakeup : public EventCallback {
+  AsyncConnectionRef conn;
+
+ public:
+  explicit C_time_wakeup(AsyncConnectionRef c): conn(c) {}
+  void do_request(uint64_t fd_or_id) override {
+    conn->wakeup_from(fd_or_id);
+  }
+};
+
+class C_handle_read : public EventCallback {
+  AsyncConnectionRef conn;
+
+ public:
+  explicit C_handle_read(AsyncConnectionRef c): conn(c) {}
+  void do_request(uint64_t fd_or_id) override {
+    conn->process();
+  }
+};
+
+class C_handle_write : public EventCallback {
+  AsyncConnectionRef conn;
+
+ public:
+  explicit C_handle_write(AsyncConnectionRef c): conn(c) {}
+  void do_request(uint64_t fd) override {
+    conn->handle_write();
+  }
+};
+
+class C_handle_write_callback : public EventCallback {
+  AsyncConnectionRef conn;
+
+public:
+  explicit C_handle_write_callback(AsyncConnectionRef c) : conn(c) {}
+  void do_request(uint64_t fd) override { conn->handle_write_callback(); }
+};
+
+class C_clean_handler : public EventCallback {
+  AsyncConnectionRef conn;
+ public:
+  explicit C_clean_handler(AsyncConnectionRef c): conn(c) {}
+  void do_request(uint64_t id) override {
+    conn->cleanup();
+    delete this;
+  }
+};
+
+class C_tick_wakeup : public EventCallback {
+  AsyncConnectionRef conn;
+
+ public:
+  explicit C_tick_wakeup(AsyncConnectionRef c): conn(c) {}
+  void do_request(uint64_t fd_or_id) override {
+    conn->tick(fd_or_id);
+  }
+};
+
+
+AsyncConnection::AsyncConnection(CephContext *cct, AsyncMessenger *m, DispatchQueue *q,
+                                 Worker *w, bool m2, bool local)
+  : Connection(cct, m), delay_state(NULL), async_msgr(m), conn_id(q->get_id()),
+    logger(w->get_perf_counter()),
+    state(STATE_NONE), port(-1),
+    dispatch_queue(q), recv_buf(NULL),
+    recv_max_prefetch(std::max<int64_t>(msgr->cct->_conf->ms_tcp_prefetch_max_size, TCP_PREFETCH_MIN_SIZE)),
+    recv_start(0), recv_end(0),
+    last_active(ceph::coarse_mono_clock::now()),
+    connect_timeout_us(cct->_conf->ms_connection_ready_timeout*1000*1000),
+    inactive_timeout_us(cct->_conf->ms_connection_idle_timeout*1000*1000),
+    msgr2(m2), state_offset(0),
+    worker(w), center(&w->center),read_buffer(nullptr)
+{
+#ifdef UNIT_TESTS_BUILT
+  this->interceptor = m->interceptor;
+#endif
+  read_handler = new C_handle_read(this);
+  write_handler = new C_handle_write(this);
+  write_callback_handler = new C_handle_write_callback(this);
+  wakeup_handler = new C_time_wakeup(this);
+  tick_handler = new C_tick_wakeup(this);
+  // double recv_max_prefetch see "read_until"
+  recv_buf = new char[2*recv_max_prefetch];
+  if (local) {
+    protocol = std::unique_ptr<Protocol>(new LoopbackProtocolV1(this));
+  } else if (m2) {
+    protocol = std::unique_ptr<Protocol>(new ProtocolV2(this));
+  } else {
+    protocol = std::unique_ptr<Protocol>(new ProtocolV1(this));
+  }
+  logger->inc(l_msgr_created_connections);
+}
+
+AsyncConnection::~AsyncConnection()
+{
+  if (recv_buf)
+    delete[] recv_buf;
+  ceph_assert(!delay_state);
+}
+
+int AsyncConnection::get_con_mode() const {
+  return protocol->get_con_mode();
+}
+
+void AsyncConnection::maybe_start_delay_thread()
+{
+  if (!delay_state) {
+    async_msgr->cct->_conf.with_val<std::string>(
+      "ms_inject_delay_type",
+      [this](const string& s) {
+	if (s.find(ceph_entity_type_name(peer_type)) != string::npos) {
+	  ldout(msgr->cct, 1) << __func__ << " setting up a delay queue"
+			      << dendl;
+	  delay_state = new DelayedDelivery(async_msgr, center, dispatch_queue,
+					    conn_id);
+	}
+      });
+  }
+}
+
+
+ssize_t AsyncConnection::read(unsigned len, char *buffer,
+                              std::function<void(char *, ssize_t)> callback) {
+  ldout(async_msgr->cct, 20) << __func__
+                             << (pendingReadLen ? " continue" : " start")
+                             << " len=" << len << dendl;
+  ssize_t r = read_until(len, buffer);
+  if (r > 0) {
+    readCallback = callback;
+    pendingReadLen = len;
+    read_buffer = buffer;
+  }
+  return r;
+}
+
+// Because this func will be called multi times to populate
+// the needed buffer, so the passed in bufferptr must be the same.
+// Normally, only "read_message" will pass existing bufferptr in
+//
+// And it will uses readahead method to reduce small read overhead,
+// "recv_buf" is used to store read buffer
+//
+// return the remaining bytes, 0 means this buffer is finished
+// else return < 0 means error
+ssize_t AsyncConnection::read_until(unsigned len, char *p)
+{
+  ldout(async_msgr->cct, 25) << __func__ << " len is " << len << " state_offset is "
+                             << state_offset << dendl;
+
+  if (async_msgr->cct->_conf->ms_inject_socket_failures && cs) {
+    if (rand() % async_msgr->cct->_conf->ms_inject_socket_failures == 0) {
+      ldout(async_msgr->cct, 0) << __func__ << " injecting socket failure" << dendl;
+      cs.shutdown();
+    }
+  }
+
+  ssize_t r = 0;
+  uint64_t left = len - state_offset;
+  if (recv_end > recv_start) {
+    uint64_t to_read = std::min<uint64_t>(recv_end - recv_start, left);
+    memcpy(p, recv_buf+recv_start, to_read);
+    recv_start += to_read;
+    left -= to_read;
+    ldout(async_msgr->cct, 25) << __func__ << " got " << to_read << " in buffer "
+                               << " left is " << left << " buffer still has "
+                               << recv_end - recv_start << dendl;
+    if (left == 0) {
+      return 0;
+    }
+    state_offset += to_read;
+  }
+
+  recv_end = recv_start = 0;
+  /* nothing left in the prefetch buffer */
+  if (left > (uint64_t)recv_max_prefetch) {
+    /* this was a large read, we don't prefetch for these */
+    do {
+      r = read_bulk(p+state_offset, left);
+      ldout(async_msgr->cct, 25) << __func__ << " read_bulk left is " << left << " got " << r << dendl;
+      if (r < 0) {
+        ldout(async_msgr->cct, 1) << __func__ << " read failed" << dendl;
+        return -1;
+      } else if (r == static_cast<int>(left)) {
+        state_offset = 0;
+        return 0;
+      }
+      state_offset += r;
+      left -= r;
+    } while (r > 0);
+  } else {
+    do {
+      r = read_bulk(recv_buf+recv_end, recv_max_prefetch);
+      ldout(async_msgr->cct, 25) << __func__ << " read_bulk recv_end is " << recv_end
+                                 << " left is " << left << " got " << r << dendl;
+      if (r < 0) {
+        ldout(async_msgr->cct, 1) << __func__ << " read failed" << dendl;
+        return -1;
+      }
+      recv_end += r;
+      if (r >= static_cast<int>(left)) {
+        recv_start = len - state_offset;
+        memcpy(p+state_offset, recv_buf, recv_start);
+        state_offset = 0;
+        return 0;
+      }
+      left -= r;
+    } while (r > 0);
+    memcpy(p+state_offset, recv_buf, recv_end-recv_start);
+    state_offset += (recv_end - recv_start);
+    recv_end = recv_start = 0;
+  }
+  ldout(async_msgr->cct, 25) << __func__ << " need len " << len << " remaining "
+                             << len - state_offset << " bytes" << dendl;
+  return len - state_offset;
+}
+
+/* return -1 means `fd` occurs error or closed, it should be closed
+ * return 0 means EAGAIN or EINTR */
+ssize_t AsyncConnection::read_bulk(char *buf, unsigned len)
+{
+  ssize_t nread;
+ again:
+  nread = cs.read(buf, len);
+  if (nread < 0) {
+    if (nread == -EAGAIN) {
+      nread = 0;
+    } else if (nread == -EINTR) {
+      goto again;
+    } else {
+      ldout(async_msgr->cct, 1) << __func__ << " reading from fd=" << cs.fd()
+                          << " : "<< strerror(nread) << dendl;
+      return -1;
+    }
+  } else if (nread == 0) {
+    ldout(async_msgr->cct, 1) << __func__ << " peer close file descriptor "
+                              << cs.fd() << dendl;
+    return -1;
+  }
+  return nread;
+}
+
+ssize_t AsyncConnection::write(bufferlist &bl,
+                               std::function<void(ssize_t)> callback,
+                               bool more) {
+
+    std::unique_lock<std::mutex> l(write_lock);
+    outgoing_bl.claim_append(bl);
+    ssize_t r = _try_send(more);
+    if (r > 0) {
+      writeCallback = callback;
+    }
+    return r;
+}
+
+// return the remaining bytes, it may larger than the length of ptr
+// else return < 0 means error
+ssize_t AsyncConnection::_try_send(bool more)
+{
+  if (async_msgr->cct->_conf->ms_inject_socket_failures && cs) {
+    if (rand() % async_msgr->cct->_conf->ms_inject_socket_failures == 0) {
+      ldout(async_msgr->cct, 0) << __func__ << " injecting socket failure" << dendl;
+      cs.shutdown();
+    }
+  }
+
+  ceph_assert(center->in_thread());
+  ldout(async_msgr->cct, 25) << __func__ << " cs.send " << outgoing_bl.length()
+                             << " bytes" << dendl;
+  ssize_t r = cs.send(outgoing_bl, more);
+  if (r < 0) {
+    ldout(async_msgr->cct, 1) << __func__ << " send error: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  ldout(async_msgr->cct, 10) << __func__ << " sent bytes " << r
+                             << " remaining bytes " << outgoing_bl.length() << dendl;
+
+  if (!open_write && is_queued()) {
+    center->create_file_event(cs.fd(), EVENT_WRITABLE, write_handler);
+    open_write = true;
+  }
+
+  if (open_write && !is_queued()) {
+    center->delete_file_event(cs.fd(), EVENT_WRITABLE);
+    open_write = false;
+    if (writeCallback) {
+      center->dispatch_event_external(write_callback_handler);
+    }
+  }
+
+  return outgoing_bl.length();
+}
+
+void AsyncConnection::inject_delay() {
+  if (async_msgr->cct->_conf->ms_inject_internal_delays) {
+    ldout(async_msgr->cct, 10) << __func__ << " sleep for " <<
+      async_msgr->cct->_conf->ms_inject_internal_delays << dendl;
+    utime_t t;
+    t.set_from_double(async_msgr->cct->_conf->ms_inject_internal_delays);
+    t.sleep();
+  }
+}
+
+void AsyncConnection::process() {
+  std::lock_guard<std::mutex> l(lock);
+  last_active = ceph::coarse_mono_clock::now();
+  recv_start_time = ceph::mono_clock::now();
+
+  ldout(async_msgr->cct, 20) << __func__ << dendl;
+
+  switch (state) {
+    case STATE_NONE: {
+      ldout(async_msgr->cct, 20) << __func__ << " enter none state" << dendl;
+      return;
+    }
+    case STATE_CLOSED: {
+      ldout(async_msgr->cct, 20) << __func__ << " socket closed" << dendl;
+      return;
+    }
+    case STATE_CONNECTING: {
+      ceph_assert(!policy.server);
+
+      // clear timer (if any) since we are connecting/re-connecting
+      if (last_tick_id) {
+        center->delete_time_event(last_tick_id);
+        last_tick_id = 0;
+      }
+
+      if (cs) {
+        center->delete_file_event(cs.fd(), EVENT_READABLE | EVENT_WRITABLE);
+        cs.close();
+      }
+
+      SocketOptions opts;
+      opts.priority = async_msgr->get_socket_priority();
+      opts.connect_bind_addr = msgr->get_myaddrs().front();
+      ssize_t r = worker->connect(target_addr, opts, &cs);
+      if (r < 0) {
+        protocol->fault();
+        return;
+      }
+
+      center->create_file_event(cs.fd(), EVENT_READABLE, read_handler);
+      state = STATE_CONNECTING_RE;
+    }
+    case STATE_CONNECTING_RE: {
+      ssize_t r = cs.is_connected();
+      if (r < 0) {
+        ldout(async_msgr->cct, 1) << __func__ << " reconnect failed to "
+                                  << target_addr << dendl;
+        if (r == -ECONNREFUSED) {
+          ldout(async_msgr->cct, 2)
+              << __func__ << " connection refused!" << dendl;
+          dispatch_queue->queue_refused(this);
+        }
+        protocol->fault();
+        return;
+      } else if (r == 0) {
+        ldout(async_msgr->cct, 10)
+            << __func__ << " nonblock connect inprogress" << dendl;
+        if (async_msgr->get_stack()->nonblock_connect_need_writable_event()) {
+          center->create_file_event(cs.fd(), EVENT_WRITABLE,
+                                    read_handler);
+        }
+        logger->tinc(l_msgr_running_recv_time,
+               ceph::mono_clock::now() - recv_start_time);
+        return;
+      }
+
+      center->delete_file_event(cs.fd(), EVENT_WRITABLE);
+      ldout(async_msgr->cct, 10)
+          << __func__ << " connect successfully, ready to send banner" << dendl;
+      state = STATE_CONNECTION_ESTABLISHED;
+      ceph_assert(last_tick_id == 0);
+      // exclude TCP nonblock connect time
+      last_connect_started = ceph::coarse_mono_clock::now();
+      last_tick_id = center->create_time_event(
+        connect_timeout_us, tick_handler);
+      break;
+    }
+
+    case STATE_ACCEPTING: {
+      center->create_file_event(cs.fd(), EVENT_READABLE, read_handler);
+      state = STATE_CONNECTION_ESTABLISHED;
+
+      break;
+    }
+
+    case STATE_CONNECTION_ESTABLISHED: {
+      if (pendingReadLen) {
+        ssize_t r = read(*pendingReadLen, read_buffer, readCallback);
+        if (r <= 0) { // read all bytes, or an error occured
+          pendingReadLen.reset();
+          char *buf_tmp = read_buffer;
+          read_buffer = nullptr;
+          readCallback(buf_tmp, r);
+        }
+        return;
+      }
+      break;
+    }
+  }
+
+  protocol->read_event();
+
+  logger->tinc(l_msgr_running_recv_time,
+               ceph::mono_clock::now() - recv_start_time);
+}
+
+bool AsyncConnection::is_connected() {
+  return protocol->is_connected();
+}
+
+void AsyncConnection::connect(const entity_addrvec_t &addrs, int type,
+                              entity_addr_t &target) {
+
+  std::lock_guard<std::mutex> l(lock);
+  set_peer_type(type);
+  set_peer_addrs(addrs);
+  policy = msgr->get_policy(type);
+  target_addr = target;
+  _connect();
+}
+
+void AsyncConnection::_connect()
+{
+  ldout(async_msgr->cct, 10) << __func__ << dendl;
+
+  state = STATE_CONNECTING;
+  protocol->connect();
+  // rescheduler connection in order to avoid lock dep
+  // may called by external thread(send_message)
+  center->dispatch_event_external(read_handler);
+}
+
+void AsyncConnection::accept(ConnectedSocket socket,
+			     const entity_addr_t &listen_addr,
+			     const entity_addr_t &peer_addr)
+{
+  ldout(async_msgr->cct, 10) << __func__ << " sd=" << socket.fd()
+			     << " listen_addr " << listen_addr
+			     << " peer_addr " << peer_addr << dendl;
+  ceph_assert(socket.fd() >= 0);
+
+  std::lock_guard<std::mutex> l(lock);
+  cs = std::move(socket);
+  socket_addr = listen_addr;
+  target_addr = peer_addr; // until we know better
+  state = STATE_ACCEPTING;
+  protocol->accept();
+  // rescheduler connection in order to avoid lock dep
+  center->dispatch_event_external(read_handler);
+}
+
+int AsyncConnection::send_message(Message *m)
+{
+  FUNCTRACE(async_msgr->cct);
+  lgeneric_subdout(async_msgr->cct, ms,
+		   1) << "-- " << async_msgr->get_myaddrs() << " --> "
+		      << get_peer_addrs() << " -- "
+		      << *m << " -- " << m << " con "
+		      << this
+		      << dendl;
+
+  // optimistic think it's ok to encode(actually may broken now)
+  if (!m->get_priority())
+    m->set_priority(async_msgr->get_default_send_priority());
+
+  m->get_header().src = async_msgr->get_myname();
+  m->set_connection(this);
+
+  if (m->get_type() == CEPH_MSG_OSD_OP)
+    OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OP_BEGIN", true);
+  else if (m->get_type() == CEPH_MSG_OSD_OPREPLY)
+    OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OPREPLY_BEGIN", true);
+
+  if (async_msgr->get_myaddrs() == get_peer_addrs()) { //loopback connection
+    ldout(async_msgr->cct, 20) << __func__ << " " << *m << " local" << dendl;
+    std::lock_guard<std::mutex> l(write_lock);
+    if (protocol->is_connected()) {
+      dispatch_queue->local_delivery(m, m->get_priority());
+    } else {
+      ldout(async_msgr->cct, 10) << __func__ << " loopback connection closed."
+                                 << " Drop message " << m << dendl;
+      m->put();
+    }
+    return 0;
+  }
+
+  // we don't want to consider local message here, it's too lightweight which
+  // may disturb users
+  logger->inc(l_msgr_send_messages);
+
+  protocol->send_message(m);
+  return 0;
+}
+
+entity_addr_t AsyncConnection::_infer_target_addr(const entity_addrvec_t& av)
+{
+  // pick the first addr of the same address family as socket_addr.  it could be
+  // an any: or v2: addr, we don't care.  it should not be a v1 addr.
+  for (auto& i : av.v) {
+    if (i.is_legacy()) {
+      continue;
+    }
+    if (i.get_family() == socket_addr.get_family()) {
+      ldout(async_msgr->cct,10) << __func__ << " " << av << " -> " << i << dendl;
+      return i;
+    }
+  }
+  ldout(async_msgr->cct,10) << __func__ << " " << av << " -> nothing to match "
+			    << socket_addr << dendl;
+  return {};
+}
+
+void AsyncConnection::fault()
+{
+  shutdown_socket();
+  open_write = false;
+
+  // queue delayed items immediately
+  if (delay_state)
+    delay_state->flush();
+
+  recv_start = recv_end = 0;
+  state_offset = 0;
+  outgoing_bl.clear();
+}
+
+void AsyncConnection::_stop() {
+  writeCallback.reset();
+  dispatch_queue->discard_queue(conn_id);
+  async_msgr->unregister_conn(this);
+  worker->release_worker();
+
+  state = STATE_CLOSED;
+  open_write = false;
+
+  state_offset = 0;
+  // Make sure in-queue events will been processed
+  center->dispatch_event_external(EventCallbackRef(new C_clean_handler(this)));
+}
+
+bool AsyncConnection::is_queued() const {
+  return outgoing_bl.length();
+}
+
+void AsyncConnection::shutdown_socket() {
+  for (auto &&t : register_time_events) center->delete_time_event(t);
+  register_time_events.clear();
+  if (last_tick_id) {
+    center->delete_time_event(last_tick_id);
+    last_tick_id = 0;
+  }
+  if (cs) {
+    center->delete_file_event(cs.fd(), EVENT_READABLE | EVENT_WRITABLE);
+    cs.shutdown();
+    cs.close();
+  }
+}
+
+void AsyncConnection::DelayedDelivery::do_request(uint64_t id)
+{
+  Message *m = nullptr;
+  {
+    std::lock_guard<std::mutex> l(delay_lock);
+    register_time_events.erase(id);
+    if (stop_dispatch)
+      return ;
+    if (delay_queue.empty())
+      return ;
+    m = delay_queue.front();
+    delay_queue.pop_front();
+  }
+  if (msgr->ms_can_fast_dispatch(m)) {
+    dispatch_queue->fast_dispatch(m);
+  } else {
+    dispatch_queue->enqueue(m, m->get_priority(), conn_id);
+  }
+}
+
+void AsyncConnection::DelayedDelivery::discard() {
+  stop_dispatch = true;
+  center->submit_to(center->get_id(),
+                    [this]() mutable {
+                      std::lock_guard<std::mutex> l(delay_lock);
+                      while (!delay_queue.empty()) {
+                        Message *m = delay_queue.front();
+                        dispatch_queue->dispatch_throttle_release(
+                            m->get_dispatch_throttle_size());
+                        m->put();
+                        delay_queue.pop_front();
+                      }
+                      for (auto i : register_time_events)
+                        center->delete_time_event(i);
+                      register_time_events.clear();
+                      stop_dispatch = false;
+                    },
+                    true);
+}
+
+void AsyncConnection::DelayedDelivery::flush() {
+  stop_dispatch = true;
+  center->submit_to(
+      center->get_id(), [this] () mutable {
+    std::lock_guard<std::mutex> l(delay_lock);
+    while (!delay_queue.empty()) {
+      Message *m = delay_queue.front();
+      if (msgr->ms_can_fast_dispatch(m)) {
+        dispatch_queue->fast_dispatch(m);
+      } else {
+        dispatch_queue->enqueue(m, m->get_priority(), conn_id);
+      }
+      delay_queue.pop_front();
+    }
+    for (auto i : register_time_events)
+      center->delete_time_event(i);
+    register_time_events.clear();
+    stop_dispatch = false;
+  }, true);
+}
+
+void AsyncConnection::send_keepalive()
+{
+  protocol->send_keepalive();
+}
+
+void AsyncConnection::mark_down()
+{
+  ldout(async_msgr->cct, 1) << __func__ << dendl;
+  std::lock_guard<std::mutex> l(lock);
+  protocol->stop();
+}
+
+void AsyncConnection::handle_write()
+{
+  ldout(async_msgr->cct, 10) << __func__ << dendl;
+  protocol->write_event();
+}
+
+void AsyncConnection::handle_write_callback() {
+  std::lock_guard<std::mutex> l(lock);
+  last_active = ceph::coarse_mono_clock::now();
+  recv_start_time = ceph::mono_clock::now();
+  write_lock.lock();
+  if (writeCallback) {
+    auto callback = *writeCallback;
+    writeCallback.reset();
+    write_lock.unlock();
+    callback(0);
+    return;
+  }
+  write_lock.unlock();
+}
+
+void AsyncConnection::stop(bool queue_reset) {
+  lock.lock();
+  bool need_queue_reset = (state != STATE_CLOSED) && queue_reset;
+  protocol->stop();
+  lock.unlock();
+  if (need_queue_reset) dispatch_queue->queue_reset(this);
+}
+
+void AsyncConnection::cleanup() {
+  shutdown_socket();
+  delete read_handler;
+  delete write_handler;
+  delete write_callback_handler;
+  delete wakeup_handler;
+  delete tick_handler;
+  if (delay_state) {
+    delete delay_state;
+    delay_state = NULL;
+  }
+}
+
+void AsyncConnection::wakeup_from(uint64_t id)
+{
+  lock.lock();
+  register_time_events.erase(id);
+  lock.unlock();
+  process();
+}
+
+void AsyncConnection::tick(uint64_t id)
+{
+  auto now = ceph::coarse_mono_clock::now();
+  ldout(async_msgr->cct, 20) << __func__ << " last_id=" << last_tick_id
+                             << " last_active=" << last_active << dendl;
+  std::lock_guard<std::mutex> l(lock);
+  last_tick_id = 0;
+  if (!is_connected()) {
+    if (connect_timeout_us <=
+        (uint64_t)std::chrono::duration_cast<std::chrono::microseconds>
+          (now - last_connect_started).count()) {
+      ldout(async_msgr->cct, 1) << __func__ << " see no progress in more than "
+                                << connect_timeout_us
+                                << " us during connecting, fault."
+                                << dendl;
+      protocol->fault();
+    } else {
+      last_tick_id = center->create_time_event(connect_timeout_us, tick_handler);
+    }
+  } else {
+    auto idle_period = std::chrono::duration_cast<std::chrono::microseconds>
+      (now - last_active).count();
+    if (inactive_timeout_us < (uint64_t)idle_period) {
+      ldout(async_msgr->cct, 1) << __func__ << " idle (" << idle_period
+                                << ") for more than " << inactive_timeout_us
+                                << " us, fault."
+                                << dendl;
+      protocol->fault();
+    } else {
+      last_tick_id = center->create_time_event(inactive_timeout_us, tick_handler);
+    }
+  }
+}
diff --git a/src/msg/async/AsyncConnection.h b/src/msg/async/AsyncConnection.h
new file mode 100644
index 00000000..0c2512c8
--- /dev/null
+++ b/src/msg/async/AsyncConnection.h
@@ -0,0 +1,238 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_ASYNCCONNECTION_H
+#define CEPH_MSG_ASYNCCONNECTION_H
+
+#include <atomic>
+#include <pthread.h>
+#include <climits>
+#include <list>
+#include <mutex>
+#include <map>
+#include <functional>
+#include <optional>
+
+#include "auth/AuthSessionHandler.h"
+#include "common/ceph_time.h"
+#include "common/perf_counters.h"
+#include "include/buffer.h"
+#include "msg/Connection.h"
+#include "msg/Messenger.h"
+
+#include "Event.h"
+#include "Stack.h"
+
+class AsyncMessenger;
+class DispatchQueue;
+class Worker;
+class Protocol;
+
+static const int ASYNC_IOV_MAX = (IOV_MAX >= 1024 ? IOV_MAX / 4 : IOV_MAX);
+
+/*
+ * AsyncConnection maintains a logic session between two endpoints. In other
+ * word, a pair of addresses can find the only AsyncConnection. AsyncConnection
+ * will handle with network fault or read/write transactions. If one file
+ * descriptor broken, AsyncConnection will maintain the message queue and
+ * sequence, try to reconnect peer endpoint.
+ */
+class AsyncConnection : public Connection {
+
+  ssize_t read(unsigned len, char *buffer,
+               std::function<void(char *, ssize_t)> callback);
+  ssize_t read_until(unsigned needed, char *p);
+  ssize_t read_bulk(char *buf, unsigned len);
+
+  ssize_t write(bufferlist &bl, std::function<void(ssize_t)> callback,
+                bool more=false);
+  ssize_t _try_send(bool more=false);
+
+  void _connect();
+  void _stop();
+  void fault();
+  void inject_delay();
+
+  bool is_queued() const;
+  void shutdown_socket();
+
+   /**
+   * The DelayedDelivery is for injecting delays into Message delivery off
+   * the socket. It is only enabled if delays are requested, and if they
+   * are then it pulls Messages off the DelayQueue and puts them into the
+   * AsyncMessenger event queue.
+   */
+  class DelayedDelivery : public EventCallback {
+    std::set<uint64_t> register_time_events; // need to delete it if stop
+    std::deque<Message*> delay_queue;
+    std::mutex delay_lock;
+    AsyncMessenger *msgr;
+    EventCenter *center;
+    DispatchQueue *dispatch_queue;
+    uint64_t conn_id;
+    std::atomic_bool stop_dispatch;
+
+   public:
+    explicit DelayedDelivery(AsyncMessenger *omsgr, EventCenter *c,
+                             DispatchQueue *q, uint64_t cid)
+      : msgr(omsgr), center(c), dispatch_queue(q), conn_id(cid),
+        stop_dispatch(false) { }
+    ~DelayedDelivery() override {
+      ceph_assert(register_time_events.empty());
+      ceph_assert(delay_queue.empty());
+    }
+    void set_center(EventCenter *c) { center = c; }
+    void do_request(uint64_t id) override;
+    void queue(double delay_period, Message *m) {
+      std::lock_guard<std::mutex> l(delay_lock);
+      delay_queue.push_back(m);
+      register_time_events.insert(center->create_time_event(delay_period*1000000, this));
+    }
+    void discard();
+    bool ready() const { return !stop_dispatch && delay_queue.empty() && register_time_events.empty(); }
+    void flush();
+  } *delay_state;
+
+ public:
+  AsyncConnection(CephContext *cct, AsyncMessenger *m, DispatchQueue *q,
+		  Worker *w, bool is_msgr2, bool local);
+  ~AsyncConnection() override;
+  void maybe_start_delay_thread();
+
+  ostream& _conn_prefix(std::ostream *_dout);
+
+  bool is_connected() override;
+
+  // Only call when AsyncConnection first construct
+  void connect(const entity_addrvec_t& addrs, int type, entity_addr_t& target);
+
+  // Only call when AsyncConnection first construct
+  void accept(ConnectedSocket socket,
+	      const entity_addr_t &listen_addr,
+	      const entity_addr_t &peer_addr);
+  int send_message(Message *m) override;
+
+  void send_keepalive() override;
+  void mark_down() override;
+  void mark_disposable() override {
+    std::lock_guard<std::mutex> l(lock);
+    policy.lossy = true;
+  }
+
+  entity_addr_t get_peer_socket_addr() const override {
+    return target_addr;
+  }
+
+  int get_con_mode() const override;
+
+ private:
+  enum {
+    STATE_NONE,
+    STATE_CONNECTING,
+    STATE_CONNECTING_RE,
+    STATE_ACCEPTING,
+    STATE_CONNECTION_ESTABLISHED,
+    STATE_CLOSED
+  };
+
+  static const uint32_t TCP_PREFETCH_MIN_SIZE;
+  static const char *get_state_name(int state) {
+      const char* const statenames[] = {"STATE_NONE",
+                                        "STATE_CONNECTING",
+                                        "STATE_CONNECTING_RE",
+                                        "STATE_ACCEPTING",
+                                        "STATE_CONNECTION_ESTABLISHED",
+                                        "STATE_CLOSED"};
+      return statenames[state];
+  }
+
+  AsyncMessenger *async_msgr;
+  uint64_t conn_id;
+  PerfCounters *logger;
+  int state;
+  ConnectedSocket cs;
+  int port;
+  Messenger::Policy policy;
+
+  DispatchQueue *dispatch_queue;
+
+  // lockfree, only used in own thread
+  bufferlist outgoing_bl;
+  bool open_write = false;
+
+  std::mutex write_lock;
+
+  std::mutex lock;
+  EventCallbackRef read_handler;
+  EventCallbackRef write_handler;
+  EventCallbackRef write_callback_handler;
+  EventCallbackRef wakeup_handler;
+  EventCallbackRef tick_handler;
+  char *recv_buf;
+  uint32_t recv_max_prefetch;
+  uint32_t recv_start;
+  uint32_t recv_end;
+  set<uint64_t> register_time_events; // need to delete it if stop
+  ceph::coarse_mono_clock::time_point last_connect_started;
+  ceph::coarse_mono_clock::time_point last_active;
+  ceph::mono_clock::time_point recv_start_time;
+  uint64_t last_tick_id = 0;
+  const uint64_t connect_timeout_us;
+  const uint64_t inactive_timeout_us;
+
+  // Tis section are temp variables used by state transition
+
+  // Accepting state
+  bool msgr2 = false;
+  entity_addr_t socket_addr;  ///< local socket addr
+  entity_addr_t target_addr;  ///< which of the peer_addrs we're connecting to (as clienet) or should reconnect to (as peer)
+
+  entity_addr_t _infer_target_addr(const entity_addrvec_t& av);
+
+  // used only by "read_until"
+  uint64_t state_offset;
+  Worker *worker;
+  EventCenter *center;
+
+  std::unique_ptr<Protocol> protocol;
+
+  std::optional<std::function<void(ssize_t)>> writeCallback;
+  std::function<void(char *, ssize_t)> readCallback;
+  std::optional<unsigned> pendingReadLen;
+  char *read_buffer;
+
+ public:
+  // used by eventcallback
+  void handle_write();
+  void handle_write_callback();
+  void process();
+  void wakeup_from(uint64_t id);
+  void tick(uint64_t id);
+  void local_deliver();
+  void stop(bool queue_reset);
+  void cleanup();
+  PerfCounters *get_perf_counter() {
+    return logger;
+  }
+
+  friend class Protocol;
+  friend class ProtocolV1;
+  friend class ProtocolV2;
+}; /* AsyncConnection */
+
+typedef boost::intrusive_ptr<AsyncConnection> AsyncConnectionRef;
+
+#endif
diff --git a/src/msg/async/AsyncMessenger.cc b/src/msg/async/AsyncMessenger.cc
new file mode 100644
index 00000000..2b1488c4
--- /dev/null
+++ b/src/msg/async/AsyncMessenger.cc
@@ -0,0 +1,949 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "acconfig.h"
+
+#include <iostream>
+#include <fstream>
+
+#include "AsyncMessenger.h"
+
+#include "common/config.h"
+#include "common/Timer.h"
+#include "common/errno.h"
+
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "common/EventTrace.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+static ostream& _prefix(std::ostream *_dout, AsyncMessenger *m) {
+  return *_dout << "-- " << m->get_myaddrs() << " ";
+}
+
+static ostream& _prefix(std::ostream *_dout, Processor *p) {
+  return *_dout << " Processor -- ";
+}
+
+
+/*******************
+ * Processor
+ */
+
+class Processor::C_processor_accept : public EventCallback {
+  Processor *pro;
+
+ public:
+  explicit C_processor_accept(Processor *p): pro(p) {}
+  void do_request(uint64_t id) override {
+    pro->accept();
+  }
+};
+
+Processor::Processor(AsyncMessenger *r, Worker *w, CephContext *c)
+  : msgr(r), net(c), worker(w),
+    listen_handler(new C_processor_accept(this)) {}
+
+int Processor::bind(const entity_addrvec_t &bind_addrs,
+		    const set<int>& avoid_ports,
+		    entity_addrvec_t* bound_addrs)
+{
+  const auto& conf = msgr->cct->_conf;
+  // bind to socket(s)
+  ldout(msgr->cct, 10) << __func__ << " " << bind_addrs << dendl;
+
+  SocketOptions opts;
+  opts.nodelay = msgr->cct->_conf->ms_tcp_nodelay;
+  opts.rcbuf_size = msgr->cct->_conf->ms_tcp_rcvbuf;
+
+  listen_sockets.resize(bind_addrs.v.size());
+  *bound_addrs = bind_addrs;
+
+  for (unsigned k = 0; k < bind_addrs.v.size(); ++k) {
+    auto& listen_addr = bound_addrs->v[k];
+
+    /* bind to port */
+    int r = -1;
+
+    for (int i = 0; i < conf->ms_bind_retry_count; i++) {
+      if (i > 0) {
+	lderr(msgr->cct) << __func__ << " was unable to bind. Trying again in "
+			 << conf->ms_bind_retry_delay << " seconds " << dendl;
+	sleep(conf->ms_bind_retry_delay);
+      }
+
+      if (listen_addr.get_port()) {
+	worker->center.submit_to(
+	  worker->center.get_id(),
+	  [this, k, &listen_addr, &opts, &r]() {
+	    r = worker->listen(listen_addr, k, opts, &listen_sockets[k]);
+	  }, false);
+	if (r < 0) {
+	  lderr(msgr->cct) << __func__ << " unable to bind to " << listen_addr
+			   << ": " << cpp_strerror(r) << dendl;
+	  continue;
+	}
+      } else {
+	// try a range of ports
+	for (int port = msgr->cct->_conf->ms_bind_port_min;
+	     port <= msgr->cct->_conf->ms_bind_port_max;
+	     port++) {
+	  if (avoid_ports.count(port))
+	    continue;
+
+	  listen_addr.set_port(port);
+	  worker->center.submit_to(
+	    worker->center.get_id(),
+	    [this, k, &listen_addr, &opts, &r]() {
+	      r = worker->listen(listen_addr, k, opts, &listen_sockets[k]);
+	    }, false);
+	  if (r == 0)
+	    break;
+	}
+	if (r < 0) {
+	  lderr(msgr->cct) << __func__ << " unable to bind to " << listen_addr
+			   << " on any port in range "
+			   << msgr->cct->_conf->ms_bind_port_min
+			   << "-" << msgr->cct->_conf->ms_bind_port_max << ": "
+			   << cpp_strerror(r) << dendl;
+	  listen_addr.set_port(0); // Clear port before retry, otherwise we shall fail again.
+	  continue;
+	}
+	ldout(msgr->cct, 10) << __func__ << " bound on random port "
+			     << listen_addr << dendl;
+      }
+      if (r == 0) {
+	break;
+      }
+    }
+
+    // It seems that binding completely failed, return with that exit status
+    if (r < 0) {
+      lderr(msgr->cct) << __func__ << " was unable to bind after "
+		       << conf->ms_bind_retry_count
+		       << " attempts: " << cpp_strerror(r) << dendl;
+      for (unsigned j = 0; j < k; ++j) {
+	// clean up previous bind
+	listen_sockets[j].abort_accept();
+      }
+      return r;
+    }
+  }
+
+  ldout(msgr->cct, 10) << __func__ << " bound to " << *bound_addrs << dendl;
+  return 0;
+}
+
+void Processor::start()
+{
+  ldout(msgr->cct, 1) << __func__ << dendl;
+
+  // start thread
+  worker->center.submit_to(worker->center.get_id(), [this]() {
+      for (auto& l : listen_sockets) {
+	if (l) {
+	  worker->center.create_file_event(l.fd(), EVENT_READABLE,
+					   listen_handler); }
+      }
+    }, false);
+}
+
+void Processor::accept()
+{
+  SocketOptions opts;
+  opts.nodelay = msgr->cct->_conf->ms_tcp_nodelay;
+  opts.rcbuf_size = msgr->cct->_conf->ms_tcp_rcvbuf;
+  opts.priority = msgr->get_socket_priority();
+
+  for (auto& listen_socket : listen_sockets) {
+    ldout(msgr->cct, 10) << __func__ << " listen_fd=" << listen_socket.fd()
+			 << dendl;
+    unsigned accept_error_num = 0;
+
+    while (true) {
+      entity_addr_t addr;
+      ConnectedSocket cli_socket;
+      Worker *w = worker;
+      if (!msgr->get_stack()->support_local_listen_table())
+	w = msgr->get_stack()->get_worker();
+      else
+	++w->references;
+      int r = listen_socket.accept(&cli_socket, opts, &addr, w);
+      if (r == 0) {
+	ldout(msgr->cct, 10) << __func__ << " accepted incoming on sd "
+			     << cli_socket.fd() << dendl;
+
+	msgr->add_accept(
+	  w, std::move(cli_socket),
+	  msgr->get_myaddrs().v[listen_socket.get_addr_slot()],
+	  addr);
+	accept_error_num = 0;
+	continue;
+      } else {
+	--w->references;
+	if (r == -EINTR) {
+	  continue;
+	} else if (r == -EAGAIN) {
+	  break;
+	} else if (r == -EMFILE || r == -ENFILE) {
+	  lderr(msgr->cct) << __func__ << " open file descriptions limit reached sd = " << listen_socket.fd()
+			   << " errno " << r << " " << cpp_strerror(r) << dendl;
+	  if (++accept_error_num > msgr->cct->_conf->ms_max_accept_failures) {
+	    lderr(msgr->cct) << "Proccessor accept has encountered enough error numbers, just do ceph_abort()." << dendl;
+	    ceph_abort();
+	  }
+	  continue;
+	} else if (r == -ECONNABORTED) {
+	  ldout(msgr->cct, 0) << __func__ << " it was closed because of rst arrived sd = " << listen_socket.fd()
+			      << " errno " << r << " " << cpp_strerror(r) << dendl;
+	  continue;
+	} else {
+	  lderr(msgr->cct) << __func__ << " no incoming connection?"
+			   << " errno " << r << " " << cpp_strerror(r) << dendl;
+	  if (++accept_error_num > msgr->cct->_conf->ms_max_accept_failures) {
+	    lderr(msgr->cct) << "Proccessor accept has encountered enough error numbers, just do ceph_abort()." << dendl;
+	    ceph_abort();
+	  }
+	  continue;
+	}
+      }
+    }
+  }
+}
+
+void Processor::stop()
+{
+  ldout(msgr->cct,10) << __func__ << dendl;
+
+  worker->center.submit_to(worker->center.get_id(), [this]() {
+      for (auto& listen_socket : listen_sockets) {
+	if (listen_socket) {
+	  worker->center.delete_file_event(listen_socket.fd(), EVENT_READABLE);
+	  listen_socket.abort_accept();
+	}
+      }
+    }, false);
+}
+
+
+struct StackSingleton {
+  CephContext *cct;
+  std::shared_ptr<NetworkStack> stack;
+
+  explicit StackSingleton(CephContext *c): cct(c) {}
+  void ready(std::string &type) {
+    if (!stack)
+      stack = NetworkStack::create(cct, type);
+  }
+  ~StackSingleton() {
+    stack->stop();
+  }
+};
+
+
+class C_handle_reap : public EventCallback {
+  AsyncMessenger *msgr;
+
+  public:
+  explicit C_handle_reap(AsyncMessenger *m): msgr(m) {}
+  void do_request(uint64_t id) override {
+    // judge whether is a time event
+    msgr->reap_dead();
+  }
+};
+
+/*******************
+ * AsyncMessenger
+ */
+
+AsyncMessenger::AsyncMessenger(CephContext *cct, entity_name_t name,
+                               const std::string &type, string mname, uint64_t _nonce)
+  : SimplePolicyMessenger(cct, name,mname, _nonce),
+    dispatch_queue(cct, this, mname),
+    lock("AsyncMessenger::lock"),
+    nonce(_nonce), need_addr(true), did_bind(false),
+    global_seq(0), deleted_lock("AsyncMessenger::deleted_lock"),
+    cluster_protocol(0), stopped(true)
+{
+  std::string transport_type = "posix";
+  if (type.find("rdma") != std::string::npos)
+    transport_type = "rdma";
+  else if (type.find("dpdk") != std::string::npos)
+    transport_type = "dpdk";
+
+  auto single = &cct->lookup_or_create_singleton_object<StackSingleton>(
+    "AsyncMessenger::NetworkStack::" + transport_type, true, cct);
+  single->ready(transport_type);
+  stack = single->stack.get();
+  stack->start();
+  local_worker = stack->get_worker();
+  local_connection = new AsyncConnection(cct, this, &dispatch_queue,
+					 local_worker, true, true);
+  init_local_connection();
+  reap_handler = new C_handle_reap(this);
+  unsigned processor_num = 1;
+  if (stack->support_local_listen_table())
+    processor_num = stack->get_num_worker();
+  for (unsigned i = 0; i < processor_num; ++i)
+    processors.push_back(new Processor(this, stack->get_worker(i), cct));
+}
+
+/**
+ * Destroy the AsyncMessenger. Pretty simple since all the work is done
+ * elsewhere.
+ */
+AsyncMessenger::~AsyncMessenger()
+{
+  delete reap_handler;
+  ceph_assert(!did_bind); // either we didn't bind or we shut down the Processor
+  for (auto &&p : processors)
+    delete p;
+}
+
+void AsyncMessenger::ready()
+{
+  ldout(cct,10) << __func__ << " " << get_myaddrs() << dendl;
+
+  stack->ready();
+  if (pending_bind) {
+    int err = bindv(pending_bind_addrs);
+    if (err) {
+      lderr(cct) << __func__ << " postponed bind failed" << dendl;
+      ceph_abort();
+    }
+  }
+
+  Mutex::Locker l(lock);
+  for (auto &&p : processors)
+    p->start();
+  dispatch_queue.start();
+}
+
+int AsyncMessenger::shutdown()
+{
+  ldout(cct,10) << __func__ << " " << get_myaddrs() << dendl;
+
+  // done!  clean up.
+  for (auto &&p : processors)
+    p->stop();
+  mark_down_all();
+  // break ref cycles on the loopback connection
+  local_connection->set_priv(NULL);
+  local_connection->mark_down();
+  did_bind = false;
+  lock.Lock();
+  stop_cond.Signal();
+  stopped = true;
+  lock.Unlock();
+  stack->drain();
+  return 0;
+}
+
+int AsyncMessenger::bind(const entity_addr_t &bind_addr)
+{
+  ldout(cct,10) << __func__ << " " << bind_addr << dendl;
+  // old bind() can take entity_addr_t(). new bindv() can take a
+  // 0.0.0.0-like address but needs type and family to be set.
+  auto a = bind_addr;
+  if (a == entity_addr_t()) {
+    a.set_type(entity_addr_t::TYPE_LEGACY);
+    if (cct->_conf->ms_bind_ipv6) {
+      a.set_family(AF_INET6);
+    } else {
+      a.set_family(AF_INET);
+    }
+  }
+  return bindv(entity_addrvec_t(a));
+}
+
+int AsyncMessenger::bindv(const entity_addrvec_t &bind_addrs)
+{
+  lock.Lock();
+
+  if (!pending_bind && started) {
+    ldout(cct,10) << __func__ << " already started" << dendl;
+    lock.Unlock();
+    return -1;
+  }
+
+  ldout(cct,10) << __func__ << " " << bind_addrs << dendl;
+
+  if (!stack->is_ready()) {
+    ldout(cct, 10) << __func__ << " Network Stack is not ready for bind yet - postponed" << dendl;
+    pending_bind_addrs = bind_addrs;
+    pending_bind = true;
+    lock.Unlock();
+    return 0;
+  }
+
+  lock.Unlock();
+
+  // bind to a socket
+  set<int> avoid_ports;
+  entity_addrvec_t bound_addrs;
+  unsigned i = 0;
+  for (auto &&p : processors) {
+    int r = p->bind(bind_addrs, avoid_ports, &bound_addrs);
+    if (r) {
+      // Note: this is related to local tcp listen table problem.
+      // Posix(default kernel implementation) backend shares listen table
+      // in the kernel, so all threads can use the same listen table naturally
+      // and only one thread need to bind. But other backends(like dpdk) uses local
+      // listen table, we need to bind/listen tcp port for each worker. So if the
+      // first worker failed to bind, it could be think the normal error then handle
+      // it, like port is used case. But if the first worker successfully to bind
+      // but the second worker failed, it's not expected and we need to assert
+      // here
+      ceph_assert(i == 0);
+      return r;
+    }
+    ++i;
+  }
+  _finish_bind(bind_addrs, bound_addrs);
+  return 0;
+}
+
+int AsyncMessenger::rebind(const set<int>& avoid_ports)
+{
+  ldout(cct,1) << __func__ << " rebind avoid " << avoid_ports << dendl;
+  ceph_assert(did_bind);
+
+  for (auto &&p : processors)
+    p->stop();
+  mark_down_all();
+
+  // adjust the nonce; we want our entity_addr_t to be truly unique.
+  nonce += 1000000;
+  ldout(cct, 10) << __func__ << " new nonce " << nonce
+		 << " and addr " << get_myaddrs() << dendl;
+
+  entity_addrvec_t bound_addrs;
+  entity_addrvec_t bind_addrs = get_myaddrs();
+  set<int> new_avoid(avoid_ports);
+  for (auto& a : bind_addrs.v) {
+    new_avoid.insert(a.get_port());
+    a.set_port(0);
+  }
+  ldout(cct, 10) << __func__ << " will try " << bind_addrs
+		 << " and avoid ports " << new_avoid << dendl;
+  unsigned i = 0;
+  for (auto &&p : processors) {
+    int r = p->bind(bind_addrs, avoid_ports, &bound_addrs);
+    if (r) {
+      ceph_assert(i == 0);
+      return r;
+    }
+    ++i;
+  }
+  _finish_bind(bind_addrs, bound_addrs);
+  for (auto &&p : processors) {
+    p->start();
+  }
+  return 0;
+}
+
+int AsyncMessenger::client_bind(const entity_addr_t &bind_addr)
+{
+  if (!cct->_conf->ms_bind_before_connect)
+    return 0;
+  Mutex::Locker l(lock);
+  if (did_bind) {
+    return 0;
+  }
+  if (started) {
+    ldout(cct, 10) << __func__ << " already started" << dendl;
+    return -1;
+  }
+  ldout(cct, 10) << __func__ << " " << bind_addr << dendl;
+
+  set_myaddrs(entity_addrvec_t(bind_addr));
+  return 0;
+}
+
+void AsyncMessenger::_finish_bind(const entity_addrvec_t& bind_addrs,
+				  const entity_addrvec_t& listen_addrs)
+{
+  set_myaddrs(bind_addrs);
+  for (auto& a : bind_addrs.v) {
+    if (!a.is_blank_ip()) {
+      learned_addr(a);
+    }
+  }
+
+  if (get_myaddrs().front().get_port() == 0) {
+    set_myaddrs(listen_addrs);
+  }
+  entity_addrvec_t newaddrs = *my_addrs;
+  for (auto& a : newaddrs.v) {
+    a.set_nonce(nonce);
+  }
+  set_myaddrs(newaddrs);
+
+  init_local_connection();
+
+  ldout(cct,1) << __func__ << " bind my_addrs is " << get_myaddrs() << dendl;
+  did_bind = true;
+}
+
+int AsyncMessenger::start()
+{
+  lock.Lock();
+  ldout(cct,1) << __func__ << " start" << dendl;
+
+  // register at least one entity, first!
+  ceph_assert(my_name.type() >= 0);
+
+  ceph_assert(!started);
+  started = true;
+  stopped = false;
+
+  if (!did_bind) {
+    entity_addrvec_t newaddrs = *my_addrs;
+    for (auto& a : newaddrs.v) {
+      a.nonce = nonce;
+    }
+    set_myaddrs(newaddrs);
+    _init_local_connection();
+  }
+
+  lock.Unlock();
+  return 0;
+}
+
+void AsyncMessenger::wait()
+{
+  lock.Lock();
+  if (!started) {
+    lock.Unlock();
+    return;
+  }
+  if (!stopped)
+    stop_cond.Wait(lock);
+
+  lock.Unlock();
+
+  dispatch_queue.shutdown();
+  if (dispatch_queue.is_started()) {
+    ldout(cct, 10) << __func__ << ": waiting for dispatch queue" << dendl;
+    dispatch_queue.wait();
+    dispatch_queue.discard_local();
+    ldout(cct, 10) << __func__ << ": dispatch queue is stopped" << dendl;
+  }
+
+  // close all connections
+  shutdown_connections(false);
+  stack->drain();
+
+  ldout(cct, 10) << __func__ << ": done." << dendl;
+  ldout(cct, 1) << __func__ << " complete." << dendl;
+  started = false;
+}
+
+void AsyncMessenger::add_accept(Worker *w, ConnectedSocket cli_socket,
+				const entity_addr_t &listen_addr,
+				const entity_addr_t &peer_addr)
+{
+  lock.Lock();
+  AsyncConnectionRef conn = new AsyncConnection(cct, this, &dispatch_queue, w,
+						listen_addr.is_msgr2(), false);
+  conn->accept(std::move(cli_socket), listen_addr, peer_addr);
+  accepting_conns.insert(conn);
+  lock.Unlock();
+}
+
+AsyncConnectionRef AsyncMessenger::create_connect(
+  const entity_addrvec_t& addrs, int type)
+{
+  ceph_assert(lock.is_locked());
+
+  ldout(cct, 10) << __func__ << " " << addrs
+      << ", creating connection and registering" << dendl;
+
+  // here is where we decide which of the addrs to connect to.  always prefer
+  // the first one, if we support it.
+  entity_addr_t target;
+  for (auto& a : addrs.v) {
+    if (!a.is_msgr2() && !a.is_legacy()) {
+      continue;
+    }
+    // FIXME: for ipv4 vs ipv6, check whether local host can handle ipv6 before
+    // trying it?  for now, just pick whichever is listed first.
+    target = a;
+    break;
+  }
+
+  // create connection
+  Worker *w = stack->get_worker();
+  AsyncConnectionRef conn = new AsyncConnection(cct, this, &dispatch_queue, w,
+						target.is_msgr2(), false);
+  conn->connect(addrs, type, target);
+  ceph_assert(!conns.count(addrs));
+  ldout(cct, 10) << __func__ << " " << conn << " " << addrs << " "
+		 << *conn->peer_addrs << dendl;
+  conns[addrs] = conn;
+  w->get_perf_counter()->inc(l_msgr_active_connections);
+
+  return conn;
+}
+
+
+ConnectionRef AsyncMessenger::get_loopback_connection()
+{
+  return local_connection;
+}
+
+bool AsyncMessenger::should_use_msgr2()
+{
+  // if we are bound to v1 only, and we are connecting to a v2 peer,
+  // we cannot use the peer's v2 address. otherwise the connection
+  // is assymetrical, because they would have to use v1 to connect
+  // to us, and we would use v2, and connection race detection etc
+  // would totally break down (among other things).  or, the other
+  // end will be confused that we advertise ourselve with a v1
+  // address only (that we bound to) but connected with protocol v2.
+  return !did_bind || get_myaddrs().has_msgr2();
+}
+
+entity_addrvec_t AsyncMessenger::_filter_addrs(int type,
+					       const entity_addrvec_t& addrs)
+{
+  if (!should_use_msgr2()) {
+    ldout(cct, 10) << __func__ << " " << addrs << " type " << type
+		   << " limiting to v1 ()" << dendl;
+    entity_addrvec_t r;
+    for (auto& i : addrs.v) {
+      if (i.is_msgr2()) {
+	continue;
+      }
+      r.v.push_back(i);
+    }
+    return r;
+  } else {
+    return addrs;
+  }
+}
+
+int AsyncMessenger::send_to(Message *m, int type, const entity_addrvec_t& addrs)
+{
+  Mutex::Locker l(lock);
+
+  FUNCTRACE(cct);
+  ceph_assert(m);
+
+  if (m->get_type() == CEPH_MSG_OSD_OP)
+    OID_EVENT_TRACE(((MOSDOp *)m)->get_oid().name.c_str(), "SEND_MSG_OSD_OP");
+  else if (m->get_type() == CEPH_MSG_OSD_OPREPLY)
+    OID_EVENT_TRACE(((MOSDOpReply *)m)->get_oid().name.c_str(), "SEND_MSG_OSD_OP_REPLY");
+
+  ldout(cct, 1) << __func__ << "--> " << ceph_entity_type_name(type) << " "
+      << addrs << " -- " << *m << " -- ?+"
+      << m->get_data().length() << " " << m << dendl;
+
+  if (addrs.empty()) {
+    ldout(cct,0) << __func__ <<  " message " << *m
+        << " with empty dest " << addrs << dendl;
+    m->put();
+    return -EINVAL;
+  }
+
+  auto av = _filter_addrs(type, addrs);
+  AsyncConnectionRef conn = _lookup_conn(av);
+  submit_message(m, conn, av, type);
+  return 0;
+}
+
+ConnectionRef AsyncMessenger::connect_to(int type, const entity_addrvec_t& addrs)
+{
+  Mutex::Locker l(lock);
+  if (*my_addrs == addrs ||
+      (addrs.v.size() == 1 &&
+       my_addrs->contains(addrs.front()))) {
+    // local
+    return local_connection;
+  }
+
+  auto av = _filter_addrs(type, addrs);
+
+  AsyncConnectionRef conn = _lookup_conn(av);
+  if (conn) {
+    ldout(cct, 10) << __func__ << " " << av << " existing " << conn << dendl;
+  } else {
+    conn = create_connect(av, type);
+    ldout(cct, 10) << __func__ << " " << av << " new " << conn << dendl;
+  }
+
+  return conn;
+}
+
+void AsyncMessenger::submit_message(Message *m, AsyncConnectionRef con,
+                                    const entity_addrvec_t& dest_addrs,
+				    int dest_type)
+{
+  if (cct->_conf->ms_dump_on_send) {
+    m->encode(-1, MSG_CRC_ALL);
+    ldout(cct, 0) << __func__ << " submit_message " << *m << "\n";
+    m->get_payload().hexdump(*_dout);
+    if (m->get_data().length() > 0) {
+      *_dout << " data:\n";
+      m->get_data().hexdump(*_dout);
+    }
+    *_dout << dendl;
+    m->clear_payload();
+  }
+
+  // existing connection?
+  if (con) {
+    con->send_message(m);
+    return ;
+  }
+
+  // local?
+  if (*my_addrs == dest_addrs ||
+      (dest_addrs.v.size() == 1 &&
+       my_addrs->contains(dest_addrs.front()))) {
+    // local
+    local_connection->send_message(m);
+    return ;
+  }
+
+  // remote, no existing connection.
+  const Policy& policy = get_policy(dest_type);
+  if (policy.server) {
+    ldout(cct, 20) << __func__ << " " << *m << " remote, " << dest_addrs
+        << ", lossy server for target type "
+        << ceph_entity_type_name(dest_type) << ", no session, dropping." << dendl;
+    m->put();
+  } else {
+    ldout(cct,20) << __func__ << " " << *m << " remote, " << dest_addrs
+		  << ", new connection." << dendl;
+    con = create_connect(dest_addrs, dest_type);
+    con->send_message(m);
+  }
+}
+
+/**
+ * If my_addr doesn't have an IP set, this function
+ * will fill it in from the passed addr. Otherwise it does nothing and returns.
+ */
+bool AsyncMessenger::set_addr_unknowns(const entity_addrvec_t &addrs)
+{
+  ldout(cct,1) << __func__ << " " << addrs << dendl;
+  bool ret = false;
+  Mutex::Locker l(lock);
+
+  entity_addrvec_t newaddrs = *my_addrs;
+  for (auto& a : newaddrs.v) {
+    if (a.is_blank_ip()) {
+      int type = a.get_type();
+      int port = a.get_port();
+      uint32_t nonce = a.get_nonce();
+      for (auto& b : addrs.v) {
+	if (a.get_family() == b.get_family()) {
+	  ldout(cct,1) << __func__ << " assuming my addr " << a
+		       << " matches provided addr " << b << dendl;
+	  a = b;
+	  a.set_nonce(nonce);
+	  a.set_type(type);
+	  a.set_port(port);
+	  ret = true;
+	  break;
+	}
+      }
+    }
+  }
+  set_myaddrs(newaddrs);
+  if (ret) {
+    _init_local_connection();
+  }
+  ldout(cct,1) << __func__ << " now " << *my_addrs << dendl;
+  return ret;
+}
+
+void AsyncMessenger::set_addrs(const entity_addrvec_t &addrs)
+{
+  Mutex::Locker l(lock);
+  auto t = addrs;
+  for (auto& a : t.v) {
+    a.set_nonce(nonce);
+  }
+  set_myaddrs(t);
+  _init_local_connection();
+}
+
+void AsyncMessenger::shutdown_connections(bool queue_reset)
+{
+  ldout(cct,1) << __func__ << " " << dendl;
+  lock.Lock();
+  for (set<AsyncConnectionRef>::iterator q = accepting_conns.begin();
+       q != accepting_conns.end(); ++q) {
+    AsyncConnectionRef p = *q;
+    ldout(cct, 5) << __func__ << " accepting_conn " << p.get() << dendl;
+    p->stop(queue_reset);
+  }
+  accepting_conns.clear();
+
+  while (!conns.empty()) {
+    auto it = conns.begin();
+    AsyncConnectionRef p = it->second;
+    ldout(cct, 5) << __func__ << " mark down " << it->first << " " << p << dendl;
+    conns.erase(it);
+    p->get_perf_counter()->dec(l_msgr_active_connections);
+    p->stop(queue_reset);
+  }
+
+  {
+    Mutex::Locker l(deleted_lock);
+    while (!deleted_conns.empty()) {
+      set<AsyncConnectionRef>::iterator it = deleted_conns.begin();
+      AsyncConnectionRef p = *it;
+      ldout(cct, 5) << __func__ << " delete " << p << dendl;
+      deleted_conns.erase(it);
+    }
+  }
+  lock.Unlock();
+}
+
+void AsyncMessenger::mark_down_addrs(const entity_addrvec_t& addrs)
+{
+  lock.Lock();
+  AsyncConnectionRef p = _lookup_conn(addrs);
+  if (p) {
+    ldout(cct, 1) << __func__ << " " << addrs << " -- " << p << dendl;
+    p->stop(true);
+  } else {
+    ldout(cct, 1) << __func__ << " " << addrs << " -- connection dne" << dendl;
+  }
+  lock.Unlock();
+}
+
+int AsyncMessenger::get_proto_version(int peer_type, bool connect) const
+{
+  int my_type = my_name.type();
+
+  // set reply protocol version
+  if (peer_type == my_type) {
+    // internal
+    return cluster_protocol;
+  } else {
+    // public
+    switch (connect ? peer_type : my_type) {
+      case CEPH_ENTITY_TYPE_OSD: return CEPH_OSDC_PROTOCOL;
+      case CEPH_ENTITY_TYPE_MDS: return CEPH_MDSC_PROTOCOL;
+      case CEPH_ENTITY_TYPE_MON: return CEPH_MONC_PROTOCOL;
+    }
+  }
+  return 0;
+}
+
+int AsyncMessenger::accept_conn(AsyncConnectionRef conn)
+{
+  Mutex::Locker l(lock);
+  auto it = conns.find(*conn->peer_addrs);
+  if (it != conns.end()) {
+    AsyncConnectionRef existing = it->second;
+
+    // lazy delete, see "deleted_conns"
+    // If conn already in, we will return 0
+    Mutex::Locker l(deleted_lock);
+    if (deleted_conns.erase(existing)) {
+      conns.erase(it);
+    } else if (conn != existing) {
+      return -1;
+    }
+  }
+  ldout(cct, 10) << __func__ << " " << conn << " " << *conn->peer_addrs << dendl;
+  conns[*conn->peer_addrs] = conn;
+  conn->get_perf_counter()->inc(l_msgr_active_connections);
+  accepting_conns.erase(conn);
+  return 0;
+}
+
+
+bool AsyncMessenger::learned_addr(const entity_addr_t &peer_addr_for_me)
+{
+  // be careful here: multiple threads may block here, and readers of
+  // my_addr do NOT hold any lock.
+
+  // this always goes from true -> false under the protection of the
+  // mutex.  if it is already false, we need not retake the mutex at
+  // all.
+  if (!need_addr)
+    return false;
+  std::lock_guard l(lock);
+  if (need_addr) {
+    if (my_addrs->empty()) {
+      auto a = peer_addr_for_me;
+      a.set_type(entity_addr_t::TYPE_ANY);
+      a.set_nonce(nonce);
+      if (!did_bind) {
+	a.set_port(0);
+      }
+      set_myaddrs(entity_addrvec_t(a));
+      ldout(cct,10) << __func__ << " had no addrs" << dendl;
+    } else {
+      // fix all addrs of the same family, regardless of type (msgr2 vs legacy)
+      entity_addrvec_t newaddrs = *my_addrs;
+      for (auto& a : newaddrs.v) {
+	if (a.is_blank_ip() &&
+	    a.get_family() == peer_addr_for_me.get_family()) {
+	  entity_addr_t t = peer_addr_for_me;
+	  if (!did_bind) {
+	    t.set_type(entity_addr_t::TYPE_ANY);
+	    t.set_port(0);
+	  } else {	  
+	    t.set_type(a.get_type());
+	    t.set_port(a.get_port());
+	  }
+	  t.set_nonce(a.get_nonce());
+	  ldout(cct,10) << __func__ << " " << a << " -> " << t << dendl;
+	  a = t;
+	}
+      }
+      set_myaddrs(newaddrs);
+    }
+    ldout(cct, 1) << __func__ << " learned my addr " << *my_addrs
+		  << " (peer_addr_for_me " << peer_addr_for_me << ")" << dendl;
+    _init_local_connection();
+    need_addr = false;
+    return true;
+  }
+  return false;
+}
+
+int AsyncMessenger::reap_dead()
+{
+  ldout(cct, 1) << __func__ << " start" << dendl;
+  int num = 0;
+
+  Mutex::Locker l1(lock);
+  Mutex::Locker l2(deleted_lock);
+
+  while (!deleted_conns.empty()) {
+    auto it = deleted_conns.begin();
+    AsyncConnectionRef p = *it;
+    ldout(cct, 5) << __func__ << " delete " << p << dendl;
+    auto conns_it = conns.find(*p->peer_addrs);
+    if (conns_it != conns.end() && conns_it->second == p)
+      conns.erase(conns_it);
+    accepting_conns.erase(p);
+    deleted_conns.erase(it);
+    ++num;
+  }
+
+  return num;
+}
diff --git a/src/msg/async/AsyncMessenger.h b/src/msg/async/AsyncMessenger.h
new file mode 100644
index 00000000..98bf9d52
--- /dev/null
+++ b/src/msg/async/AsyncMessenger.h
@@ -0,0 +1,426 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_ASYNCMESSENGER_H
+#define CEPH_ASYNCMESSENGER_H
+
+#include <map>
+#include <mutex>
+
+#include "include/types.h"
+#include "include/xlist.h"
+#include "include/spinlock.h"
+#include "include/unordered_map.h"
+#include "include/unordered_set.h"
+
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/Thread.h"
+
+#include "msg/SimplePolicyMessenger.h"
+#include "msg/DispatchQueue.h"
+#include "AsyncConnection.h"
+#include "Event.h"
+
+#include "include/ceph_assert.h"
+
+class AsyncMessenger;
+
+/**
+ * If the Messenger binds to a specific address, the Processor runs
+ * and listens for incoming connections.
+ */
+class Processor {
+  AsyncMessenger *msgr;
+  NetHandler net;
+  Worker *worker;
+  vector<ServerSocket> listen_sockets;
+  EventCallbackRef listen_handler;
+
+  class C_processor_accept;
+
+ public:
+  Processor(AsyncMessenger *r, Worker *w, CephContext *c);
+  ~Processor() { delete listen_handler; };
+
+  void stop();
+  int bind(const entity_addrvec_t &bind_addrs,
+	   const set<int>& avoid_ports,
+	   entity_addrvec_t* bound_addrs);
+  void start();
+  void accept();
+};
+
+/*
+ * AsyncMessenger is represented for maintaining a set of asynchronous connections,
+ * it may own a bind address and the accepted connections will be managed by
+ * AsyncMessenger.
+ *
+ */
+
+class AsyncMessenger : public SimplePolicyMessenger {
+  // First we have the public Messenger interface implementation...
+public:
+  /**
+   * Initialize the AsyncMessenger!
+   *
+   * @param cct The CephContext to use
+   * @param name The name to assign ourselves
+   * _nonce A unique ID to use for this AsyncMessenger. It should not
+   * be a value that will be repeated if the daemon restarts.
+   */
+  AsyncMessenger(CephContext *cct, entity_name_t name, const std::string &type,
+                 string mname, uint64_t _nonce);
+
+  /**
+   * Destroy the AsyncMessenger. Pretty simple since all the work is done
+   * elsewhere.
+   */
+  ~AsyncMessenger() override;
+
+  /** @defgroup Accessors
+   * @{
+   */
+  bool set_addr_unknowns(const entity_addrvec_t &addr) override;
+  void set_addrs(const entity_addrvec_t &addrs) override;
+
+  int get_dispatch_queue_len() override {
+    return dispatch_queue.get_queue_len();
+  }
+
+  double get_dispatch_queue_max_age(utime_t now) override {
+    return dispatch_queue.get_max_age(now);
+  }
+  /** @} Accessors */
+
+  /**
+   * @defgroup Configuration functions
+   * @{
+   */
+  void set_cluster_protocol(int p) override {
+    ceph_assert(!started && !did_bind);
+    cluster_protocol = p;
+  }
+
+  int bind(const entity_addr_t& bind_addr) override;
+  int rebind(const set<int>& avoid_ports) override;
+  int client_bind(const entity_addr_t& bind_addr) override;
+
+  int bindv(const entity_addrvec_t& bind_addrs) override;
+
+  bool should_use_msgr2() override;
+
+  /** @} Configuration functions */
+
+  /**
+   * @defgroup Startup/Shutdown
+   * @{
+   */
+  int start() override;
+  void wait() override;
+  int shutdown() override;
+
+  /** @} // Startup/Shutdown */
+
+  /**
+   * @defgroup Messaging
+   * @{
+   */
+  int send_to(Message *m, int type, const entity_addrvec_t& addrs) override;
+
+  /** @} // Messaging */
+
+  /**
+   * @defgroup Connection Management
+   * @{
+   */
+  ConnectionRef connect_to(int type,
+			   const entity_addrvec_t& addrs) override;
+  ConnectionRef get_loopback_connection() override;
+  void mark_down(const entity_addr_t& addr) override {
+    mark_down_addrs(entity_addrvec_t(addr));
+  }
+  void mark_down_addrs(const entity_addrvec_t& addrs) override;
+  void mark_down_all() override {
+    shutdown_connections(true);
+  }
+  /** @} // Connection Management */
+
+  /**
+   * @defgroup Inner classes
+   * @{
+   */
+
+  /**
+   * @} // Inner classes
+   */
+
+protected:
+  /**
+   * @defgroup Messenger Interfaces
+   * @{
+   */
+  /**
+   * Start up the DispatchQueue thread once we have somebody to dispatch to.
+   */
+  void ready() override;
+  /** @} // Messenger Interfaces */
+
+private:
+
+  /**
+   * @defgroup Utility functions
+   * @{
+   */
+
+  /**
+   * Create a connection associated with the given entity (of the given type).
+   * Initiate the connection. (This function returning does not guarantee
+   * connection success.)
+   *
+   * @param addrs The address(es) of the entity to connect to.
+   * @param type The peer type of the entity at the address.
+   *
+   * @return a pointer to the newly-created connection. Caller does not own a
+   * reference; take one if you need it.
+   */
+  AsyncConnectionRef create_connect(const entity_addrvec_t& addrs, int type);
+
+  /**
+   * Queue up a Message for delivery to the entity specified
+   * by addr and dest_type.
+   * submit_message() is responsible for creating
+   * new AsyncConnection (and closing old ones) as necessary.
+   *
+   * @param m The Message to queue up. This function eats a reference.
+   * @param con The existing Connection to use, or NULL if you don't know of one.
+   * @param dest_addr The address to send the Message to.
+   * @param dest_type The peer type of the address we're sending to
+   * just drop silently under failure.
+   */
+  void submit_message(Message *m, AsyncConnectionRef con,
+                      const entity_addrvec_t& dest_addrs, int dest_type);
+
+  void _finish_bind(const entity_addrvec_t& bind_addrs,
+		    const entity_addrvec_t& listen_addrs);
+
+  entity_addrvec_t _filter_addrs(int type,
+				 const entity_addrvec_t& addrs);
+
+ private:
+  static const uint64_t ReapDeadConnectionThreshold = 5;
+
+  NetworkStack *stack;
+  std::vector<Processor*> processors;
+  friend class Processor;
+  DispatchQueue dispatch_queue;
+
+  // the worker run messenger's cron jobs
+  Worker *local_worker;
+
+  std::string ms_type;
+
+  /// overall lock used for AsyncMessenger data structures
+  Mutex lock;
+  // AsyncMessenger stuff
+  /// approximately unique ID set by the Constructor for use in entity_addr_t
+  uint64_t nonce;
+
+  /// true, specifying we haven't learned our addr; set false when we find it.
+  // maybe this should be protected by the lock?
+  bool need_addr;
+
+  /**
+   * set to bind addresses if bind was called before NetworkStack was ready to
+   * bind
+   */
+  entity_addrvec_t pending_bind_addrs;
+
+  /**
+   * false; set to true if a pending bind exists
+   */
+  bool pending_bind = false;
+
+  /**
+   *  The following aren't lock-protected since you shouldn't be able to race
+   *  the only writers.
+   */
+
+  /**
+   *  false; set to true if the AsyncMessenger bound to a specific address;
+   *  and set false again by Accepter::stop().
+   */
+  bool did_bind;
+  /// counter for the global seq our connection protocol uses
+  __u32 global_seq;
+  /// lock to protect the global_seq
+  ceph::spinlock global_seq_lock;
+
+  /**
+   * hash map of addresses to Asyncconnection
+   *
+   * NOTE: a Asyncconnection* with state CLOSED may still be in the map but is considered
+   * invalid and can be replaced by anyone holding the msgr lock
+   */
+  ceph::unordered_map<entity_addrvec_t, AsyncConnectionRef> conns;
+
+  /**
+   * list of connection are in the process of accepting
+   *
+   * These are not yet in the conns map.
+   */
+  set<AsyncConnectionRef> accepting_conns;
+
+  /**
+   * list of connection are closed which need to be clean up
+   *
+   * Because AsyncMessenger and AsyncConnection follow a lock rule that
+   * we can lock AsyncMesenger::lock firstly then lock AsyncConnection::lock
+   * but can't reversed. This rule is aimed to avoid dead lock.
+   * So if AsyncConnection want to unregister itself from AsyncMessenger,
+   * we pick up this idea that just queue itself to this set and do lazy
+   * deleted for AsyncConnection. "_lookup_conn" must ensure not return a
+   * AsyncConnection in this set.
+   */
+  Mutex deleted_lock;
+  set<AsyncConnectionRef> deleted_conns;
+
+  EventCallbackRef reap_handler;
+
+  /// internal cluster protocol version, if any, for talking to entities of the same type.
+  int cluster_protocol;
+
+  Cond  stop_cond;
+  bool stopped;
+
+  AsyncConnectionRef _lookup_conn(const entity_addrvec_t& k) {
+    ceph_assert(lock.is_locked());
+    auto p = conns.find(k);
+    if (p == conns.end())
+      return NULL;
+
+    // lazy delete, see "deleted_conns"
+    Mutex::Locker l(deleted_lock);
+    if (deleted_conns.erase(p->second)) {
+      conns.erase(p);
+      return NULL;
+    }
+
+    return p->second;
+  }
+
+  void _init_local_connection() {
+    ceph_assert(lock.is_locked());
+    local_connection->peer_addrs = *my_addrs;
+    local_connection->peer_type = my_name.type();
+    local_connection->set_features(CEPH_FEATURES_ALL);
+    ms_deliver_handle_fast_connect(local_connection.get());
+  }
+
+  void shutdown_connections(bool queue_reset);
+
+public:
+
+  /// con used for sending messages to ourselves
+  AsyncConnectionRef local_connection;
+
+  /**
+   * @defgroup AsyncMessenger internals
+   * @{
+   */
+  /**
+   * This wraps _lookup_conn.
+   */
+  AsyncConnectionRef lookup_conn(const entity_addrvec_t& k) {
+    Mutex::Locker l(lock);
+    return _lookup_conn(k);
+  }
+
+  int accept_conn(AsyncConnectionRef conn);
+  bool learned_addr(const entity_addr_t &peer_addr_for_me);
+  void add_accept(Worker *w, ConnectedSocket cli_socket,
+		  const entity_addr_t &listen_addr,
+		  const entity_addr_t &peer_addr);
+  NetworkStack *get_stack() {
+    return stack;
+  }
+
+  uint64_t get_nonce() const {
+    return nonce;
+  }
+
+  /**
+   * Increment the global sequence for this AsyncMessenger and return it.
+   * This is for the connect protocol, although it doesn't hurt if somebody
+   * else calls it.
+   *
+   * @return a global sequence ID that nobody else has seen.
+   */
+  __u32 get_global_seq(__u32 old=0) {
+    std::lock_guard<ceph::spinlock> lg(global_seq_lock);
+
+    if (old > global_seq)
+      global_seq = old;
+    __u32 ret = ++global_seq;
+
+    return ret;
+  }
+  /**
+   * Get the protocol version we support for the given peer type: either
+   * a peer protocol (if it matches our own), the protocol version for the
+   * peer (if we're connecting), or our protocol version (if we're accepting).
+   */
+  int get_proto_version(int peer_type, bool connect) const;
+
+  /**
+   * Fill in the address and peer type for the local connection, which
+   * is used for delivering messages back to ourself.
+   */
+  void init_local_connection() {
+    Mutex::Locker l(lock);
+    _init_local_connection();
+  }
+
+  /**
+   * Unregister connection from `conns`
+   *
+   * See "deleted_conns"
+   */
+  void unregister_conn(AsyncConnectionRef conn) {
+    Mutex::Locker l(deleted_lock);
+    conn->get_perf_counter()->dec(l_msgr_active_connections);
+    deleted_conns.emplace(std::move(conn));
+
+    if (deleted_conns.size() >= ReapDeadConnectionThreshold) {
+      local_worker->center.dispatch_event_external(reap_handler);
+    }
+  }
+
+  /**
+   * Reap dead connection from `deleted_conns`
+   *
+   * @return the number of dead connections
+   *
+   * See "deleted_conns"
+   */
+  int reap_dead();
+
+  /**
+   * @} // AsyncMessenger Internals
+   */
+} ;
+
+#endif /* CEPH_ASYNCMESSENGER_H */
diff --git a/src/msg/async/Event.cc b/src/msg/async/Event.cc
new file mode 100644
index 00000000..6b5e4c7c
--- /dev/null
+++ b/src/msg/async/Event.cc
@@ -0,0 +1,471 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+#include "common/errno.h"
+#include "Event.h"
+
+#ifdef HAVE_DPDK
+#include "dpdk/EventDPDK.h"
+#endif
+
+#ifdef HAVE_EPOLL
+#include "EventEpoll.h"
+#else
+#ifdef HAVE_KQUEUE
+#include "EventKqueue.h"
+#else
+#include "EventSelect.h"
+#endif
+#endif
+
+#define dout_subsys ceph_subsys_ms
+
+#undef dout_prefix
+#define dout_prefix *_dout << "EventCallback "
+class C_handle_notify : public EventCallback {
+  EventCenter *center;
+  CephContext *cct;
+
+ public:
+  C_handle_notify(EventCenter *c, CephContext *cc): center(c), cct(cc) {}
+  void do_request(uint64_t fd_or_id) override {
+    char c[256];
+    int r = 0;
+    do {
+      r = read(fd_or_id, c, sizeof(c));
+      if (r < 0) {
+        if (errno != EAGAIN)
+          ldout(cct, 1) << __func__ << " read notify pipe failed: " << cpp_strerror(errno) << dendl;
+      }
+    } while (r > 0);
+  }
+};
+
+#undef dout_prefix
+#define dout_prefix _event_prefix(_dout)
+
+/**
+ * Construct a Poller.
+ *
+ * \param center
+ *      EventCenter object through which the poller will be invoked (defaults
+ *      to the global #RAMCloud::center object).
+ * \param pollerName
+ *      Human readable name that can be printed out in debugging messages
+ *      about the poller. The name of the superclass is probably sufficient
+ *      for most cases.
+ */
+EventCenter::Poller::Poller(EventCenter* center, const string& name)
+    : owner(center), poller_name(name), slot(owner->pollers.size())
+{
+  owner->pollers.push_back(this);
+}
+
+/**
+ * Destroy a Poller.
+ */
+EventCenter::Poller::~Poller()
+{
+  // Erase this Poller from the vector by overwriting it with the
+  // poller that used to be the last one in the vector.
+  //
+  // Note: this approach is reentrant (it is safe to delete a
+  // poller from a poller callback, which means that the poll
+  // method is in the middle of scanning the list of all pollers;
+  // the worst that will happen is that the poller that got moved
+  // may not be invoked in the current scan).
+  owner->pollers[slot] = owner->pollers.back();
+  owner->pollers[slot]->slot = slot;
+  owner->pollers.pop_back();
+  slot = -1;
+}
+
+ostream& EventCenter::_event_prefix(std::ostream *_dout)
+{
+  return *_dout << "Event(" << this << " nevent=" << nevent
+                << " time_id=" << time_event_next_id << ").";
+}
+
+int EventCenter::init(int n, unsigned i, const std::string &t)
+{
+  // can't init multi times
+  ceph_assert(nevent == 0);
+
+  type = t;
+  idx = i;
+
+  if (t == "dpdk") {
+#ifdef HAVE_DPDK
+    driver = new DPDKDriver(cct);
+#endif
+  } else {
+#ifdef HAVE_EPOLL
+  driver = new EpollDriver(cct);
+#else
+#ifdef HAVE_KQUEUE
+  driver = new KqueueDriver(cct);
+#else
+  driver = new SelectDriver(cct);
+#endif
+#endif
+  }
+
+  if (!driver) {
+    lderr(cct) << __func__ << " failed to create event driver " << dendl;
+    return -1;
+  }
+
+  int r = driver->init(this, n);
+  if (r < 0) {
+    lderr(cct) << __func__ << " failed to init event driver." << dendl;
+    return r;
+  }
+
+  file_events.resize(n);
+  nevent = n;
+
+  if (!driver->need_wakeup())
+    return 0;
+
+  int fds[2];
+  if (pipe_cloexec(fds) < 0) {
+    int e = errno;
+    lderr(cct) << __func__ << " can't create notify pipe: " << cpp_strerror(e) << dendl;
+    return -e;
+  }
+
+  notify_receive_fd = fds[0];
+  notify_send_fd = fds[1];
+  r = net.set_nonblock(notify_receive_fd);
+  if (r < 0) {
+    return r;
+  }
+  r = net.set_nonblock(notify_send_fd);
+  if (r < 0) {
+    return r;
+  }
+
+  return r;
+}
+
+EventCenter::~EventCenter()
+{
+  {
+    std::lock_guard<std::mutex> l(external_lock);
+    while (!external_events.empty()) {
+      EventCallbackRef e = external_events.front();
+      if (e)
+        e->do_request(0);
+      external_events.pop_front();
+    }
+  }
+  time_events.clear();
+  //assert(time_events.empty());
+
+  if (notify_receive_fd >= 0)
+    ::close(notify_receive_fd);
+  if (notify_send_fd >= 0)
+    ::close(notify_send_fd);
+
+  delete driver;
+  if (notify_handler)
+    delete notify_handler;
+}
+
+
+void EventCenter::set_owner()
+{
+  owner = pthread_self();
+  ldout(cct, 2) << __func__ << " idx=" << idx << " owner=" << owner << dendl;
+  if (!global_centers) {
+    global_centers = &cct->lookup_or_create_singleton_object<
+      EventCenter::AssociatedCenters>(
+	"AsyncMessenger::EventCenter::global_center::" + type, true);
+    ceph_assert(global_centers);
+    global_centers->centers[idx] = this;
+    if (driver->need_wakeup()) {
+      notify_handler = new C_handle_notify(this, cct);
+      int r = create_file_event(notify_receive_fd, EVENT_READABLE, notify_handler);
+      ceph_assert(r == 0);
+    }
+  }
+}
+
+int EventCenter::create_file_event(int fd, int mask, EventCallbackRef ctxt)
+{
+  ceph_assert(in_thread());
+  int r = 0;
+  if (fd >= nevent) {
+    int new_size = nevent << 2;
+    while (fd >= new_size)
+      new_size <<= 2;
+    ldout(cct, 20) << __func__ << " event count exceed " << nevent << ", expand to " << new_size << dendl;
+    r = driver->resize_events(new_size);
+    if (r < 0) {
+      lderr(cct) << __func__ << " event count is exceed." << dendl;
+      return -ERANGE;
+    }
+    file_events.resize(new_size);
+    nevent = new_size;
+  }
+
+  EventCenter::FileEvent *event = _get_file_event(fd);
+  ldout(cct, 20) << __func__ << " create event started fd=" << fd << " mask=" << mask
+                 << " original mask is " << event->mask << dendl;
+  if (event->mask == mask)
+    return 0;
+
+  r = driver->add_event(fd, event->mask, mask);
+  if (r < 0) {
+    // Actually we don't allow any failed error code, caller doesn't prepare to
+    // handle error status. So now we need to assert failure here. In practice,
+    // add_event shouldn't report error, otherwise it must be a innermost bug!
+    lderr(cct) << __func__ << " add event failed, ret=" << r << " fd=" << fd
+               << " mask=" << mask << " original mask is " << event->mask << dendl;
+    ceph_abort_msg("BUG!");
+    return r;
+  }
+
+  event->mask |= mask;
+  if (mask & EVENT_READABLE) {
+    event->read_cb = ctxt;
+  }
+  if (mask & EVENT_WRITABLE) {
+    event->write_cb = ctxt;
+  }
+  ldout(cct, 20) << __func__ << " create event end fd=" << fd << " mask=" << mask
+                 << " original mask is " << event->mask << dendl;
+  return 0;
+}
+
+void EventCenter::delete_file_event(int fd, int mask)
+{
+  ceph_assert(in_thread() && fd >= 0);
+  if (fd >= nevent) {
+    ldout(cct, 1) << __func__ << " delete event fd=" << fd << " is equal or greater than nevent=" << nevent
+                  << "mask=" << mask << dendl;
+    return ;
+  }
+  EventCenter::FileEvent *event = _get_file_event(fd);
+  ldout(cct, 30) << __func__ << " delete event started fd=" << fd << " mask=" << mask
+                 << " original mask is " << event->mask << dendl;
+  if (!event->mask)
+    return ;
+
+  int r = driver->del_event(fd, event->mask, mask);
+  if (r < 0) {
+    // see create_file_event
+    ceph_abort_msg("BUG!");
+  }
+
+  if (mask & EVENT_READABLE && event->read_cb) {
+    event->read_cb = nullptr;
+  }
+  if (mask & EVENT_WRITABLE && event->write_cb) {
+    event->write_cb = nullptr;
+  }
+
+  event->mask = event->mask & (~mask);
+  ldout(cct, 30) << __func__ << " delete event end fd=" << fd << " mask=" << mask
+                 << " original mask is " << event->mask << dendl;
+}
+
+uint64_t EventCenter::create_time_event(uint64_t microseconds, EventCallbackRef ctxt)
+{
+  ceph_assert(in_thread());
+  uint64_t id = time_event_next_id++;
+
+  ldout(cct, 30) << __func__ << " id=" << id << " trigger after " << microseconds << "us"<< dendl;
+  EventCenter::TimeEvent event;
+  clock_type::time_point expire = clock_type::now() + std::chrono::microseconds(microseconds);
+  event.id = id;
+  event.time_cb = ctxt;
+  std::multimap<clock_type::time_point, TimeEvent>::value_type s_val(expire, event);
+  auto it = time_events.insert(std::move(s_val));
+  event_map[id] = it;
+
+  return id;
+}
+
+void EventCenter::delete_time_event(uint64_t id)
+{
+  ceph_assert(in_thread());
+  ldout(cct, 30) << __func__ << " id=" << id << dendl;
+  if (id >= time_event_next_id || id == 0)
+    return ;
+
+  auto it = event_map.find(id);
+  if (it == event_map.end()) {
+    ldout(cct, 10) << __func__ << " id=" << id << " not found" << dendl;
+    return ;
+  }
+
+  time_events.erase(it->second);
+  event_map.erase(it);
+}
+
+void EventCenter::wakeup()
+{
+  // No need to wake up since we never sleep
+  if (!pollers.empty() || !driver->need_wakeup())
+    return ;
+
+  ldout(cct, 20) << __func__ << dendl;
+  char buf = 'c';
+  // wake up "event_wait"
+  int n = write(notify_send_fd, &buf, sizeof(buf));
+  if (n < 0) {
+    if (errno != EAGAIN) {
+      ldout(cct, 1) << __func__ << " write notify pipe failed: " << cpp_strerror(errno) << dendl;
+      ceph_abort();
+    }
+  }
+}
+
+int EventCenter::process_time_events()
+{
+  int processed = 0;
+  clock_type::time_point now = clock_type::now();
+  ldout(cct, 30) << __func__ << " cur time is " << now << dendl;
+
+  while (!time_events.empty()) {
+    auto it = time_events.begin();
+    if (now >= it->first) {
+      TimeEvent &e = it->second;
+      EventCallbackRef cb = e.time_cb;
+      uint64_t id = e.id;
+      time_events.erase(it);
+      event_map.erase(id);
+      ldout(cct, 30) << __func__ << " process time event: id=" << id << dendl;
+      processed++;
+      cb->do_request(id);
+    } else {
+      break;
+    }
+  }
+
+  return processed;
+}
+
+int EventCenter::process_events(unsigned timeout_microseconds,  ceph::timespan *working_dur)
+{
+  struct timeval tv;
+  int numevents;
+  bool trigger_time = false;
+  auto now = clock_type::now();
+
+  auto it = time_events.begin();
+  bool blocking = pollers.empty() && !external_num_events.load();
+  // If exists external events or poller, don't block
+  if (!blocking) {
+    if (it != time_events.end() && now >= it->first)
+      trigger_time = true;
+    tv.tv_sec = 0;
+    tv.tv_usec = 0;
+  } else {
+    clock_type::time_point shortest;
+    shortest = now + std::chrono::microseconds(timeout_microseconds); 
+
+    if (it != time_events.end() && shortest >= it->first) {
+      ldout(cct, 30) << __func__ << " shortest is " << shortest << " it->first is " << it->first << dendl;
+      shortest = it->first;
+      trigger_time = true;
+      if (shortest > now) {
+        timeout_microseconds = std::chrono::duration_cast<std::chrono::microseconds>(
+            shortest - now).count();
+      } else {
+        shortest = now;
+        timeout_microseconds = 0;
+      }
+    }
+    tv.tv_sec = timeout_microseconds / 1000000;
+    tv.tv_usec = timeout_microseconds % 1000000;
+  }
+
+  ldout(cct, 30) << __func__ << " wait second " << tv.tv_sec << " usec " << tv.tv_usec << dendl;
+  vector<FiredFileEvent> fired_events;
+  numevents = driver->event_wait(fired_events, &tv);
+  auto working_start = ceph::mono_clock::now();
+  for (int j = 0; j < numevents; j++) {
+    int rfired = 0;
+    FileEvent *event;
+    EventCallbackRef cb;
+    event = _get_file_event(fired_events[j].fd);
+
+    /* note the event->mask & mask & ... code: maybe an already processed
+    * event removed an element that fired and we still didn't
+    * processed, so we check if the event is still valid. */
+    if (event->mask & fired_events[j].mask & EVENT_READABLE) {
+      rfired = 1;
+      cb = event->read_cb;
+      cb->do_request(fired_events[j].fd);
+    }
+
+    if (event->mask & fired_events[j].mask & EVENT_WRITABLE) {
+      if (!rfired || event->read_cb != event->write_cb) {
+        cb = event->write_cb;
+        cb->do_request(fired_events[j].fd);
+      }
+    }
+
+    ldout(cct, 30) << __func__ << " event_wq process is " << fired_events[j].fd << " mask is " << fired_events[j].mask << dendl;
+  }
+
+  if (trigger_time)
+    numevents += process_time_events();
+
+  if (external_num_events.load()) {
+    external_lock.lock();
+    deque<EventCallbackRef> cur_process;
+    cur_process.swap(external_events);
+    external_num_events.store(0);
+    external_lock.unlock();
+    numevents += cur_process.size();
+    while (!cur_process.empty()) {
+      EventCallbackRef e = cur_process.front();
+      ldout(cct, 30) << __func__ << " do " << e << dendl;
+      e->do_request(0);
+      cur_process.pop_front();
+    }
+  }
+
+  if (!numevents && !blocking) {
+    for (uint32_t i = 0; i < pollers.size(); i++)
+      numevents += pollers[i]->poll();
+  }
+
+  if (working_dur)
+    *working_dur = ceph::mono_clock::now() - working_start;
+  return numevents;
+}
+
+void EventCenter::dispatch_event_external(EventCallbackRef e)
+{
+  uint64_t num = 0;
+  {
+    std::lock_guard lock{external_lock};
+    if (external_num_events > 0 && *external_events.rbegin() == e) {
+      return;
+    }
+    external_events.push_back(e);
+    num = ++external_num_events;
+  }
+  if (num == 1 && !in_thread())
+    wakeup();
+
+  ldout(cct, 30) << __func__ << " " << e << " pending " << num << dendl;
+}
diff --git a/src/msg/async/Event.h b/src/msg/async/Event.h
new file mode 100644
index 00000000..6736060e
--- /dev/null
+++ b/src/msg/async/Event.h
@@ -0,0 +1,266 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_EVENT_H
+#define CEPH_MSG_EVENT_H
+
+#ifdef __APPLE__
+#include <AvailabilityMacros.h>
+#endif
+
+// We use epoll, kqueue, evport, select in descending order by performance.
+#if defined(__linux__)
+#define HAVE_EPOLL 1
+#endif
+
+#if (defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined (__NetBSD__)
+#define HAVE_KQUEUE 1
+#endif
+
+#ifdef __sun
+#include <sys/feature_tests.h>
+#ifdef _DTRACE_VERSION
+#define HAVE_EVPORT 1
+#endif
+#endif
+
+#include <atomic>
+#include <mutex>
+#include <condition_variable>
+
+#include "common/ceph_time.h"
+#include "common/dout.h"
+#include "net_handler.h"
+
+#define EVENT_NONE 0
+#define EVENT_READABLE 1
+#define EVENT_WRITABLE 2
+
+class EventCenter;
+
+class EventCallback {
+
+ public:
+  virtual void do_request(uint64_t fd_or_id) = 0;
+  virtual ~EventCallback() {}       // we want a virtual destructor!!!
+};
+
+typedef EventCallback* EventCallbackRef;
+
+struct FiredFileEvent {
+  int fd;
+  int mask;
+};
+
+/*
+ * EventDriver is a wrap of event mechanisms depends on different OS.
+ * For example, Linux will use epoll(2), BSD will use kqueue(2) and select will
+ * be used for worst condition.
+ */
+class EventDriver {
+ public:
+  virtual ~EventDriver() {}       // we want a virtual destructor!!!
+  virtual int init(EventCenter *center, int nevent) = 0;
+  virtual int add_event(int fd, int cur_mask, int mask) = 0;
+  virtual int del_event(int fd, int cur_mask, int del_mask) = 0;
+  virtual int event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tp) = 0;
+  virtual int resize_events(int newsize) = 0;
+  virtual bool need_wakeup() { return true; }
+};
+
+/*
+ * EventCenter maintain a set of file descriptor and handle registered events.
+ */
+class EventCenter {
+ public:
+  // should be enough;
+  static const int MAX_EVENTCENTER = 24;
+
+ private:
+  using clock_type = ceph::coarse_mono_clock;
+
+  struct AssociatedCenters {
+    EventCenter *centers[MAX_EVENTCENTER];
+    AssociatedCenters() {
+      // FIPS zeroization audit 20191115: this memset is not security related.
+      memset(centers, 0, MAX_EVENTCENTER * sizeof(EventCenter*));
+    }
+  };
+
+  struct FileEvent {
+    int mask;
+    EventCallbackRef read_cb;
+    EventCallbackRef write_cb;
+    FileEvent(): mask(0), read_cb(NULL), write_cb(NULL) {}
+  };
+
+  struct TimeEvent {
+    uint64_t id;
+    EventCallbackRef time_cb;
+
+    TimeEvent(): id(0), time_cb(NULL) {}
+  };
+
+ public:
+  /**
+     * A Poller object is invoked once each time through the dispatcher's
+     * inner polling loop.
+     */
+  class Poller {
+   public:
+    explicit Poller(EventCenter* center, const string& pollerName);
+    virtual ~Poller();
+
+    /**
+     * This method is defined by a subclass and invoked once by the
+     * center during each pass through its inner polling loop.
+     *
+     * \return
+     *      1 means that this poller did useful work during this call.
+     *      0 means that the poller found no work to do.
+     */
+    virtual int poll() = 0;
+
+   private:
+    /// The EventCenter object that owns this Poller.  NULL means the
+    /// EventCenter has been deleted.
+    EventCenter* owner;
+
+    /// Human-readable string name given to the poller to make it
+    /// easy to identify for debugging. For most pollers just passing
+    /// in the subclass name probably makes sense.
+    string poller_name;
+
+    /// Index of this Poller in EventCenter::pollers.  Allows deletion
+    /// without having to scan all the entries in pollers. -1 means
+    /// this poller isn't currently in EventCenter::pollers (happens
+    /// after EventCenter::reset).
+    int slot;
+  };
+
+ private:
+  CephContext *cct;
+  std::string type;
+  int nevent;
+  // Used only to external event
+  pthread_t owner = 0;
+  std::mutex external_lock;
+  std::atomic_ulong external_num_events;
+  deque<EventCallbackRef> external_events;
+  vector<FileEvent> file_events;
+  EventDriver *driver;
+  std::multimap<clock_type::time_point, TimeEvent> time_events;
+  // Keeps track of all of the pollers currently defined.  We don't
+  // use an intrusive list here because it isn't reentrant: we need
+  // to add/remove elements while the center is traversing the list.
+  std::vector<Poller*> pollers;
+  std::map<uint64_t, std::multimap<clock_type::time_point, TimeEvent>::iterator> event_map;
+  uint64_t time_event_next_id;
+  int notify_receive_fd;
+  int notify_send_fd;
+  NetHandler net;
+  EventCallbackRef notify_handler;
+  unsigned idx;
+  AssociatedCenters *global_centers = nullptr;
+
+  int process_time_events();
+  FileEvent *_get_file_event(int fd) {
+    ceph_assert(fd < nevent);
+    return &file_events[fd];
+  }
+
+ public:
+  explicit EventCenter(CephContext *c):
+    cct(c), nevent(0),
+    external_num_events(0),
+    driver(NULL), time_event_next_id(1),
+    notify_receive_fd(-1), notify_send_fd(-1), net(c),
+    notify_handler(NULL), idx(0) { }
+  ~EventCenter();
+  ostream& _event_prefix(std::ostream *_dout);
+
+  int init(int nevent, unsigned idx, const std::string &t);
+  void set_owner();
+  pthread_t get_owner() const { return owner; }
+  unsigned get_id() const { return idx; }
+
+  EventDriver *get_driver() { return driver; }
+
+  // Used by internal thread
+  int create_file_event(int fd, int mask, EventCallbackRef ctxt);
+  uint64_t create_time_event(uint64_t milliseconds, EventCallbackRef ctxt);
+  void delete_file_event(int fd, int mask);
+  void delete_time_event(uint64_t id);
+  int process_events(unsigned timeout_microseconds, ceph::timespan *working_dur = nullptr);
+  void wakeup();
+
+  // Used by external thread
+  void dispatch_event_external(EventCallbackRef e);
+  inline bool in_thread() const {
+    return pthread_equal(pthread_self(), owner);
+  }
+
+ private:
+  template <typename func>
+  class C_submit_event : public EventCallback {
+    std::mutex lock;
+    std::condition_variable cond;
+    bool done = false;
+    func f;
+    bool nonwait;
+   public:
+    C_submit_event(func &&_f, bool nw)
+      : f(std::move(_f)), nonwait(nw) {}
+    void do_request(uint64_t id) override {
+      f();
+      lock.lock();
+      cond.notify_all();
+      done = true;
+      bool del = nonwait;
+      lock.unlock();
+      if (del)
+        delete this;
+    }
+    void wait() {
+      ceph_assert(!nonwait);
+      std::unique_lock<std::mutex> l(lock);
+      while (!done)
+        cond.wait(l);
+    }
+  };
+
+ public:
+  template <typename func>
+  void submit_to(int i, func &&f, bool nowait = false) {
+    ceph_assert(i < MAX_EVENTCENTER && global_centers);
+    EventCenter *c = global_centers->centers[i];
+    ceph_assert(c);
+    if (!nowait && c->in_thread()) {
+      f();
+      return ;
+    }
+    if (nowait) {
+      C_submit_event<func> *event = new C_submit_event<func>(std::move(f), true);
+      c->dispatch_event_external(event);
+    } else {
+      C_submit_event<func> event(std::move(f), false);
+      c->dispatch_event_external(&event);
+      event.wait();
+    }
+  };
+};
+
+#endif
diff --git a/src/msg/async/EventEpoll.cc b/src/msg/async/EventEpoll.cc
new file mode 100644
index 00000000..37b46973
--- /dev/null
+++ b/src/msg/async/EventEpoll.cc
@@ -0,0 +1,142 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/errno.h"
+#include <fcntl.h>
+#include "EventEpoll.h"
+
+#define dout_subsys ceph_subsys_ms
+
+#undef dout_prefix
+#define dout_prefix *_dout << "EpollDriver."
+
+int EpollDriver::init(EventCenter *c, int nevent)
+{
+  events = (struct epoll_event*)malloc(sizeof(struct epoll_event)*nevent);
+  if (!events) {
+    lderr(cct) << __func__ << " unable to malloc memory. " << dendl;
+    return -ENOMEM;
+  }
+  memset(events, 0, sizeof(struct epoll_event)*nevent);
+
+  epfd = epoll_create(1024); /* 1024 is just an hint for the kernel */
+  if (epfd == -1) {
+    lderr(cct) << __func__ << " unable to do epoll_create: "
+                       << cpp_strerror(errno) << dendl;
+    return -errno;
+  }
+  if (::fcntl(epfd, F_SETFD, FD_CLOEXEC) == -1) {
+    int e = errno;
+    ::close(epfd);
+    lderr(cct) << __func__ << " unable to set cloexec: "
+                       << cpp_strerror(e) << dendl;
+
+    return -e;
+  }
+
+  size = nevent;
+
+  return 0;
+}
+
+int EpollDriver::add_event(int fd, int cur_mask, int add_mask)
+{
+  ldout(cct, 20) << __func__ << " add event fd=" << fd << " cur_mask=" << cur_mask
+                 << " add_mask=" << add_mask << " to " << epfd << dendl;
+  struct epoll_event ee;
+  /* If the fd was already monitored for some event, we need a MOD
+   * operation. Otherwise we need an ADD operation. */
+  int op;
+  op = cur_mask == EVENT_NONE ? EPOLL_CTL_ADD: EPOLL_CTL_MOD;
+
+  ee.events = EPOLLET;
+  add_mask |= cur_mask; /* Merge old events */
+  if (add_mask & EVENT_READABLE)
+    ee.events |= EPOLLIN;
+  if (add_mask & EVENT_WRITABLE)
+    ee.events |= EPOLLOUT;
+  ee.data.u64 = 0; /* avoid valgrind warning */
+  ee.data.fd = fd;
+  if (epoll_ctl(epfd, op, fd, &ee) == -1) {
+    lderr(cct) << __func__ << " epoll_ctl: add fd=" << fd << " failed. "
+               << cpp_strerror(errno) << dendl;
+    return -errno;
+  }
+
+  return 0;
+}
+
+int EpollDriver::del_event(int fd, int cur_mask, int delmask)
+{
+  ldout(cct, 20) << __func__ << " del event fd=" << fd << " cur_mask=" << cur_mask
+                 << " delmask=" << delmask << " to " << epfd << dendl;
+  struct epoll_event ee;
+  int mask = cur_mask & (~delmask);
+  int r = 0;
+
+  ee.events = 0;
+  if (mask & EVENT_READABLE) ee.events |= EPOLLIN;
+  if (mask & EVENT_WRITABLE) ee.events |= EPOLLOUT;
+  ee.data.u64 = 0; /* avoid valgrind warning */
+  ee.data.fd = fd;
+  if (mask != EVENT_NONE) {
+    if ((r = epoll_ctl(epfd, EPOLL_CTL_MOD, fd, &ee)) < 0) {
+      lderr(cct) << __func__ << " epoll_ctl: modify fd=" << fd << " mask=" << mask
+                 << " failed." << cpp_strerror(errno) << dendl;
+      return -errno;
+    }
+  } else {
+    /* Note, Kernel < 2.6.9 requires a non null event pointer even for
+     * EPOLL_CTL_DEL. */
+    if ((r = epoll_ctl(epfd, EPOLL_CTL_DEL, fd, &ee)) < 0) {
+      lderr(cct) << __func__ << " epoll_ctl: delete fd=" << fd
+                 << " failed." << cpp_strerror(errno) << dendl;
+      return -errno;
+    }
+  }
+  return 0;
+}
+
+int EpollDriver::resize_events(int newsize)
+{
+  return 0;
+}
+
+int EpollDriver::event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tvp)
+{
+  int retval, numevents = 0;
+
+  retval = epoll_wait(epfd, events, size,
+                      tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1);
+  if (retval > 0) {
+    int j;
+
+    numevents = retval;
+    fired_events.resize(numevents);
+    for (j = 0; j < numevents; j++) {
+      int mask = 0;
+      struct epoll_event *e = events + j;
+
+      if (e->events & EPOLLIN) mask |= EVENT_READABLE;
+      if (e->events & EPOLLOUT) mask |= EVENT_WRITABLE;
+      if (e->events & EPOLLERR) mask |= EVENT_READABLE|EVENT_WRITABLE;
+      if (e->events & EPOLLHUP) mask |= EVENT_READABLE|EVENT_WRITABLE;
+      fired_events[j].fd = e->data.fd;
+      fired_events[j].mask = mask;
+    }
+  }
+  return numevents;
+}
diff --git a/src/msg/async/EventEpoll.h b/src/msg/async/EventEpoll.h
new file mode 100644
index 00000000..abc4b8bb
--- /dev/null
+++ b/src/msg/async/EventEpoll.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_EVENTEPOLL_H
+#define CEPH_MSG_EVENTEPOLL_H
+
+#include <unistd.h>
+#include <sys/epoll.h>
+
+#include "Event.h"
+
+class EpollDriver : public EventDriver {
+  int epfd;
+  struct epoll_event *events;
+  CephContext *cct;
+  int size;
+
+ public:
+  explicit EpollDriver(CephContext *c): epfd(-1), events(NULL), cct(c), size(0) {}
+  ~EpollDriver() override {
+    if (epfd != -1)
+      close(epfd);
+
+    if (events)
+      free(events);
+  }
+
+  int init(EventCenter *c, int nevent) override;
+  int add_event(int fd, int cur_mask, int add_mask) override;
+  int del_event(int fd, int cur_mask, int del_mask) override;
+  int resize_events(int newsize) override;
+  int event_wait(vector<FiredFileEvent> &fired_events,
+		 struct timeval *tp) override;
+};
+
+#endif
diff --git a/src/msg/async/EventKqueue.cc b/src/msg/async/EventKqueue.cc
new file mode 100644
index 00000000..d6ba4a3d
--- /dev/null
+++ b/src/msg/async/EventKqueue.cc
@@ -0,0 +1,267 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/errno.h"
+#include "EventKqueue.h"
+
+#define dout_subsys ceph_subsys_ms
+
+#undef dout_prefix
+#define dout_prefix *_dout << "KqueueDriver."
+
+#define KEVENT_NOWAIT 0
+
+int KqueueDriver::test_kqfd() {
+  struct kevent ke[1];
+  if (kevent(kqfd, ke, 0, NULL, 0, KEVENT_NOWAIT) == -1) {
+    ldout(cct,0) << __func__ << " invalid kqfd = " << kqfd 
+                 << cpp_strerror(errno) << dendl;
+    return -errno;
+  }
+  return kqfd;
+}
+
+int KqueueDriver::restore_events() {
+  struct kevent ke[2];
+  int i;
+
+  ldout(cct,30) << __func__ << " on kqfd = " << kqfd << dendl;
+  for(i=0;i<size;i++) {
+    int num = 0;
+    if (sav_events[i].mask == 0 )
+      continue;
+    ldout(cct,30) << __func__ << " restore kqfd = " << kqfd 
+                  << " fd = " << i << " mask " << sav_events[i].mask << dendl;
+    if (sav_events[i].mask & EVENT_READABLE)
+      EV_SET(&ke[num++], i, EVFILT_READ, EV_ADD, 0, 0, NULL);
+    if (sav_events[i].mask & EVENT_WRITABLE)
+      EV_SET(&ke[num++], i, EVFILT_WRITE, EV_ADD, 0, 0, NULL);
+    if (num) {
+      if (kevent(kqfd, ke, num, NULL, 0, KEVENT_NOWAIT) == -1) {
+        ldout(cct,0) << __func__ << " unable to add event: "
+                     << cpp_strerror(errno) << dendl;
+        return -errno;
+      }
+    }
+  }
+  return 0;
+}
+
+int KqueueDriver::test_thread_change(const char* funcname) {
+  // check to see if we changed thread, because that invalidates
+  // the kqfd and we need to restore that
+  int oldkqfd = kqfd;
+
+  if (!pthread_equal(mythread, pthread_self())) {
+    ldout(cct,20) << funcname << " We changed thread from " << mythread
+                  << " to " << pthread_self() << dendl;
+    mythread = pthread_self();
+    kqfd = -1;
+  } else if ((kqfd != -1) && (test_kqfd() < 0)) {
+    // should this ever happen?
+    // It would be strange to change kqfd with thread change.
+    // Might nee to change this into an ceph_assert() in the future.
+    ldout(cct,0) << funcname << " Warning: Recreating old kqfd. "
+                 << "This should not happen!!!"  << dendl;
+    kqfd = -1;
+  }
+  if (kqfd == -1) {
+    kqfd = kqueue();
+    ldout(cct,30) << funcname << " kqueue: new kqfd = " << kqfd
+                  << " (was: " << oldkqfd << ")"
+                  << dendl;
+    if (kqfd < 0) {
+      lderr(cct) << funcname << " unable to do kqueue: "
+                             << cpp_strerror(errno) << dendl;
+      return -errno;
+    }
+    if (restore_events()< 0) {
+      lderr(cct) << funcname << " unable restore all events "
+                             << cpp_strerror(errno) << dendl;
+      return -errno;
+    }
+  }
+  return 0;
+}
+
+int KqueueDriver::init(EventCenter *c, int nevent)
+{
+  // keep track of possible changes of our thread
+  // because change of thread kills the kqfd
+  mythread = pthread_self();
+
+  // Reserve the space to accept the kevent return events.
+  res_events = (struct kevent*)malloc(sizeof(struct kevent)*nevent);
+  if (!res_events) {
+    lderr(cct) << __func__ << " unable to malloc memory: "
+                           << cpp_strerror(errno) << dendl;
+    return -ENOMEM;
+  }
+  memset(res_events, 0, sizeof(struct kevent)*nevent);
+  size = nevent;
+
+  // Reserve the space to keep all of the events set, so it can be redone
+  // when we change trhread ID. 
+  sav_events = (struct SaveEvent*)malloc(sizeof(struct SaveEvent)*nevent);
+  if (!sav_events) {
+    lderr(cct) << __func__ << " unable to malloc memory: "
+                           << cpp_strerror(errno) << dendl;
+    return -ENOMEM;
+  }
+  memset(sav_events, 0, sizeof(struct SaveEvent)*nevent);
+  sav_max = nevent;
+
+  // Delay assigning a descriptor until it is really needed.
+  // kqfd = kqueue();
+  kqfd = -1;
+  return 0;
+}
+
+int KqueueDriver::add_event(int fd, int cur_mask, int add_mask)
+{
+  struct kevent ke[2];
+  int num = 0;
+
+  ldout(cct,30) << __func__ << " add event kqfd = " << kqfd << " fd = " << fd 
+	<< " cur_mask = " << cur_mask << " add_mask = " << add_mask 
+	<< dendl;
+
+  int r = test_thread_change(__func__);
+  if ( r < 0 )
+    return r;
+
+  if (add_mask & EVENT_READABLE)
+    EV_SET(&ke[num++], fd, EVFILT_READ, EV_ADD|EV_CLEAR, 0, 0, NULL);
+  if (add_mask & EVENT_WRITABLE)
+    EV_SET(&ke[num++], fd, EVFILT_WRITE, EV_ADD|EV_CLEAR, 0, 0, NULL);
+
+  if (num) {
+    if (kevent(kqfd, ke, num, NULL, 0, KEVENT_NOWAIT) == -1) {
+      lderr(cct) << __func__ << " unable to add event: "
+                             << cpp_strerror(errno) << dendl;
+      return -errno;
+    }
+  }
+  // keep what we set
+  if (fd >= sav_max)
+    resize_events(sav_max+5000);
+  sav_events[fd].mask = cur_mask | add_mask;
+  return 0;
+}
+
+int KqueueDriver::del_event(int fd, int cur_mask, int del_mask)
+{
+  struct kevent ke[2];
+  int num = 0;
+  int mask = cur_mask & del_mask;
+
+  ldout(cct,30) << __func__ << " delete event kqfd = " << kqfd 
+	<< " fd = " << fd << " cur_mask = " << cur_mask 
+	<< " del_mask = " << del_mask << dendl;
+
+  int r = test_thread_change(__func__);
+  if ( r < 0 )
+    return r;
+
+  if (mask & EVENT_READABLE)
+    EV_SET(&ke[num++], fd, EVFILT_READ, EV_DELETE, 0, 0, NULL);
+  if (mask & EVENT_WRITABLE)
+    EV_SET(&ke[num++], fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL);
+
+  if (num) {
+    int r = 0;
+    if ((r = kevent(kqfd, ke, num, NULL, 0, KEVENT_NOWAIT)) < 0) {
+      lderr(cct) << __func__ << " kevent: delete fd=" << fd << " mask=" << mask
+                 << " failed." << cpp_strerror(errno) << dendl;
+      return -errno;
+    }
+  }
+  // keep the administration
+  sav_events[fd].mask = cur_mask & ~del_mask;
+  return 0;
+}
+
+int KqueueDriver::resize_events(int newsize)
+{
+  ldout(cct,30) << __func__ << " kqfd = " << kqfd << "newsize = " << newsize 
+                << dendl;
+  if (newsize > sav_max) {
+    sav_events = (struct SaveEvent*)realloc(sav_events, sizeof(struct SaveEvent)*newsize);
+    if (!sav_events) {
+      lderr(cct) << __func__ << " unable to realloc memory: "
+                             << cpp_strerror(errno) << dendl;
+      ceph_assert(sav_events);
+      return -ENOMEM;
+    }
+    memset(&sav_events[size], 0, sizeof(struct SaveEvent)*(newsize-sav_max));
+    sav_max = newsize;
+  }
+  return 0;
+}
+
+int KqueueDriver::event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tvp)
+{
+  int retval, numevents = 0;
+  struct timespec timeout;
+
+  ldout(cct,10) << __func__ << " kqfd = " << kqfd << dendl;
+
+  int r = test_thread_change(__func__);
+  if ( r < 0 )
+    return r;
+
+  if (tvp != NULL) {
+      timeout.tv_sec = tvp->tv_sec;
+      timeout.tv_nsec = tvp->tv_usec * 1000;
+      ldout(cct,20) << __func__ << " "
+		<< timeout.tv_sec << " sec "
+		<< timeout.tv_nsec << " nsec"
+		<< dendl;
+      retval = kevent(kqfd, NULL, 0, res_events, size, &timeout);
+  } else {
+      ldout(cct,30) << __func__ << " event_wait: " << " NULL" << dendl;
+      retval = kevent(kqfd, NULL, 0, res_events, size, KEVENT_NOWAIT);
+  }
+
+  ldout(cct,25) << __func__ << " kevent retval: " << retval << dendl;
+  if (retval < 0) {
+    lderr(cct) << __func__ << " kqueue error: "
+                           << cpp_strerror(errno) << dendl;
+    return -errno;
+  } else if (retval == 0) {
+    ldout(cct,5) << __func__ << " Hit timeout("
+                 << timeout.tv_sec << " sec "
+                 << timeout.tv_nsec << " nsec"
+		 << ")." << dendl;
+  } else {
+    int j;
+
+    numevents = retval;
+    fired_events.resize(numevents);
+    for (j = 0; j < numevents; j++) {
+      int mask = 0;
+      struct kevent *e = res_events + j;
+
+      if (e->filter == EVFILT_READ) mask |= EVENT_READABLE;
+      if (e->filter == EVFILT_WRITE) mask |= EVENT_WRITABLE;
+      if (e->flags & EV_ERROR) mask |= EVENT_READABLE|EVENT_WRITABLE;
+      fired_events[j].fd = (int)e->ident;
+      fired_events[j].mask = mask;
+
+    }
+  }
+  return numevents;
+}
diff --git a/src/msg/async/EventKqueue.h b/src/msg/async/EventKqueue.h
new file mode 100644
index 00000000..24863a93
--- /dev/null
+++ b/src/msg/async/EventKqueue.h
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_EVENTKQUEUE_H
+#define CEPH_MSG_EVENTKQUEUE_H
+
+#include <sys/types.h>
+#include <sys/event.h>
+#include <unistd.h>
+
+#include "Event.h"
+
+class KqueueDriver : public EventDriver {
+  int kqfd;
+  pthread_t mythread;
+  struct kevent *res_events;
+  CephContext *cct;
+  int size;
+
+  // Keep what we set on the kqfd
+  struct SaveEvent{
+    int fd;
+    int mask;
+  };
+  struct SaveEvent *sav_events;
+  int sav_max;
+  int restore_events();
+  int test_kqfd();
+  int test_thread_change(const char* funcname);
+
+ public:
+  explicit KqueueDriver(CephContext *c): kqfd(-1), res_events(NULL), cct(c), 
+		size(0), sav_max(0) {}
+  virtual ~KqueueDriver() {
+    if (kqfd != -1)
+      close(kqfd);
+
+    if (res_events)
+      free(res_events);
+    size = 0;
+    if (sav_events)
+      free(sav_events);
+    sav_max = 0;
+  }
+
+  int init(EventCenter *c, int nevent) override;
+  int add_event(int fd, int cur_mask, int add_mask) override;
+  int del_event(int fd, int cur_mask, int del_mask) override;
+  int resize_events(int newsize) override;
+  int event_wait(vector<FiredFileEvent> &fired_events,
+		 struct timeval *tp) override;
+};
+
+#endif
diff --git a/src/msg/async/EventSelect.cc b/src/msg/async/EventSelect.cc
new file mode 100644
index 00000000..fdee6ebc
--- /dev/null
+++ b/src/msg/async/EventSelect.cc
@@ -0,0 +1,95 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/errno.h"
+#include "EventSelect.h"
+
+#include <unistd.h>
+#include <sys/select.h>
+#define dout_subsys ceph_subsys_ms
+
+#undef dout_prefix
+#define dout_prefix *_dout << "SelectDriver."
+
+int SelectDriver::init(EventCenter *c, int nevent)
+{
+  ldout(cct, 0) << "Select isn't suitable for production env, just avoid "
+                << "compiling error or special purpose" << dendl;
+  FD_ZERO(&rfds);
+  FD_ZERO(&wfds);
+  max_fd = 0;
+  return 0;
+}
+
+int SelectDriver::add_event(int fd, int cur_mask, int add_mask)
+{
+  ldout(cct, 10) << __func__ << " add event to fd=" << fd << " mask=" << add_mask
+                 << dendl;
+
+  int mask = cur_mask | add_mask;
+  if (mask & EVENT_READABLE)
+    FD_SET(fd, &rfds);
+  if (mask & EVENT_WRITABLE)
+    FD_SET(fd, &wfds);
+  if (fd > max_fd)
+      max_fd = fd;
+
+  return 0;
+}
+
+int SelectDriver::del_event(int fd, int cur_mask, int delmask)
+{
+  ldout(cct, 10) << __func__ << " del event fd=" << fd << " cur mask=" << cur_mask
+                 << dendl;
+
+  if (delmask & EVENT_READABLE)
+    FD_CLR(fd, &rfds);
+  if (delmask & EVENT_WRITABLE)
+    FD_CLR(fd, &wfds);
+  return 0;
+}
+
+int SelectDriver::resize_events(int newsize)
+{
+  return 0;
+}
+
+int SelectDriver::event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tvp)
+{
+  int retval, numevents = 0;
+
+  memcpy(&_rfds, &rfds, sizeof(fd_set));
+  memcpy(&_wfds, &wfds, sizeof(fd_set));
+
+  retval = select(max_fd+1, &_rfds, &_wfds, NULL, tvp);
+  if (retval > 0) {
+    for (int j = 0; j <= max_fd; j++) {
+      int mask = 0;
+      struct FiredFileEvent fe;
+      if (FD_ISSET(j, &_rfds))
+          mask |= EVENT_READABLE;
+      if (FD_ISSET(j, &_wfds))
+          mask |= EVENT_WRITABLE;
+      if (mask) {
+        fe.fd = j;
+        fe.mask = mask;
+        fired_events.push_back(fe);
+        numevents++;
+      }
+    }
+  }
+  return numevents;
+}
diff --git a/src/msg/async/EventSelect.h b/src/msg/async/EventSelect.h
new file mode 100644
index 00000000..1b75da0b
--- /dev/null
+++ b/src/msg/async/EventSelect.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_EVENTSELECT_H
+#define CEPH_MSG_EVENTSELECT_H
+
+#include "Event.h"
+
+class SelectDriver : public EventDriver {
+  fd_set rfds, wfds;
+  /* We need to have a copy of the fd sets as it's not safe to reuse
+   * FD sets after select(). */
+  fd_set _rfds, _wfds;
+  int max_fd;
+  CephContext *cct;
+
+ public:
+  explicit SelectDriver(CephContext *c): max_fd(0), cct(c) {}
+  ~SelectDriver() override {}
+
+  int init(EventCenter *c, int nevent) override;
+  int add_event(int fd, int cur_mask, int add_mask) override;
+  int del_event(int fd, int cur_mask, int del_mask) override;
+  int resize_events(int newsize) override;
+  int event_wait(vector<FiredFileEvent> &fired_events,
+		 struct timeval *tp) override;
+};
+
+#endif
diff --git a/src/msg/async/PosixStack.cc b/src/msg/async/PosixStack.cc
new file mode 100644
index 00000000..e9c8d404
--- /dev/null
+++ b/src/msg/async/PosixStack.cc
@@ -0,0 +1,293 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <sys/socket.h>
+#include <netinet/tcp.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <errno.h>
+
+#include <algorithm>
+
+#include "PosixStack.h"
+
+#include "include/buffer.h"
+#include "include/str_list.h"
+#include "common/errno.h"
+#include "common/strtol.h"
+#include "common/dout.h"
+#include "msg/Messenger.h"
+#include "include/compat.h"
+#include "include/sock_compat.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << "PosixStack "
+
+class PosixConnectedSocketImpl final : public ConnectedSocketImpl {
+  NetHandler &handler;
+  int _fd;
+  entity_addr_t sa;
+  bool connected;
+
+ public:
+  explicit PosixConnectedSocketImpl(NetHandler &h, const entity_addr_t &sa, int f, bool connected)
+      : handler(h), _fd(f), sa(sa), connected(connected) {}
+
+  int is_connected() override {
+    if (connected)
+      return 1;
+
+    int r = handler.reconnect(sa, _fd);
+    if (r == 0) {
+      connected = true;
+      return 1;
+    } else if (r < 0) {
+      return r;
+    } else {
+      return 0;
+    }
+  }
+
+  ssize_t zero_copy_read(bufferptr&) override {
+    return -EOPNOTSUPP;
+  }
+
+  ssize_t read(char *buf, size_t len) override {
+    ssize_t r = ::read(_fd, buf, len);
+    if (r < 0)
+      r = -errno;
+    return r;
+  }
+
+  // return the sent length
+  // < 0 means error occurred
+  static ssize_t do_sendmsg(int fd, struct msghdr &msg, unsigned len, bool more)
+  {
+    size_t sent = 0;
+    while (1) {
+      MSGR_SIGPIPE_STOPPER;
+      ssize_t r;
+      r = ::sendmsg(fd, &msg, MSG_NOSIGNAL | (more ? MSG_MORE : 0));
+      if (r < 0) {
+        if (errno == EINTR) {
+          continue;
+        } else if (errno == EAGAIN) {
+          break;
+        }
+        return -errno;
+      }
+
+      sent += r;
+      if (len == sent) break;
+
+      while (r > 0) {
+        if (msg.msg_iov[0].iov_len <= (size_t)r) {
+          // drain this whole item
+          r -= msg.msg_iov[0].iov_len;
+          msg.msg_iov++;
+          msg.msg_iovlen--;
+        } else {
+          msg.msg_iov[0].iov_base = (char *)msg.msg_iov[0].iov_base + r;
+          msg.msg_iov[0].iov_len -= r;
+          break;
+        }
+      }
+    }
+    return (ssize_t)sent;
+  }
+
+  ssize_t send(bufferlist &bl, bool more) override {
+    size_t sent_bytes = 0;
+    auto pb = std::cbegin(bl.buffers());
+    uint64_t left_pbrs = std::size(bl.buffers());
+    while (left_pbrs) {
+      struct msghdr msg;
+      struct iovec msgvec[IOV_MAX];
+      uint64_t size = std::min<uint64_t>(left_pbrs, IOV_MAX);
+      left_pbrs -= size;
+      // FIPS zeroization audit 20191115: this memset is not security related.
+      memset(&msg, 0, sizeof(msg));
+      msg.msg_iovlen = size;
+      msg.msg_iov = msgvec;
+      unsigned msglen = 0;
+      for (auto iov = msgvec; iov != msgvec + size; iov++) {
+	iov->iov_base = (void*)(pb->c_str());
+	iov->iov_len = pb->length();
+	msglen += pb->length();
+	++pb;
+      }
+      ssize_t r = do_sendmsg(_fd, msg, msglen, left_pbrs || more);
+      if (r < 0)
+        return r;
+
+      // "r" is the remaining length
+      sent_bytes += r;
+      if (static_cast<unsigned>(r) < msglen)
+        break;
+      // only "r" == 0 continue
+    }
+
+    if (sent_bytes) {
+      bufferlist swapped;
+      if (sent_bytes < bl.length()) {
+        bl.splice(sent_bytes, bl.length()-sent_bytes, &swapped);
+        bl.swap(swapped);
+      } else {
+        bl.clear();
+      }
+    }
+
+    return static_cast<ssize_t>(sent_bytes);
+  }
+  void shutdown() override {
+    ::shutdown(_fd, SHUT_RDWR);
+  }
+  void close() override {
+    ::close(_fd);
+  }
+  int fd() const override {
+    return _fd;
+  }
+  int socket_fd() const override {
+    return _fd;
+  }
+  friend class PosixServerSocketImpl;
+  friend class PosixNetworkStack;
+};
+
+class PosixServerSocketImpl : public ServerSocketImpl {
+  NetHandler &handler;
+  int _fd;
+
+ public:
+  explicit PosixServerSocketImpl(NetHandler &h, int f,
+				 const entity_addr_t& listen_addr, unsigned slot)
+    : ServerSocketImpl(listen_addr.get_type(), slot),
+      handler(h), _fd(f) {}
+  int accept(ConnectedSocket *sock, const SocketOptions &opts, entity_addr_t *out, Worker *w) override;
+  void abort_accept() override {
+    ::close(_fd);
+  }
+  int fd() const override {
+    return _fd;
+  }
+};
+
+int PosixServerSocketImpl::accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w) {
+  ceph_assert(sock);
+  sockaddr_storage ss;
+  socklen_t slen = sizeof(ss);
+  int sd = accept_cloexec(_fd, (sockaddr*)&ss, &slen);
+  if (sd < 0) {
+    return -errno;
+  }
+
+  int r = handler.set_nonblock(sd);
+  if (r < 0) {
+    ::close(sd);
+    return -errno;
+  }
+
+  r = handler.set_socket_options(sd, opt.nodelay, opt.rcbuf_size);
+  if (r < 0) {
+    ::close(sd);
+    return -errno;
+  }
+
+  ceph_assert(NULL != out); //out should not be NULL in accept connection
+
+  out->set_type(addr_type);
+  out->set_sockaddr((sockaddr*)&ss);
+  handler.set_priority(sd, opt.priority, out->get_family());
+
+  std::unique_ptr<PosixConnectedSocketImpl> csi(new PosixConnectedSocketImpl(handler, *out, sd, true));
+  *sock = ConnectedSocket(std::move(csi));
+  return 0;
+}
+
+void PosixWorker::initialize()
+{
+}
+
+int PosixWorker::listen(entity_addr_t &sa,
+			unsigned addr_slot,
+			const SocketOptions &opt,
+                        ServerSocket *sock)
+{
+  int listen_sd = net.create_socket(sa.get_family(), true);
+  if (listen_sd < 0) {
+    return -errno;
+  }
+
+  int r = net.set_nonblock(listen_sd);
+  if (r < 0) {
+    ::close(listen_sd);
+    return -errno;
+  }
+
+  r = net.set_socket_options(listen_sd, opt.nodelay, opt.rcbuf_size);
+  if (r < 0) {
+    ::close(listen_sd);
+    return -errno;
+  }
+
+  r = ::bind(listen_sd, sa.get_sockaddr(), sa.get_sockaddr_len());
+  if (r < 0) {
+    r = -errno;
+    ldout(cct, 10) << __func__ << " unable to bind to " << sa.get_sockaddr()
+                   << ": " << cpp_strerror(r) << dendl;
+    ::close(listen_sd);
+    return r;
+  }
+
+  r = ::listen(listen_sd, cct->_conf->ms_tcp_listen_backlog);
+  if (r < 0) {
+    r = -errno;
+    lderr(cct) << __func__ << " unable to listen on " << sa << ": " << cpp_strerror(r) << dendl;
+    ::close(listen_sd);
+    return r;
+  }
+
+  *sock = ServerSocket(
+          std::unique_ptr<PosixServerSocketImpl>(
+	    new PosixServerSocketImpl(net, listen_sd, sa, addr_slot)));
+  return 0;
+}
+
+int PosixWorker::connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) {
+  int sd;
+
+  if (opts.nonblock) {
+    sd = net.nonblock_connect(addr, opts.connect_bind_addr);
+  } else {
+    sd = net.connect(addr, opts.connect_bind_addr);
+  }
+
+  if (sd < 0) {
+    return -errno;
+  }
+
+  net.set_priority(sd, opts.priority, addr.get_family());
+  *socket = ConnectedSocket(
+      std::unique_ptr<PosixConnectedSocketImpl>(new PosixConnectedSocketImpl(net, addr, sd, !opts.nonblock)));
+  return 0;
+}
+
+PosixNetworkStack::PosixNetworkStack(CephContext *c, const string &t)
+    : NetworkStack(c, t)
+{
+}
diff --git a/src/msg/async/PosixStack.h b/src/msg/async/PosixStack.h
new file mode 100644
index 00000000..f1aaccd4
--- /dev/null
+++ b/src/msg/async/PosixStack.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_ASYNC_POSIXSTACK_H
+#define CEPH_MSG_ASYNC_POSIXSTACK_H
+
+#include <thread>
+
+#include "msg/msg_types.h"
+#include "msg/async/net_handler.h"
+
+#include "Stack.h"
+
+class PosixWorker : public Worker {
+  NetHandler net;
+  void initialize() override;
+ public:
+  PosixWorker(CephContext *c, unsigned i)
+      : Worker(c, i), net(c) {}
+  int listen(entity_addr_t &sa,
+	     unsigned addr_slot,
+	     const SocketOptions &opt,
+	     ServerSocket *socks) override;
+  int connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) override;
+};
+
+class PosixNetworkStack : public NetworkStack {
+  vector<std::thread> threads;
+
+ public:
+  explicit PosixNetworkStack(CephContext *c, const string &t);
+
+  void spawn_worker(unsigned i, std::function<void ()> &&func) override {
+    threads.resize(i+1);
+    threads[i] = std::thread(func);
+  }
+  void join_worker(unsigned i) override {
+    ceph_assert(threads.size() > i && threads[i].joinable());
+    threads[i].join();
+  }
+};
+
+#endif //CEPH_MSG_ASYNC_POSIXSTACK_H
diff --git a/src/msg/async/Protocol.cc b/src/msg/async/Protocol.cc
new file mode 100644
index 00000000..4bdc065e
--- /dev/null
+++ b/src/msg/async/Protocol.cc
@@ -0,0 +1,14 @@
+#include "Protocol.h"
+
+#include "AsyncConnection.h"
+#include "AsyncMessenger.h"
+
+Protocol::Protocol(int type, AsyncConnection *connection)
+  : proto_type(type),
+    connection(connection),
+    messenger(connection->async_msgr),
+    cct(connection->async_msgr->cct) {
+  auth_meta.reset(new AuthConnectionMeta());
+}
+
+Protocol::~Protocol() {}
diff --git a/src/msg/async/Protocol.h b/src/msg/async/Protocol.h
new file mode 100644
index 00000000..cccba183
--- /dev/null
+++ b/src/msg/async/Protocol.h
@@ -0,0 +1,140 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef _MSG_ASYNC_PROTOCOL_
+#define _MSG_ASYNC_PROTOCOL_
+
+#include <list>
+#include <map>
+
+#include "AsyncConnection.h"
+#include "include/buffer.h"
+#include "include/msgr.h"
+
+/*
+ * Continuation Helper Classes
+ */
+
+#include <memory>
+#include <tuple>
+
+template <class C>
+class Ct {
+public:
+  virtual ~Ct() {}
+  virtual Ct<C> *call(C *foo) const = 0;
+};
+
+template <class C, typename... Args>
+class CtFun : public Ct<C> {
+private:
+  using fn_t = Ct<C> *(C::*)(Args...);
+  fn_t _f;
+  std::tuple<Args...> _params;
+
+  template <std::size_t... Is>
+  inline Ct<C> *_call(C *foo, std::index_sequence<Is...>) const {
+    return (foo->*_f)(std::get<Is>(_params)...);
+  }
+
+public:
+  CtFun(fn_t f) : _f(f) {}
+
+  inline void setParams(Args... args) { _params = std::make_tuple(args...); }
+  inline Ct<C> *call(C *foo) const override {
+    return _call(foo, std::index_sequence_for<Args...>());
+  }
+};
+
+using rx_buffer_t =
+    std::unique_ptr<buffer::ptr_node, buffer::ptr_node::disposer>;
+
+template <class C>
+class CtRxNode : public Ct<C> {
+  using fn_t = Ct<C> *(C::*)(rx_buffer_t&&, int r);
+  fn_t _f;
+
+public:
+  mutable rx_buffer_t node;
+  int r;
+
+  CtRxNode(fn_t f) : _f(f) {}
+  void setParams(rx_buffer_t &&node, int r) {
+    this->node = std::move(node);
+    this->r = r;
+  }
+  inline Ct<C> *call(C *foo) const override {
+    return (foo->*_f)(std::move(node), r);
+  }
+};
+
+template <class C> using CONTINUATION_TYPE = CtFun<C>;
+template <class C> using CONTINUATION_TX_TYPE = CtFun<C, int>;
+template <class C> using CONTINUATION_RX_TYPE = CtFun<C, char*, int>;
+template <class C> using CONTINUATION_RXBPTR_TYPE = CtRxNode<C>;
+
+#define CONTINUATION_DECL(C, F, ...)                    \
+  CtFun<C, ##__VA_ARGS__> F##_cont { (&C::F) };
+
+#define CONTINUATION(F) F##_cont
+#define CONTINUE(F, ...) (F##_cont.setParams(__VA_ARGS__), &F##_cont)
+
+#define CONTINUATION_RUN(CT)                                      \
+  {                                                               \
+    Ct<std::remove_reference<decltype(*this)>::type> *_cont = &CT;\
+    do {                                                          \
+      _cont = _cont->call(this);                                  \
+    } while (_cont);                                              \
+  }
+
+#define READ_HANDLER_CONTINUATION_DECL(C, F) \
+  CONTINUATION_DECL(C, F, char *, int)
+
+#define READ_BPTR_HANDLER_CONTINUATION_DECL(C, F) \
+  CtRxNode<C> F##_cont { (&C::F) };
+
+#define WRITE_HANDLER_CONTINUATION_DECL(C, F) CONTINUATION_DECL(C, F, int)
+
+//////////////////////////////////////////////////////////////////////
+
+class AsyncMessenger;
+
+class Protocol {
+public:
+  const int proto_type;
+protected:
+  AsyncConnection *connection;
+  AsyncMessenger *messenger;
+  CephContext *cct;
+public:
+  std::shared_ptr<AuthConnectionMeta> auth_meta;
+
+public:
+  Protocol(int type, AsyncConnection *connection);
+  virtual ~Protocol();
+
+  // prepare protocol for connecting to peer
+  virtual void connect() = 0;
+  // prepare protocol for accepting peer connections
+  virtual void accept() = 0;
+  // true -> protocol is ready for sending messages
+  virtual bool is_connected() = 0;
+  // stop connection
+  virtual void stop() = 0;
+  // signal and handle connection failure
+  virtual void fault() = 0;
+  // send message
+  virtual void send_message(Message *m) = 0;
+  // send keepalive
+  virtual void send_keepalive() = 0;
+
+  virtual void read_event() = 0;
+  virtual void write_event() = 0;
+  virtual bool is_queued() = 0;
+
+  int get_con_mode() const {
+    return auth_meta->con_mode;
+  }
+};
+
+#endif /* _MSG_ASYNC_PROTOCOL_ */
diff --git a/src/msg/async/ProtocolV1.cc b/src/msg/async/ProtocolV1.cc
new file mode 100644
index 00000000..9a7ab9d4
--- /dev/null
+++ b/src/msg/async/ProtocolV1.cc
@@ -0,0 +1,2547 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ProtocolV1.h"
+
+#include "common/errno.h"
+
+#include "AsyncConnection.h"
+#include "AsyncMessenger.h"
+#include "common/EventTrace.h"
+#include "include/random.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix _conn_prefix(_dout)
+ostream &ProtocolV1::_conn_prefix(std::ostream *_dout) {
+  return *_dout << "--1- " << messenger->get_myaddrs() << " >> "
+                << *connection->peer_addrs
+		<< " conn("
+                << connection << " " << this
+                << " :" << connection->port << " s=" << get_state_name(state)
+                << " pgs=" << peer_global_seq << " cs=" << connect_seq
+                << " l=" << connection->policy.lossy << ").";
+}
+
+#define WRITE(B, C) write(CONTINUATION(C), B)
+
+#define READ(L, C) read(CONTINUATION(C), L)
+
+#define READB(L, B, C) read(CONTINUATION(C), L, B)
+
+// Constant to limit starting sequence number to 2^31.  Nothing special about
+// it, just a big number.  PLR
+#define SEQ_MASK 0x7fffffff
+
+const int ASYNC_COALESCE_THRESHOLD = 256;
+
+using namespace std;
+
+static void alloc_aligned_buffer(bufferlist &data, unsigned len, unsigned off) {
+  // create a buffer to read into that matches the data alignment
+  unsigned alloc_len = 0;
+  unsigned left = len;
+  unsigned head = 0;
+  if (off & ~CEPH_PAGE_MASK) {
+    // head
+    alloc_len += CEPH_PAGE_SIZE;
+    head = std::min<uint64_t>(CEPH_PAGE_SIZE - (off & ~CEPH_PAGE_MASK), left);
+    left -= head;
+  }
+  alloc_len += left;
+  bufferptr ptr(buffer::create_small_page_aligned(alloc_len));
+  if (head) ptr.set_offset(CEPH_PAGE_SIZE - head);
+  data.push_back(std::move(ptr));
+}
+
+/**
+ * Protocol V1
+ **/
+
+ProtocolV1::ProtocolV1(AsyncConnection *connection)
+    : Protocol(1, connection),
+      temp_buffer(nullptr),
+      can_write(WriteStatus::NOWRITE),
+      keepalive(false),
+      connect_seq(0),
+      peer_global_seq(0),
+      msg_left(0),
+      cur_msg_size(0),
+      replacing(false),
+      is_reset_from_peer(false),
+      once_ready(false),
+      state(NONE),
+      global_seq(0),
+      authorizer(nullptr),
+      wait_for_seq(false) {
+  temp_buffer = new char[4096];
+}
+
+ProtocolV1::~ProtocolV1() {
+  ceph_assert(out_q.empty());
+  ceph_assert(sent.empty());
+
+  delete[] temp_buffer;
+
+  if (authorizer) {
+    delete authorizer;
+  }
+}
+
+void ProtocolV1::connect() {
+  this->state = START_CONNECT;
+
+  // reset connect state variables
+  if (authorizer) {
+    delete authorizer;
+    authorizer = nullptr;
+  }
+  authorizer_buf.clear();
+  // FIPS zeroization audit 20191115: these memsets are not security related.
+  memset(&connect_msg, 0, sizeof(connect_msg));
+  memset(&connect_reply, 0, sizeof(connect_reply));
+
+  global_seq = messenger->get_global_seq();
+}
+
+void ProtocolV1::accept() { this->state = START_ACCEPT; }
+
+bool ProtocolV1::is_connected() {
+  return can_write.load() == WriteStatus::CANWRITE;
+}
+
+void ProtocolV1::stop() {
+  ldout(cct, 20) << __func__ << dendl;
+  if (state == CLOSED) {
+    return;
+  }
+
+  if (connection->delay_state) connection->delay_state->flush();
+
+  ldout(cct, 2) << __func__ << dendl;
+  std::lock_guard<std::mutex> l(connection->write_lock);
+
+  reset_recv_state();
+  discard_out_queue();
+
+  connection->_stop();
+
+  can_write = WriteStatus::CLOSED;
+  state = CLOSED;
+}
+
+void ProtocolV1::fault() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (state == CLOSED || state == NONE) {
+    ldout(cct, 10) << __func__ << " connection is already closed" << dendl;
+    return;
+  }
+
+  if (connection->policy.lossy && state != START_CONNECT &&
+      state != CONNECTING) {
+    ldout(cct, 1) << __func__ << " on lossy channel, failing" << dendl;
+    stop();
+    connection->dispatch_queue->queue_reset(connection);
+    return;
+  }
+
+  connection->write_lock.lock();
+  can_write = WriteStatus::NOWRITE;
+  is_reset_from_peer = false;
+
+  // requeue sent items
+  requeue_sent();
+
+  if (!once_ready && out_q.empty() && state >= START_ACCEPT &&
+      state <= ACCEPTING_WAIT_CONNECT_MSG_AUTH && !replacing) {
+    ldout(cct, 10) << __func__ << " with nothing to send and in the half "
+                   << " accept state just closed" << dendl;
+    connection->write_lock.unlock();
+    stop();
+    connection->dispatch_queue->queue_reset(connection);
+    return;
+  }
+  replacing = false;
+
+  connection->fault();
+
+  reset_recv_state();
+
+  if (connection->policy.standby && out_q.empty() && !keepalive &&
+      state != WAIT) {
+    ldout(cct, 10) << __func__ << " with nothing to send, going to standby"
+                   << dendl;
+    state = STANDBY;
+    connection->write_lock.unlock();
+    return;
+  }
+
+  connection->write_lock.unlock();
+
+  if ((state >= START_CONNECT && state <= CONNECTING_SEND_CONNECT_MSG) ||
+      state == WAIT) {
+    // backoff!
+    if (state == WAIT) {
+      backoff.set_from_double(cct->_conf->ms_max_backoff);
+    } else if (backoff == utime_t()) {
+      backoff.set_from_double(cct->_conf->ms_initial_backoff);
+    } else {
+      backoff += backoff;
+      if (backoff > cct->_conf->ms_max_backoff)
+        backoff.set_from_double(cct->_conf->ms_max_backoff);
+    }
+
+    global_seq = messenger->get_global_seq();
+    state = START_CONNECT;
+    connection->state = AsyncConnection::STATE_CONNECTING;
+    ldout(cct, 10) << __func__ << " waiting " << backoff << dendl;
+    // woke up again;
+    connection->register_time_events.insert(
+        connection->center->create_time_event(backoff.to_nsec() / 1000,
+                                              connection->wakeup_handler));
+  } else {
+    // policy maybe empty when state is in accept
+    if (connection->policy.server) {
+      ldout(cct, 0) << __func__ << " server, going to standby" << dendl;
+      state = STANDBY;
+    } else {
+      ldout(cct, 0) << __func__ << " initiating reconnect" << dendl;
+      connect_seq++;
+      global_seq = messenger->get_global_seq();
+      state = START_CONNECT;
+      connection->state = AsyncConnection::STATE_CONNECTING;
+    }
+    backoff = utime_t();
+    connection->center->dispatch_event_external(connection->read_handler);
+  }
+}
+
+void ProtocolV1::send_message(Message *m) {
+  bufferlist bl;
+  uint64_t f = connection->get_features();
+
+  // TODO: Currently not all messages supports reencode like MOSDMap, so here
+  // only let fast dispatch support messages prepare message
+  bool can_fast_prepare = messenger->ms_can_fast_dispatch(m);
+  if (can_fast_prepare) {
+    prepare_send_message(f, m, bl);
+  }
+
+  std::lock_guard<std::mutex> l(connection->write_lock);
+  // "features" changes will change the payload encoding
+  if (can_fast_prepare &&
+      (can_write == WriteStatus::NOWRITE || connection->get_features() != f)) {
+    // ensure the correctness of message encoding
+    bl.clear();
+    m->clear_payload();
+    ldout(cct, 5) << __func__ << " clear encoded buffer previous " << f
+                  << " != " << connection->get_features() << dendl;
+  }
+  if (can_write == WriteStatus::CLOSED) {
+    ldout(cct, 10) << __func__ << " connection closed."
+                   << " Drop message " << m << dendl;
+    m->put();
+  } else {
+    m->trace.event("async enqueueing message");
+    out_q[m->get_priority()].emplace_back(std::move(bl), m);
+    ldout(cct, 15) << __func__ << " inline write is denied, reschedule m=" << m
+                   << dendl;
+    if (can_write != WriteStatus::REPLACING && !write_in_progress) {
+      write_in_progress = true;
+      connection->center->dispatch_event_external(connection->write_handler);
+    }
+  }
+}
+
+void ProtocolV1::prepare_send_message(uint64_t features, Message *m,
+                                      bufferlist &bl) {
+  ldout(cct, 20) << __func__ << " m " << *m << dendl;
+
+  // associate message with Connection (for benefit of encode_payload)
+  if (m->empty_payload()) {
+    ldout(cct, 20) << __func__ << " encoding features " << features << " " << m
+                   << " " << *m << dendl;
+  } else {
+    ldout(cct, 20) << __func__ << " half-reencoding features " << features
+                   << " " << m << " " << *m << dendl;
+  }
+
+  // encode and copy out of *m
+  m->encode(features, messenger->crcflags);
+
+  bl.append(m->get_payload());
+  bl.append(m->get_middle());
+  bl.append(m->get_data());
+}
+
+void ProtocolV1::send_keepalive() {
+  ldout(cct, 10) << __func__ << dendl;
+  std::lock_guard<std::mutex> l(connection->write_lock);
+  if (can_write != WriteStatus::CLOSED) {
+    keepalive = true;
+    connection->center->dispatch_event_external(connection->write_handler);
+  }
+}
+
+void ProtocolV1::read_event() {
+  ldout(cct, 20) << __func__ << dendl;
+  switch (state) {
+    case START_CONNECT:
+      CONTINUATION_RUN(CONTINUATION(send_client_banner));
+      break;
+    case START_ACCEPT:
+      CONTINUATION_RUN(CONTINUATION(send_server_banner));
+      break;
+    case OPENED:
+      CONTINUATION_RUN(CONTINUATION(wait_message));
+      break;
+    case THROTTLE_MESSAGE:
+      CONTINUATION_RUN(CONTINUATION(throttle_message));
+      break;
+    case THROTTLE_BYTES:
+      CONTINUATION_RUN(CONTINUATION(throttle_bytes));
+      break;
+    case THROTTLE_DISPATCH_QUEUE:
+      CONTINUATION_RUN(CONTINUATION(throttle_dispatch_queue));
+      break;
+    default:
+      break;
+  }
+}
+
+void ProtocolV1::write_event() {
+  ldout(cct, 10) << __func__ << dendl;
+  ssize_t r = 0;
+
+  connection->write_lock.lock();
+  if (can_write == WriteStatus::CANWRITE) {
+    if (keepalive) {
+      append_keepalive_or_ack();
+      keepalive = false;
+    }
+
+    auto start = ceph::mono_clock::now();
+    bool more;
+    do {
+      bufferlist data;
+      Message *m = _get_next_outgoing(&data);
+      if (!m) {
+        break;
+      }
+
+      if (!connection->policy.lossy) {
+        // put on sent list
+        sent.push_back(m);
+        m->get();
+      }
+      more = !out_q.empty();
+      connection->write_lock.unlock();
+
+      // send_message or requeue messages may not encode message
+      if (!data.length()) {
+        prepare_send_message(connection->get_features(), m, data);
+      }
+
+      r = write_message(m, data, more);
+
+      connection->write_lock.lock();
+      if (r == 0) {
+        ;
+      } else if (r < 0) {
+        ldout(cct, 1) << __func__ << " send msg failed" << dendl;
+        break;
+      } else if (r > 0)
+        break;
+    } while (can_write == WriteStatus::CANWRITE);
+    write_in_progress = false;
+    connection->write_lock.unlock();
+
+    // if r > 0 mean data still lefted, so no need _try_send.
+    if (r == 0) {
+      uint64_t left = ack_left;
+      if (left) {
+        ceph_le64 s;
+        s = in_seq;
+        connection->outgoing_bl.append(CEPH_MSGR_TAG_ACK);
+        connection->outgoing_bl.append((char *)&s, sizeof(s));
+        ldout(cct, 10) << __func__ << " try send msg ack, acked " << left
+                       << " messages" << dendl;
+        ack_left -= left;
+        left = ack_left;
+        r = connection->_try_send(left);
+      } else if (is_queued()) {
+        r = connection->_try_send();
+      }
+    }
+
+    connection->logger->tinc(l_msgr_running_send_time,
+                             ceph::mono_clock::now() - start);
+    if (r < 0) {
+      ldout(cct, 1) << __func__ << " send msg failed" << dendl;
+      connection->lock.lock();
+      fault();
+      connection->lock.unlock();
+      return;
+    }
+  } else {
+    write_in_progress = false;
+    connection->write_lock.unlock();
+    connection->lock.lock();
+    connection->write_lock.lock();
+    if (state == STANDBY && !connection->policy.server && is_queued()) {
+      ldout(cct, 10) << __func__ << " policy.server is false" << dendl;
+      connection->_connect();
+    } else if (connection->cs && state != NONE && state != CLOSED &&
+               state != START_CONNECT) {
+      r = connection->_try_send();
+      if (r < 0) {
+        ldout(cct, 1) << __func__ << " send outcoming bl failed" << dendl;
+        connection->write_lock.unlock();
+        fault();
+        connection->lock.unlock();
+        return;
+      }
+    }
+    connection->write_lock.unlock();
+    connection->lock.unlock();
+  }
+}
+
+bool ProtocolV1::is_queued() {
+  return !out_q.empty() || connection->is_queued();
+}
+
+void ProtocolV1::run_continuation(CtPtr pcontinuation) {
+  if (pcontinuation) {
+    CONTINUATION_RUN(*pcontinuation);
+  }
+}
+
+CtPtr ProtocolV1::read(CONTINUATION_RX_TYPE<ProtocolV1> &next,
+                       int len, char *buffer) {
+  if (!buffer) {
+    buffer = temp_buffer;
+  }
+  ssize_t r = connection->read(len, buffer,
+                               [&next, this](char *buffer, int r) {
+                                 next.setParams(buffer, r);
+                                 CONTINUATION_RUN(next);
+                               });
+  if (r <= 0) {
+    next.setParams(buffer, r);
+    return &next;
+  }
+
+  return nullptr;
+}
+
+CtPtr ProtocolV1::write(CONTINUATION_TX_TYPE<ProtocolV1> &next,
+                        bufferlist &buffer) {
+  ssize_t r = connection->write(buffer, [&next, this](int r) {
+    next.setParams(r);
+    CONTINUATION_RUN(next);
+  });
+  if (r <= 0) {
+    next.setParams(r);
+    return &next;
+  }
+
+  return nullptr;
+}
+
+CtPtr ProtocolV1::ready() {
+  ldout(cct, 25) << __func__ << dendl;
+
+  // make sure no pending tick timer
+  if (connection->last_tick_id) {
+    connection->center->delete_time_event(connection->last_tick_id);
+  }
+  connection->last_tick_id = connection->center->create_time_event(
+      connection->inactive_timeout_us, connection->tick_handler);
+
+  connection->write_lock.lock();
+  can_write = WriteStatus::CANWRITE;
+  if (is_queued()) {
+    connection->center->dispatch_event_external(connection->write_handler);
+  }
+  connection->write_lock.unlock();
+  connection->maybe_start_delay_thread();
+
+  state = OPENED;
+  return wait_message();
+}
+
+CtPtr ProtocolV1::wait_message() {
+  if (state != OPENED) {  // must have changed due to a replace
+    return nullptr;
+  }
+
+  ldout(cct, 20) << __func__ << dendl;
+
+  return READ(sizeof(char), handle_message);
+}
+
+CtPtr ProtocolV1::handle_message(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read tag failed" << dendl;
+    return _fault();
+  }
+
+  char tag = buffer[0];
+  ldout(cct, 20) << __func__ << " process tag " << (int)tag << dendl;
+
+  if (tag == CEPH_MSGR_TAG_KEEPALIVE) {
+    ldout(cct, 20) << __func__ << " got KEEPALIVE" << dendl;
+    connection->set_last_keepalive(ceph_clock_now());
+  } else if (tag == CEPH_MSGR_TAG_KEEPALIVE2) {
+    return READ(sizeof(ceph_timespec), handle_keepalive2);
+  } else if (tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
+    return READ(sizeof(ceph_timespec), handle_keepalive2_ack);
+  } else if (tag == CEPH_MSGR_TAG_ACK) {
+    return READ(sizeof(ceph_le64), handle_tag_ack);
+  } else if (tag == CEPH_MSGR_TAG_MSG) {
+#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
+    ltt_recv_stamp = ceph_clock_now();
+#endif
+    recv_stamp = ceph_clock_now();
+    ldout(cct, 20) << __func__ << " begin MSG" << dendl;
+    return READ(sizeof(ceph_msg_header), handle_message_header);
+  } else if (tag == CEPH_MSGR_TAG_CLOSE) {
+    ldout(cct, 20) << __func__ << " got CLOSE" << dendl;
+    stop();
+  } else {
+    ldout(cct, 0) << __func__ << " bad tag " << (int)tag << dendl;
+    return _fault();
+  }
+  return nullptr;
+}
+
+CtPtr ProtocolV1::handle_keepalive2(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read keeplive timespec failed" << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 30) << __func__ << " got KEEPALIVE2 tag ..." << dendl;
+
+  ceph_timespec *t;
+  t = (ceph_timespec *)buffer;
+  utime_t kp_t = utime_t(*t);
+  connection->write_lock.lock();
+  append_keepalive_or_ack(true, &kp_t);
+  connection->write_lock.unlock();
+
+  ldout(cct, 20) << __func__ << " got KEEPALIVE2 " << kp_t << dendl;
+  connection->set_last_keepalive(ceph_clock_now());
+
+  if (is_connected()) {
+    connection->center->dispatch_event_external(connection->write_handler);
+  }
+
+  return CONTINUE(wait_message);
+}
+
+void ProtocolV1::append_keepalive_or_ack(bool ack, utime_t *tp) {
+  ldout(cct, 10) << __func__ << dendl;
+  if (ack) {
+    ceph_assert(tp);
+    struct ceph_timespec ts;
+    tp->encode_timeval(&ts);
+    connection->outgoing_bl.append(CEPH_MSGR_TAG_KEEPALIVE2_ACK);
+    connection->outgoing_bl.append((char *)&ts, sizeof(ts));
+  } else if (connection->has_feature(CEPH_FEATURE_MSGR_KEEPALIVE2)) {
+    struct ceph_timespec ts;
+    utime_t t = ceph_clock_now();
+    t.encode_timeval(&ts);
+    connection->outgoing_bl.append(CEPH_MSGR_TAG_KEEPALIVE2);
+    connection->outgoing_bl.append((char *)&ts, sizeof(ts));
+  } else {
+    connection->outgoing_bl.append(CEPH_MSGR_TAG_KEEPALIVE);
+  }
+}
+
+CtPtr ProtocolV1::handle_keepalive2_ack(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read keeplive timespec failed" << dendl;
+    return _fault();
+  }
+
+  ceph_timespec *t;
+  t = (ceph_timespec *)buffer;
+  connection->set_last_keepalive_ack(utime_t(*t));
+  ldout(cct, 20) << __func__ << " got KEEPALIVE_ACK" << dendl;
+
+  return CONTINUE(wait_message);
+}
+
+CtPtr ProtocolV1::handle_tag_ack(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read ack seq failed" << dendl;
+    return _fault();
+  }
+
+  ceph_le64 seq;
+  seq = *(ceph_le64 *)buffer;
+  ldout(cct, 20) << __func__ << " got ACK" << dendl;
+
+  ldout(cct, 15) << __func__ << " got ack seq " << seq << dendl;
+  // trim sent list
+  static const int max_pending = 128;
+  int i = 0;
+  Message *pending[max_pending];
+  connection->write_lock.lock();
+  while (!sent.empty() && sent.front()->get_seq() <= seq && i < max_pending) {
+    Message *m = sent.front();
+    sent.pop_front();
+    pending[i++] = m;
+    ldout(cct, 10) << __func__ << " got ack seq " << seq
+                   << " >= " << m->get_seq() << " on " << m << " " << *m
+                   << dendl;
+  }
+  connection->write_lock.unlock();
+  for (int k = 0; k < i; k++) {
+    pending[k]->put();
+  }
+
+  return CONTINUE(wait_message);
+}
+
+CtPtr ProtocolV1::handle_message_header(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read message header failed" << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 20) << __func__ << " got MSG header" << dendl;
+
+  current_header = *((ceph_msg_header *)buffer);
+
+  ldout(cct, 20) << __func__ << " got envelope type=" << current_header.type << " src "
+                 << entity_name_t(current_header.src) << " front=" << current_header.front_len
+                 << " data=" << current_header.data_len << " off " << current_header.data_off
+                 << dendl;
+
+  if (messenger->crcflags & MSG_CRC_HEADER) {
+    __u32 header_crc = 0;
+    header_crc = ceph_crc32c(0, (unsigned char *)&current_header,
+                             sizeof(current_header) - sizeof(current_header.crc));
+    // verify header crc
+    if (header_crc != current_header.crc) {
+      ldout(cct, 0) << __func__ << " got bad header crc " << header_crc
+                    << " != " << current_header.crc << dendl;
+      return _fault();
+    }
+  }
+
+  // Reset state
+  data_buf.clear();
+  front.clear();
+  middle.clear();
+  data.clear();
+
+  state = THROTTLE_MESSAGE;
+  return CONTINUE(throttle_message);
+}
+
+CtPtr ProtocolV1::throttle_message() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (connection->policy.throttler_messages) {
+    ldout(cct, 10) << __func__ << " wants " << 1
+                   << " message from policy throttler "
+                   << connection->policy.throttler_messages->get_current()
+                   << "/" << connection->policy.throttler_messages->get_max()
+                   << dendl;
+    if (!connection->policy.throttler_messages->get_or_fail()) {
+      ldout(cct, 10) << __func__ << " wants 1 message from policy throttle "
+                     << connection->policy.throttler_messages->get_current()
+                     << "/" << connection->policy.throttler_messages->get_max()
+                     << " failed, just wait." << dendl;
+      // following thread pool deal with th full message queue isn't a
+      // short time, so we can wait a ms.
+      if (connection->register_time_events.empty()) {
+        connection->register_time_events.insert(
+            connection->center->create_time_event(1000,
+                                                  connection->wakeup_handler));
+      }
+      return nullptr;
+    }
+  }
+
+  state = THROTTLE_BYTES;
+  return CONTINUE(throttle_bytes);
+}
+
+CtPtr ProtocolV1::throttle_bytes() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  cur_msg_size = current_header.front_len + current_header.middle_len +
+                 current_header.data_len;
+  if (cur_msg_size) {
+    if (connection->policy.throttler_bytes) {
+      ldout(cct, 10) << __func__ << " wants " << cur_msg_size
+                     << " bytes from policy throttler "
+                     << connection->policy.throttler_bytes->get_current() << "/"
+                     << connection->policy.throttler_bytes->get_max() << dendl;
+      if (!connection->policy.throttler_bytes->get_or_fail(cur_msg_size)) {
+        ldout(cct, 10) << __func__ << " wants " << cur_msg_size
+                       << " bytes from policy throttler "
+                       << connection->policy.throttler_bytes->get_current()
+                       << "/" << connection->policy.throttler_bytes->get_max()
+                       << " failed, just wait." << dendl;
+        // following thread pool deal with th full message queue isn't a
+        // short time, so we can wait a ms.
+        if (connection->register_time_events.empty()) {
+          connection->register_time_events.insert(
+              connection->center->create_time_event(
+                  1000, connection->wakeup_handler));
+        }
+        return nullptr;
+      }
+    }
+  }
+
+  state = THROTTLE_DISPATCH_QUEUE;
+  return CONTINUE(throttle_dispatch_queue);
+}
+
+CtPtr ProtocolV1::throttle_dispatch_queue() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (cur_msg_size) {
+    if (!connection->dispatch_queue->dispatch_throttler.get_or_fail(
+            cur_msg_size)) {
+      ldout(cct, 10)
+          << __func__ << " wants " << cur_msg_size
+          << " bytes from dispatch throttle "
+          << connection->dispatch_queue->dispatch_throttler.get_current() << "/"
+          << connection->dispatch_queue->dispatch_throttler.get_max()
+          << " failed, just wait." << dendl;
+      // following thread pool deal with th full message queue isn't a
+      // short time, so we can wait a ms.
+      if (connection->register_time_events.empty()) {
+        connection->register_time_events.insert(
+            connection->center->create_time_event(1000,
+                                                  connection->wakeup_handler));
+      }
+      return nullptr;
+    }
+  }
+
+  throttle_stamp = ceph_clock_now();
+
+  state = READ_MESSAGE_FRONT;
+  return read_message_front();
+}
+
+CtPtr ProtocolV1::read_message_front() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  unsigned front_len = current_header.front_len;
+  if (front_len) {
+    if (!front.length()) {
+      front.push_back(buffer::create(front_len));
+    }
+    return READB(front_len, front.c_str(), handle_message_front);
+  }
+  return read_message_middle();
+}
+
+CtPtr ProtocolV1::handle_message_front(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read message front failed" << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 20) << __func__ << " got front " << front.length() << dendl;
+
+  return read_message_middle();
+}
+
+CtPtr ProtocolV1::read_message_middle() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (current_header.middle_len) {
+    if (!middle.length()) {
+      middle.push_back(buffer::create(current_header.middle_len));
+    }
+    return READB(current_header.middle_len, middle.c_str(),
+                 handle_message_middle);
+  }
+
+  return read_message_data_prepare();
+}
+
+CtPtr ProtocolV1::handle_message_middle(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read message middle failed" << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 20) << __func__ << " got middle " << middle.length() << dendl;
+
+  return read_message_data_prepare();
+}
+
+CtPtr ProtocolV1::read_message_data_prepare() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  unsigned data_len = le32_to_cpu(current_header.data_len);
+  unsigned data_off = le32_to_cpu(current_header.data_off);
+
+  if (data_len) {
+    // get a buffer
+#if 0
+    // rx_buffers is broken by design... see
+    //  http://tracker.ceph.com/issues/22480
+    map<ceph_tid_t, pair<bufferlist, int> >::iterator p =
+        connection->rx_buffers.find(current_header.tid);
+    if (p != connection->rx_buffers.end()) {
+      ldout(cct, 10) << __func__ << " seleting rx buffer v " << p->second.second
+                     << " at offset " << data_off << " len "
+                     << p->second.first.length() << dendl;
+      data_buf = p->second.first;
+      // make sure it's big enough
+      if (data_buf.length() < data_len)
+        data_buf.push_back(buffer::create(data_len - data_buf.length()));
+      data_blp = data_buf.begin();
+    } else {
+      ldout(cct, 20) << __func__ << " allocating new rx buffer at offset "
+                     << data_off << dendl;
+      alloc_aligned_buffer(data_buf, data_len, data_off);
+      data_blp = data_buf.begin();
+    }
+#else
+    ldout(cct, 20) << __func__ << " allocating new rx buffer at offset "
+		   << data_off << dendl;
+    alloc_aligned_buffer(data_buf, data_len, data_off);
+    data_blp = data_buf.begin();
+#endif
+  }
+
+  msg_left = data_len;
+
+  return CONTINUE(read_message_data);
+}
+
+CtPtr ProtocolV1::read_message_data() {
+  ldout(cct, 20) << __func__ << " msg_left=" << msg_left << dendl;
+
+  if (msg_left > 0) {
+    bufferptr bp = data_blp.get_current_ptr();
+    unsigned read_len = std::min(bp.length(), msg_left);
+
+    return READB(read_len, bp.c_str(), handle_message_data);
+  }
+
+  return read_message_footer();
+}
+
+CtPtr ProtocolV1::handle_message_data(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read data error " << dendl;
+    return _fault();
+  }
+
+  bufferptr bp = data_blp.get_current_ptr();
+  unsigned read_len = std::min(bp.length(), msg_left);
+  ceph_assert(read_len < std::numeric_limits<int>::max());
+  data_blp.advance(read_len);
+  data.append(bp, 0, read_len);
+  msg_left -= read_len;
+
+  return CONTINUE(read_message_data);
+}
+
+CtPtr ProtocolV1::read_message_footer() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  state = READ_FOOTER_AND_DISPATCH;
+
+  unsigned len;
+  if (connection->has_feature(CEPH_FEATURE_MSG_AUTH)) {
+    len = sizeof(ceph_msg_footer);
+  } else {
+    len = sizeof(ceph_msg_footer_old);
+  }
+
+  return READ(len, handle_message_footer);
+}
+
+CtPtr ProtocolV1::handle_message_footer(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read footer data error " << dendl;
+    return _fault();
+  }
+
+  ceph_msg_footer footer;
+  ceph_msg_footer_old old_footer;
+
+  if (connection->has_feature(CEPH_FEATURE_MSG_AUTH)) {
+    footer = *((ceph_msg_footer *)buffer);
+  } else {
+    old_footer = *((ceph_msg_footer_old *)buffer);
+    footer.front_crc = old_footer.front_crc;
+    footer.middle_crc = old_footer.middle_crc;
+    footer.data_crc = old_footer.data_crc;
+    footer.sig = 0;
+    footer.flags = old_footer.flags;
+  }
+
+  int aborted = (footer.flags & CEPH_MSG_FOOTER_COMPLETE) == 0;
+  ldout(cct, 10) << __func__ << " aborted = " << aborted << dendl;
+  if (aborted) {
+    ldout(cct, 0) << __func__ << " got " << front.length() << " + "
+                  << middle.length() << " + " << data.length()
+                  << " byte message.. ABORTED" << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 20) << __func__ << " got " << front.length() << " + "
+                 << middle.length() << " + " << data.length() << " byte message"
+                 << dendl;
+  Message *message = decode_message(cct, messenger->crcflags, current_header,
+                                    footer, front, middle, data, connection);
+  if (!message) {
+    ldout(cct, 1) << __func__ << " decode message failed " << dendl;
+    return _fault();
+  }
+
+  //
+  //  Check the signature if one should be present.  A zero return indicates
+  //  success. PLR
+  //
+
+  if (session_security.get() == NULL) {
+    ldout(cct, 10) << __func__ << " no session security set" << dendl;
+  } else {
+    if (session_security->check_message_signature(message)) {
+      ldout(cct, 0) << __func__ << " Signature check failed" << dendl;
+      message->put();
+      return _fault();
+    }
+  }
+  message->set_byte_throttler(connection->policy.throttler_bytes);
+  message->set_message_throttler(connection->policy.throttler_messages);
+
+  // store reservation size in message, so we don't get confused
+  // by messages entering the dispatch queue through other paths.
+  message->set_dispatch_throttle_size(cur_msg_size);
+
+  message->set_recv_stamp(recv_stamp);
+  message->set_throttle_stamp(throttle_stamp);
+  message->set_recv_complete_stamp(ceph_clock_now());
+
+  // check received seq#.  if it is old, drop the message.
+  // note that incoming messages may skip ahead.  this is convenient for the
+  // client side queueing because messages can't be renumbered, but the (kernel)
+  // client will occasionally pull a message out of the sent queue to send
+  // elsewhere.  in that case it doesn't matter if we "got" it or not.
+  uint64_t cur_seq = in_seq;
+  if (message->get_seq() <= cur_seq) {
+    ldout(cct, 0) << __func__ << " got old message " << message->get_seq()
+                  << " <= " << cur_seq << " " << message << " " << *message
+                  << ", discarding" << dendl;
+    message->put();
+    if (connection->has_feature(CEPH_FEATURE_RECONNECT_SEQ) &&
+        cct->_conf->ms_die_on_old_message) {
+      ceph_assert(0 == "old msgs despite reconnect_seq feature");
+    }
+    return nullptr;
+  }
+  if (message->get_seq() > cur_seq + 1) {
+    ldout(cct, 0) << __func__ << " missed message?  skipped from seq "
+                  << cur_seq << " to " << message->get_seq() << dendl;
+    if (cct->_conf->ms_die_on_skipped_message) {
+      ceph_assert(0 == "skipped incoming seq");
+    }
+  }
+
+#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
+  if (message->get_type() == CEPH_MSG_OSD_OP ||
+      message->get_type() == CEPH_MSG_OSD_OPREPLY) {
+    utime_t ltt_processed_stamp = ceph_clock_now();
+    double usecs_elapsed =
+        (ltt_processed_stamp.to_nsec() - ltt_recv_stamp.to_nsec()) / 1000;
+    ostringstream buf;
+    if (message->get_type() == CEPH_MSG_OSD_OP)
+      OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OP",
+                           false);
+    else
+      OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OPREPLY",
+                           false);
+  }
+#endif
+
+  // note last received message.
+  in_seq = message->get_seq();
+  ldout(cct, 5) << " rx " << message->get_source() << " seq "
+                << message->get_seq() << " " << message << " " << *message
+                << dendl;
+
+  bool need_dispatch_writer = false;
+  if (!connection->policy.lossy) {
+    ack_left++;
+    need_dispatch_writer = true;
+  }
+
+  state = OPENED;
+
+  connection->logger->inc(l_msgr_recv_messages);
+  connection->logger->inc(
+      l_msgr_recv_bytes,
+      cur_msg_size + sizeof(ceph_msg_header) + sizeof(ceph_msg_footer));
+
+  messenger->ms_fast_preprocess(message);
+  auto fast_dispatch_time = ceph::mono_clock::now();
+  connection->logger->tinc(l_msgr_running_recv_time,
+                           fast_dispatch_time - connection->recv_start_time);
+  if (connection->delay_state) {
+    double delay_period = 0;
+    if (rand() % 10000 < cct->_conf->ms_inject_delay_probability * 10000.0) {
+      delay_period =
+          cct->_conf->ms_inject_delay_max * (double)(rand() % 10000) / 10000.0;
+      ldout(cct, 1) << "queue_received will delay after "
+                    << (ceph_clock_now() + delay_period) << " on " << message
+                    << " " << *message << dendl;
+    }
+    connection->delay_state->queue(delay_period, message);
+  } else if (messenger->ms_can_fast_dispatch(message)) {
+    connection->lock.unlock();
+    connection->dispatch_queue->fast_dispatch(message);
+    connection->recv_start_time = ceph::mono_clock::now();
+    connection->logger->tinc(l_msgr_running_fast_dispatch_time,
+                             connection->recv_start_time - fast_dispatch_time);
+    connection->lock.lock();
+  } else {
+    connection->dispatch_queue->enqueue(message, message->get_priority(),
+                                        connection->conn_id);
+  }
+
+  // clean up local buffer references
+  data_buf.clear();
+  front.clear();
+  middle.clear();
+  data.clear();
+
+  if (need_dispatch_writer && connection->is_connected()) {
+    connection->center->dispatch_event_external(connection->write_handler);
+  }
+
+  return CONTINUE(wait_message);
+}
+
+void ProtocolV1::session_reset() {
+  ldout(cct, 10) << __func__ << " started" << dendl;
+
+  std::lock_guard<std::mutex> l(connection->write_lock);
+  if (connection->delay_state) {
+    connection->delay_state->discard();
+  }
+
+  connection->dispatch_queue->discard_queue(connection->conn_id);
+  discard_out_queue();
+  // note: we need to clear outgoing_bl here, but session_reset may be
+  // called by other thread, so let caller clear this itself!
+  // outgoing_bl.clear();
+
+  connection->dispatch_queue->queue_remote_reset(connection);
+
+  randomize_out_seq();
+
+  in_seq = 0;
+  connect_seq = 0;
+  // it's safe to directly set 0, double locked
+  ack_left = 0;
+  once_ready = false;
+  can_write = WriteStatus::NOWRITE;
+}
+
+void ProtocolV1::randomize_out_seq() {
+  if (connection->get_features() & CEPH_FEATURE_MSG_AUTH) {
+    // Set out_seq to a random value, so CRC won't be predictable.
+    auto rand_seq = ceph::util::generate_random_number<uint64_t>(0, SEQ_MASK);
+    ldout(cct, 10) << __func__ << " randomize_out_seq " << rand_seq << dendl;
+    out_seq = rand_seq;
+  } else {
+    // previously, seq #'s always started at 0.
+    out_seq = 0;
+  }
+}
+
+ssize_t ProtocolV1::write_message(Message *m, bufferlist &bl, bool more) {
+  FUNCTRACE(cct);
+  ceph_assert(connection->center->in_thread());
+  m->set_seq(++out_seq);
+
+  if (messenger->crcflags & MSG_CRC_HEADER) {
+    m->calc_header_crc();
+  }
+
+  ceph_msg_header &header = m->get_header();
+  ceph_msg_footer &footer = m->get_footer();
+
+  // TODO: let sign_message could be reentry?
+  // Now that we have all the crcs calculated, handle the
+  // digital signature for the message, if the AsyncConnection has session
+  // security set up.  Some session security options do not
+  // actually calculate and check the signature, but they should
+  // handle the calls to sign_message and check_signature.  PLR
+  if (session_security.get() == NULL) {
+    ldout(cct, 20) << __func__ << " no session security" << dendl;
+  } else {
+    if (session_security->sign_message(m)) {
+      ldout(cct, 20) << __func__ << " failed to sign m=" << m
+                     << "): sig = " << footer.sig << dendl;
+    } else {
+      ldout(cct, 20) << __func__ << " signed m=" << m
+                     << "): sig = " << footer.sig << dendl;
+    }
+  }
+
+  connection->outgoing_bl.append(CEPH_MSGR_TAG_MSG);
+  connection->outgoing_bl.append((char *)&header, sizeof(header));
+
+  ldout(cct, 20) << __func__ << " sending message type=" << header.type
+                 << " src " << entity_name_t(header.src)
+                 << " front=" << header.front_len << " data=" << header.data_len
+                 << " off " << header.data_off << dendl;
+
+  if ((bl.length() <= ASYNC_COALESCE_THRESHOLD) && (bl.buffers().size() > 1)) {
+    for (const auto &pb : bl.buffers()) {
+      connection->outgoing_bl.append((char *)pb.c_str(), pb.length());
+    }
+  } else {
+    connection->outgoing_bl.claim_append(bl);
+  }
+
+  // send footer; if receiver doesn't support signatures, use the old footer
+  // format
+  ceph_msg_footer_old old_footer;
+  if (connection->has_feature(CEPH_FEATURE_MSG_AUTH)) {
+    connection->outgoing_bl.append((char *)&footer, sizeof(footer));
+  } else {
+    if (messenger->crcflags & MSG_CRC_HEADER) {
+      old_footer.front_crc = footer.front_crc;
+      old_footer.middle_crc = footer.middle_crc;
+      old_footer.data_crc = footer.data_crc;
+    } else {
+      old_footer.front_crc = old_footer.middle_crc = 0;
+    }
+    old_footer.data_crc =
+        messenger->crcflags & MSG_CRC_DATA ? footer.data_crc : 0;
+    old_footer.flags = footer.flags;
+    connection->outgoing_bl.append((char *)&old_footer, sizeof(old_footer));
+  }
+
+  m->trace.event("async writing message");
+  ldout(cct, 20) << __func__ << " sending " << m->get_seq() << " " << m
+                 << dendl;
+  ssize_t total_send_size = connection->outgoing_bl.length();
+  ssize_t rc = connection->_try_send(more);
+  if (rc < 0) {
+    ldout(cct, 1) << __func__ << " error sending " << m << ", "
+                  << cpp_strerror(rc) << dendl;
+  } else {
+    connection->logger->inc(
+        l_msgr_send_bytes, total_send_size - connection->outgoing_bl.length());
+    ldout(cct, 10) << __func__ << " sending " << m
+                   << (rc ? " continuely." : " done.") << dendl;
+  }
+  if (m->get_type() == CEPH_MSG_OSD_OP)
+    OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OP_END", false);
+  else if (m->get_type() == CEPH_MSG_OSD_OPREPLY)
+    OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OPREPLY_END", false);
+  m->put();
+
+  return rc;
+}
+
+void ProtocolV1::requeue_sent() {
+  write_in_progress = false;
+  if (sent.empty()) {
+    return;
+  }
+
+  list<pair<bufferlist, Message *> > &rq = out_q[CEPH_MSG_PRIO_HIGHEST];
+  out_seq -= sent.size();
+  while (!sent.empty()) {
+    Message *m = sent.back();
+    sent.pop_back();
+    ldout(cct, 10) << __func__ << " " << *m << " for resend "
+                   << " (" << m->get_seq() << ")" << dendl;
+    rq.push_front(make_pair(bufferlist(), m));
+  }
+}
+
+uint64_t ProtocolV1::discard_requeued_up_to(uint64_t out_seq, uint64_t seq) {
+  ldout(cct, 10) << __func__ << " " << seq << dendl;
+  std::lock_guard<std::mutex> l(connection->write_lock);
+  if (out_q.count(CEPH_MSG_PRIO_HIGHEST) == 0) {
+    return seq;
+  }
+  list<pair<bufferlist, Message *> > &rq = out_q[CEPH_MSG_PRIO_HIGHEST];
+  uint64_t count = out_seq;
+  while (!rq.empty()) {
+    pair<bufferlist, Message *> p = rq.front();
+    if (p.second->get_seq() == 0 || p.second->get_seq() > seq) break;
+    ldout(cct, 10) << __func__ << " " << *(p.second) << " for resend seq "
+                   << p.second->get_seq() << " <= " << seq << ", discarding"
+                   << dendl;
+    p.second->put();
+    rq.pop_front();
+    count++;
+  }
+  if (rq.empty()) out_q.erase(CEPH_MSG_PRIO_HIGHEST);
+  return count;
+}
+
+/*
+ * Tears down the message queues, and removes them from the
+ * DispatchQueue Must hold write_lock prior to calling.
+ */
+void ProtocolV1::discard_out_queue() {
+  ldout(cct, 10) << __func__ << " started" << dendl;
+
+  for (list<Message *>::iterator p = sent.begin(); p != sent.end(); ++p) {
+    ldout(cct, 20) << __func__ << " discard " << *p << dendl;
+    (*p)->put();
+  }
+  sent.clear();
+  for (map<int, list<pair<bufferlist, Message *> > >::iterator p =
+           out_q.begin();
+       p != out_q.end(); ++p) {
+    for (list<pair<bufferlist, Message *> >::iterator r = p->second.begin();
+         r != p->second.end(); ++r) {
+      ldout(cct, 20) << __func__ << " discard " << r->second << dendl;
+      r->second->put();
+    }
+  }
+  out_q.clear();
+  write_in_progress = false;
+}
+
+void ProtocolV1::reset_security()
+{
+  ldout(cct, 5) << __func__ << dendl;
+
+  // clean up state internal variables and states
+  if (state == CONNECTING_SEND_CONNECT_MSG) {
+    if (authorizer) {
+      delete authorizer;
+    }
+    authorizer = nullptr;
+  }
+}
+
+void ProtocolV1::reset_recv_state() {
+  ldout(cct, 5) << __func__ << dendl;
+
+  // execute in the same thread that uses the `session_security`.
+  // We need to do the warp because holding `write_lock` is not
+  // enough as `write_event()` releases it just before calling
+  // `write_message()`. `submit_to()` here is NOT blocking.
+  if (!connection->center->in_thread()) {
+    connection->center->submit_to(connection->center->get_id(), [this] {
+      ldout(cct, 5) << "reset_recv_state (warped) reseting security handlers"
+                    << dendl;
+      // Possibly unnecessary. See the comment in `deactivate_existing`.
+      std::lock_guard<std::mutex> l(connection->lock);
+      std::lock_guard<std::mutex> wl(connection->write_lock);
+      reset_security();
+    }, /* nowait = */true);
+  } else {
+    reset_security();
+  }
+
+  // clean read and write callbacks
+  connection->pendingReadLen.reset();
+  connection->writeCallback.reset();
+
+  if (state > THROTTLE_MESSAGE && state <= READ_FOOTER_AND_DISPATCH &&
+      connection->policy.throttler_messages) {
+    ldout(cct, 10) << __func__ << " releasing " << 1
+                   << " message to policy throttler "
+                   << connection->policy.throttler_messages->get_current()
+                   << "/" << connection->policy.throttler_messages->get_max()
+                   << dendl;
+    connection->policy.throttler_messages->put();
+  }
+  if (state > THROTTLE_BYTES && state <= READ_FOOTER_AND_DISPATCH) {
+    if (connection->policy.throttler_bytes) {
+      ldout(cct, 10) << __func__ << " releasing " << cur_msg_size
+                     << " bytes to policy throttler "
+                     << connection->policy.throttler_bytes->get_current() << "/"
+                     << connection->policy.throttler_bytes->get_max() << dendl;
+      connection->policy.throttler_bytes->put(cur_msg_size);
+    }
+  }
+  if (state > THROTTLE_DISPATCH_QUEUE && state <= READ_FOOTER_AND_DISPATCH) {
+    ldout(cct, 10)
+        << __func__ << " releasing " << cur_msg_size
+        << " bytes to dispatch_queue throttler "
+        << connection->dispatch_queue->dispatch_throttler.get_current() << "/"
+        << connection->dispatch_queue->dispatch_throttler.get_max() << dendl;
+    connection->dispatch_queue->dispatch_throttle_release(cur_msg_size);
+  }
+}
+
+Message *ProtocolV1::_get_next_outgoing(bufferlist *bl) {
+  Message *m = 0;
+  if (!out_q.empty()) {
+    map<int, list<pair<bufferlist, Message *> > >::reverse_iterator it =
+        out_q.rbegin();
+    ceph_assert(!it->second.empty());
+    list<pair<bufferlist, Message *> >::iterator p = it->second.begin();
+    m = p->second;
+    if (bl) bl->swap(p->first);
+    it->second.erase(p);
+    if (it->second.empty()) out_q.erase(it->first);
+  }
+  return m;
+}
+
+/**
+ * Client Protocol V1
+ **/
+
+CtPtr ProtocolV1::send_client_banner() {
+  ldout(cct, 20) << __func__ << dendl;
+  state = CONNECTING;
+
+  bufferlist bl;
+  bl.append(CEPH_BANNER, strlen(CEPH_BANNER));
+  return WRITE(bl, handle_client_banner_write);
+}
+
+CtPtr ProtocolV1::handle_client_banner_write(int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " write client banner failed" << dendl;
+    return _fault();
+  }
+  ldout(cct, 10) << __func__ << " connect write banner done: "
+                 << connection->get_peer_addr() << dendl;
+
+  return wait_server_banner();
+}
+
+CtPtr ProtocolV1::wait_server_banner() {
+  state = CONNECTING_WAIT_BANNER_AND_IDENTIFY;
+
+  ldout(cct, 20) << __func__ << dendl;
+
+  bufferlist myaddrbl;
+  unsigned banner_len = strlen(CEPH_BANNER);
+  unsigned need_len = banner_len + sizeof(ceph_entity_addr) * 2;
+  return READ(need_len, handle_server_banner_and_identify);
+}
+
+CtPtr ProtocolV1::handle_server_banner_and_identify(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read banner and identify addresses failed"
+                  << dendl;
+    return _fault();
+  }
+
+  unsigned banner_len = strlen(CEPH_BANNER);
+  if (memcmp(buffer, CEPH_BANNER, banner_len)) {
+    ldout(cct, 0) << __func__ << " connect protocol error (bad banner) on peer "
+                  << connection->get_peer_addr() << dendl;
+    return _fault();
+  }
+
+  bufferlist bl;
+  entity_addr_t paddr, peer_addr_for_me;
+
+  bl.append(buffer + banner_len, sizeof(ceph_entity_addr) * 2);
+  auto p = bl.cbegin();
+  try {
+    decode(paddr, p);
+    decode(peer_addr_for_me, p);
+  } catch (const buffer::error &e) {
+    lderr(cct) << __func__ << " decode peer addr failed " << dendl;
+    return _fault();
+  }
+  ldout(cct, 20) << __func__ << " connect read peer addr " << paddr
+                 << " on socket " << connection->cs.fd() << dendl;
+
+  entity_addr_t peer_addr = connection->peer_addrs->legacy_addr();
+  if (peer_addr != paddr) {
+    if (paddr.is_blank_ip() && peer_addr.get_port() == paddr.get_port() &&
+        peer_addr.get_nonce() == paddr.get_nonce()) {
+      ldout(cct, 0) << __func__ << " connect claims to be " << paddr << " not "
+                    << peer_addr << " - presumably this is the same node!"
+                    << dendl;
+    } else {
+      ldout(cct, 10) << __func__ << " connect claims to be " << paddr << " not "
+                     << peer_addr << dendl;
+      return _fault();
+    }
+  }
+
+  ldout(cct, 20) << __func__ << " connect peer addr for me is "
+                 << peer_addr_for_me << dendl;
+  if (messenger->get_myaddrs().empty() ||
+      messenger->get_myaddrs().front().is_blank_ip()) {
+    sockaddr_storage ss;
+    socklen_t len = sizeof(ss);
+    getsockname(connection->cs.fd(), (sockaddr *)&ss, &len);
+    entity_addr_t a;
+    if (cct->_conf->ms_learn_addr_from_peer) {
+      ldout(cct, 1) << __func__ << " peer " << connection->target_addr
+		    << " says I am " << peer_addr_for_me << " (socket says "
+		    << (sockaddr*)&ss << ")" << dendl;
+      a = peer_addr_for_me;
+    } else {
+      ldout(cct, 1) << __func__ << " socket to  " << connection->target_addr
+		    << " says I am " << (sockaddr*)&ss
+		    << " (peer says " << peer_addr_for_me << ")" << dendl;
+      a.set_sockaddr((sockaddr *)&ss);
+    }
+    a.set_type(entity_addr_t::TYPE_LEGACY); // anything but NONE; learned_addr ignores this
+    a.set_port(0);
+    connection->lock.unlock();
+    messenger->learned_addr(a);
+    if (cct->_conf->ms_inject_internal_delays &&
+	cct->_conf->ms_inject_socket_failures) {
+      if (rand() % cct->_conf->ms_inject_socket_failures == 0) {
+	ldout(cct, 10) << __func__ << " sleep for "
+		       << cct->_conf->ms_inject_internal_delays << dendl;
+	utime_t t;
+	t.set_from_double(cct->_conf->ms_inject_internal_delays);
+	t.sleep();
+      }
+    }
+    connection->lock.lock();
+    if (state != CONNECTING_WAIT_BANNER_AND_IDENTIFY) {
+      ldout(cct, 1) << __func__
+                  << " state changed while learned_addr, mark_down or "
+		    << " replacing must be happened just now" << dendl;
+      return nullptr;
+    }
+  }
+
+  bufferlist myaddrbl;
+  encode(messenger->get_myaddr_legacy(), myaddrbl, 0);  // legacy
+  return WRITE(myaddrbl, handle_my_addr_write);
+}
+
+CtPtr ProtocolV1::handle_my_addr_write(int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 2) << __func__ << " connect couldn't write my addr, "
+                  << cpp_strerror(r) << dendl;
+    return _fault();
+  }
+  ldout(cct, 10) << __func__ << " connect sent my addr "
+                 << messenger->get_myaddr_legacy() << dendl;
+
+  return CONTINUE(send_connect_message);
+}
+
+CtPtr ProtocolV1::send_connect_message() {
+  state = CONNECTING_SEND_CONNECT_MSG;
+
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (!authorizer) {
+    authorizer = messenger->ms_deliver_get_authorizer(connection->peer_type);
+  }
+
+  ceph_msg_connect connect;
+  connect.features = connection->policy.features_supported;
+  connect.host_type = messenger->get_myname().type();
+  connect.global_seq = global_seq;
+  connect.connect_seq = connect_seq;
+  connect.protocol_version =
+      messenger->get_proto_version(connection->peer_type, true);
+  connect.authorizer_protocol = authorizer ? authorizer->protocol : 0;
+  connect.authorizer_len = authorizer ? authorizer->bl.length() : 0;
+
+  if (authorizer) {
+    ldout(cct, 10) << __func__
+                   << " connect_msg.authorizer_len=" << connect.authorizer_len
+                   << " protocol=" << connect.authorizer_protocol << dendl;
+  }
+
+  connect.flags = 0;
+  if (connection->policy.lossy) {
+    connect.flags |=
+        CEPH_MSG_CONNECT_LOSSY;  // this is fyi, actually, server decides!
+  }
+
+  bufferlist bl;
+  bl.append((char *)&connect, sizeof(connect));
+  if (authorizer) {
+    bl.append(authorizer->bl.c_str(), authorizer->bl.length());
+  }
+
+  ldout(cct, 10) << __func__ << " connect sending gseq=" << global_seq
+                 << " cseq=" << connect_seq
+                 << " proto=" << connect.protocol_version << dendl;
+
+  return WRITE(bl, handle_connect_message_write);
+}
+
+CtPtr ProtocolV1::handle_connect_message_write(int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 2) << __func__ << " connect couldn't send reply "
+                  << cpp_strerror(r) << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 20) << __func__
+                 << " connect wrote (self +) cseq, waiting for reply" << dendl;
+
+  return wait_connect_reply();
+}
+
+CtPtr ProtocolV1::wait_connect_reply() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&connect_reply, 0, sizeof(connect_reply));
+  return READ(sizeof(connect_reply), handle_connect_reply_1);
+}
+
+CtPtr ProtocolV1::handle_connect_reply_1(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read connect reply failed" << dendl;
+    return _fault();
+  }
+
+  connect_reply = *((ceph_msg_connect_reply *)buffer);
+
+  ldout(cct, 20) << __func__ << " connect got reply tag "
+                 << (int)connect_reply.tag << " connect_seq "
+                 << connect_reply.connect_seq << " global_seq "
+                 << connect_reply.global_seq << " proto "
+                 << connect_reply.protocol_version << " flags "
+                 << (int)connect_reply.flags << " features "
+                 << connect_reply.features << dendl;
+
+  if (connect_reply.authorizer_len) {
+    return wait_connect_reply_auth();
+  }
+
+  return handle_connect_reply_2();
+}
+
+CtPtr ProtocolV1::wait_connect_reply_auth() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  ldout(cct, 10) << __func__
+                 << " reply.authorizer_len=" << connect_reply.authorizer_len
+                 << dendl;
+
+  ceph_assert(connect_reply.authorizer_len < 4096);
+
+  return READ(connect_reply.authorizer_len, handle_connect_reply_auth);
+}
+
+CtPtr ProtocolV1::handle_connect_reply_auth(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read connect reply authorizer failed"
+                  << dendl;
+    return _fault();
+  }
+
+  bufferlist authorizer_reply;
+  authorizer_reply.append(buffer, connect_reply.authorizer_len);
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
+    ldout(cct, 10) << __func__ << " connect got auth challenge" << dendl;
+    authorizer->add_challenge(cct, authorizer_reply);
+    return CONTINUE(send_connect_message);
+  }
+
+  auto iter = authorizer_reply.cbegin();
+  if (authorizer && !authorizer->verify_reply(iter,
+					      nullptr /* connection_secret */)) {
+    ldout(cct, 0) << __func__ << " failed verifying authorize reply" << dendl;
+    return _fault();
+  }
+
+  return handle_connect_reply_2();
+}
+
+CtPtr ProtocolV1::handle_connect_reply_2() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_FEATURES) {
+    ldout(cct, 0) << __func__ << " connect protocol feature mismatch, my "
+                  << std::hex << connection->policy.features_supported
+                  << " < peer " << connect_reply.features << " missing "
+                  << (connect_reply.features &
+                      ~connection->policy.features_supported)
+                  << std::dec << dendl;
+    return _fault();
+  }
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_BADPROTOVER) {
+    ldout(cct, 0) << __func__ << " connect protocol version mismatch, my "
+                  << messenger->get_proto_version(connection->peer_type, true)
+                  << " != " << connect_reply.protocol_version << dendl;
+    return _fault();
+  }
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_BADAUTHORIZER) {
+    ldout(cct, 0) << __func__ << " connect got BADAUTHORIZER" << dendl;
+    return _fault();
+  }
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_RESETSESSION) {
+    ldout(cct, 0) << __func__ << " connect got RESETSESSION" << dendl;
+    session_reset();
+    connect_seq = 0;
+
+    // see session_reset
+    connection->outgoing_bl.clear();
+
+    return CONTINUE(send_connect_message);
+  }
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_RETRY_GLOBAL) {
+    global_seq = messenger->get_global_seq(connect_reply.global_seq);
+    ldout(cct, 5) << __func__ << " connect got RETRY_GLOBAL "
+                  << connect_reply.global_seq << " chose new " << global_seq
+                  << dendl;
+    return CONTINUE(send_connect_message);
+  }
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_RETRY_SESSION) {
+    ceph_assert(connect_reply.connect_seq > connect_seq);
+    ldout(cct, 5) << __func__ << " connect got RETRY_SESSION " << connect_seq
+                  << " -> " << connect_reply.connect_seq << dendl;
+    connect_seq = connect_reply.connect_seq;
+    return CONTINUE(send_connect_message);
+  }
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_WAIT) {
+    ldout(cct, 1) << __func__ << " connect got WAIT (connection race)" << dendl;
+    state = WAIT;
+    return _fault();
+  }
+
+  uint64_t feat_missing;
+  feat_missing =
+      connection->policy.features_required & ~(uint64_t)connect_reply.features;
+  if (feat_missing) {
+    ldout(cct, 1) << __func__ << " missing required features " << std::hex
+                  << feat_missing << std::dec << dendl;
+    return _fault();
+  }
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_SEQ) {
+    ldout(cct, 10)
+        << __func__
+        << " got CEPH_MSGR_TAG_SEQ, reading acked_seq and writing in_seq"
+        << dendl;
+
+    return wait_ack_seq();
+  }
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_READY) {
+    ldout(cct, 10) << __func__ << " got CEPH_MSGR_TAG_READY " << dendl;
+  }
+
+  return client_ready();
+}
+
+CtPtr ProtocolV1::wait_ack_seq() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  return READ(sizeof(uint64_t), handle_ack_seq);
+}
+
+CtPtr ProtocolV1::handle_ack_seq(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read connect ack seq failed" << dendl;
+    return _fault();
+  }
+
+  uint64_t newly_acked_seq = 0;
+
+  newly_acked_seq = *((uint64_t *)buffer);
+  ldout(cct, 2) << __func__ << " got newly_acked_seq " << newly_acked_seq
+                << " vs out_seq " << out_seq << dendl;
+  out_seq = discard_requeued_up_to(out_seq, newly_acked_seq);
+
+  bufferlist bl;
+  uint64_t s = in_seq;
+  bl.append((char *)&s, sizeof(s));
+
+  return WRITE(bl, handle_in_seq_write);
+}
+
+CtPtr ProtocolV1::handle_in_seq_write(int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 10) << __func__ << " failed to send in_seq " << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 10) << __func__ << " send in_seq done " << dendl;
+
+  return client_ready();
+}
+
+CtPtr ProtocolV1::client_ready() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  // hooray!
+  peer_global_seq = connect_reply.global_seq;
+  connection->policy.lossy = connect_reply.flags & CEPH_MSG_CONNECT_LOSSY;
+
+  once_ready = true;
+  connect_seq += 1;
+  ceph_assert(connect_seq == connect_reply.connect_seq);
+  backoff = utime_t();
+  connection->set_features((uint64_t)connect_reply.features &
+                           (uint64_t)connection->policy.features_supported);
+  ldout(cct, 10) << __func__ << " connect success " << connect_seq
+                 << ", lossy = " << connection->policy.lossy << ", features "
+                 << connection->get_features() << dendl;
+
+  // If we have an authorizer, get a new AuthSessionHandler to deal with
+  // ongoing security of the connection.  PLR
+  if (authorizer != NULL) {
+    ldout(cct, 10) << __func__ << " setting up session_security with auth "
+		   << authorizer << dendl;
+    session_security.reset(get_auth_session_handler(
+        cct, authorizer->protocol,
+	authorizer->session_key,
+        connection->get_features()));
+  } else {
+    // We have no authorizer, so we shouldn't be applying security to messages
+    // in this AsyncConnection.  PLR
+    ldout(cct, 10) << __func__ << " no authorizer, clearing session_security"
+		   << dendl;
+    session_security.reset();
+  }
+
+  if (connection->delay_state) {
+    ceph_assert(connection->delay_state->ready());
+  }
+  connection->dispatch_queue->queue_connect(connection);
+  messenger->ms_deliver_handle_fast_connect(connection);
+
+  return ready();
+}
+
+/**
+ * Server Protocol V1
+ **/
+
+CtPtr ProtocolV1::send_server_banner() {
+  ldout(cct, 20) << __func__ << dendl;
+  state = ACCEPTING;
+
+  bufferlist bl;
+
+  bl.append(CEPH_BANNER, strlen(CEPH_BANNER));
+
+  // as a server, we should have a legacy addr if we accepted this connection.
+  auto legacy = messenger->get_myaddrs().legacy_addr();
+  encode(legacy, bl, 0);  // legacy
+  connection->port = legacy.get_port();
+  encode(connection->target_addr, bl, 0);  // legacy
+
+  ldout(cct, 1) << __func__ << " sd=" << connection->cs.fd()
+		<< " legacy " << legacy
+		<< " socket_addr " << connection->socket_addr
+		<< " target_addr " << connection->target_addr
+		<< dendl;
+
+  return WRITE(bl, handle_server_banner_write);
+}
+
+CtPtr ProtocolV1::handle_server_banner_write(int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << " write server banner failed" << dendl;
+    return _fault();
+  }
+  ldout(cct, 10) << __func__ << " write banner and addr done: "
+                 << connection->get_peer_addr() << dendl;
+
+  return wait_client_banner();
+}
+
+CtPtr ProtocolV1::wait_client_banner() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  return READ(strlen(CEPH_BANNER) + sizeof(ceph_entity_addr),
+              handle_client_banner);
+}
+
+CtPtr ProtocolV1::handle_client_banner(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read peer banner and addr failed" << dendl;
+    return _fault();
+  }
+
+  if (memcmp(buffer, CEPH_BANNER, strlen(CEPH_BANNER))) {
+    ldout(cct, 1) << __func__ << " accept peer sent bad banner '" << buffer
+                  << "' (should be '" << CEPH_BANNER << "')" << dendl;
+    return _fault();
+  }
+
+  bufferlist addr_bl;
+  entity_addr_t peer_addr;
+
+  addr_bl.append(buffer + strlen(CEPH_BANNER), sizeof(ceph_entity_addr));
+  try {
+    auto ti = addr_bl.cbegin();
+    decode(peer_addr, ti);
+  } catch (const buffer::error &e) {
+    lderr(cct) << __func__ << " decode peer_addr failed " << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 10) << __func__ << " accept peer addr is " << peer_addr << dendl;
+  if (peer_addr.is_blank_ip()) {
+    // peer apparently doesn't know what ip they have; figure it out for them.
+    int port = peer_addr.get_port();
+    peer_addr.set_sockaddr(connection->target_addr.get_sockaddr());
+    peer_addr.set_port(port);
+
+    ldout(cct, 0) << __func__ << " accept peer addr is really " << peer_addr
+                  << " (socket is " << connection->target_addr << ")" << dendl;
+  }
+  connection->set_peer_addr(peer_addr);  // so that connection_state gets set up
+  connection->target_addr = peer_addr;
+
+  return CONTINUE(wait_connect_message);
+}
+
+CtPtr ProtocolV1::wait_connect_message() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&connect_msg, 0, sizeof(connect_msg));
+  return READ(sizeof(connect_msg), handle_connect_message_1);
+}
+
+CtPtr ProtocolV1::handle_connect_message_1(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read connect msg failed" << dendl;
+    return _fault();
+  }
+
+  connect_msg = *((ceph_msg_connect *)buffer);
+
+  state = ACCEPTING_WAIT_CONNECT_MSG_AUTH;
+
+  if (connect_msg.authorizer_len) {
+    return wait_connect_message_auth();
+  }
+
+  return handle_connect_message_2();
+}
+
+CtPtr ProtocolV1::wait_connect_message_auth() {
+  ldout(cct, 20) << __func__ << dendl;
+  authorizer_buf.clear();
+  authorizer_buf.push_back(buffer::create(connect_msg.authorizer_len));
+  return READB(connect_msg.authorizer_len, authorizer_buf.c_str(),
+               handle_connect_message_auth);
+}
+
+CtPtr ProtocolV1::handle_connect_message_auth(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read connect authorizer failed" << dendl;
+    return _fault();
+  }
+
+  return handle_connect_message_2();
+}
+
+CtPtr ProtocolV1::handle_connect_message_2() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  ldout(cct, 20) << __func__ << " accept got peer connect_seq "
+                 << connect_msg.connect_seq << " global_seq "
+                 << connect_msg.global_seq << dendl;
+
+  connection->set_peer_type(connect_msg.host_type);
+  connection->policy = messenger->get_policy(connect_msg.host_type);
+
+  ldout(cct, 10) << __func__ << " accept of host_type " << connect_msg.host_type
+                 << ", policy.lossy=" << connection->policy.lossy
+                 << " policy.server=" << connection->policy.server
+                 << " policy.standby=" << connection->policy.standby
+                 << " policy.resetcheck=" << connection->policy.resetcheck
+		 << " features 0x" << std::hex << (uint64_t)connect_msg.features
+		 << std::dec
+                 << dendl;
+
+  ceph_msg_connect_reply reply;
+  bufferlist authorizer_reply;
+
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&reply, 0, sizeof(reply));
+  reply.protocol_version =
+      messenger->get_proto_version(connection->peer_type, false);
+
+  // mismatch?
+  ldout(cct, 10) << __func__ << " accept my proto " << reply.protocol_version
+                 << ", their proto " << connect_msg.protocol_version << dendl;
+
+  if (connect_msg.protocol_version != reply.protocol_version) {
+    return send_connect_message_reply(CEPH_MSGR_TAG_BADPROTOVER, reply,
+                                      authorizer_reply);
+  }
+
+  // require signatures for cephx?
+  if (connect_msg.authorizer_protocol == CEPH_AUTH_CEPHX) {
+    if (connection->peer_type == CEPH_ENTITY_TYPE_OSD ||
+        connection->peer_type == CEPH_ENTITY_TYPE_MDS ||
+        connection->peer_type == CEPH_ENTITY_TYPE_MGR) {
+      if (cct->_conf->cephx_require_signatures ||
+          cct->_conf->cephx_cluster_require_signatures) {
+        ldout(cct, 10)
+            << __func__
+            << " using cephx, requiring MSG_AUTH feature bit for cluster"
+            << dendl;
+        connection->policy.features_required |= CEPH_FEATURE_MSG_AUTH;
+      }
+      if (cct->_conf->cephx_require_version >= 2 ||
+          cct->_conf->cephx_cluster_require_version >= 2) {
+        ldout(cct, 10)
+            << __func__
+            << " using cephx, requiring cephx v2 feature bit for cluster"
+            << dendl;
+        connection->policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2;
+      }
+    } else {
+      if (cct->_conf->cephx_require_signatures ||
+          cct->_conf->cephx_service_require_signatures) {
+        ldout(cct, 10)
+            << __func__
+            << " using cephx, requiring MSG_AUTH feature bit for service"
+            << dendl;
+        connection->policy.features_required |= CEPH_FEATURE_MSG_AUTH;
+      }
+      if (cct->_conf->cephx_require_version >= 2 ||
+          cct->_conf->cephx_service_require_version >= 2) {
+        ldout(cct, 10)
+            << __func__
+            << " using cephx, requiring cephx v2 feature bit for service"
+            << dendl;
+        connection->policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2;
+      }
+    }
+  }
+
+  uint64_t feat_missing =
+      connection->policy.features_required & ~(uint64_t)connect_msg.features;
+  if (feat_missing) {
+    ldout(cct, 1) << __func__ << " peer missing required features " << std::hex
+                  << feat_missing << std::dec << dendl;
+    return send_connect_message_reply(CEPH_MSGR_TAG_FEATURES, reply,
+                                      authorizer_reply);
+  }
+
+  bufferlist auth_bl_copy = authorizer_buf;
+  connection->lock.unlock();
+  ldout(cct,10) << __func__ << " authorizor_protocol "
+		<< connect_msg.authorizer_protocol
+		<< " len " << auth_bl_copy.length()
+		<< dendl;
+  bool authorizer_valid;
+  bool need_challenge = HAVE_FEATURE(connect_msg.features, CEPHX_V2);
+  bool had_challenge = (bool)authorizer_challenge;
+  if (!messenger->ms_deliver_verify_authorizer(
+          connection, connection->peer_type, connect_msg.authorizer_protocol,
+          auth_bl_copy, authorizer_reply, authorizer_valid, session_key,
+	  nullptr /* connection_secret */,
+          need_challenge ? &authorizer_challenge : nullptr) ||
+      !authorizer_valid) {
+    connection->lock.lock();
+    if (state != ACCEPTING_WAIT_CONNECT_MSG_AUTH) {
+      ldout(cct, 1) << __func__
+		    << " state changed while accept, it must be mark_down"
+		    << dendl;
+      ceph_assert(state == CLOSED);
+      return _fault();
+    }
+
+    if (need_challenge && !had_challenge && authorizer_challenge) {
+      ldout(cct, 10) << __func__ << ": challenging authorizer" << dendl;
+      ceph_assert(authorizer_reply.length());
+      return send_connect_message_reply(CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER,
+                                        reply, authorizer_reply);
+    } else {
+      ldout(cct, 0) << __func__ << ": got bad authorizer, auth_reply_len="
+                    << authorizer_reply.length() << dendl;
+      session_security.reset();
+      return send_connect_message_reply(CEPH_MSGR_TAG_BADAUTHORIZER, reply,
+                                        authorizer_reply);
+    }
+  }
+
+  // We've verified the authorizer for this AsyncConnection, so set up the
+  // session security structure.  PLR
+  ldout(cct, 10) << __func__ << " accept setting up session_security." << dendl;
+
+  // existing?
+  AsyncConnectionRef existing = messenger->lookup_conn(*connection->peer_addrs);
+
+  connection->inject_delay();
+
+  connection->lock.lock();
+  if (state != ACCEPTING_WAIT_CONNECT_MSG_AUTH) {
+    ldout(cct, 1) << __func__
+                  << " state changed while accept, it must be mark_down"
+                  << dendl;
+    ceph_assert(state == CLOSED);
+    return _fault();
+  }
+
+  if (existing == connection) {
+    existing = nullptr;
+  }
+  if (existing && existing->protocol->proto_type != 1) {
+    ldout(cct,1) << __func__ << " existing " << existing << " proto "
+		 << existing->protocol.get() << " version is "
+		 << existing->protocol->proto_type << ", marking down" << dendl;
+    existing->mark_down();
+    existing = nullptr;
+  }
+
+  if (existing) {
+    // There is no possible that existing connection will acquire this
+    // connection's lock
+    existing->lock.lock();  // skip lockdep check (we are locking a second
+                            // AsyncConnection here)
+
+    ldout(cct,10) << __func__ << " existing=" << existing << " exproto="
+		  << existing->protocol.get() << dendl;
+    ProtocolV1 *exproto = dynamic_cast<ProtocolV1 *>(existing->protocol.get());
+    ceph_assert(exproto);
+    ceph_assert(exproto->proto_type == 1);
+
+    if (exproto->state == CLOSED) {
+      ldout(cct, 1) << __func__ << " existing " << existing
+		    << " already closed." << dendl;
+      existing->lock.unlock();
+      existing = nullptr;
+
+      return open(reply, authorizer_reply);
+    }
+
+    if (exproto->replacing) {
+      ldout(cct, 1) << __func__
+                    << " existing racing replace happened while replacing."
+                    << " existing_state="
+                    << connection->get_state_name(existing->state) << dendl;
+      reply.global_seq = exproto->peer_global_seq;
+      existing->lock.unlock();
+      return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_GLOBAL, reply,
+                                        authorizer_reply);
+    }
+
+    if (connect_msg.global_seq < exproto->peer_global_seq) {
+      ldout(cct, 10) << __func__ << " accept existing " << existing << ".gseq "
+                     << exproto->peer_global_seq << " > "
+                     << connect_msg.global_seq << ", RETRY_GLOBAL" << dendl;
+      reply.global_seq = exproto->peer_global_seq;  // so we can send it below..
+      existing->lock.unlock();
+      return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_GLOBAL, reply,
+                                        authorizer_reply);
+    } else {
+      ldout(cct, 10) << __func__ << " accept existing " << existing << ".gseq "
+                     << exproto->peer_global_seq
+                     << " <= " << connect_msg.global_seq << ", looks ok"
+                     << dendl;
+    }
+
+    if (existing->policy.lossy) {
+      ldout(cct, 0)
+          << __func__
+          << " accept replacing existing (lossy) channel (new one lossy="
+          << connection->policy.lossy << ")" << dendl;
+      exproto->session_reset();
+      return replace(existing, reply, authorizer_reply);
+    }
+
+    ldout(cct, 1) << __func__ << " accept connect_seq "
+                  << connect_msg.connect_seq
+                  << " vs existing csq=" << exproto->connect_seq
+                  << " existing_state="
+                  << connection->get_state_name(existing->state) << dendl;
+
+    if (connect_msg.connect_seq == 0 && exproto->connect_seq > 0) {
+      ldout(cct, 0)
+          << __func__
+          << " accept peer reset, then tried to connect to us, replacing"
+          << dendl;
+      // this is a hard reset from peer
+      is_reset_from_peer = true;
+      if (connection->policy.resetcheck) {
+        exproto->session_reset();  // this resets out_queue, msg_ and
+                                   // connect_seq #'s
+      }
+      return replace(existing, reply, authorizer_reply);
+    }
+
+    if (connect_msg.connect_seq < exproto->connect_seq) {
+      // old attempt, or we sent READY but they didn't get it.
+      ldout(cct, 10) << __func__ << " accept existing " << existing << ".cseq "
+                     << exproto->connect_seq << " > " << connect_msg.connect_seq
+                     << ", RETRY_SESSION" << dendl;
+      reply.connect_seq = exproto->connect_seq + 1;
+      existing->lock.unlock();
+      return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_SESSION, reply,
+                                        authorizer_reply);
+    }
+
+    if (connect_msg.connect_seq == exproto->connect_seq) {
+      // if the existing connection successfully opened, and/or
+      // subsequently went to standby, then the peer should bump
+      // their connect_seq and retry: this is not a connection race
+      // we need to resolve here.
+      if (exproto->state == OPENED || exproto->state == STANDBY) {
+        ldout(cct, 10) << __func__ << " accept connection race, existing "
+                       << existing << ".cseq " << exproto->connect_seq
+                       << " == " << connect_msg.connect_seq
+                       << ", OPEN|STANDBY, RETRY_SESSION " << dendl;
+        // if connect_seq both zero, dont stuck into dead lock. it's ok to
+        // replace
+        if (connection->policy.resetcheck && exproto->connect_seq == 0) {
+          return replace(existing, reply, authorizer_reply);
+        }
+
+        reply.connect_seq = exproto->connect_seq + 1;
+        existing->lock.unlock();
+        return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_SESSION, reply,
+                                          authorizer_reply);
+      }
+
+      // connection race?
+      if (connection->peer_addrs->legacy_addr() < messenger->get_myaddr_legacy() ||
+          existing->policy.server) {
+        // incoming wins
+        ldout(cct, 10) << __func__ << " accept connection race, existing "
+                       << existing << ".cseq " << exproto->connect_seq
+                       << " == " << connect_msg.connect_seq
+                       << ", or we are server, replacing my attempt" << dendl;
+        return replace(existing, reply, authorizer_reply);
+      } else {
+        // our existing outgoing wins
+        ldout(messenger->cct, 10)
+            << __func__ << " accept connection race, existing " << existing
+            << ".cseq " << exproto->connect_seq
+            << " == " << connect_msg.connect_seq << ", sending WAIT" << dendl;
+        ceph_assert(connection->peer_addrs->legacy_addr() >
+                    messenger->get_myaddr_legacy());
+        existing->lock.unlock();
+	// make sure we follow through with opening the existing
+	// connection (if it isn't yet open) since we know the peer
+	// has something to send to us.
+	existing->send_keepalive();
+        return send_connect_message_reply(CEPH_MSGR_TAG_WAIT, reply,
+                                          authorizer_reply);
+      }
+    }
+
+    ceph_assert(connect_msg.connect_seq > exproto->connect_seq);
+    ceph_assert(connect_msg.global_seq >= exproto->peer_global_seq);
+    if (connection->policy.resetcheck &&  // RESETSESSION only used by servers;
+                                          // peers do not reset each other
+        exproto->connect_seq == 0) {
+      ldout(cct, 0) << __func__ << " accept we reset (peer sent cseq "
+                    << connect_msg.connect_seq << ", " << existing
+                    << ".cseq = " << exproto->connect_seq
+                    << "), sending RESETSESSION " << dendl;
+      existing->lock.unlock();
+      return send_connect_message_reply(CEPH_MSGR_TAG_RESETSESSION, reply,
+                                        authorizer_reply);
+    }
+
+    // reconnect
+    ldout(cct, 10) << __func__ << " accept peer sent cseq "
+                   << connect_msg.connect_seq << " > " << exproto->connect_seq
+                   << dendl;
+    return replace(existing, reply, authorizer_reply);
+  }  // existing
+  else if (!replacing && connect_msg.connect_seq > 0) {
+    // we reset, and they are opening a new session
+    ldout(cct, 0) << __func__ << " accept we reset (peer sent cseq "
+                  << connect_msg.connect_seq << "), sending RESETSESSION"
+                  << dendl;
+    return send_connect_message_reply(CEPH_MSGR_TAG_RESETSESSION, reply,
+                                      authorizer_reply);
+  } else {
+    // new session
+    ldout(cct, 10) << __func__ << " accept new session" << dendl;
+    existing = nullptr;
+    return open(reply, authorizer_reply);
+  }
+}
+
+CtPtr ProtocolV1::send_connect_message_reply(char tag,
+                                             ceph_msg_connect_reply &reply,
+                                             bufferlist &authorizer_reply) {
+  ldout(cct, 20) << __func__ << dendl;
+  bufferlist reply_bl;
+  reply.tag = tag;
+  reply.features =
+      ((uint64_t)connect_msg.features & connection->policy.features_supported) |
+      connection->policy.features_required;
+  reply.authorizer_len = authorizer_reply.length();
+  reply_bl.append((char *)&reply, sizeof(reply));
+
+  ldout(cct, 10) << __func__ << " reply features 0x" << std::hex
+		 << reply.features << " = (policy sup 0x"
+		 << connection->policy.features_supported
+		 << " & connect 0x" << (uint64_t)connect_msg.features
+		 << ") | policy req 0x"
+		 << connection->policy.features_required
+		 << dendl;
+
+  if (reply.authorizer_len) {
+    reply_bl.append(authorizer_reply.c_str(), authorizer_reply.length());
+    authorizer_reply.clear();
+  }
+
+  return WRITE(reply_bl, handle_connect_message_reply_write);
+}
+
+CtPtr ProtocolV1::handle_connect_message_reply_write(int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << " write connect message reply failed" << dendl;
+    connection->inject_delay();
+    return _fault();
+  }
+
+  return CONTINUE(wait_connect_message);
+}
+
+CtPtr ProtocolV1::replace(AsyncConnectionRef existing,
+                          ceph_msg_connect_reply &reply,
+                          bufferlist &authorizer_reply) {
+  ldout(cct, 10) << __func__ << " accept replacing " << existing << dendl;
+
+  connection->inject_delay();
+  if (existing->policy.lossy) {
+    // disconnect from the Connection
+    ldout(cct, 1) << __func__ << " replacing on lossy channel, failing existing"
+                  << dendl;
+    existing->protocol->stop();
+    existing->dispatch_queue->queue_reset(existing.get());
+  } else {
+    ceph_assert(can_write == WriteStatus::NOWRITE);
+    existing->write_lock.lock();
+
+    ProtocolV1 *exproto = dynamic_cast<ProtocolV1 *>(existing->protocol.get());
+
+    // reset the in_seq if this is a hard reset from peer,
+    // otherwise we respect our original connection's value
+    if (is_reset_from_peer) {
+      exproto->is_reset_from_peer = true;
+    }
+
+    connection->center->delete_file_event(connection->cs.fd(),
+                                          EVENT_READABLE | EVENT_WRITABLE);
+
+    if (existing->delay_state) {
+      existing->delay_state->flush();
+      ceph_assert(!connection->delay_state);
+    }
+    exproto->reset_recv_state();
+
+    exproto->connect_msg.features = connect_msg.features;
+
+    auto temp_cs = std::move(connection->cs);
+    EventCenter *new_center = connection->center;
+    Worker *new_worker = connection->worker;
+    // avoid _stop shutdown replacing socket
+    // queue a reset on the new connection, which we're dumping for the old
+    stop();
+
+    connection->dispatch_queue->queue_reset(connection);
+    ldout(messenger->cct, 1)
+        << __func__ << " stop myself to swap existing" << dendl;
+    exproto->can_write = WriteStatus::REPLACING;
+    exproto->replacing = true;
+    exproto->write_in_progress = false;
+    existing->state_offset = 0;
+    // avoid previous thread modify event
+    exproto->state = NONE;
+    existing->state = AsyncConnection::STATE_NONE;
+    // Discard existing prefetch buffer in `recv_buf`
+    existing->recv_start = existing->recv_end = 0;
+    // there shouldn't exist any buffer
+    ceph_assert(connection->recv_start == connection->recv_end);
+
+    exproto->authorizer_challenge.reset();
+
+    auto deactivate_existing = std::bind(
+        [existing, new_worker, new_center, exproto, reply,
+         authorizer_reply](ConnectedSocket &cs) mutable {
+          // we need to delete time event in original thread
+          {
+            std::lock_guard<std::mutex> l(existing->lock);
+            existing->write_lock.lock();
+            exproto->requeue_sent();
+            existing->outgoing_bl.clear();
+            existing->open_write = false;
+            existing->write_lock.unlock();
+            if (exproto->state == NONE) {
+              existing->shutdown_socket();
+              existing->cs = std::move(cs);
+              existing->worker->references--;
+              new_worker->references++;
+              existing->logger = new_worker->get_perf_counter();
+              existing->worker = new_worker;
+              existing->center = new_center;
+              if (existing->delay_state)
+                existing->delay_state->set_center(new_center);
+            } else if (exproto->state == CLOSED) {
+              auto back_to_close =
+                  std::bind([](ConnectedSocket &cs) mutable { cs.close(); },
+                            std::move(cs));
+              new_center->submit_to(new_center->get_id(),
+                                    std::move(back_to_close), true);
+              return;
+            } else {
+              ceph_abort();
+            }
+          }
+
+          // Before changing existing->center, it may already exists some
+          // events in existing->center's queue. Then if we mark down
+          // `existing`, it will execute in another thread and clean up
+          // connection. Previous event will result in segment fault
+          auto transfer_existing = [existing, exproto, reply,
+                                    authorizer_reply]() mutable {
+            std::lock_guard<std::mutex> l(existing->lock);
+            if (exproto->state == CLOSED) return;
+            ceph_assert(exproto->state == NONE);
+
+            // we have called shutdown_socket above
+            ceph_assert(existing->last_tick_id == 0);
+            // restart timer since we are going to re-build connection
+            existing->last_connect_started = ceph::coarse_mono_clock::now();
+            existing->last_tick_id = existing->center->create_time_event(
+              existing->connect_timeout_us, existing->tick_handler);
+            existing->state = AsyncConnection::STATE_CONNECTION_ESTABLISHED;
+            exproto->state = ACCEPTING;
+
+            existing->center->create_file_event(
+                existing->cs.fd(), EVENT_READABLE, existing->read_handler);
+            reply.global_seq = exproto->peer_global_seq;
+            exproto->run_continuation(exproto->send_connect_message_reply(
+                CEPH_MSGR_TAG_RETRY_GLOBAL, reply, authorizer_reply));
+          };
+          if (existing->center->in_thread())
+            transfer_existing();
+          else
+            existing->center->submit_to(existing->center->get_id(),
+                                        std::move(transfer_existing), true);
+        },
+        std::move(temp_cs));
+
+    existing->center->submit_to(existing->center->get_id(),
+                                std::move(deactivate_existing), true);
+    existing->write_lock.unlock();
+    existing->lock.unlock();
+    return nullptr;
+  }
+  existing->lock.unlock();
+
+  return open(reply, authorizer_reply);
+}
+
+CtPtr ProtocolV1::open(ceph_msg_connect_reply &reply,
+                       bufferlist &authorizer_reply) {
+  ldout(cct, 20) << __func__ << dendl;
+
+  connect_seq = connect_msg.connect_seq + 1;
+  peer_global_seq = connect_msg.global_seq;
+  ldout(cct, 10) << __func__ << " accept success, connect_seq = " << connect_seq
+                 << " in_seq=" << in_seq << ", sending READY" << dendl;
+
+  // if it is a hard reset from peer, we don't need a round-trip to negotiate
+  // in/out sequence
+  if ((connect_msg.features & CEPH_FEATURE_RECONNECT_SEQ) &&
+      !is_reset_from_peer) {
+    reply.tag = CEPH_MSGR_TAG_SEQ;
+    wait_for_seq = true;
+  } else {
+    reply.tag = CEPH_MSGR_TAG_READY;
+    wait_for_seq = false;
+    out_seq = discard_requeued_up_to(out_seq, 0);
+    is_reset_from_peer = false;
+    in_seq = 0;
+  }
+
+  // send READY reply
+  reply.features = connection->policy.features_supported;
+  reply.global_seq = messenger->get_global_seq();
+  reply.connect_seq = connect_seq;
+  reply.flags = 0;
+  reply.authorizer_len = authorizer_reply.length();
+  if (connection->policy.lossy) {
+    reply.flags = reply.flags | CEPH_MSG_CONNECT_LOSSY;
+  }
+
+  connection->set_features((uint64_t)reply.features &
+                           (uint64_t)connect_msg.features);
+  ldout(cct, 10) << __func__ << " accept features "
+                 << connection->get_features()
+		 << " authorizer_protocol "
+		 << connect_msg.authorizer_protocol << dendl;
+
+  session_security.reset(
+      get_auth_session_handler(cct, connect_msg.authorizer_protocol,
+                               session_key,
+			       connection->get_features()));
+
+  bufferlist reply_bl;
+  reply_bl.append((char *)&reply, sizeof(reply));
+
+  if (reply.authorizer_len) {
+    reply_bl.append(authorizer_reply.c_str(), authorizer_reply.length());
+  }
+
+  if (reply.tag == CEPH_MSGR_TAG_SEQ) {
+    uint64_t s = in_seq;
+    reply_bl.append((char *)&s, sizeof(s));
+  }
+
+  connection->lock.unlock();
+  // Because "replacing" will prevent other connections preempt this addr,
+  // it's safe that here we don't acquire Connection's lock
+  ssize_t r = messenger->accept_conn(connection);
+
+  connection->inject_delay();
+
+  connection->lock.lock();
+  replacing = false;
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " existing race replacing process for addr = "
+                  << connection->peer_addrs->legacy_addr()
+                  << " just fail later one(this)" << dendl;
+    ldout(cct, 10) << "accept fault after register" << dendl;
+    connection->inject_delay();
+    return _fault();
+  }
+  if (state != ACCEPTING_WAIT_CONNECT_MSG_AUTH) {
+    ldout(cct, 1) << __func__
+                  << " state changed while accept_conn, it must be mark_down"
+                  << dendl;
+    ceph_assert(state == CLOSED || state == NONE);
+    ldout(cct, 10) << "accept fault after register" << dendl;
+    messenger->unregister_conn(connection);
+    connection->inject_delay();
+    return _fault();
+  }
+
+  return WRITE(reply_bl, handle_ready_connect_message_reply_write);
+}
+
+CtPtr ProtocolV1::handle_ready_connect_message_reply_write(int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " write ready connect message reply failed"
+                  << dendl;
+    return _fault();
+  }
+
+  // notify
+  connection->dispatch_queue->queue_accept(connection);
+  messenger->ms_deliver_handle_fast_accept(connection);
+  once_ready = true;
+
+  state = ACCEPTING_HANDLED_CONNECT_MSG;
+
+  if (wait_for_seq) {
+    return wait_seq();
+  }
+
+  return server_ready();
+}
+
+CtPtr ProtocolV1::wait_seq() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  return READ(sizeof(uint64_t), handle_seq);
+}
+
+CtPtr ProtocolV1::handle_seq(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read ack seq failed" << dendl;
+    return _fault();
+  }
+
+  uint64_t newly_acked_seq = *(uint64_t *)buffer;
+  ldout(cct, 2) << __func__ << " accept get newly_acked_seq " << newly_acked_seq
+                << dendl;
+  out_seq = discard_requeued_up_to(out_seq, newly_acked_seq);
+
+  return server_ready();
+}
+
+CtPtr ProtocolV1::server_ready() {
+  ldout(cct, 20) << __func__ << " session_security is "
+		 << session_security
+		 << dendl;
+
+  ldout(cct, 20) << __func__ << " accept done" << dendl;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&connect_msg, 0, sizeof(connect_msg));
+
+  if (connection->delay_state) {
+    ceph_assert(connection->delay_state->ready());
+  }
+
+  return ready();
+}
diff --git a/src/msg/async/ProtocolV1.h b/src/msg/async/ProtocolV1.h
new file mode 100644
index 00000000..070ce73f
--- /dev/null
+++ b/src/msg/async/ProtocolV1.h
@@ -0,0 +1,305 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef _MSG_ASYNC_PROTOCOL_V1_
+#define _MSG_ASYNC_PROTOCOL_V1_
+
+#include "Protocol.h"
+
+class ProtocolV1;
+using CtPtr = Ct<ProtocolV1>*;
+
+class ProtocolV1 : public Protocol {
+/*
+ *  ProtocolV1 State Machine
+ *
+
+    send_server_banner                             send_client_banner
+            |                                              |
+            v                                              v
+    wait_client_banner                              wait_server_banner
+            |                                              |
+            |                                              v
+            v                                 handle_server_banner_and_identify
+    wait_connect_message <---------\                       |
+      |     |                      |                       v
+      |  wait_connect_message_auth |           send_connect_message <----------\
+      |     |                      |                       |                   |
+      v     v                      |                       |                   |
+handle_connect_message_2           |                       v                   |
+        |           |              |            wait_connect_reply             |
+        v           v              |              |        |                   |
+     replace -> send_connect_message_reply        |        V                   |
+        |                                         |   wait_connect_reply_auth  |
+        |                                         |        |                   |
+        v                                         v        v                   |
+      open ---\                                 handle_connect_reply_2 --------/
+        |     |                                            |
+        |     v                                            v
+        |   wait_seq                                  wait_ack_seq
+        |     |                                            |
+        v     v                                            v
+    server_ready                                      client_ready
+            |                                              |
+            \------------------> wait_message <------------/
+                                 |  ^   |  ^
+        /------------------------/  |   |  |
+        |                           |   |  \----------------- ------------\
+        v                /----------/   v                                 |
+handle_keepalive2        |        handle_message_header      read_message_footer
+handle_keepalive2_ack    |              |                                 ^
+handle_tag_ack           |              v                                 |
+        |                |        throttle_message             read_message_data
+        \----------------/              |                                 ^
+                                        v                                 |
+                             read_message_front --> read_message_middle --/
+*/
+
+protected:
+
+  enum State {
+    NONE = 0,
+    START_CONNECT,
+    CONNECTING,
+    CONNECTING_WAIT_BANNER_AND_IDENTIFY,
+    CONNECTING_SEND_CONNECT_MSG,
+    START_ACCEPT,
+    ACCEPTING,
+    ACCEPTING_WAIT_CONNECT_MSG_AUTH,
+    ACCEPTING_HANDLED_CONNECT_MSG,
+    OPENED,
+    THROTTLE_MESSAGE,
+    THROTTLE_BYTES,
+    THROTTLE_DISPATCH_QUEUE,
+    READ_MESSAGE_FRONT,
+    READ_FOOTER_AND_DISPATCH,
+    CLOSED,
+    WAIT,
+    STANDBY
+  };
+
+  static const char *get_state_name(int state) {
+    const char *const statenames[] = {"NONE",
+                                      "START_CONNECT",
+                                      "CONNECTING",
+                                      "CONNECTING_WAIT_BANNER_AND_IDENTIFY",
+                                      "CONNECTING_SEND_CONNECT_MSG",
+                                      "START_ACCEPT",
+                                      "ACCEPTING",
+                                      "ACCEPTING_WAIT_CONNECT_MSG_AUTH",
+                                      "ACCEPTING_HANDLED_CONNECT_MSG",
+                                      "OPENED",
+                                      "THROTTLE_MESSAGE",
+                                      "THROTTLE_BYTES",
+                                      "THROTTLE_DISPATCH_QUEUE",
+                                      "READ_MESSAGE_FRONT",
+                                      "READ_FOOTER_AND_DISPATCH",
+                                      "CLOSED",
+                                      "WAIT",
+                                      "STANDBY"};
+    return statenames[state];
+  }
+
+  char *temp_buffer;
+
+  enum class WriteStatus { NOWRITE, REPLACING, CANWRITE, CLOSED };
+  std::atomic<WriteStatus> can_write;
+  std::list<Message *> sent;  // the first bufferlist need to inject seq
+  // priority queue for outbound msgs
+  std::map<int, std::list<std::pair<bufferlist, Message *>>> out_q;
+  bool keepalive;
+  bool write_in_progress = false;
+
+  __u32 connect_seq, peer_global_seq;
+  std::atomic<uint64_t> in_seq{0};
+  std::atomic<uint64_t> out_seq{0};
+  std::atomic<uint64_t> ack_left{0};
+
+  CryptoKey session_key;
+  std::shared_ptr<AuthSessionHandler> session_security;
+  std::unique_ptr<AuthAuthorizerChallenge> authorizer_challenge;  // accept side
+
+  // Open state
+  ceph_msg_connect connect_msg;
+  ceph_msg_connect_reply connect_reply;
+  bufferlist authorizer_buf;
+
+  utime_t backoff;  // backoff time
+  utime_t recv_stamp;
+  utime_t throttle_stamp;
+  unsigned msg_left;
+  uint64_t cur_msg_size;
+  ceph_msg_header current_header;
+  bufferlist data_buf;
+  bufferlist::iterator data_blp;
+  bufferlist front, middle, data;
+
+  bool replacing;  // when replacing process happened, we will reply connect
+                   // side with RETRY tag and accept side will clear replaced
+                   // connection. So when connect side reissue connect_msg,
+                   // there won't exists conflicting connection so we use
+                   // "replacing" to skip RESETSESSION to avoid detect wrong
+                   // presentation
+  bool is_reset_from_peer;
+  bool once_ready;
+
+  State state;
+
+  void run_continuation(CtPtr pcontinuation);
+  CtPtr read(CONTINUATION_RX_TYPE<ProtocolV1> &next, int len,
+             char *buffer = nullptr);
+  CtPtr write(CONTINUATION_TX_TYPE<ProtocolV1> &next,bufferlist &bl);
+  inline CtPtr _fault() {  // helper fault method that stops continuation
+    fault();
+    return nullptr;
+  }
+
+  CONTINUATION_DECL(ProtocolV1, wait_message);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_keepalive2);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_keepalive2_ack);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_tag_ack);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_header);
+  CONTINUATION_DECL(ProtocolV1, throttle_message);
+  CONTINUATION_DECL(ProtocolV1, throttle_bytes);
+  CONTINUATION_DECL(ProtocolV1, throttle_dispatch_queue);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_front);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_middle);
+  CONTINUATION_DECL(ProtocolV1, read_message_data);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_data);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_footer);
+
+  CtPtr ready();
+  CtPtr wait_message();
+  CtPtr handle_message(char *buffer, int r);
+
+  CtPtr handle_keepalive2(char *buffer, int r);
+  void append_keepalive_or_ack(bool ack = false, utime_t *t = nullptr);
+  CtPtr handle_keepalive2_ack(char *buffer, int r);
+  CtPtr handle_tag_ack(char *buffer, int r);
+
+  CtPtr handle_message_header(char *buffer, int r);
+  CtPtr throttle_message();
+  CtPtr throttle_bytes();
+  CtPtr throttle_dispatch_queue();
+  CtPtr read_message_front();
+  CtPtr handle_message_front(char *buffer, int r);
+  CtPtr read_message_middle();
+  CtPtr handle_message_middle(char *buffer, int r);
+  CtPtr read_message_data_prepare();
+  CtPtr read_message_data();
+  CtPtr handle_message_data(char *buffer, int r);
+  CtPtr read_message_footer();
+  CtPtr handle_message_footer(char *buffer, int r);
+
+  void session_reset();
+  void randomize_out_seq();
+
+  Message *_get_next_outgoing(bufferlist *bl);
+
+  void prepare_send_message(uint64_t features, Message *m, bufferlist &bl);
+  ssize_t write_message(Message *m, bufferlist &bl, bool more);
+
+  void requeue_sent();
+  uint64_t discard_requeued_up_to(uint64_t out_seq, uint64_t seq);
+  void discard_out_queue();
+
+  void reset_recv_state();
+  void reset_security();
+
+  ostream &_conn_prefix(std::ostream *_dout);
+
+public:
+  ProtocolV1(AsyncConnection *connection);
+  virtual ~ProtocolV1();
+
+  virtual void connect() override;
+  virtual void accept() override;
+  virtual bool is_connected() override;
+  virtual void stop() override;
+  virtual void fault() override;
+  virtual void send_message(Message *m) override;
+  virtual void send_keepalive() override;
+
+  virtual void read_event() override;
+  virtual void write_event() override;
+  virtual bool is_queued() override;
+
+  // Client Protocol
+private:
+  int global_seq;
+  AuthAuthorizer *authorizer;
+
+  CONTINUATION_DECL(ProtocolV1, send_client_banner);
+  WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_client_banner_write);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_server_banner_and_identify);
+  WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_my_addr_write);
+  CONTINUATION_DECL(ProtocolV1, send_connect_message);
+  WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_message_write);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_reply_1);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_reply_auth);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_ack_seq);
+  WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_in_seq_write);
+
+  CtPtr send_client_banner();
+  CtPtr handle_client_banner_write(int r);
+  CtPtr wait_server_banner();
+  CtPtr handle_server_banner_and_identify(char *buffer, int r);
+  CtPtr handle_my_addr_write(int r);
+  CtPtr send_connect_message();
+  CtPtr handle_connect_message_write(int r);
+  CtPtr wait_connect_reply();
+  CtPtr handle_connect_reply_1(char *buffer, int r);
+  CtPtr wait_connect_reply_auth();
+  CtPtr handle_connect_reply_auth(char *buffer, int r);
+  CtPtr handle_connect_reply_2();
+  CtPtr wait_ack_seq();
+  CtPtr handle_ack_seq(char *buffer, int r);
+  CtPtr handle_in_seq_write(int r);
+  CtPtr client_ready();
+
+  // Server Protocol
+protected:
+  bool wait_for_seq;
+
+  CONTINUATION_DECL(ProtocolV1, send_server_banner);
+  WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_server_banner_write);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_client_banner);
+  CONTINUATION_DECL(ProtocolV1, wait_connect_message);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_message_1);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_message_auth);
+  WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1,
+                                  handle_connect_message_reply_write);
+  WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1,
+                                  handle_ready_connect_message_reply_write);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_seq);
+
+  CtPtr send_server_banner();
+  CtPtr handle_server_banner_write(int r);
+  CtPtr wait_client_banner();
+  CtPtr handle_client_banner(char *buffer, int r);
+  CtPtr wait_connect_message();
+  CtPtr handle_connect_message_1(char *buffer, int r);
+  CtPtr wait_connect_message_auth();
+  CtPtr handle_connect_message_auth(char *buffer, int r);
+  CtPtr handle_connect_message_2();
+  CtPtr send_connect_message_reply(char tag, ceph_msg_connect_reply &reply,
+                                   bufferlist &authorizer_reply);
+  CtPtr handle_connect_message_reply_write(int r);
+  CtPtr replace(AsyncConnectionRef existing, ceph_msg_connect_reply &reply,
+                bufferlist &authorizer_reply);
+  CtPtr open(ceph_msg_connect_reply &reply, bufferlist &authorizer_reply);
+  CtPtr handle_ready_connect_message_reply_write(int r);
+  CtPtr wait_seq();
+  CtPtr handle_seq(char *buffer, int r);
+  CtPtr server_ready();
+};
+
+class LoopbackProtocolV1 : public ProtocolV1 {
+public:
+  LoopbackProtocolV1(AsyncConnection *connection) : ProtocolV1(connection) {
+    this->can_write = WriteStatus::CANWRITE;
+  }
+};
+
+#endif /* _MSG_ASYNC_PROTOCOL_V1_ */
diff --git a/src/msg/async/ProtocolV2.cc b/src/msg/async/ProtocolV2.cc
new file mode 100644
index 00000000..381d42c3
--- /dev/null
+++ b/src/msg/async/ProtocolV2.cc
@@ -0,0 +1,2870 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <type_traits>
+
+#include "ProtocolV2.h"
+#include "AsyncMessenger.h"
+
+#include "common/EventTrace.h"
+#include "common/ceph_crypto.h"
+#include "common/errno.h"
+#include "include/random.h"
+#include "auth/AuthClient.h"
+#include "auth/AuthServer.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix _conn_prefix(_dout)
+ostream &ProtocolV2::_conn_prefix(std::ostream *_dout) {
+  return *_dout << "--2- " << messenger->get_myaddrs() << " >> "
+                << *connection->peer_addrs << " conn(" << connection << " "
+                << this
+		<< " " << ceph_con_mode_name(auth_meta->con_mode)
+		<< " :" << connection->port
+                << " s=" << get_state_name(state) << " pgs=" << peer_global_seq
+                << " cs=" << connect_seq << " l=" << connection->policy.lossy
+                << " rev1=" << HAVE_MSGR2_FEATURE(peer_supported_features,
+                                                  REVISION_1)
+                << " rx=" << session_stream_handlers.rx.get()
+                << " tx=" << session_stream_handlers.tx.get()
+                << ").";
+}
+
+using namespace ceph::msgr::v2;
+
+using CtPtr = Ct<ProtocolV2> *;
+using CtRef = Ct<ProtocolV2> &;
+
+void ProtocolV2::run_continuation(CtPtr pcontinuation) {
+  if (pcontinuation) {
+    run_continuation(*pcontinuation);
+  }
+}
+
+void ProtocolV2::run_continuation(CtRef continuation) {
+  try {
+    CONTINUATION_RUN(continuation)
+  } catch (const buffer::error &e) {
+    lderr(cct) << __func__ << " failed decoding of frame header: " << e
+               << dendl;
+    _fault();
+  } catch (const ceph::crypto::onwire::MsgAuthError &e) {
+    lderr(cct) << __func__ << " " << e.what() << dendl;
+    _fault();
+  } catch (const DecryptionError &) {
+    lderr(cct) << __func__ << " failed to decrypt frame payload" << dendl;
+  }
+}
+
+#define WRITE(B, D, C) write(D, CONTINUATION(C), B)
+
+#define READ(L, C) read(CONTINUATION(C), buffer::ptr_node::create(buffer::create(L)))
+
+#define READ_RXBUF(B, C) read(CONTINUATION(C), B)
+
+#ifdef UNIT_TESTS_BUILT
+
+#define INTERCEPT(S) { \
+if(connection->interceptor) { \
+  auto a = connection->interceptor->intercept(connection, (S)); \
+  if (a == Interceptor::ACTION::FAIL) { \
+    return _fault(); \
+  } else if (a == Interceptor::ACTION::STOP) { \
+    stop(); \
+    connection->dispatch_queue->queue_reset(connection); \
+    return nullptr; \
+  }}}
+  
+#else
+#define INTERCEPT(S)
+#endif
+
+ProtocolV2::ProtocolV2(AsyncConnection *connection)
+    : Protocol(2, connection),
+      state(NONE),
+      peer_supported_features(0),
+      client_cookie(0),
+      server_cookie(0),
+      global_seq(0),
+      connect_seq(0),
+      peer_global_seq(0),
+      message_seq(0),
+      reconnecting(false),
+      replacing(false),
+      can_write(false),
+      bannerExchangeCallback(nullptr),
+      tx_frame_asm(&session_stream_handlers, false),
+      rx_frame_asm(&session_stream_handlers, false),
+      next_tag(static_cast<Tag>(0)),
+      keepalive(false) {
+}
+
+ProtocolV2::~ProtocolV2() {
+}
+
+void ProtocolV2::connect() {
+  ldout(cct, 1) << __func__ << dendl;
+  state = START_CONNECT;
+  pre_auth.enabled = true;
+}
+
+void ProtocolV2::accept() {
+  ldout(cct, 1) << __func__ << dendl;
+  state = START_ACCEPT;
+}
+
+bool ProtocolV2::is_connected() { return can_write; }
+
+/*
+ * Tears down the message queues, and removes them from the
+ * DispatchQueue Must hold write_lock prior to calling.
+ */
+void ProtocolV2::discard_out_queue() {
+  ldout(cct, 10) << __func__ << " started" << dendl;
+
+  for (list<Message *>::iterator p = sent.begin(); p != sent.end(); ++p) {
+    ldout(cct, 20) << __func__ << " discard " << *p << dendl;
+    (*p)->put();
+  }
+  sent.clear();
+  for (auto& [ prio, entries ] : out_queue) {
+    static_cast<void>(prio);
+    for (auto& entry : entries) {
+      ldout(cct, 20) << __func__ << " discard " << *entry.m << dendl;
+      entry.m->put();
+    }
+  }
+  out_queue.clear();
+  write_in_progress = false;
+}
+
+void ProtocolV2::reset_session() {
+  ldout(cct, 1) << __func__ << dendl;
+
+  std::lock_guard<std::mutex> l(connection->write_lock);
+  if (connection->delay_state) {
+    connection->delay_state->discard();
+  }
+
+  connection->dispatch_queue->discard_queue(connection->conn_id);
+  discard_out_queue();
+  connection->outgoing_bl.clear();
+
+  connection->dispatch_queue->queue_remote_reset(connection);
+
+  out_seq = 0;
+  in_seq = 0;
+  client_cookie = 0;
+  server_cookie = 0;
+  connect_seq = 0;
+  peer_global_seq = 0;
+  message_seq = 0;
+  ack_left = 0;
+  can_write = false;
+}
+
+void ProtocolV2::stop() {
+  ldout(cct, 1) << __func__ << dendl;
+  if (state == CLOSED) {
+    return;
+  }
+
+  if (connection->delay_state) connection->delay_state->flush();
+
+  std::lock_guard<std::mutex> l(connection->write_lock);
+
+  reset_recv_state();
+  discard_out_queue();
+
+  connection->_stop();
+
+  can_write = false;
+  state = CLOSED;
+}
+
+void ProtocolV2::fault() { _fault(); }
+
+void ProtocolV2::requeue_sent() {
+  write_in_progress = false;
+  if (sent.empty()) {
+    return;
+  }
+
+  auto& rq = out_queue[CEPH_MSG_PRIO_HIGHEST];
+  out_seq -= sent.size();
+  while (!sent.empty()) {
+    Message *m = sent.back();
+    sent.pop_back();
+    ldout(cct, 5) << __func__ << " requeueing message m=" << m
+                  << " seq=" << m->get_seq() << " type=" << m->get_type() << " "
+                  << *m << dendl;
+    rq.emplace_front(out_queue_entry_t{false, m});
+  }
+}
+
+uint64_t ProtocolV2::discard_requeued_up_to(uint64_t out_seq, uint64_t seq) {
+  ldout(cct, 10) << __func__ << " " << seq << dendl;
+  std::lock_guard<std::mutex> l(connection->write_lock);
+  if (out_queue.count(CEPH_MSG_PRIO_HIGHEST) == 0) {
+    return seq;
+  }
+  auto& rq = out_queue[CEPH_MSG_PRIO_HIGHEST];
+  uint64_t count = out_seq;
+  while (!rq.empty()) {
+    Message* const m = rq.front().m;
+    if (m->get_seq() == 0 || m->get_seq() > seq) break;
+    ldout(cct, 5) << __func__ << " discarding message m=" << m
+                  << " seq=" << m->get_seq() << " ack_seq=" << seq << " "
+                  << *m << dendl;
+    m->put();
+    rq.pop_front();
+    count++;
+  }
+  if (rq.empty()) out_queue.erase(CEPH_MSG_PRIO_HIGHEST);
+  return count;
+}
+
+void ProtocolV2::reset_security() {
+  ldout(cct, 5) << __func__ << dendl;
+
+  auth_meta.reset(new AuthConnectionMeta);
+  session_stream_handlers.rx.reset(nullptr);
+  session_stream_handlers.tx.reset(nullptr);
+  pre_auth.rxbuf.clear();
+  pre_auth.txbuf.clear();
+}
+
+// it's expected the `write_lock` is held while calling this method.
+void ProtocolV2::reset_recv_state() {
+  ldout(cct, 5) << __func__ << dendl;
+
+  if (!connection->center->in_thread()) {
+    // execute in the same thread that uses the rx/tx handlers. We need
+    // to do the warp because holding `write_lock` is not enough as
+    // `write_event()` unlocks it just before calling `write_message()`.
+    // `submit_to()` here is NOT blocking.
+    connection->center->submit_to(connection->center->get_id(), [this] {
+      ldout(cct, 5) << "reset_recv_state (warped) reseting crypto handlers"
+                    << dendl;
+      // Possibly unnecessary. See the comment in `deactivate_existing`.
+      std::lock_guard<std::mutex> l(connection->lock);
+      std::lock_guard<std::mutex> wl(connection->write_lock);
+      reset_security();
+    }, /* nowait = */true);
+  } else {
+    reset_security();
+  }
+
+  // clean read and write callbacks
+  connection->pendingReadLen.reset();
+  connection->writeCallback.reset();
+
+  next_tag = static_cast<Tag>(0);
+
+  reset_throttle();
+}
+
+size_t ProtocolV2::get_current_msg_size() const {
+  ceph_assert(rx_frame_asm.get_num_segments() > 0);
+  size_t sum = 0;
+  // we don't include SegmentIndex::Msg::HEADER.
+  for (size_t i = 1; i < rx_frame_asm.get_num_segments(); i++) {
+    sum += rx_frame_asm.get_segment_logical_len(i);
+  }
+  return sum;
+}
+
+void ProtocolV2::reset_throttle() {
+  if (state > THROTTLE_MESSAGE && state <= THROTTLE_DONE &&
+      connection->policy.throttler_messages) {
+    ldout(cct, 10) << __func__ << " releasing " << 1
+                   << " message to policy throttler "
+                   << connection->policy.throttler_messages->get_current()
+                   << "/" << connection->policy.throttler_messages->get_max()
+                   << dendl;
+    connection->policy.throttler_messages->put();
+  }
+  if (state > THROTTLE_BYTES && state <= THROTTLE_DONE) {
+    if (connection->policy.throttler_bytes) {
+      const size_t cur_msg_size = get_current_msg_size();
+      ldout(cct, 10) << __func__ << " releasing " << cur_msg_size
+                     << " bytes to policy throttler "
+                     << connection->policy.throttler_bytes->get_current() << "/"
+                     << connection->policy.throttler_bytes->get_max() << dendl;
+      connection->policy.throttler_bytes->put(cur_msg_size);
+    }
+  }
+  if (state > THROTTLE_DISPATCH_QUEUE && state <= THROTTLE_DONE) {
+    const size_t cur_msg_size = get_current_msg_size();
+    ldout(cct, 10)
+        << __func__ << " releasing " << cur_msg_size
+        << " bytes to dispatch_queue throttler "
+        << connection->dispatch_queue->dispatch_throttler.get_current() << "/"
+        << connection->dispatch_queue->dispatch_throttler.get_max() << dendl;
+    connection->dispatch_queue->dispatch_throttle_release(cur_msg_size);
+  }
+}
+
+CtPtr ProtocolV2::_fault() {
+  ldout(cct, 10) << __func__ << dendl;
+
+  if (state == CLOSED || state == NONE) {
+    ldout(cct, 10) << __func__ << " connection is already closed" << dendl;
+    return nullptr;
+  }
+
+  if (connection->policy.lossy &&
+      !(state >= START_CONNECT && state <= SESSION_RECONNECTING)) {
+    ldout(cct, 2) << __func__ << " on lossy channel, failing" << dendl;
+    stop();
+    connection->dispatch_queue->queue_reset(connection);
+    return nullptr;
+  }
+
+  connection->write_lock.lock();
+
+  can_write = false;
+  // requeue sent items
+  requeue_sent();
+
+  if (out_queue.empty() && state >= START_ACCEPT &&
+      state <= SESSION_ACCEPTING && !replacing) {
+    ldout(cct, 2) << __func__ << " with nothing to send and in the half "
+                   << " accept state just closed" << dendl;
+    connection->write_lock.unlock();
+    stop();
+    connection->dispatch_queue->queue_reset(connection);
+    return nullptr;
+  }
+
+  replacing = false;
+  connection->fault();
+  reset_recv_state();
+
+  reconnecting = false;
+
+  if (connection->policy.standby && out_queue.empty() && !keepalive &&
+      state != WAIT) {
+    ldout(cct, 1) << __func__ << " with nothing to send, going to standby"
+                  << dendl;
+    state = STANDBY;
+    connection->write_lock.unlock();
+    return nullptr;
+  }
+  if (connection->policy.server) {
+    ldout(cct, 1) << __func__ << " server, going to standby, even though i have stuff queued" << dendl;
+    state = STANDBY;
+    connection->write_lock.unlock();
+    return nullptr;
+  }
+
+  connection->write_lock.unlock();
+
+  if (!(state >= START_CONNECT && state <= SESSION_RECONNECTING) &&
+      state != WAIT &&
+      state != SESSION_ACCEPTING /* due to connection race */) {
+    // policy maybe empty when state is in accept
+    if (connection->policy.server) {
+      ldout(cct, 1) << __func__ << " server, going to standby" << dendl;
+      state = STANDBY;
+    } else {
+      ldout(cct, 1) << __func__ << " initiating reconnect" << dendl;
+      connect_seq++;
+      global_seq = messenger->get_global_seq();
+      state = START_CONNECT;
+      pre_auth.enabled = true;
+      connection->state = AsyncConnection::STATE_CONNECTING;
+    }
+    backoff = utime_t();
+    connection->center->dispatch_event_external(connection->read_handler);
+  } else {
+    if (state == WAIT) {
+      backoff.set_from_double(cct->_conf->ms_max_backoff);
+    } else if (backoff == utime_t()) {
+      backoff.set_from_double(cct->_conf->ms_initial_backoff);
+    } else {
+      backoff += backoff;
+      if (backoff > cct->_conf->ms_max_backoff)
+        backoff.set_from_double(cct->_conf->ms_max_backoff);
+    }
+
+    if (server_cookie) {
+      connect_seq++;
+    }
+
+    global_seq = messenger->get_global_seq();
+    state = START_CONNECT;
+    pre_auth.enabled = true;
+    connection->state = AsyncConnection::STATE_CONNECTING;
+    ldout(cct, 1) << __func__ << " waiting " << backoff << dendl;
+    // woke up again;
+    connection->register_time_events.insert(
+        connection->center->create_time_event(backoff.to_nsec() / 1000,
+                                              connection->wakeup_handler));
+  }
+  return nullptr;
+}
+
+void ProtocolV2::prepare_send_message(uint64_t features,
+				      Message *m) {
+  ldout(cct, 20) << __func__ << " m=" << *m << dendl;
+
+  // associate message with Connection (for benefit of encode_payload)
+  if (m->empty_payload()) {
+    ldout(cct, 20) << __func__ << " encoding features " << features << " " << m
+                   << " " << *m << dendl;
+  } else {
+    ldout(cct, 20) << __func__ << " half-reencoding features " << features
+                   << " " << m << " " << *m << dendl;
+  }
+
+  // encode and copy out of *m
+  m->encode(features, 0);
+}
+
+void ProtocolV2::send_message(Message *m) {
+  uint64_t f = connection->get_features();
+
+  // TODO: Currently not all messages supports reencode like MOSDMap, so here
+  // only let fast dispatch support messages prepare message
+  const bool can_fast_prepare = messenger->ms_can_fast_dispatch(m);
+  if (can_fast_prepare) {
+    prepare_send_message(f, m);
+  }
+
+  std::lock_guard<std::mutex> l(connection->write_lock);
+  bool is_prepared = can_fast_prepare;
+  // "features" changes will change the payload encoding
+  if (can_fast_prepare && (!can_write || connection->get_features() != f)) {
+    // ensure the correctness of message encoding
+    m->clear_payload();
+    is_prepared = false;
+    ldout(cct, 10) << __func__ << " clear encoded buffer previous " << f
+                   << " != " << connection->get_features() << dendl;
+  }
+  if (state == CLOSED) {
+    ldout(cct, 10) << __func__ << " connection closed."
+                   << " Drop message " << m << dendl;
+    m->put();
+  } else {
+    ldout(cct, 5) << __func__ << " enqueueing message m=" << m
+                  << " type=" << m->get_type() << " " << *m << dendl;
+    m->trace.event("async enqueueing message");
+    out_queue[m->get_priority()].emplace_back(
+      out_queue_entry_t{is_prepared, m});
+    ldout(cct, 15) << __func__ << " inline write is denied, reschedule m=" << m
+                   << dendl;
+    if (((!replacing && can_write) || state == STANDBY) && !write_in_progress) {
+      write_in_progress = true;
+      connection->center->dispatch_event_external(connection->write_handler);
+    }
+  }
+}
+
+void ProtocolV2::send_keepalive() {
+  ldout(cct, 10) << __func__ << dendl;
+  std::lock_guard<std::mutex> l(connection->write_lock);
+  if (state != CLOSED) {
+    keepalive = true;
+    connection->center->dispatch_event_external(connection->write_handler);
+  }
+}
+
+void ProtocolV2::read_event() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  switch (state) {
+    case START_CONNECT:
+      run_continuation(CONTINUATION(start_client_banner_exchange));
+      break;
+    case START_ACCEPT:
+      run_continuation(CONTINUATION(start_server_banner_exchange));
+      break;
+    case READY:
+      run_continuation(CONTINUATION(read_frame));
+      break;
+    case THROTTLE_MESSAGE:
+      run_continuation(CONTINUATION(throttle_message));
+      break;
+    case THROTTLE_BYTES:
+      run_continuation(CONTINUATION(throttle_bytes));
+      break;
+    case THROTTLE_DISPATCH_QUEUE:
+      run_continuation(CONTINUATION(throttle_dispatch_queue));
+      break;
+    default:
+      break;
+  }
+}
+
+ProtocolV2::out_queue_entry_t ProtocolV2::_get_next_outgoing() {
+  out_queue_entry_t out_entry;
+
+  if (!out_queue.empty()) {
+    auto it = out_queue.rbegin();
+    auto& entries = it->second;
+    ceph_assert(!entries.empty());
+    out_entry = entries.front();
+    entries.pop_front();
+    if (entries.empty()) {
+      out_queue.erase(it->first);
+    }
+  }
+  return out_entry;
+}
+
+ssize_t ProtocolV2::write_message(Message *m, bool more) {
+  FUNCTRACE(cct);
+  ceph_assert(connection->center->in_thread());
+  m->set_seq(++out_seq);
+
+  connection->lock.lock();
+  uint64_t ack_seq = in_seq;
+  ack_left = 0;
+  connection->lock.unlock();
+
+  ceph_msg_header &header = m->get_header();
+  ceph_msg_footer &footer = m->get_footer();
+
+  ceph_msg_header2 header2{header.seq,        header.tid,
+                           header.type,       header.priority,
+                           header.version,
+                           init_le32(0),      header.data_off,
+                           init_le64(ack_seq),
+                           footer.flags,      header.compat_version,
+                           header.reserved};
+
+  auto message = MessageFrame::Encode(
+			     header2,
+			     m->get_payload(),
+			     m->get_middle(),
+			     m->get_data());
+  if (!append_frame(message)) {
+    m->put();
+    return -EILSEQ;
+  }
+
+  ldout(cct, 5) << __func__ << " sending message m=" << m
+                << " seq=" << m->get_seq() << " " << *m << dendl;
+
+  m->trace.event("async writing message");
+  ldout(cct, 20) << __func__ << " sending m=" << m << " seq=" << m->get_seq()
+                 << " src=" << entity_name_t(messenger->get_myname())
+                 << " off=" << header2.data_off
+                 << dendl;
+  ssize_t total_send_size = connection->outgoing_bl.length();
+  ssize_t rc = connection->_try_send(more);
+  if (rc < 0) {
+    ldout(cct, 1) << __func__ << " error sending " << m << ", "
+                  << cpp_strerror(rc) << dendl;
+  } else {
+    connection->logger->inc(
+        l_msgr_send_bytes, total_send_size - connection->outgoing_bl.length());
+    ldout(cct, 10) << __func__ << " sending " << m
+                   << (rc ? " continuely." : " done.") << dendl;
+  }
+  if (m->get_type() == CEPH_MSG_OSD_OP)
+    OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OP_END", false);
+  else if (m->get_type() == CEPH_MSG_OSD_OPREPLY)
+    OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OPREPLY_END", false);
+  m->put();
+
+  return rc;
+}
+
+template <class F>
+bool ProtocolV2::append_frame(F& frame) {
+  ceph::bufferlist bl;
+  try {
+    bl = frame.get_buffer(tx_frame_asm);
+  } catch (ceph::crypto::onwire::TxHandlerError &e) {
+    ldout(cct, 1) << __func__ << " " << e.what() << dendl;
+    return false;
+  }
+
+  ldout(cct, 25) << __func__ << " assembled frame " << bl.length()
+                 << " bytes " << tx_frame_asm << dendl;
+  connection->outgoing_bl.append(bl);
+  return true;
+}
+
+void ProtocolV2::handle_message_ack(uint64_t seq) {
+  if (connection->policy.lossy) {  // lossy connections don't keep sent messages
+    return;
+  }
+
+  ldout(cct, 15) << __func__ << " seq=" << seq << dendl;
+
+  // trim sent list
+  static const int max_pending = 128;
+  int i = 0;
+  Message *pending[max_pending];
+  connection->write_lock.lock();
+  while (!sent.empty() && sent.front()->get_seq() <= seq && i < max_pending) {
+    Message *m = sent.front();
+    sent.pop_front();
+    pending[i++] = m;
+    ldout(cct, 10) << __func__ << " got ack seq " << seq
+                   << " >= " << m->get_seq() << " on " << m << " " << *m
+                   << dendl;
+  }
+  connection->write_lock.unlock();
+  for (int k = 0; k < i; k++) {
+    pending[k]->put();
+  }
+}
+
+void ProtocolV2::write_event() {
+  ldout(cct, 10) << __func__ << dendl;
+  ssize_t r = 0;
+
+  connection->write_lock.lock();
+  if (can_write) {
+    if (keepalive) {
+      ldout(cct, 10) << __func__ << " appending keepalive" << dendl;
+      auto keepalive_frame = KeepAliveFrame::Encode();
+      if (!append_frame(keepalive_frame)) {
+        connection->write_lock.unlock();
+        connection->lock.lock();
+        fault();
+        connection->lock.unlock();
+        return;
+      }
+      keepalive = false;
+    }
+
+    auto start = ceph::mono_clock::now();
+    bool more;
+    do {
+      const auto out_entry = _get_next_outgoing();
+      if (!out_entry.m) {
+        break;
+      }
+
+      if (!connection->policy.lossy) {
+        // put on sent list
+        sent.push_back(out_entry.m);
+        out_entry.m->get();
+      }
+      more = !out_queue.empty();
+      connection->write_lock.unlock();
+
+      // send_message or requeue messages may not encode message
+      if (!out_entry.is_prepared) {
+        prepare_send_message(connection->get_features(), out_entry.m);
+      }
+
+      r = write_message(out_entry.m, more);
+
+      connection->write_lock.lock();
+      if (r == 0) {
+        ;
+      } else if (r < 0) {
+        ldout(cct, 1) << __func__ << " send msg failed" << dendl;
+        break;
+      } else if (r > 0)
+        break;
+    } while (can_write);
+    write_in_progress = false;
+
+    // if r > 0 mean data still lefted, so no need _try_send.
+    if (r == 0) {
+      uint64_t left = ack_left;
+      if (left) {
+        ceph_le64 s;
+        s = in_seq;
+        ldout(cct, 10) << __func__ << " try send msg ack, acked " << left
+                       << " messages" << dendl;
+        auto ack_frame = AckFrame::Encode(in_seq);
+        if (append_frame(ack_frame)) {
+          ack_left -= left;
+          left = ack_left;
+          r = connection->_try_send(left);
+        } else {
+          r = -EILSEQ;
+        }
+      } else if (is_queued()) {
+        r = connection->_try_send();
+      }
+    }
+    connection->write_lock.unlock();
+
+    connection->logger->tinc(l_msgr_running_send_time,
+                             ceph::mono_clock::now() - start);
+    if (r < 0) {
+      ldout(cct, 1) << __func__ << " send msg failed" << dendl;
+      connection->lock.lock();
+      fault();
+      connection->lock.unlock();
+      return;
+    }
+  } else {
+    write_in_progress = false;
+    connection->write_lock.unlock();
+    connection->lock.lock();
+    connection->write_lock.lock();
+    if (state == STANDBY && !connection->policy.server && is_queued()) {
+      ldout(cct, 10) << __func__ << " policy.server is false" << dendl;
+      if (server_cookie) {  // only increment connect_seq if there is a session
+        connect_seq++;
+      }
+      connection->_connect();
+    } else if (connection->cs && state != NONE && state != CLOSED &&
+               state != START_CONNECT) {
+      r = connection->_try_send();
+      if (r < 0) {
+        ldout(cct, 1) << __func__ << " send outcoming bl failed" << dendl;
+        connection->write_lock.unlock();
+        fault();
+        connection->lock.unlock();
+        return;
+      }
+    }
+    connection->write_lock.unlock();
+    connection->lock.unlock();
+  }
+}
+
+bool ProtocolV2::is_queued() {
+  return !out_queue.empty() || connection->is_queued();
+}
+
+CtPtr ProtocolV2::read(CONTINUATION_RXBPTR_TYPE<ProtocolV2> &next,
+                       rx_buffer_t &&buffer) {
+  const auto len = buffer->length();
+  const auto buf = buffer->c_str();
+  next.node = std::move(buffer);
+  ssize_t r = connection->read(len, buf,
+    [&next, this](char *buffer, int r) {
+      if (unlikely(pre_auth.enabled) && r >= 0) {
+        pre_auth.rxbuf.append(*next.node);
+	ceph_assert(!cct->_conf->ms_die_on_bug ||
+		    pre_auth.rxbuf.length() < 20000000);
+      }
+      next.r = r;
+      run_continuation(next);
+    });
+  if (r <= 0) {
+    // error or done synchronously
+    if (unlikely(pre_auth.enabled) && r >= 0) {
+      pre_auth.rxbuf.append(*next.node);
+      ceph_assert(!cct->_conf->ms_die_on_bug ||
+		  pre_auth.rxbuf.length() < 20000000);
+    }
+    next.r = r;
+    return &next;
+  }
+
+  return nullptr;
+}
+
+template <class F>
+CtPtr ProtocolV2::write(const std::string &desc,
+                        CONTINUATION_TYPE<ProtocolV2> &next,
+                        F &frame) {
+  ceph::bufferlist bl;
+  try {
+    bl = frame.get_buffer(tx_frame_asm);
+  } catch (ceph::crypto::onwire::TxHandlerError &e) {
+    ldout(cct, 1) << __func__ << " " << e.what() << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 25) << __func__ << " assembled frame " << bl.length()
+                 << " bytes " << tx_frame_asm << dendl;
+  return write(desc, next, bl);
+}
+
+CtPtr ProtocolV2::write(const std::string &desc,
+                        CONTINUATION_TYPE<ProtocolV2> &next,
+                        bufferlist &buffer) {
+  if (unlikely(pre_auth.enabled)) {
+    pre_auth.txbuf.append(buffer);
+    ceph_assert(!cct->_conf->ms_die_on_bug ||
+		pre_auth.txbuf.length() < 20000000);
+  }
+
+  ssize_t r =
+      connection->write(buffer, [&next, desc, this](int r) {
+        if (r < 0) {
+          ldout(cct, 1) << __func__ << " " << desc << " write failed r=" << r
+                        << " (" << cpp_strerror(r) << ")" << dendl;
+          connection->inject_delay();
+          _fault();
+        }
+        run_continuation(next);
+      });
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " " << desc << " write failed r=" << r
+                  << " (" << cpp_strerror(r) << ")" << dendl;
+    return _fault();
+  } else if (r == 0) {
+    next.setParams();
+    return &next;
+  }
+
+  return nullptr;
+}
+
+CtPtr ProtocolV2::_banner_exchange(CtRef callback) {
+  ldout(cct, 20) << __func__ << dendl;
+  bannerExchangeCallback = &callback;
+
+  bufferlist banner_payload;
+  encode((uint64_t)CEPH_MSGR2_SUPPORTED_FEATURES, banner_payload, 0);
+  encode((uint64_t)CEPH_MSGR2_REQUIRED_FEATURES, banner_payload, 0);
+
+  bufferlist bl;
+  bl.append(CEPH_BANNER_V2_PREFIX, strlen(CEPH_BANNER_V2_PREFIX));
+  encode((uint16_t)banner_payload.length(), bl, 0);
+  bl.claim_append(banner_payload);
+
+  INTERCEPT(state == BANNER_CONNECTING ? 3 : 4);
+
+  return WRITE(bl, "banner", _wait_for_peer_banner);
+}
+
+CtPtr ProtocolV2::_wait_for_peer_banner() {
+  unsigned banner_len = strlen(CEPH_BANNER_V2_PREFIX) + sizeof(__le16);
+  return READ(banner_len, _handle_peer_banner);
+}
+
+CtPtr ProtocolV2::_handle_peer_banner(rx_buffer_t &&buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read peer banner failed r=" << r << " ("
+                  << cpp_strerror(r) << ")" << dendl;
+    return _fault();
+  }
+
+  unsigned banner_prefix_len = strlen(CEPH_BANNER_V2_PREFIX);
+
+  if (memcmp(buffer->c_str(), CEPH_BANNER_V2_PREFIX, banner_prefix_len)) {
+    if (memcmp(buffer->c_str(), CEPH_BANNER, strlen(CEPH_BANNER)) == 0) {
+      lderr(cct) << __func__ << " peer " << *connection->peer_addrs
+                 << " is using msgr V1 protocol" << dendl;
+      return _fault();
+    }
+    ldout(cct, 1) << __func__ << " accept peer sent bad banner" << dendl;
+    return _fault();
+  }
+
+  uint16_t payload_len;
+  bufferlist bl;
+  buffer->set_offset(banner_prefix_len);
+  buffer->set_length(sizeof(__le16));
+  bl.push_back(std::move(buffer));
+  auto ti = bl.cbegin();
+  try {
+    decode(payload_len, ti);
+  } catch (const buffer::error &e) {
+    lderr(cct) << __func__ << " decode banner payload len failed " << dendl;
+    return _fault();
+  }
+
+  INTERCEPT(state == BANNER_CONNECTING ? 5 : 6);
+
+  return READ(payload_len, _handle_peer_banner_payload);
+}
+
+CtPtr ProtocolV2::_handle_peer_banner_payload(rx_buffer_t &&buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read peer banner payload failed r=" << r
+                  << " (" << cpp_strerror(r) << ")" << dendl;
+    return _fault();
+  }
+
+  uint64_t peer_supported_features;
+  uint64_t peer_required_features;
+
+  bufferlist bl;
+  bl.push_back(std::move(buffer));
+  auto ti = bl.cbegin();
+  try {
+    decode(peer_supported_features, ti);
+    decode(peer_required_features, ti);
+  } catch (const buffer::error &e) {
+    lderr(cct) << __func__ << " decode banner payload failed " << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 1) << __func__ << " supported=" << std::hex
+                << peer_supported_features << " required=" << std::hex
+                << peer_required_features << std::dec << dendl;
+
+  // Check feature bit compatibility
+
+  uint64_t supported_features = CEPH_MSGR2_SUPPORTED_FEATURES;
+  uint64_t required_features = CEPH_MSGR2_REQUIRED_FEATURES;
+
+  if ((required_features & peer_supported_features) != required_features) {
+    ldout(cct, 1) << __func__ << " peer does not support all required features"
+                  << " required=" << std::hex << required_features
+                  << " supported=" << std::hex << peer_supported_features
+                  << std::dec << dendl;
+    stop();
+    connection->dispatch_queue->queue_reset(connection);
+    return nullptr;
+  }
+  if ((supported_features & peer_required_features) != peer_required_features) {
+    ldout(cct, 1) << __func__ << " we do not support all peer required features"
+                  << " required=" << std::hex << peer_required_features
+                  << " supported=" << supported_features << std::dec << dendl;
+    stop();
+    connection->dispatch_queue->queue_reset(connection);
+    return nullptr;
+  }
+
+  this->peer_supported_features = peer_supported_features;
+  if (peer_required_features == 0) {
+    this->connection_features = msgr2_required;
+  }
+
+  // if the peer supports msgr2.1, switch to it
+  bool is_rev1 = HAVE_MSGR2_FEATURE(peer_supported_features, REVISION_1);
+  tx_frame_asm.set_is_rev1(is_rev1);
+  rx_frame_asm.set_is_rev1(is_rev1);
+
+  if (state == BANNER_CONNECTING) {
+    state = HELLO_CONNECTING;
+  }
+  else {
+    ceph_assert(state == BANNER_ACCEPTING);
+    state = HELLO_ACCEPTING;
+  }
+
+  auto hello = HelloFrame::Encode(messenger->get_mytype(),
+                                  connection->target_addr);
+
+  INTERCEPT(state == HELLO_CONNECTING ? 7 : 8);
+
+  return WRITE(hello, "hello frame", read_frame);
+}
+
+CtPtr ProtocolV2::handle_hello(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != HELLO_CONNECTING && state != HELLO_ACCEPTING) {
+    lderr(cct) << __func__ << " not in hello exchange state!" << dendl;
+    return _fault();
+  }
+
+  auto hello = HelloFrame::Decode(payload);
+
+  ldout(cct, 5) << __func__ << " received hello:"
+                << " peer_type=" << (int)hello.entity_type()
+                << " peer_addr_for_me=" << hello.peer_addr() << dendl;
+
+  sockaddr_storage ss;
+  socklen_t len = sizeof(ss);
+  getsockname(connection->cs.fd(), (sockaddr *)&ss, &len);
+  ldout(cct, 5) << __func__ << " getsockname says I am " << (sockaddr *)&ss
+		<< " when talking to " << connection->target_addr << dendl;
+
+  if (connection->get_peer_type() == -1) {
+    connection->set_peer_type(hello.entity_type());
+
+    ceph_assert(state == HELLO_ACCEPTING);
+    connection->policy = messenger->get_policy(hello.entity_type());
+    ldout(cct, 10) << __func__ << " accept of host_type "
+                   << (int)hello.entity_type()
+                   << ", policy.lossy=" << connection->policy.lossy
+                   << " policy.server=" << connection->policy.server
+                   << " policy.standby=" << connection->policy.standby
+                   << " policy.resetcheck=" << connection->policy.resetcheck
+                   << dendl;
+  } else {
+    ceph_assert(state == HELLO_CONNECTING);
+    if (connection->get_peer_type() != hello.entity_type()) {
+      ldout(cct, 1) << __func__ << " connection peer type does not match what"
+                    << " peer advertises " << connection->get_peer_type()
+                    << " != " << (int)hello.entity_type() << dendl;
+      stop();
+      connection->dispatch_queue->queue_reset(connection);
+      return nullptr;
+    }
+  }
+
+  if (messenger->get_myaddrs().empty() ||
+      messenger->get_myaddrs().front().is_blank_ip()) {
+    entity_addr_t a;
+    if (cct->_conf->ms_learn_addr_from_peer) {
+      ldout(cct, 1) << __func__ << " peer " << connection->target_addr
+		    << " says I am " << hello.peer_addr() << " (socket says "
+		    << (sockaddr*)&ss << ")" << dendl;
+      a = hello.peer_addr();
+    } else {
+      ldout(cct, 1) << __func__ << " socket to  " << connection->target_addr
+		    << " says I am " << (sockaddr*)&ss
+		    << " (peer says " << hello.peer_addr() << ")" << dendl;
+      a.set_sockaddr((sockaddr *)&ss);
+    }
+    a.set_type(entity_addr_t::TYPE_MSGR2); // anything but NONE; learned_addr ignores this
+    a.set_port(0);
+    connection->lock.unlock();
+    messenger->learned_addr(a);
+    if (cct->_conf->ms_inject_internal_delays &&
+        cct->_conf->ms_inject_socket_failures) {
+      if (rand() % cct->_conf->ms_inject_socket_failures == 0) {
+        ldout(cct, 10) << __func__ << " sleep for "
+                       << cct->_conf->ms_inject_internal_delays << dendl;
+        utime_t t;
+        t.set_from_double(cct->_conf->ms_inject_internal_delays);
+        t.sleep();
+      }
+    }
+    connection->lock.lock();
+    if (state != HELLO_CONNECTING) {
+      ldout(cct, 1) << __func__
+                    << " state changed while learned_addr, mark_down or "
+                    << " replacing must be happened just now" << dendl;
+      return nullptr;
+    }
+  }
+
+
+
+  CtPtr callback;
+  callback = bannerExchangeCallback;
+  bannerExchangeCallback = nullptr;
+  ceph_assert(callback);
+  return callback;
+}
+
+CtPtr ProtocolV2::read_frame() {
+  if (state == CLOSED) {
+    return nullptr;
+  }
+
+  ldout(cct, 20) << __func__ << dendl;
+  rx_preamble.clear();
+  rx_epilogue.clear();
+  rx_segments_data.clear();
+
+  return READ(rx_frame_asm.get_preamble_onwire_len(),
+              handle_read_frame_preamble_main);
+}
+
+CtPtr ProtocolV2::handle_read_frame_preamble_main(rx_buffer_t &&buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read frame preamble failed r=" << r
+                  << " (" << cpp_strerror(r) << ")" << dendl;
+    return _fault();
+  }
+
+  rx_preamble.push_back(std::move(buffer));
+
+  ldout(cct, 30) << __func__ << " preamble\n";
+  rx_preamble.hexdump(*_dout);
+  *_dout << dendl;
+
+  try {
+    next_tag = rx_frame_asm.disassemble_preamble(rx_preamble);
+  } catch (FrameError& e) {
+    ldout(cct, 1) << __func__ << " " << e.what() << dendl;
+    return _fault();
+  } catch (ceph::crypto::onwire::MsgAuthError&) {
+    ldout(cct, 1) << __func__ << "bad auth tag" << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 25) << __func__ << " disassembled preamble " << rx_frame_asm
+                 << dendl;
+
+  if (session_stream_handlers.rx) {
+    ldout(cct, 30) << __func__ << " preamble after decrypt\n";
+    rx_preamble.hexdump(*_dout);
+    *_dout << dendl;
+  }
+
+  // does it need throttle?
+  if (next_tag == Tag::MESSAGE) {
+    if (state != READY) {
+      lderr(cct) << __func__ << " not in ready state!" << dendl;
+      return _fault();
+    }
+    state = THROTTLE_MESSAGE;
+    return CONTINUE(throttle_message);
+  } else {
+    return read_frame_segment();
+  }
+}
+
+CtPtr ProtocolV2::handle_read_frame_dispatch() {
+  ldout(cct, 10) << __func__
+                 << " tag=" << static_cast<uint32_t>(next_tag) << dendl;
+
+  switch (next_tag) {
+    case Tag::HELLO:
+    case Tag::AUTH_REQUEST:
+    case Tag::AUTH_BAD_METHOD:
+    case Tag::AUTH_REPLY_MORE:
+    case Tag::AUTH_REQUEST_MORE:
+    case Tag::AUTH_DONE:
+    case Tag::AUTH_SIGNATURE:
+    case Tag::CLIENT_IDENT:
+    case Tag::SERVER_IDENT:
+    case Tag::IDENT_MISSING_FEATURES:
+    case Tag::SESSION_RECONNECT:
+    case Tag::SESSION_RESET:
+    case Tag::SESSION_RETRY:
+    case Tag::SESSION_RETRY_GLOBAL:
+    case Tag::SESSION_RECONNECT_OK:
+    case Tag::KEEPALIVE2:
+    case Tag::KEEPALIVE2_ACK:
+    case Tag::ACK:
+    case Tag::WAIT:
+      return handle_frame_payload();
+    case Tag::MESSAGE:
+      return handle_message();
+    default: {
+      lderr(cct) << __func__
+                 << " received unknown tag=" << static_cast<uint32_t>(next_tag)
+                 << dendl;
+      return _fault();
+    }
+  }
+
+  return nullptr;
+}
+
+CtPtr ProtocolV2::read_frame_segment() {
+  size_t seg_idx = rx_segments_data.size();
+  ldout(cct, 20) << __func__ << " seg_idx=" << seg_idx << dendl;
+  rx_segments_data.emplace_back();
+
+  uint32_t onwire_len = rx_frame_asm.get_segment_onwire_len(seg_idx);
+  if (onwire_len == 0) {
+    return _handle_read_frame_segment();
+  }
+
+  rx_buffer_t rx_buffer;
+  uint16_t align = rx_frame_asm.get_segment_align(seg_idx);
+  try {
+    rx_buffer = buffer::ptr_node::create(buffer::create_aligned(
+        onwire_len, align));
+  } catch (std::bad_alloc&) {
+    // Catching because of potential issues with satisfying alignment.
+    ldout(cct, 1) << __func__ << " can't allocate aligned rx_buffer"
+                  << " len=" << onwire_len
+                  << " align=" << align
+                  << dendl;
+    return _fault();
+  }
+
+  return READ_RXBUF(std::move(rx_buffer), handle_read_frame_segment);
+}
+
+CtPtr ProtocolV2::handle_read_frame_segment(rx_buffer_t &&rx_buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read frame segment failed r=" << r << " ("
+                  << cpp_strerror(r) << ")" << dendl;
+    return _fault();
+  }
+
+  rx_segments_data.back().push_back(std::move(rx_buffer));
+  return _handle_read_frame_segment();
+}
+
+CtPtr ProtocolV2::_handle_read_frame_segment() {
+  if (rx_segments_data.size() == rx_frame_asm.get_num_segments()) {
+    // OK, all segments planned to read are read. Can go with epilogue.
+    uint32_t epilogue_onwire_len = rx_frame_asm.get_epilogue_onwire_len();
+    if (epilogue_onwire_len == 0) {
+      return _handle_read_frame_epilogue_main();
+    }
+    return READ(epilogue_onwire_len, handle_read_frame_epilogue_main);
+  }
+  // TODO: for makeshift only. This will be more generic and throttled
+  return read_frame_segment();
+}
+
+CtPtr ProtocolV2::handle_frame_payload() {
+  ceph_assert(!rx_segments_data.empty());
+  auto& payload = rx_segments_data.back();
+
+  ldout(cct, 30) << __func__ << "\n";
+  payload.hexdump(*_dout);
+  *_dout << dendl;
+
+  switch (next_tag) {
+    case Tag::HELLO:
+      return handle_hello(payload);
+    case Tag::AUTH_REQUEST:
+      return handle_auth_request(payload);
+    case Tag::AUTH_BAD_METHOD:
+      return handle_auth_bad_method(payload);
+    case Tag::AUTH_REPLY_MORE:
+      return handle_auth_reply_more(payload);
+    case Tag::AUTH_REQUEST_MORE:
+      return handle_auth_request_more(payload);
+    case Tag::AUTH_DONE:
+      return handle_auth_done(payload);
+    case Tag::AUTH_SIGNATURE:
+      return handle_auth_signature(payload);
+    case Tag::CLIENT_IDENT:
+      return handle_client_ident(payload);
+    case Tag::SERVER_IDENT:
+      return handle_server_ident(payload);
+    case Tag::IDENT_MISSING_FEATURES:
+      return handle_ident_missing_features(payload);
+    case Tag::SESSION_RECONNECT:
+      return handle_reconnect(payload);
+    case Tag::SESSION_RESET:
+      return handle_session_reset(payload);
+    case Tag::SESSION_RETRY:
+      return handle_session_retry(payload);
+    case Tag::SESSION_RETRY_GLOBAL:
+      return handle_session_retry_global(payload);
+    case Tag::SESSION_RECONNECT_OK:
+      return handle_reconnect_ok(payload);
+    case Tag::KEEPALIVE2:
+      return handle_keepalive2(payload);
+    case Tag::KEEPALIVE2_ACK:
+      return handle_keepalive2_ack(payload);
+    case Tag::ACK:
+      return handle_message_ack(payload);
+    case Tag::WAIT:
+      return handle_wait(payload);
+    default:
+      ceph_abort();
+  }
+  return nullptr;
+}
+
+CtPtr ProtocolV2::ready() {
+  ldout(cct, 25) << __func__ << dendl;
+
+  reconnecting = false;
+  replacing = false;
+
+  // make sure no pending tick timer
+  if (connection->last_tick_id) {
+    connection->center->delete_time_event(connection->last_tick_id);
+  }
+  connection->last_tick_id = connection->center->create_time_event(
+      connection->inactive_timeout_us, connection->tick_handler);
+
+  {
+    std::lock_guard<std::mutex> l(connection->write_lock);
+    can_write = true;
+    if (!out_queue.empty()) {
+      connection->center->dispatch_event_external(connection->write_handler);
+    }
+  }
+
+  connection->maybe_start_delay_thread();
+
+  state = READY;
+  ldout(cct, 1) << __func__ << " entity=" << peer_name << " client_cookie="
+                << std::hex << client_cookie << " server_cookie="
+                << server_cookie << std::dec << " in_seq=" << in_seq
+                << " out_seq=" << out_seq << dendl;
+
+  INTERCEPT(15);
+
+  return CONTINUE(read_frame);
+}
+
+CtPtr ProtocolV2::handle_read_frame_epilogue_main(rx_buffer_t &&buffer, int r)
+{
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read frame epilogue failed r=" << r
+                  << " (" << cpp_strerror(r) << ")" << dendl;
+    return _fault();
+  }
+
+  rx_epilogue.push_back(std::move(buffer));
+  return _handle_read_frame_epilogue_main();
+}
+
+CtPtr ProtocolV2::_handle_read_frame_epilogue_main() {
+  bool aborted;
+  try {
+    rx_frame_asm.disassemble_first_segment(rx_preamble, rx_segments_data[0]);
+    aborted = !rx_frame_asm.disassemble_remaining_segments(
+        rx_segments_data.data(), rx_epilogue);
+  } catch (FrameError& e) {
+    ldout(cct, 1) << __func__ << " " << e.what() << dendl;
+    return _fault();
+  } catch (ceph::crypto::onwire::MsgAuthError&) {
+    ldout(cct, 1) << __func__ << "bad auth tag" << dendl;
+    return _fault();
+  }
+
+  // we do have a mechanism that allows transmitter to start sending message
+  // and abort after putting entire data field on wire. This will be used by
+  // the kernel client to avoid unnecessary buffering.
+  if (aborted) {
+    reset_throttle();
+    state = READY;
+    return CONTINUE(read_frame);
+  }
+  return handle_read_frame_dispatch();
+}
+
+CtPtr ProtocolV2::handle_message() {
+  ldout(cct, 20) << __func__ << dendl;
+  ceph_assert(state == THROTTLE_DONE);
+
+#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
+  ltt_recv_stamp = ceph_clock_now();
+#endif
+  recv_stamp = ceph_clock_now();
+
+  const size_t cur_msg_size = get_current_msg_size();
+  auto msg_frame = MessageFrame::Decode(rx_segments_data);
+
+  // XXX: paranoid copy just to avoid oops
+  ceph_msg_header2 current_header = msg_frame.header();
+
+  ldout(cct, 5) << __func__
+		<< " got " << msg_frame.front_len()
+		<< " + " << msg_frame.middle_len()
+		<< " + " << msg_frame.data_len()
+		<< " byte message."
+		<< " envelope type=" << current_header.type
+		<< " src " << peer_name
+		<< " off " << current_header.data_off
+                << dendl;
+
+  INTERCEPT(16);
+  ceph_msg_header header{current_header.seq,
+                         current_header.tid,
+                         current_header.type,
+                         current_header.priority,
+                         current_header.version,
+                         init_le32(msg_frame.front_len()),
+                         init_le32(msg_frame.middle_len()),
+                         init_le32(msg_frame.data_len()),
+                         current_header.data_off,
+                         peer_name,
+                         current_header.compat_version,
+                         current_header.reserved,
+                         init_le32(0)};
+  ceph_msg_footer footer{init_le32(0), init_le32(0),
+	                 init_le32(0), init_le64(0), current_header.flags};
+
+  Message *message = decode_message(cct, 0, header, footer,
+      msg_frame.front(),
+      msg_frame.middle(),
+      msg_frame.data(),
+      connection);
+  if (!message) {
+    ldout(cct, 1) << __func__ << " decode message failed " << dendl;
+    return _fault();
+  } else {
+    state = READ_MESSAGE_COMPLETE;
+  }
+
+  INTERCEPT(17);
+
+  message->set_byte_throttler(connection->policy.throttler_bytes);
+  message->set_message_throttler(connection->policy.throttler_messages);
+
+  // store reservation size in message, so we don't get confused
+  // by messages entering the dispatch queue through other paths.
+  message->set_dispatch_throttle_size(cur_msg_size);
+
+  message->set_recv_stamp(recv_stamp);
+  message->set_throttle_stamp(throttle_stamp);
+  message->set_recv_complete_stamp(ceph_clock_now());
+
+  // check received seq#.  if it is old, drop the message.
+  // note that incoming messages may skip ahead.  this is convenient for the
+  // client side queueing because messages can't be renumbered, but the (kernel)
+  // client will occasionally pull a message out of the sent queue to send
+  // elsewhere.  in that case it doesn't matter if we "got" it or not.
+  uint64_t cur_seq = in_seq;
+  if (message->get_seq() <= cur_seq) {
+    ldout(cct, 0) << __func__ << " got old message " << message->get_seq()
+                  << " <= " << cur_seq << " " << message << " " << *message
+                  << ", discarding" << dendl;
+    message->put();
+    if (connection->has_feature(CEPH_FEATURE_RECONNECT_SEQ) &&
+        cct->_conf->ms_die_on_old_message) {
+      ceph_assert(0 == "old msgs despite reconnect_seq feature");
+    }
+    return nullptr;
+  }
+  if (message->get_seq() > cur_seq + 1) {
+    ldout(cct, 0) << __func__ << " missed message?  skipped from seq "
+                  << cur_seq << " to " << message->get_seq() << dendl;
+    if (cct->_conf->ms_die_on_skipped_message) {
+      ceph_assert(0 == "skipped incoming seq");
+    }
+  }
+
+#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
+  if (message->get_type() == CEPH_MSG_OSD_OP ||
+      message->get_type() == CEPH_MSG_OSD_OPREPLY) {
+    utime_t ltt_processed_stamp = ceph_clock_now();
+    double usecs_elapsed =
+        (ltt_processed_stamp.to_nsec() - ltt_recv_stamp.to_nsec()) / 1000;
+    ostringstream buf;
+    if (message->get_type() == CEPH_MSG_OSD_OP)
+      OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OP",
+                           false);
+    else
+      OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OPREPLY",
+                           false);
+  }
+#endif
+
+  // note last received message.
+  in_seq = message->get_seq();
+  ldout(cct, 5) << __func__ << " received message m=" << message
+                << " seq=" << message->get_seq()
+                << " from=" << message->get_source() << " type=" << header.type
+                << " " << *message << dendl;
+
+  bool need_dispatch_writer = false;
+  if (!connection->policy.lossy) {
+    ack_left++;
+    need_dispatch_writer = true;
+  }
+
+  state = READY;
+
+  connection->logger->inc(l_msgr_recv_messages);
+  connection->logger->inc(l_msgr_recv_bytes,
+                          rx_frame_asm.get_frame_onwire_len());
+
+  messenger->ms_fast_preprocess(message);
+  auto fast_dispatch_time = ceph::mono_clock::now();
+  connection->logger->tinc(l_msgr_running_recv_time,
+                           fast_dispatch_time - connection->recv_start_time);
+  if (connection->delay_state) {
+    double delay_period = 0;
+    if (rand() % 10000 < cct->_conf->ms_inject_delay_probability * 10000.0) {
+      delay_period =
+          cct->_conf->ms_inject_delay_max * (double)(rand() % 10000) / 10000.0;
+      ldout(cct, 1) << "queue_received will delay after "
+                    << (ceph_clock_now() + delay_period) << " on " << message
+                    << " " << *message << dendl;
+    }
+    connection->delay_state->queue(delay_period, message);
+  } else if (messenger->ms_can_fast_dispatch(message)) {
+    connection->lock.unlock();
+    connection->dispatch_queue->fast_dispatch(message);
+    connection->recv_start_time = ceph::mono_clock::now();
+    connection->logger->tinc(l_msgr_running_fast_dispatch_time,
+                             connection->recv_start_time - fast_dispatch_time);
+    connection->lock.lock();
+  } else {
+    connection->dispatch_queue->enqueue(message, message->get_priority(),
+                                        connection->conn_id);
+  }
+
+  handle_message_ack(current_header.ack_seq);
+
+  // we might have been reused by another connection
+  // let's check if that is the case
+  if (state != READY) {
+    // yes, that was the case, let's do nothing
+    return nullptr;
+  }
+
+  if (need_dispatch_writer && connection->is_connected()) {
+    connection->center->dispatch_event_external(connection->write_handler);
+  }
+
+  return CONTINUE(read_frame);
+}
+
+
+CtPtr ProtocolV2::throttle_message() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (connection->policy.throttler_messages) {
+    ldout(cct, 10) << __func__ << " wants " << 1
+                   << " message from policy throttler "
+                   << connection->policy.throttler_messages->get_current()
+                   << "/" << connection->policy.throttler_messages->get_max()
+                   << dendl;
+    if (!connection->policy.throttler_messages->get_or_fail()) {
+      ldout(cct, 10) << __func__ << " wants 1 message from policy throttle "
+                     << connection->policy.throttler_messages->get_current()
+                     << "/" << connection->policy.throttler_messages->get_max()
+                     << " failed, just wait." << dendl;
+      // following thread pool deal with th full message queue isn't a
+      // short time, so we can wait a ms.
+      if (connection->register_time_events.empty()) {
+        connection->register_time_events.insert(
+            connection->center->create_time_event(1000,
+                                                  connection->wakeup_handler));
+      }
+      return nullptr;
+    }
+  }
+
+  state = THROTTLE_BYTES;
+  return CONTINUE(throttle_bytes);
+}
+
+CtPtr ProtocolV2::throttle_bytes() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  const size_t cur_msg_size = get_current_msg_size();
+  if (cur_msg_size) {
+    if (connection->policy.throttler_bytes) {
+      ldout(cct, 10) << __func__ << " wants " << cur_msg_size
+                     << " bytes from policy throttler "
+                     << connection->policy.throttler_bytes->get_current() << "/"
+                     << connection->policy.throttler_bytes->get_max() << dendl;
+      if (!connection->policy.throttler_bytes->get_or_fail(cur_msg_size)) {
+        ldout(cct, 10) << __func__ << " wants " << cur_msg_size
+                       << " bytes from policy throttler "
+                       << connection->policy.throttler_bytes->get_current()
+                       << "/" << connection->policy.throttler_bytes->get_max()
+                       << " failed, just wait." << dendl;
+        // following thread pool deal with th full message queue isn't a
+        // short time, so we can wait a ms.
+        if (connection->register_time_events.empty()) {
+          connection->register_time_events.insert(
+              connection->center->create_time_event(
+                  1000, connection->wakeup_handler));
+        }
+        return nullptr;
+      }
+    }
+  }
+
+  state = THROTTLE_DISPATCH_QUEUE;
+  return CONTINUE(throttle_dispatch_queue);
+}
+
+CtPtr ProtocolV2::throttle_dispatch_queue() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  const size_t cur_msg_size = get_current_msg_size();
+  if (cur_msg_size) {
+    if (!connection->dispatch_queue->dispatch_throttler.get_or_fail(
+            cur_msg_size)) {
+      ldout(cct, 10)
+          << __func__ << " wants " << cur_msg_size
+          << " bytes from dispatch throttle "
+          << connection->dispatch_queue->dispatch_throttler.get_current() << "/"
+          << connection->dispatch_queue->dispatch_throttler.get_max()
+          << " failed, just wait." << dendl;
+      // following thread pool deal with th full message queue isn't a
+      // short time, so we can wait a ms.
+      if (connection->register_time_events.empty()) {
+        connection->register_time_events.insert(
+            connection->center->create_time_event(1000,
+                                                  connection->wakeup_handler));
+      }
+      return nullptr;
+    }
+  }
+
+  throttle_stamp = ceph_clock_now();
+  state = THROTTLE_DONE;
+
+  return read_frame_segment();
+}
+
+CtPtr ProtocolV2::handle_keepalive2(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != READY) {
+    lderr(cct) << __func__ << " not in ready state!" << dendl;
+    return _fault();
+  }
+
+  auto keepalive_frame = KeepAliveFrame::Decode(payload);
+
+  ldout(cct, 30) << __func__ << " got KEEPALIVE2 tag ..." << dendl;
+
+  connection->write_lock.lock();
+  auto keepalive_ack_frame = KeepAliveFrameAck::Encode(keepalive_frame.timestamp());
+  if (!append_frame(keepalive_ack_frame)) {
+    connection->write_lock.unlock();
+    return _fault();
+  }
+  connection->write_lock.unlock();
+
+  ldout(cct, 20) << __func__ << " got KEEPALIVE2 "
+                 << keepalive_frame.timestamp() << dendl;
+  connection->set_last_keepalive(ceph_clock_now());
+
+  if (is_connected()) {
+    connection->center->dispatch_event_external(connection->write_handler);
+  }
+
+  return CONTINUE(read_frame);
+}
+
+CtPtr ProtocolV2::handle_keepalive2_ack(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != READY) {
+    lderr(cct) << __func__ << " not in ready state!" << dendl;
+    return _fault();
+  }
+
+  auto keepalive_ack_frame = KeepAliveFrameAck::Decode(payload);
+  connection->set_last_keepalive_ack(keepalive_ack_frame.timestamp());
+  ldout(cct, 20) << __func__ << " got KEEPALIVE_ACK" << dendl;
+
+  return CONTINUE(read_frame);
+}
+
+CtPtr ProtocolV2::handle_message_ack(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != READY) {
+    lderr(cct) << __func__ << " not in ready state!" << dendl;
+    return _fault();
+  }
+
+  auto ack = AckFrame::Decode(payload);
+  handle_message_ack(ack.seq());
+  return CONTINUE(read_frame);
+}
+
+/* Client Protocol Methods */
+
+CtPtr ProtocolV2::start_client_banner_exchange() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  INTERCEPT(1);
+
+  state = BANNER_CONNECTING;
+
+  global_seq = messenger->get_global_seq();
+
+  return _banner_exchange(CONTINUATION(post_client_banner_exchange));
+}
+
+CtPtr ProtocolV2::post_client_banner_exchange() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  state = AUTH_CONNECTING;
+
+  return send_auth_request();
+}
+
+CtPtr ProtocolV2::send_auth_request(std::vector<uint32_t> &allowed_methods) {
+  ldout(cct, 20) << __func__ << " peer_type " << (int)connection->peer_type
+		 << " auth_client " << messenger->auth_client << dendl;
+  ceph_assert(messenger->auth_client);
+
+  bufferlist bl;
+  vector<uint32_t> preferred_modes;
+  auto am = auth_meta;
+  connection->lock.unlock();
+  int r = messenger->auth_client->get_auth_request(
+    connection, am.get(),
+    &am->auth_method, &preferred_modes, &bl);
+  connection->lock.lock();
+  if (state != AUTH_CONNECTING) {
+    ldout(cct, 1) << __func__ << " state changed!" << dendl;
+    return _fault();
+  }
+  if (r < 0) {
+    ldout(cct, 0) << __func__ << " get_initial_auth_request returned " << r
+		  << dendl;
+    stop();
+    connection->dispatch_queue->queue_reset(connection);
+    return nullptr;
+  }
+
+  INTERCEPT(9);
+
+  auto frame = AuthRequestFrame::Encode(auth_meta->auth_method, preferred_modes,
+                                        bl);
+  return WRITE(frame, "auth request", read_frame);
+}
+
+CtPtr ProtocolV2::handle_auth_bad_method(ceph::bufferlist &payload) {
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != AUTH_CONNECTING) {
+    lderr(cct) << __func__ << " not in auth connect state!" << dendl;
+    return _fault();
+  }
+
+  auto bad_method = AuthBadMethodFrame::Decode(payload);
+  ldout(cct, 1) << __func__ << " method=" << bad_method.method()
+		<< " result " << cpp_strerror(bad_method.result())
+                << ", allowed methods=" << bad_method.allowed_methods()
+		<< ", allowed modes=" << bad_method.allowed_modes()
+                << dendl;
+  ceph_assert(messenger->auth_client);
+  auto am = auth_meta;
+  connection->lock.unlock();
+  int r = messenger->auth_client->handle_auth_bad_method(
+    connection,
+    am.get(),
+    bad_method.method(), bad_method.result(),
+    bad_method.allowed_methods(),
+    bad_method.allowed_modes());
+  connection->lock.lock();
+  if (state != AUTH_CONNECTING || r < 0) {
+    return _fault();
+  }
+  return send_auth_request(bad_method.allowed_methods());
+}
+
+CtPtr ProtocolV2::handle_auth_reply_more(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != AUTH_CONNECTING) {
+    lderr(cct) << __func__ << " not in auth connect state!" << dendl;
+    return _fault();
+  }
+
+  auto auth_more = AuthReplyMoreFrame::Decode(payload);
+  ldout(cct, 5) << __func__
+                << " auth reply more len=" << auth_more.auth_payload().length()
+                << dendl;
+  ceph_assert(messenger->auth_client);
+  ceph::bufferlist reply;
+  auto am = auth_meta;
+  connection->lock.unlock();
+  int r = messenger->auth_client->handle_auth_reply_more(
+    connection, am.get(), auth_more.auth_payload(), &reply);
+  connection->lock.lock();
+  if (state != AUTH_CONNECTING) {
+    ldout(cct, 1) << __func__ << " state changed!" << dendl;
+    return _fault();
+  }
+  if (r < 0) {
+    lderr(cct) << __func__ << " auth_client handle_auth_reply_more returned "
+	       << r << dendl;
+    return _fault();
+  }
+  auto more_reply = AuthRequestMoreFrame::Encode(reply);
+  return WRITE(more_reply, "auth request more", read_frame);
+}
+
+CtPtr ProtocolV2::handle_auth_done(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != AUTH_CONNECTING) {
+    lderr(cct) << __func__ << " not in auth connect state!" << dendl;
+    return _fault();
+  }
+
+  auto auth_done = AuthDoneFrame::Decode(payload);
+
+  ceph_assert(messenger->auth_client);
+  auto am = auth_meta;
+  connection->lock.unlock();
+  int r = messenger->auth_client->handle_auth_done(
+    connection,
+    am.get(),
+    auth_done.global_id(),
+    auth_done.con_mode(),
+    auth_done.auth_payload(),
+    &am->session_key,
+    &am->connection_secret);
+  connection->lock.lock();
+  if (state != AUTH_CONNECTING) {
+    ldout(cct, 1) << __func__ << " state changed!" << dendl;
+    return _fault();
+  }
+  if (r < 0) {
+    return _fault();
+  }
+  auth_meta->con_mode = auth_done.con_mode();
+  bool is_rev1 = HAVE_MSGR2_FEATURE(peer_supported_features, REVISION_1);
+  session_stream_handlers = ceph::crypto::onwire::rxtx_t::create_handler_pair(
+      cct, *auth_meta, /*new_nonce_format=*/is_rev1, /*crossed=*/false);
+
+  state = AUTH_CONNECTING_SIGN;
+
+  const auto sig = auth_meta->session_key.empty() ? sha256_digest_t() :
+    auth_meta->session_key.hmac_sha256(cct, pre_auth.rxbuf);
+  auto sig_frame = AuthSignatureFrame::Encode(sig);
+  pre_auth.enabled = false;
+  pre_auth.rxbuf.clear();
+  return WRITE(sig_frame, "auth signature", read_frame);
+}
+
+CtPtr ProtocolV2::finish_client_auth() {
+  if (!server_cookie) {
+    ceph_assert(connect_seq == 0);
+    state = SESSION_CONNECTING;
+    return send_client_ident();
+  } else {  // reconnecting to previous session
+    state = SESSION_RECONNECTING;
+    ceph_assert(connect_seq > 0);
+    return send_reconnect();
+  }
+}
+
+CtPtr ProtocolV2::send_client_ident() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (!connection->policy.lossy && !client_cookie) {
+    client_cookie = ceph::util::generate_random_number<uint64_t>(1, -1ll);
+  }
+
+  uint64_t flags = 0;
+  if (connection->policy.lossy) {
+    flags |= CEPH_MSG_CONNECT_LOSSY;
+  }
+
+  auto client_ident = ClientIdentFrame::Encode(
+      messenger->get_myaddrs(),
+      connection->target_addr,
+      messenger->get_myname().num(),
+      global_seq,
+      connection->policy.features_supported,
+      connection->policy.features_required | msgr2_required,
+      flags,
+      client_cookie);
+
+  ldout(cct, 5) << __func__ << " sending identification: "
+                << "addrs=" << messenger->get_myaddrs()
+                << " target=" << connection->target_addr
+                << " gid=" << messenger->get_myname().num()
+                << " global_seq=" << global_seq
+                << " features_supported=" << std::hex
+                << connection->policy.features_supported
+                << " features_required="
+		            << (connection->policy.features_required | msgr2_required)
+                << " flags=" << flags
+                << " cookie=" << client_cookie << std::dec << dendl;
+
+  INTERCEPT(11);
+
+  return WRITE(client_ident, "client ident", read_frame);
+}
+
+CtPtr ProtocolV2::send_reconnect() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  auto reconnect = ReconnectFrame::Encode(messenger->get_myaddrs(),
+                                          client_cookie,
+                                          server_cookie,
+                                          global_seq,
+                                          connect_seq,
+                                          in_seq);
+
+  ldout(cct, 5) << __func__ << " reconnect to session: client_cookie="
+                << std::hex << client_cookie << " server_cookie="
+                << server_cookie << std::dec
+                << " gs=" << global_seq << " cs=" << connect_seq
+                << " ms=" << in_seq << dendl;
+
+  INTERCEPT(13);
+
+  return WRITE(reconnect, "reconnect", read_frame);
+}
+
+CtPtr ProtocolV2::handle_ident_missing_features(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != SESSION_CONNECTING) {
+    lderr(cct) << __func__ << " not in session connect state!" << dendl;
+    return _fault();
+  }
+
+  auto ident_missing =
+      IdentMissingFeaturesFrame::Decode(payload);
+  lderr(cct) << __func__
+             << " client does not support all server features: " << std::hex
+             << ident_missing.features() << std::dec << dendl;
+
+  return _fault();
+}
+
+CtPtr ProtocolV2::handle_session_reset(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != SESSION_RECONNECTING) {
+    lderr(cct) << __func__ << " not in session reconnect state!" << dendl;
+    return _fault();
+  }
+
+  auto reset = ResetFrame::Decode(payload);
+
+  ldout(cct, 1) << __func__ << " received session reset full=" << reset.full()
+                << dendl;
+  if (reset.full()) {
+    reset_session();
+  } else {
+    server_cookie = 0;
+    connect_seq = 0;
+    in_seq = 0;
+  }
+
+  state = SESSION_CONNECTING;
+  return send_client_ident();
+}
+
+CtPtr ProtocolV2::handle_session_retry(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != SESSION_RECONNECTING) {
+    lderr(cct) << __func__ << " not in session reconnect state!" << dendl;
+    return _fault();
+  }
+
+  auto retry = RetryFrame::Decode(payload);
+  connect_seq = retry.connect_seq() + 1;
+
+  ldout(cct, 1) << __func__
+                << " received session retry connect_seq=" << retry.connect_seq()
+                << ", inc to cs=" << connect_seq << dendl;
+
+  return send_reconnect();
+}
+
+CtPtr ProtocolV2::handle_session_retry_global(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != SESSION_RECONNECTING) {
+    lderr(cct) << __func__ << " not in session reconnect state!" << dendl;
+    return _fault();
+  }
+
+  auto retry = RetryGlobalFrame::Decode(payload);
+  global_seq = messenger->get_global_seq(retry.global_seq());
+
+  ldout(cct, 1) << __func__ << " received session retry global global_seq="
+                << retry.global_seq() << ", choose new gs=" << global_seq
+                << dendl;
+
+  return send_reconnect();
+}
+
+CtPtr ProtocolV2::handle_wait(ceph::bufferlist &payload) {
+  ldout(cct, 20) << __func__
+		 << " received WAIT (connection race)"
+		 << " payload.length()=" << payload.length()
+		 << dendl;
+
+  if (state != SESSION_CONNECTING && state != SESSION_RECONNECTING) {
+    lderr(cct) << __func__ << " not in session (re)connect state!" << dendl;
+    return _fault();
+  }
+
+  state = WAIT;
+  WaitFrame::Decode(payload);
+  return _fault();
+}
+
+CtPtr ProtocolV2::handle_reconnect_ok(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != SESSION_RECONNECTING) {
+    lderr(cct) << __func__ << " not in session reconnect state!" << dendl;
+    return _fault();
+  }
+
+  auto reconnect_ok = ReconnectOkFrame::Decode(payload);
+  ldout(cct, 5) << __func__
+                << " reconnect accepted: sms=" << reconnect_ok.msg_seq()
+                << dendl;
+
+  out_seq = discard_requeued_up_to(out_seq, reconnect_ok.msg_seq());
+
+  backoff = utime_t();
+  ldout(cct, 10) << __func__ << " reconnect success " << connect_seq
+                 << ", lossy = " << connection->policy.lossy << ", features "
+                 << connection->get_features() << dendl;
+
+  if (connection->delay_state) {
+    ceph_assert(connection->delay_state->ready());
+  }
+
+  connection->dispatch_queue->queue_connect(connection);
+  messenger->ms_deliver_handle_fast_connect(connection);
+
+  return ready();
+}
+
+CtPtr ProtocolV2::handle_server_ident(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != SESSION_CONNECTING) {
+    lderr(cct) << __func__ << " not in session connect state!" << dendl;
+    return _fault();
+  }
+
+  auto server_ident = ServerIdentFrame::Decode(payload);
+  ldout(cct, 5) << __func__ << " received server identification:"
+                << " addrs=" << server_ident.addrs()
+                << " gid=" << server_ident.gid()
+                << " global_seq=" << server_ident.global_seq()
+                << " features_supported=" << std::hex
+                << server_ident.supported_features()
+                << " features_required=" << server_ident.required_features()
+                << " flags=" << server_ident.flags()
+                << " cookie=" << server_ident.cookie() << std::dec << dendl;
+
+  // is this who we intended to talk to?
+  // be a bit forgiving here, since we may be connecting based on addresses parsed out
+  // of mon_host or something.
+  if (!server_ident.addrs().contains(connection->target_addr)) {
+    ldout(cct,1) << __func__ << " peer identifies as " << server_ident.addrs()
+		 << ", does not include " << connection->target_addr << dendl;
+    return _fault();
+  }
+
+  server_cookie = server_ident.cookie();
+
+  connection->set_peer_addrs(server_ident.addrs());
+  peer_name = entity_name_t(connection->get_peer_type(), server_ident.gid());
+  connection->set_features(server_ident.supported_features() &
+                           connection->policy.features_supported);
+  peer_global_seq = server_ident.global_seq();
+
+  connection->policy.lossy = server_ident.flags() & CEPH_MSG_CONNECT_LOSSY;
+
+  backoff = utime_t();
+  ldout(cct, 10) << __func__ << " connect success " << connect_seq
+                 << ", lossy = " << connection->policy.lossy << ", features "
+                 << connection->get_features() << dendl;
+
+  if (connection->delay_state) {
+    ceph_assert(connection->delay_state->ready());
+  }
+
+  connection->dispatch_queue->queue_connect(connection);
+  messenger->ms_deliver_handle_fast_connect(connection);
+
+  return ready();
+}
+
+/* Server Protocol Methods */
+
+CtPtr ProtocolV2::start_server_banner_exchange() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  INTERCEPT(2);
+
+  state = BANNER_ACCEPTING;
+
+  return _banner_exchange(CONTINUATION(post_server_banner_exchange));
+}
+
+CtPtr ProtocolV2::post_server_banner_exchange() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  state = AUTH_ACCEPTING;
+
+  return CONTINUE(read_frame);
+}
+
+CtPtr ProtocolV2::handle_auth_request(ceph::bufferlist &payload) {
+  ldout(cct, 20) << __func__ << " payload.length()=" << payload.length()
+                 << dendl;
+
+  if (state != AUTH_ACCEPTING) {
+    lderr(cct) << __func__ << " not in auth accept state!" << dendl;
+    return _fault();
+  }
+
+  auto request = AuthRequestFrame::Decode(payload);
+  ldout(cct, 10) << __func__ << " AuthRequest(method=" << request.method()
+		 << ", preferred_modes=" << request.preferred_modes()
+                 << ", payload_len=" << request.auth_payload().length() << ")"
+                 << dendl;
+  auth_meta->auth_method = request.method();
+  auth_meta->con_mode = messenger->auth_server->pick_con_mode(
+    connection->get_peer_type(), auth_meta->auth_method,
+    request.preferred_modes());
+  if (auth_meta->con_mode == CEPH_CON_MODE_UNKNOWN) {
+    return _auth_bad_method(-EOPNOTSUPP);
+  }
+  return _handle_auth_request(request.auth_payload(), false);
+}
+
+CtPtr ProtocolV2::_auth_bad_method(int r)
+{
+  ceph_assert(r < 0);
+  std::vector<uint32_t> allowed_methods;
+  std::vector<uint32_t> allowed_modes;
+  messenger->auth_server->get_supported_auth_methods(
+    connection->get_peer_type(), &allowed_methods, &allowed_modes);
+  ldout(cct, 1) << __func__ << " auth_method " << auth_meta->auth_method
+		<< " r " << cpp_strerror(r)
+		<< ", allowed_methods " << allowed_methods
+		<< ", allowed_modes " << allowed_modes
+		<< dendl;
+  auto bad_method = AuthBadMethodFrame::Encode(auth_meta->auth_method, r,
+                                               allowed_methods, allowed_modes);
+  return WRITE(bad_method, "bad auth method", read_frame);
+}
+
+CtPtr ProtocolV2::_handle_auth_request(bufferlist& auth_payload, bool more)
+{
+  if (!messenger->auth_server) {
+    return _fault();
+  }
+  bufferlist reply;
+  auto am = auth_meta;
+  connection->lock.unlock();
+  int r = messenger->auth_server->handle_auth_request(
+    connection, am.get(),
+    more, am->auth_method, auth_payload,
+    &reply);
+  connection->lock.lock();
+  if (state != AUTH_ACCEPTING && state != AUTH_ACCEPTING_MORE) {
+    ldout(cct, 1) << __func__
+                  << " state changed while accept, it must be mark_down"
+                  << dendl;
+    ceph_assert(state == CLOSED);
+    return _fault();
+  }
+  if (r == 1) {
+    INTERCEPT(10);
+    state = AUTH_ACCEPTING_SIGN;
+
+    auto auth_done = AuthDoneFrame::Encode(connection->peer_global_id,
+                                           auth_meta->con_mode,
+                                           reply);
+    return WRITE(auth_done, "auth done", finish_auth);
+  } else if (r == 0) {
+    state = AUTH_ACCEPTING_MORE;
+
+    auto more = AuthReplyMoreFrame::Encode(reply);
+    return WRITE(more, "auth reply more", read_frame);
+  } else if (r == -EBUSY) {
+    // kick the client and maybe they'll come back later
+    return _fault();
+  } else {
+    return _auth_bad_method(r);
+  }
+}
+
+CtPtr ProtocolV2::finish_auth()
+{
+  ceph_assert(auth_meta);
+  // TODO: having a possibility to check whether we're server or client could
+  // allow reusing finish_auth().
+  bool is_rev1 = HAVE_MSGR2_FEATURE(peer_supported_features, REVISION_1);
+  session_stream_handlers = ceph::crypto::onwire::rxtx_t::create_handler_pair(
+      cct, *auth_meta, /*new_nonce_format=*/is_rev1, /*crossed=*/true);
+
+  const auto sig = auth_meta->session_key.empty() ? sha256_digest_t() :
+    auth_meta->session_key.hmac_sha256(cct, pre_auth.rxbuf);
+  auto sig_frame = AuthSignatureFrame::Encode(sig);
+  pre_auth.enabled = false;
+  pre_auth.rxbuf.clear();
+  return WRITE(sig_frame, "auth signature", read_frame);
+}
+
+CtPtr ProtocolV2::handle_auth_request_more(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != AUTH_ACCEPTING_MORE) {
+    lderr(cct) << __func__ << " not in auth accept more state!" << dendl;
+    return _fault();
+  }
+
+  auto auth_more = AuthRequestMoreFrame::Decode(payload);
+  return _handle_auth_request(auth_more.auth_payload(), true);
+}
+
+CtPtr ProtocolV2::handle_auth_signature(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != AUTH_ACCEPTING_SIGN && state != AUTH_CONNECTING_SIGN) {
+    lderr(cct) << __func__
+               << " pre-auth verification signature seen in wrong state!"
+               << dendl;
+    return _fault();
+  }
+
+  auto sig_frame = AuthSignatureFrame::Decode(payload);
+
+  const auto actual_tx_sig = auth_meta->session_key.empty() ?
+    sha256_digest_t() : auth_meta->session_key.hmac_sha256(cct, pre_auth.txbuf);
+  if (sig_frame.signature() != actual_tx_sig) {
+    ldout(cct, 2) << __func__ << " pre-auth signature mismatch"
+                  << " actual_tx_sig=" << actual_tx_sig
+                  << " sig_frame.signature()=" << sig_frame.signature()
+                  << dendl;
+    return _fault();
+  } else {
+    ldout(cct, 20) << __func__ << " pre-auth signature success"
+                   << " sig_frame.signature()=" << sig_frame.signature()
+                   << dendl;
+    pre_auth.txbuf.clear();
+  }
+
+  if (state == AUTH_ACCEPTING_SIGN) {
+    // server had sent AuthDone and client responded with correct pre-auth
+    // signature. we can start accepting new sessions/reconnects.
+    state = SESSION_ACCEPTING;
+    return CONTINUE(read_frame);
+  } else if (state == AUTH_CONNECTING_SIGN) {
+    // this happened at client side
+    return finish_client_auth();
+  } else {
+    ceph_assert_always("state corruption" == nullptr);
+  }
+}
+
+CtPtr ProtocolV2::handle_client_ident(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != SESSION_ACCEPTING) {
+    lderr(cct) << __func__ << " not in session accept state!" << dendl;
+    return _fault();
+  }
+
+  auto client_ident = ClientIdentFrame::Decode(payload);
+
+  ldout(cct, 5) << __func__ << " received client identification:"
+                << " addrs=" << client_ident.addrs()
+		            << " target=" << client_ident.target_addr()
+                << " gid=" << client_ident.gid()
+                << " global_seq=" << client_ident.global_seq()
+                << " features_supported=" << std::hex
+                << client_ident.supported_features()
+                << " features_required=" << client_ident.required_features()
+                << " flags=" << client_ident.flags()
+                << " cookie=" << client_ident.cookie() << std::dec << dendl;
+
+  if (client_ident.addrs().empty() ||
+      client_ident.addrs().front() == entity_addr_t()) {
+    ldout(cct,5) << __func__ << " oops, client_ident.addrs() is empty" << dendl;
+    return _fault();  // a v2 peer should never do this
+  }
+  if (!messenger->get_myaddrs().contains(client_ident.target_addr())) {
+    ldout(cct,5) << __func__ << " peer is trying to reach "
+		 << client_ident.target_addr()
+		 << " which is not us (" << messenger->get_myaddrs() << ")"
+		 << dendl;
+    return _fault();
+  }
+
+  connection->set_peer_addrs(client_ident.addrs());
+  connection->target_addr = connection->_infer_target_addr(client_ident.addrs());
+
+  peer_name = entity_name_t(connection->get_peer_type(), client_ident.gid());
+  connection->set_peer_id(client_ident.gid());
+
+  client_cookie = client_ident.cookie();
+
+  uint64_t feat_missing =
+    (connection->policy.features_required | msgr2_required) &
+    ~(uint64_t)client_ident.supported_features();
+  if (feat_missing) {
+    ldout(cct, 1) << __func__ << " peer missing required features " << std::hex
+                  << feat_missing << std::dec << dendl;
+    auto ident_missing_features =
+        IdentMissingFeaturesFrame::Encode(feat_missing);
+
+    return WRITE(ident_missing_features, "ident missing features", read_frame);
+  }
+
+  connection_features =
+      client_ident.supported_features() & connection->policy.features_supported;
+
+  peer_global_seq = client_ident.global_seq();
+
+  // Looks good so far, let's check if there is already an existing connection
+  // to this peer.
+
+  connection->lock.unlock();
+  AsyncConnectionRef existing = messenger->lookup_conn(*connection->peer_addrs);
+
+  if (existing &&
+      existing->protocol->proto_type != 2) {
+    ldout(cct,1) << __func__ << " existing " << existing << " proto "
+		 << existing->protocol.get() << " version is "
+		 << existing->protocol->proto_type << ", marking down" << dendl;
+    existing->mark_down();
+    existing = nullptr;
+  }
+
+  connection->inject_delay();
+
+  connection->lock.lock();
+  if (state != SESSION_ACCEPTING) {
+    ldout(cct, 1) << __func__
+                  << " state changed while accept, it must be mark_down"
+                  << dendl;
+    ceph_assert(state == CLOSED);
+    return _fault();
+  }
+
+  if (existing) {
+    return handle_existing_connection(existing);
+  }
+
+  // if everything is OK reply with server identification
+  return send_server_ident();
+}
+
+CtPtr ProtocolV2::handle_reconnect(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != SESSION_ACCEPTING) {
+    lderr(cct) << __func__ << " not in session accept state!" << dendl;
+    return _fault();
+  }
+
+  auto reconnect = ReconnectFrame::Decode(payload);
+
+  ldout(cct, 5) << __func__
+                << " received reconnect:" 
+                << " client_cookie=" << std::hex << reconnect.client_cookie()
+                << " server_cookie=" << reconnect.server_cookie() << std::dec
+                << " gs=" << reconnect.global_seq()
+                << " cs=" << reconnect.connect_seq()
+                << " ms=" << reconnect.msg_seq()
+		            << dendl;
+
+  // Should we check if one of the ident.addrs match connection->target_addr
+  // as we do in ProtocolV1?
+  connection->set_peer_addrs(reconnect.addrs());
+  connection->target_addr = connection->_infer_target_addr(reconnect.addrs());
+  peer_global_seq = reconnect.global_seq();
+
+  connection->lock.unlock();
+  AsyncConnectionRef existing = messenger->lookup_conn(*connection->peer_addrs);
+
+  if (existing &&
+      existing->protocol->proto_type != 2) {
+    ldout(cct,1) << __func__ << " existing " << existing << " proto "
+		 << existing->protocol.get() << " version is "
+		 << existing->protocol->proto_type << ", marking down" << dendl;
+    existing->mark_down();
+    existing = nullptr;
+  }
+
+  connection->inject_delay();
+
+  connection->lock.lock();
+  if (state != SESSION_ACCEPTING) {
+    ldout(cct, 1) << __func__
+                  << " state changed while accept, it must be mark_down"
+                  << dendl;
+    ceph_assert(state == CLOSED);
+    return _fault();
+  }
+
+  if (!existing) {
+    // there is no existing connection therefore cannot reconnect to previous
+    // session
+    ldout(cct, 0) << __func__
+                  << " no existing connection exists, reseting client" << dendl;
+    auto reset = ResetFrame::Encode(true);
+    return WRITE(reset, "session reset", read_frame);
+  }
+
+  std::lock_guard<std::mutex> l(existing->lock);
+
+  ProtocolV2 *exproto = dynamic_cast<ProtocolV2 *>(existing->protocol.get());
+  if (!exproto) {
+    ldout(cct, 1) << __func__ << " existing=" << existing << dendl;
+    ceph_assert(false);
+  }
+
+  if (exproto->state == CLOSED) {
+    ldout(cct, 5) << __func__ << " existing " << existing
+                  << " already closed. Reseting client" << dendl;
+    auto reset = ResetFrame::Encode(true);
+    return WRITE(reset, "session reset", read_frame);
+  }
+
+  if (exproto->replacing) {
+    ldout(cct, 1) << __func__
+                  << " existing racing replace happened while replacing."
+                  << " existing=" << existing << dendl;
+    auto retry = RetryGlobalFrame::Encode(exproto->peer_global_seq);
+    return WRITE(retry, "session retry", read_frame);
+  }
+
+  if (exproto->client_cookie != reconnect.client_cookie()) {
+    ldout(cct, 1) << __func__ << " existing=" << existing
+                  << " client cookie mismatch, I must have reseted:"
+                  << " cc=" << std::hex << exproto->client_cookie
+                  << " rcc=" << reconnect.client_cookie()
+                  << ", reseting client." << std::dec
+                  << dendl;
+    auto reset = ResetFrame::Encode(connection->policy.resetcheck);
+    return WRITE(reset, "session reset", read_frame);
+  } else if (exproto->server_cookie == 0) {
+    // this happens when:
+    //   - a connects to b
+    //   - a sends client_ident
+    //   - b gets client_ident, sends server_ident and sets cookie X
+    //   - connection fault
+    //   - b reconnects to a with cookie X, connect_seq=1
+    //   - a has cookie==0
+    ldout(cct, 1) << __func__ << " I was a client and didn't received the"
+                  << " server_ident. Asking peer to resume session"
+                  << " establishment" << dendl;
+    auto reset = ResetFrame::Encode(false);
+    return WRITE(reset, "session reset", read_frame);
+  }
+
+  if (exproto->peer_global_seq > reconnect.global_seq()) {
+    ldout(cct, 5) << __func__
+                  << " stale global_seq: sgs=" << exproto->peer_global_seq
+                  << " cgs=" << reconnect.global_seq()
+                  << ", ask client to retry global" << dendl;
+    auto retry = RetryGlobalFrame::Encode(exproto->peer_global_seq);
+
+    INTERCEPT(18);
+
+    return WRITE(retry, "session retry", read_frame);
+  }
+
+  if (exproto->connect_seq > reconnect.connect_seq()) {
+    ldout(cct, 5) << __func__
+                  << " stale connect_seq scs=" << exproto->connect_seq
+                  << " ccs=" << reconnect.connect_seq()
+                  << " , ask client to retry" << dendl;
+    auto retry = RetryFrame::Encode(exproto->connect_seq);
+    return WRITE(retry, "session retry", read_frame);
+  }
+
+  if (exproto->connect_seq == reconnect.connect_seq()) {
+    // reconnect race: both peers are sending reconnect messages
+    if (existing->peer_addrs->msgr2_addr() >
+            messenger->get_myaddrs().msgr2_addr() &&
+        !existing->policy.server) {
+      // the existing connection wins
+      ldout(cct, 1)
+          << __func__
+          << " reconnect race detected, this connection loses to existing="
+          << existing << dendl;
+
+      auto wait = WaitFrame::Encode();
+      return WRITE(wait, "wait", read_frame);
+    } else {
+      // this connection wins
+      ldout(cct, 1) << __func__
+                    << " reconnect race detected, replacing existing="
+                    << existing << " socket by this connection's socket"
+                    << dendl;
+    }
+  }
+
+  ldout(cct, 1) << __func__ << " reconnect to existing=" << existing << dendl;
+
+  reconnecting = true;
+
+  // everything looks good
+  exproto->connect_seq = reconnect.connect_seq();
+  exproto->message_seq = reconnect.msg_seq();
+
+  return reuse_connection(existing, exproto);
+}
+
+CtPtr ProtocolV2::handle_existing_connection(AsyncConnectionRef existing) {
+  ldout(cct, 20) << __func__ << " existing=" << existing << dendl;
+
+  std::lock_guard<std::mutex> l(existing->lock);
+
+  ProtocolV2 *exproto = dynamic_cast<ProtocolV2 *>(existing->protocol.get());
+  if (!exproto) {
+    ldout(cct, 1) << __func__ << " existing=" << existing << dendl;
+    ceph_assert(false);
+  }
+
+  if (exproto->state == CLOSED) {
+    ldout(cct, 1) << __func__ << " existing " << existing << " already closed."
+                  << dendl;
+    return send_server_ident();
+  }
+
+  if (exproto->replacing) {
+    ldout(cct, 1) << __func__
+                  << " existing racing replace happened while replacing."
+                  << " existing=" << existing << dendl;
+    auto wait = WaitFrame::Encode();
+    return WRITE(wait, "wait", read_frame);
+  }
+
+  if (exproto->peer_global_seq > peer_global_seq) {
+    ldout(cct, 1) << __func__ << " this is a stale connection, peer_global_seq="
+                  << peer_global_seq
+                  << " existing->peer_global_seq=" << exproto->peer_global_seq
+                  << ", stopping this connection." << dendl;
+    stop();
+    connection->dispatch_queue->queue_reset(connection);
+    return nullptr;
+  }
+
+  if (existing->policy.lossy) {
+    // existing connection can be thrown out in favor of this one
+    ldout(cct, 1)
+        << __func__ << " existing=" << existing
+        << " is a lossy channel. Stopping existing in favor of this connection"
+        << dendl;
+    existing->protocol->stop();
+    existing->dispatch_queue->queue_reset(existing.get());
+    return send_server_ident();
+  }
+
+  if (exproto->server_cookie && exproto->client_cookie &&
+      exproto->client_cookie != client_cookie) {
+    // Found previous session
+    // peer has reseted and we're going to reuse the existing connection
+    // by replacing the communication socket
+    ldout(cct, 1) << __func__ << " found previous session existing=" << existing
+                  << ", peer must have reseted." << dendl;
+    if (connection->policy.resetcheck) {
+      exproto->reset_session();
+    }
+    return reuse_connection(existing, exproto);
+  }
+
+  if (exproto->client_cookie == client_cookie) {
+    // session establishment interrupted between client_ident and server_ident,
+    // continuing...
+    ldout(cct, 1) << __func__ << " found previous session existing=" << existing
+                  << ", continuing session establishment." << dendl;
+    return reuse_connection(existing, exproto);
+  }
+
+  if (exproto->state == READY || exproto->state == STANDBY) {
+    ldout(cct, 1) << __func__ << " existing=" << existing
+                  << " is READY/STANDBY, lets reuse it" << dendl;
+    return reuse_connection(existing, exproto);
+  }
+
+  // Looks like a connection race: server and client are both connecting to
+  // each other at the same time.
+  if (connection->peer_addrs->msgr2_addr() <
+          messenger->get_myaddrs().msgr2_addr() ||
+      existing->policy.server) {
+    // this connection wins
+    ldout(cct, 1) << __func__
+                  << " connection race detected, replacing existing="
+                  << existing << " socket by this connection's socket" << dendl;
+    return reuse_connection(existing, exproto);
+  } else {
+    // the existing connection wins
+    ldout(cct, 1)
+        << __func__
+        << " connection race detected, this connection loses to existing="
+        << existing << dendl;
+    ceph_assert(connection->peer_addrs->msgr2_addr() >
+                messenger->get_myaddrs().msgr2_addr());
+
+    // make sure we follow through with opening the existing
+    // connection (if it isn't yet open) since we know the peer
+    // has something to send to us.
+    existing->send_keepalive();
+    auto wait = WaitFrame::Encode();
+    return WRITE(wait, "wait", read_frame);
+  }
+}
+
+CtPtr ProtocolV2::reuse_connection(AsyncConnectionRef existing,
+                                   ProtocolV2 *exproto) {
+  ldout(cct, 20) << __func__ << " existing=" << existing
+                 << " reconnect=" << reconnecting << dendl;
+
+  connection->inject_delay();
+
+  std::lock_guard<std::mutex> l(existing->write_lock);
+
+  connection->center->delete_file_event(connection->cs.fd(),
+                                        EVENT_READABLE | EVENT_WRITABLE);
+
+  if (existing->delay_state) {
+    existing->delay_state->flush();
+    ceph_assert(!connection->delay_state);
+  }
+  exproto->reset_recv_state();
+  exproto->pre_auth.enabled = false;
+
+  if (!reconnecting) {
+    exproto->peer_supported_features = peer_supported_features;
+    exproto->tx_frame_asm.set_is_rev1(tx_frame_asm.get_is_rev1());
+    exproto->rx_frame_asm.set_is_rev1(rx_frame_asm.get_is_rev1());
+
+    exproto->client_cookie = client_cookie;
+    exproto->peer_name = peer_name;
+    exproto->connection_features = connection_features;
+    existing->set_features(connection_features);
+  }
+  exproto->peer_global_seq = peer_global_seq;
+
+  ceph_assert(connection->center->in_thread());
+  auto temp_cs = std::move(connection->cs);
+  EventCenter *new_center = connection->center;
+  Worker *new_worker = connection->worker;
+  // we can steal the session_stream_handlers under the assumption
+  // this happens in the event center's thread as there should be
+  // no user outside its boundaries (simlarly to e.g. outgoing_bl).
+  auto temp_stream_handlers = std::move(session_stream_handlers);
+  exproto->auth_meta = auth_meta;
+
+  ldout(messenger->cct, 5) << __func__ << " stop myself to swap existing"
+                           << dendl;
+
+  // avoid _stop shutdown replacing socket
+  // queue a reset on the new connection, which we're dumping for the old
+  stop();
+
+  connection->dispatch_queue->queue_reset(connection);
+
+  exproto->can_write = false;
+  exproto->write_in_progress = false;
+  exproto->reconnecting = reconnecting;
+  exproto->replacing = true;
+  existing->state_offset = 0;
+  // avoid previous thread modify event
+  exproto->state = NONE;
+  existing->state = AsyncConnection::STATE_NONE;
+  // Discard existing prefetch buffer in `recv_buf`
+  existing->recv_start = existing->recv_end = 0;
+  // there shouldn't exist any buffer
+  ceph_assert(connection->recv_start == connection->recv_end);
+
+  auto deactivate_existing = std::bind(
+      [ existing,
+        new_worker,
+        new_center,
+        exproto,
+        temp_stream_handlers=std::move(temp_stream_handlers)
+      ](ConnectedSocket &cs) mutable {
+        // we need to delete time event in original thread
+        {
+          std::lock_guard<std::mutex> l(existing->lock);
+          existing->write_lock.lock();
+          exproto->requeue_sent();
+          // XXX: do we really need the locking for `outgoing_bl`? There is
+          // a comment just above its definition saying "lockfree, only used
+          // in own thread". I'm following lockfull schema just in the case.
+          // From performance point of view it should be fine – this happens
+          // far away from hot paths.
+          existing->outgoing_bl.clear();
+          existing->open_write = false;
+          exproto->session_stream_handlers = std::move(temp_stream_handlers);
+          existing->write_lock.unlock();
+          if (exproto->state == NONE) {
+            existing->shutdown_socket();
+            existing->cs = std::move(cs);
+            existing->worker->references--;
+            new_worker->references++;
+            existing->logger = new_worker->get_perf_counter();
+            existing->worker = new_worker;
+            existing->center = new_center;
+            if (existing->delay_state)
+              existing->delay_state->set_center(new_center);
+          } else if (exproto->state == CLOSED) {
+            auto back_to_close = std::bind(
+                [](ConnectedSocket &cs) mutable { cs.close(); }, std::move(cs));
+            new_center->submit_to(new_center->get_id(),
+                                  std::move(back_to_close), true);
+            return;
+          } else {
+            ceph_abort();
+          }
+        }
+
+        // Before changing existing->center, it may already exists some
+        // events in existing->center's queue. Then if we mark down
+        // `existing`, it will execute in another thread and clean up
+        // connection. Previous event will result in segment fault
+        auto transfer_existing = [existing, exproto]() mutable {
+          std::lock_guard<std::mutex> l(existing->lock);
+          if (exproto->state == CLOSED) return;
+          ceph_assert(exproto->state == NONE);
+
+          exproto->state = SESSION_ACCEPTING;
+          // we have called shutdown_socket above
+          ceph_assert(existing->last_tick_id == 0);
+          // restart timer since we are going to re-build connection
+          existing->last_connect_started = ceph::coarse_mono_clock::now();
+          existing->last_tick_id = existing->center->create_time_event(
+            existing->connect_timeout_us, existing->tick_handler);
+          existing->state = AsyncConnection::STATE_CONNECTION_ESTABLISHED;
+          existing->center->create_file_event(existing->cs.fd(), EVENT_READABLE,
+                                              existing->read_handler);
+          if (!exproto->reconnecting) {
+            exproto->run_continuation(exproto->send_server_ident());
+          } else {
+            exproto->run_continuation(exproto->send_reconnect_ok());
+          }
+        };
+        if (existing->center->in_thread())
+          transfer_existing();
+        else
+          existing->center->submit_to(existing->center->get_id(),
+                                      std::move(transfer_existing), true);
+      },
+      std::move(temp_cs));
+
+  existing->center->submit_to(existing->center->get_id(),
+                              std::move(deactivate_existing), true);
+  return nullptr;
+}
+
+CtPtr ProtocolV2::send_server_ident() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  // this is required for the case when this connection is being replaced
+  out_seq = discard_requeued_up_to(out_seq, 0);
+  in_seq = 0;
+
+  if (!connection->policy.lossy) {
+    server_cookie = ceph::util::generate_random_number<uint64_t>(1, -1ll);
+  }
+
+  uint64_t flags = 0;
+  if (connection->policy.lossy) {
+    flags = flags | CEPH_MSG_CONNECT_LOSSY;
+  }
+
+  uint64_t gs = messenger->get_global_seq();
+  auto server_ident = ServerIdentFrame::Encode(
+          messenger->get_myaddrs(),
+          messenger->get_myname().num(),
+          gs,
+          connection->policy.features_supported,
+          connection->policy.features_required | msgr2_required,
+          flags,
+          server_cookie);
+
+  ldout(cct, 5) << __func__ << " sending identification:"
+                << " addrs=" << messenger->get_myaddrs()
+                << " gid=" << messenger->get_myname().num()
+                << " global_seq=" << gs << " features_supported=" << std::hex
+                << connection->policy.features_supported
+                << " features_required="
+		            << (connection->policy.features_required | msgr2_required)
+                << " flags=" << flags
+                << " cookie=" << server_cookie << std::dec << dendl;
+
+  connection->lock.unlock();
+  // Because "replacing" will prevent other connections preempt this addr,
+  // it's safe that here we don't acquire Connection's lock
+  ssize_t r = messenger->accept_conn(connection);
+
+  connection->inject_delay();
+
+  connection->lock.lock();
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " existing race replacing process for addr = "
+                  << connection->peer_addrs->msgr2_addr()
+                  << " just fail later one(this)" << dendl;
+    connection->inject_delay();
+    return _fault();
+  }
+  if (state != SESSION_ACCEPTING) {
+    ldout(cct, 1) << __func__
+                  << " state changed while accept_conn, it must be mark_down"
+                  << dendl;
+    ceph_assert(state == CLOSED || state == NONE);
+    messenger->unregister_conn(connection);
+    connection->inject_delay();
+    return _fault();
+  }
+
+  connection->set_features(connection_features);
+
+  // notify
+  connection->dispatch_queue->queue_accept(connection);
+  messenger->ms_deliver_handle_fast_accept(connection);
+
+  INTERCEPT(12);
+
+  return WRITE(server_ident, "server ident", server_ready);
+}
+
+CtPtr ProtocolV2::server_ready() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (connection->delay_state) {
+    ceph_assert(connection->delay_state->ready());
+  }
+
+  return ready();
+}
+
+CtPtr ProtocolV2::send_reconnect_ok() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  out_seq = discard_requeued_up_to(out_seq, message_seq);
+
+  uint64_t ms = in_seq;
+  auto reconnect_ok = ReconnectOkFrame::Encode(ms);
+
+  ldout(cct, 5) << __func__ << " sending reconnect_ok: msg_seq=" << ms << dendl;
+
+  connection->lock.unlock();
+  // Because "replacing" will prevent other connections preempt this addr,
+  // it's safe that here we don't acquire Connection's lock
+  ssize_t r = messenger->accept_conn(connection);
+
+  connection->inject_delay();
+
+  connection->lock.lock();
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " existing race replacing process for addr = "
+                  << connection->peer_addrs->msgr2_addr()
+                  << " just fail later one(this)" << dendl;
+    connection->inject_delay();
+    return _fault();
+  }
+  if (state != SESSION_ACCEPTING) {
+    ldout(cct, 1) << __func__
+                  << " state changed while accept_conn, it must be mark_down"
+                  << dendl;
+    ceph_assert(state == CLOSED || state == NONE);
+    messenger->unregister_conn(connection);
+    connection->inject_delay();
+    return _fault();
+  }
+
+  // notify
+  connection->dispatch_queue->queue_accept(connection);
+  messenger->ms_deliver_handle_fast_accept(connection);
+
+  INTERCEPT(14);
+
+  return WRITE(reconnect_ok, "reconnect ok", server_ready);
+}
diff --git a/src/msg/async/ProtocolV2.h b/src/msg/async/ProtocolV2.h
new file mode 100644
index 00000000..4941cea5
--- /dev/null
+++ b/src/msg/async/ProtocolV2.h
@@ -0,0 +1,259 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef _MSG_ASYNC_PROTOCOL_V2_
+#define _MSG_ASYNC_PROTOCOL_V2_
+
+#include "Protocol.h"
+#include "crypto_onwire.h"
+#include "frames_v2.h"
+
+class ProtocolV2 : public Protocol {
+private:
+  enum State {
+    NONE,
+    START_CONNECT,
+    BANNER_CONNECTING,
+    HELLO_CONNECTING,
+    AUTH_CONNECTING,
+    AUTH_CONNECTING_SIGN,
+    SESSION_CONNECTING,
+    SESSION_RECONNECTING,
+    START_ACCEPT,
+    BANNER_ACCEPTING,
+    HELLO_ACCEPTING,
+    AUTH_ACCEPTING,
+    AUTH_ACCEPTING_MORE,
+    AUTH_ACCEPTING_SIGN,
+    SESSION_ACCEPTING,
+    READY,
+    THROTTLE_MESSAGE,
+    THROTTLE_BYTES,
+    THROTTLE_DISPATCH_QUEUE,
+    THROTTLE_DONE,
+    READ_MESSAGE_COMPLETE,
+    STANDBY,
+    WAIT,
+    CLOSED
+  };
+
+  static const char *get_state_name(int state) {
+    const char *const statenames[] = {"NONE",
+                                      "START_CONNECT",
+                                      "BANNER_CONNECTING",
+                                      "HELLO_CONNECTING",
+                                      "AUTH_CONNECTING",
+                                      "AUTH_CONNECTING_SIGN",
+                                      "SESSION_CONNECTING",
+                                      "SESSION_RECONNECTING",
+                                      "START_ACCEPT",
+                                      "BANNER_ACCEPTING",
+                                      "HELLO_ACCEPTING",
+                                      "AUTH_ACCEPTING",
+                                      "AUTH_ACCEPTING_MORE",
+                                      "AUTH_ACCEPTING_SIGN",
+                                      "SESSION_ACCEPTING",
+                                      "READY",
+                                      "THROTTLE_MESSAGE",
+                                      "THROTTLE_BYTES",
+                                      "THROTTLE_DISPATCH_QUEUE",
+                                      "THROTTLE_DONE",
+                                      "READ_MESSAGE_COMPLETE",
+                                      "STANDBY",
+                                      "WAIT",
+                                      "CLOSED"};
+    return statenames[state];
+  }
+
+  // TODO: move into auth_meta?
+  ceph::crypto::onwire::rxtx_t session_stream_handlers;
+
+  entity_name_t peer_name;
+  State state;
+  uint64_t peer_supported_features;  // CEPH_MSGR2_FEATURE_*
+
+  uint64_t client_cookie;
+  uint64_t server_cookie;
+  uint64_t global_seq;
+  uint64_t connect_seq;
+  uint64_t peer_global_seq;
+  uint64_t message_seq;
+  bool reconnecting;
+  bool replacing;
+  bool can_write;
+  struct out_queue_entry_t {
+    bool is_prepared {false};
+    Message* m {nullptr};
+  };
+  std::map<int, std::list<out_queue_entry_t>> out_queue;
+  std::list<Message *> sent;
+  std::atomic<uint64_t> out_seq{0};
+  std::atomic<uint64_t> in_seq{0};
+  std::atomic<uint64_t> ack_left{0};
+
+  using ProtFuncPtr = void (ProtocolV2::*)();
+  Ct<ProtocolV2> *bannerExchangeCallback;
+
+  ceph::msgr::v2::FrameAssembler tx_frame_asm;
+  ceph::msgr::v2::FrameAssembler rx_frame_asm;
+
+  ceph::bufferlist rx_preamble;
+  ceph::bufferlist rx_epilogue;
+  ceph::msgr::v2::segment_bls_t rx_segments_data;
+  ceph::msgr::v2::Tag next_tag;
+  utime_t backoff;  // backoff time
+  utime_t recv_stamp;
+  utime_t throttle_stamp;
+
+  struct {
+    ceph::bufferlist rxbuf;
+    ceph::bufferlist txbuf;
+    bool enabled {true};
+  } pre_auth;
+
+  bool keepalive;
+  bool write_in_progress = false;
+
+  ostream &_conn_prefix(std::ostream *_dout);
+  void run_continuation(Ct<ProtocolV2> *pcontinuation);
+  void run_continuation(Ct<ProtocolV2> &continuation);
+
+  Ct<ProtocolV2> *read(CONTINUATION_RXBPTR_TYPE<ProtocolV2> &next,
+                       rx_buffer_t&& buffer);
+  template <class F>
+  Ct<ProtocolV2> *write(const std::string &desc,
+                        CONTINUATION_TYPE<ProtocolV2> &next,
+			F &frame);
+  Ct<ProtocolV2> *write(const std::string &desc,
+                        CONTINUATION_TYPE<ProtocolV2> &next,
+                        bufferlist &buffer);
+
+  template <class F>
+  bool append_frame(F& frame);
+
+  void requeue_sent();
+  uint64_t discard_requeued_up_to(uint64_t out_seq, uint64_t seq);
+  void reset_recv_state();
+  void reset_security();
+  void reset_throttle();
+  Ct<ProtocolV2> *_fault();
+  void discard_out_queue();
+  void reset_session();
+  void prepare_send_message(uint64_t features, Message *m);
+  out_queue_entry_t _get_next_outgoing();
+  ssize_t write_message(Message *m, bool more);
+  void handle_message_ack(uint64_t seq);
+
+  CONTINUATION_DECL(ProtocolV2, _wait_for_peer_banner);
+  READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, _handle_peer_banner);
+  READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, _handle_peer_banner_payload);
+
+  Ct<ProtocolV2> *_banner_exchange(Ct<ProtocolV2> &callback);
+  Ct<ProtocolV2> *_wait_for_peer_banner();
+  Ct<ProtocolV2> *_handle_peer_banner(rx_buffer_t &&buffer, int r);
+  Ct<ProtocolV2> *_handle_peer_banner_payload(rx_buffer_t &&buffer, int r);
+  Ct<ProtocolV2> *handle_hello(ceph::bufferlist &payload);
+
+  CONTINUATION_DECL(ProtocolV2, read_frame);
+  CONTINUATION_DECL(ProtocolV2, finish_auth);
+  READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, handle_read_frame_preamble_main);
+  READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, handle_read_frame_segment);
+  READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, handle_read_frame_epilogue_main);
+  CONTINUATION_DECL(ProtocolV2, throttle_message);
+  CONTINUATION_DECL(ProtocolV2, throttle_bytes);
+  CONTINUATION_DECL(ProtocolV2, throttle_dispatch_queue);
+
+  Ct<ProtocolV2> *read_frame();
+  Ct<ProtocolV2> *finish_auth();
+  Ct<ProtocolV2> *finish_client_auth();
+  Ct<ProtocolV2> *handle_read_frame_preamble_main(rx_buffer_t &&buffer, int r);
+  Ct<ProtocolV2> *read_frame_segment();
+  Ct<ProtocolV2> *handle_read_frame_segment(rx_buffer_t &&rx_buffer, int r);
+  Ct<ProtocolV2> *_handle_read_frame_segment();
+  Ct<ProtocolV2> *handle_read_frame_epilogue_main(rx_buffer_t &&buffer, int r);
+  Ct<ProtocolV2> *_handle_read_frame_epilogue_main();
+  Ct<ProtocolV2> *handle_read_frame_dispatch();
+  Ct<ProtocolV2> *handle_frame_payload();
+
+  Ct<ProtocolV2> *ready();
+
+  Ct<ProtocolV2> *handle_message();
+  Ct<ProtocolV2> *throttle_message();
+  Ct<ProtocolV2> *throttle_bytes();
+  Ct<ProtocolV2> *throttle_dispatch_queue();
+  Ct<ProtocolV2> *read_message_data_prepare();
+
+  Ct<ProtocolV2> *handle_keepalive2(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_keepalive2_ack(ceph::bufferlist &payload);
+
+  Ct<ProtocolV2> *handle_message_ack(ceph::bufferlist &payload);
+
+public:
+  uint64_t connection_features;
+
+  ProtocolV2(AsyncConnection *connection);
+  virtual ~ProtocolV2();
+
+  virtual void connect() override;
+  virtual void accept() override;
+  virtual bool is_connected() override;
+  virtual void stop() override;
+  virtual void fault() override;
+  virtual void send_message(Message *m) override;
+  virtual void send_keepalive() override;
+
+  virtual void read_event() override;
+  virtual void write_event() override;
+  virtual bool is_queued() override;
+
+private:
+  // Client Protocol
+  CONTINUATION_DECL(ProtocolV2, start_client_banner_exchange);
+  CONTINUATION_DECL(ProtocolV2, post_client_banner_exchange);
+
+  Ct<ProtocolV2> *start_client_banner_exchange();
+  Ct<ProtocolV2> *post_client_banner_exchange();
+  inline Ct<ProtocolV2> *send_auth_request() {
+    std::vector<uint32_t> empty;
+    return send_auth_request(empty);
+  }
+  Ct<ProtocolV2> *send_auth_request(std::vector<uint32_t> &allowed_methods);
+  Ct<ProtocolV2> *handle_auth_bad_method(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_auth_reply_more(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_auth_done(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_auth_signature(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *send_client_ident();
+  Ct<ProtocolV2> *send_reconnect();
+  Ct<ProtocolV2> *handle_ident_missing_features(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_session_reset(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_session_retry(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_session_retry_global(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_wait(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_reconnect_ok(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_server_ident(ceph::bufferlist &payload);
+
+  // Server Protocol
+  CONTINUATION_DECL(ProtocolV2, start_server_banner_exchange);
+  CONTINUATION_DECL(ProtocolV2, post_server_banner_exchange);
+  CONTINUATION_DECL(ProtocolV2, server_ready);
+
+  Ct<ProtocolV2> *start_server_banner_exchange();
+  Ct<ProtocolV2> *post_server_banner_exchange();
+  Ct<ProtocolV2> *handle_auth_request(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_auth_request_more(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *_handle_auth_request(bufferlist& auth_payload, bool more);
+  Ct<ProtocolV2> *_auth_bad_method(int r);
+  Ct<ProtocolV2> *handle_client_ident(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_ident_missing_features_write(int r);
+  Ct<ProtocolV2> *handle_reconnect(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_existing_connection(AsyncConnectionRef existing);
+  Ct<ProtocolV2> *reuse_connection(AsyncConnectionRef existing,
+                                   ProtocolV2 *exproto);
+  Ct<ProtocolV2> *send_server_ident();
+  Ct<ProtocolV2> *send_reconnect_ok();
+  Ct<ProtocolV2> *server_ready();
+
+  size_t get_current_msg_size() const;
+};
+
+#endif /* _MSG_ASYNC_PROTOCOL_V2_ */
diff --git a/src/msg/async/Stack.cc b/src/msg/async/Stack.cc
new file mode 100644
index 00000000..8976c3cc
--- /dev/null
+++ b/src/msg/async/Stack.cc
@@ -0,0 +1,217 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <mutex>
+
+#include "include/compat.h"
+#include "common/Cond.h"
+#include "common/errno.h"
+#include "PosixStack.h"
+#ifdef HAVE_RDMA
+#include "rdma/RDMAStack.h"
+#endif
+#ifdef HAVE_DPDK
+#include "dpdk/DPDKStack.h"
+#endif
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << "stack "
+
+std::function<void ()> NetworkStack::add_thread(unsigned i)
+{
+  Worker *w = workers[i];
+  return [this, w]() {
+      char tp_name[16];
+      sprintf(tp_name, "msgr-worker-%u", w->id);
+      ceph_pthread_setname(pthread_self(), tp_name);
+      const unsigned EventMaxWaitUs = 30000000;
+      w->center.set_owner();
+      ldout(cct, 10) << __func__ << " starting" << dendl;
+      w->initialize();
+      w->init_done();
+      while (!w->done) {
+        ldout(cct, 30) << __func__ << " calling event process" << dendl;
+
+        ceph::timespan dur;
+        int r = w->center.process_events(EventMaxWaitUs, &dur);
+        if (r < 0) {
+          ldout(cct, 20) << __func__ << " process events failed: "
+                         << cpp_strerror(errno) << dendl;
+          // TODO do something?
+        }
+        w->perf_logger->tinc(l_msgr_running_total_time, dur);
+      }
+      w->reset();
+      w->destroy();
+  };
+}
+
+std::shared_ptr<NetworkStack> NetworkStack::create(CephContext *c, const string &t)
+{
+  if (t == "posix")
+    return std::make_shared<PosixNetworkStack>(c, t);
+#ifdef HAVE_RDMA
+  else if (t == "rdma")
+    return std::make_shared<RDMAStack>(c, t);
+#endif
+#ifdef HAVE_DPDK
+  else if (t == "dpdk")
+    return std::make_shared<DPDKStack>(c, t);
+#endif
+
+  lderr(c) << __func__ << " ms_async_transport_type " << t <<
+    " is not supported! " << dendl;
+  ceph_abort();
+  return nullptr;
+}
+
+Worker* NetworkStack::create_worker(CephContext *c, const string &type, unsigned i)
+{
+  if (type == "posix")
+    return new PosixWorker(c, i);
+#ifdef HAVE_RDMA
+  else if (type == "rdma")
+    return new RDMAWorker(c, i);
+#endif
+#ifdef HAVE_DPDK
+  else if (type == "dpdk")
+    return new DPDKWorker(c, i);
+#endif
+
+  lderr(c) << __func__ << " ms_async_transport_type " << type <<
+    " is not supported! " << dendl;
+  ceph_abort();
+  return nullptr;
+}
+
+NetworkStack::NetworkStack(CephContext *c, const string &t): type(t), started(false), cct(c)
+{
+  ceph_assert(cct->_conf->ms_async_op_threads > 0);
+
+  const int InitEventNumber = 5000;
+  num_workers = cct->_conf->ms_async_op_threads;
+  if (num_workers >= EventCenter::MAX_EVENTCENTER) {
+    ldout(cct, 0) << __func__ << " max thread limit is "
+                  << EventCenter::MAX_EVENTCENTER << ", switching to this now. "
+                  << "Higher thread values are unnecessary and currently unsupported."
+                  << dendl;
+    num_workers = EventCenter::MAX_EVENTCENTER;
+  }
+
+  for (unsigned i = 0; i < num_workers; ++i) {
+    Worker *w = create_worker(cct, type, i);
+    w->center.init(InitEventNumber, i, type);
+    workers.push_back(w);
+  }
+}
+
+void NetworkStack::start()
+{
+  std::unique_lock<decltype(pool_spin)> lk(pool_spin);
+
+  if (started) {
+    return ;
+  }
+
+  for (unsigned i = 0; i < num_workers; ++i) {
+    if (workers[i]->is_init())
+      continue;
+    std::function<void ()> thread = add_thread(i);
+    spawn_worker(i, std::move(thread));
+  }
+  started = true;
+  lk.unlock();
+
+  for (unsigned i = 0; i < num_workers; ++i)
+    workers[i]->wait_for_init();
+}
+
+Worker* NetworkStack::get_worker()
+{
+  ldout(cct, 30) << __func__ << dendl;
+
+   // start with some reasonably large number
+  unsigned min_load = std::numeric_limits<int>::max();
+  Worker* current_best = nullptr;
+
+  pool_spin.lock();
+  // find worker with least references
+  // tempting case is returning on references == 0, but in reality
+  // this will happen so rarely that there's no need for special case.
+  for (unsigned i = 0; i < num_workers; ++i) {
+    unsigned worker_load = workers[i]->references.load();
+    if (worker_load < min_load) {
+      current_best = workers[i];
+      min_load = worker_load;
+    }
+  }
+
+  pool_spin.unlock();
+  ceph_assert(current_best);
+  ++current_best->references;
+  return current_best;
+}
+
+void NetworkStack::stop()
+{
+  std::lock_guard<decltype(pool_spin)> lk(pool_spin);
+  for (unsigned i = 0; i < num_workers; ++i) {
+    workers[i]->done = true;
+    workers[i]->center.wakeup();
+    join_worker(i);
+  }
+  started = false;
+}
+
+class C_drain : public EventCallback {
+  Mutex drain_lock;
+  Cond drain_cond;
+  unsigned drain_count;
+
+ public:
+  explicit C_drain(size_t c)
+      : drain_lock("C_drain::drain_lock"),
+        drain_count(c) {}
+  void do_request(uint64_t id) override {
+    Mutex::Locker l(drain_lock);
+    drain_count--;
+    if (drain_count == 0) drain_cond.Signal();
+  }
+  void wait() {
+    Mutex::Locker l(drain_lock);
+    while (drain_count)
+      drain_cond.Wait(drain_lock);
+  }
+};
+
+void NetworkStack::drain()
+{
+  ldout(cct, 30) << __func__ << " started." << dendl;
+  pthread_t cur = pthread_self();
+  pool_spin.lock();
+  C_drain drain(num_workers);
+  for (unsigned i = 0; i < num_workers; ++i) {
+    ceph_assert(cur != workers[i]->center.get_owner());
+    workers[i]->center.dispatch_event_external(EventCallbackRef(&drain));
+  }
+  pool_spin.unlock();
+  drain.wait();
+  ldout(cct, 30) << __func__ << " end." << dendl;
+}
diff --git a/src/msg/async/Stack.h b/src/msg/async/Stack.h
new file mode 100644
index 00000000..a093dadb
--- /dev/null
+++ b/src/msg/async/Stack.h
@@ -0,0 +1,356 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_ASYNC_STACK_H
+#define CEPH_MSG_ASYNC_STACK_H
+
+#include "include/spinlock.h"
+#include "common/perf_counters.h"
+#include "msg/msg_types.h"
+#include "msg/async/Event.h"
+
+class Worker;
+class ConnectedSocketImpl {
+ public:
+  virtual ~ConnectedSocketImpl() {}
+  virtual int is_connected() = 0;
+  virtual ssize_t read(char*, size_t) = 0;
+  virtual ssize_t zero_copy_read(bufferptr&) = 0;
+  virtual ssize_t send(bufferlist &bl, bool more) = 0;
+  virtual void shutdown() = 0;
+  virtual void close() = 0;
+  virtual int fd() const = 0;
+  virtual int socket_fd() const = 0;
+};
+
+class ConnectedSocket;
+struct SocketOptions {
+  bool nonblock = true;
+  bool nodelay = true;
+  int rcbuf_size = 0;
+  int priority = -1;
+  entity_addr_t connect_bind_addr;
+};
+
+/// \cond internal
+class ServerSocketImpl {
+ public:
+  unsigned addr_type; ///< entity_addr_t::TYPE_*
+  unsigned addr_slot; ///< position of our addr in myaddrs().v
+  ServerSocketImpl(unsigned type, unsigned slot)
+    : addr_type(type), addr_slot(slot) {}
+  virtual ~ServerSocketImpl() {}
+  virtual int accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w) = 0;
+  virtual void abort_accept() = 0;
+  /// Get file descriptor
+  virtual int fd() const = 0;
+};
+/// \endcond
+
+/// \addtogroup networking-module
+/// @{
+
+/// A TCP (or other stream-based protocol) connection.
+///
+/// A \c ConnectedSocket represents a full-duplex stream between
+/// two endpoints, a local endpoint and a remote endpoint.
+class ConnectedSocket {
+  std::unique_ptr<ConnectedSocketImpl> _csi;
+
+ public:
+  /// Constructs a \c ConnectedSocket not corresponding to a connection
+  ConnectedSocket() {};
+  /// \cond internal
+  explicit ConnectedSocket(std::unique_ptr<ConnectedSocketImpl> csi)
+      : _csi(std::move(csi)) {}
+  /// \endcond
+   ~ConnectedSocket() {
+    if (_csi)
+      _csi->close();
+  }
+  /// Moves a \c ConnectedSocket object.
+  ConnectedSocket(ConnectedSocket&& cs) = default;
+  /// Move-assigns a \c ConnectedSocket object.
+  ConnectedSocket& operator=(ConnectedSocket&& cs) = default;
+
+  int is_connected() {
+    return _csi->is_connected();
+  }
+  /// Read the input stream with copy.
+  ///
+  /// Copy an object returning data sent from the remote endpoint.
+  ssize_t read(char* buf, size_t len) {
+    return _csi->read(buf, len);
+  }
+  /// Gets the input stream.
+  ///
+  /// Gets an object returning data sent from the remote endpoint.
+  ssize_t zero_copy_read(bufferptr &data) {
+    return _csi->zero_copy_read(data);
+  }
+  /// Gets the output stream.
+  ///
+  /// Gets an object that sends data to the remote endpoint.
+  ssize_t send(bufferlist &bl, bool more) {
+    return _csi->send(bl, more);
+  }
+  /// Disables output to the socket.
+  ///
+  /// Current or future writes that have not been successfully flushed
+  /// will immediately fail with an error.  This is useful to abort
+  /// operations on a socket that is not making progress due to a
+  /// peer failure.
+  void shutdown() {
+    return _csi->shutdown();
+  }
+  /// Disables input from the socket.
+  ///
+  /// Current or future reads will immediately fail with an error.
+  /// This is useful to abort operations on a socket that is not making
+  /// progress due to a peer failure.
+  void close() {
+    _csi->close();
+    _csi.reset();
+  }
+
+  /// Get file descriptor
+  int fd() const {
+    return _csi->fd();
+  }
+  int socket_fd() const {
+    return _csi->socket_fd();
+  }
+
+  explicit operator bool() const {
+    return _csi.get();
+  }
+};
+/// @}
+
+/// \addtogroup networking-module
+/// @{
+
+/// A listening socket, waiting to accept incoming network connections.
+class ServerSocket {
+  std::unique_ptr<ServerSocketImpl> _ssi;
+ public:
+  /// Constructs a \c ServerSocket not corresponding to a connection
+  ServerSocket() {}
+  /// \cond internal
+  explicit ServerSocket(std::unique_ptr<ServerSocketImpl> ssi)
+      : _ssi(std::move(ssi)) {}
+  ~ServerSocket() {
+    if (_ssi)
+      _ssi->abort_accept();
+  }
+  /// \endcond
+  /// Moves a \c ServerSocket object.
+  ServerSocket(ServerSocket&& ss) = default;
+  /// Move-assigns a \c ServerSocket object.
+  ServerSocket& operator=(ServerSocket&& cs) = default;
+
+  /// Accepts the next connection to successfully connect to this socket.
+  ///
+  /// \Accepts a \ref ConnectedSocket representing the connection, and
+  ///          a \ref entity_addr_t describing the remote endpoint.
+  int accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w) {
+    return _ssi->accept(sock, opt, out, w);
+  }
+
+  /// Stops any \ref accept() in progress.
+  ///
+  /// Current and future \ref accept() calls will terminate immediately
+  /// with an error.
+  void abort_accept() {
+    _ssi->abort_accept();
+    _ssi.reset();
+  }
+
+  /// Get file descriptor
+  int fd() const {
+    return _ssi->fd();
+  }
+
+  /// get listen/bind addr
+  unsigned get_addr_slot() {
+    return _ssi->addr_slot;
+  }
+
+  explicit operator bool() const {
+    return _ssi.get();
+  }
+};
+/// @}
+
+class NetworkStack;
+
+enum {
+  l_msgr_first = 94000,
+  l_msgr_recv_messages,
+  l_msgr_send_messages,
+  l_msgr_recv_bytes,
+  l_msgr_send_bytes,
+  l_msgr_created_connections,
+  l_msgr_active_connections,
+
+  l_msgr_running_total_time,
+  l_msgr_running_send_time,
+  l_msgr_running_recv_time,
+  l_msgr_running_fast_dispatch_time,
+
+  l_msgr_last,
+};
+
+class Worker {
+  std::mutex init_lock;
+  std::condition_variable init_cond;
+  bool init = false;
+
+ public:
+  bool done = false;
+
+  CephContext *cct;
+  PerfCounters *perf_logger;
+  unsigned id;
+
+  std::atomic_uint references;
+  EventCenter center;
+
+  Worker(const Worker&) = delete;
+  Worker& operator=(const Worker&) = delete;
+
+  Worker(CephContext *c, unsigned i)
+    : cct(c), perf_logger(NULL), id(i), references(0), center(c) {
+    char name[128];
+    sprintf(name, "AsyncMessenger::Worker-%u", id);
+    // initialize perf_logger
+    PerfCountersBuilder plb(cct, name, l_msgr_first, l_msgr_last);
+
+    plb.add_u64_counter(l_msgr_recv_messages, "msgr_recv_messages", "Network received messages");
+    plb.add_u64_counter(l_msgr_send_messages, "msgr_send_messages", "Network sent messages");
+    plb.add_u64_counter(l_msgr_recv_bytes, "msgr_recv_bytes", "Network received bytes", NULL, 0, unit_t(UNIT_BYTES));
+    plb.add_u64_counter(l_msgr_send_bytes, "msgr_send_bytes", "Network sent bytes", NULL, 0, unit_t(UNIT_BYTES));
+    plb.add_u64_counter(l_msgr_active_connections, "msgr_active_connections", "Active connection number");
+    plb.add_u64_counter(l_msgr_created_connections, "msgr_created_connections", "Created connection number");
+
+    plb.add_time(l_msgr_running_total_time, "msgr_running_total_time", "The total time of thread running");
+    plb.add_time(l_msgr_running_send_time, "msgr_running_send_time", "The total time of message sending");
+    plb.add_time(l_msgr_running_recv_time, "msgr_running_recv_time", "The total time of message receiving");
+    plb.add_time(l_msgr_running_fast_dispatch_time, "msgr_running_fast_dispatch_time", "The total time of fast dispatch");
+
+    perf_logger = plb.create_perf_counters();
+    cct->get_perfcounters_collection()->add(perf_logger);
+  }
+  virtual ~Worker() {
+    if (perf_logger) {
+      cct->get_perfcounters_collection()->remove(perf_logger);
+      delete perf_logger;
+    }
+  }
+
+  virtual int listen(entity_addr_t &addr, unsigned addr_slot,
+                     const SocketOptions &opts, ServerSocket *) = 0;
+  virtual int connect(const entity_addr_t &addr,
+                      const SocketOptions &opts, ConnectedSocket *socket) = 0;
+  virtual void destroy() {}
+
+  virtual void initialize() {}
+  PerfCounters *get_perf_counter() { return perf_logger; }
+  void release_worker() {
+    int oldref = references.fetch_sub(1);
+    ceph_assert(oldref > 0);
+  }
+  void init_done() {
+    init_lock.lock();
+    init = true;
+    init_cond.notify_all();
+    init_lock.unlock();
+  }
+  bool is_init() {
+    std::lock_guard<std::mutex> l(init_lock);
+    return init;
+  }
+  void wait_for_init() {
+    std::unique_lock<std::mutex> l(init_lock);
+    while (!init)
+      init_cond.wait(l);
+  }
+  void reset() {
+    init_lock.lock();
+    init = false;
+    init_cond.notify_all();
+    init_lock.unlock();
+    done = false;
+  }
+};
+
+class NetworkStack {
+  std::string type;
+  unsigned num_workers = 0;
+  ceph::spinlock pool_spin;
+  bool started = false;
+
+  std::function<void ()> add_thread(unsigned i);
+
+ protected:
+  CephContext *cct;
+  vector<Worker*> workers;
+
+  explicit NetworkStack(CephContext *c, const string &t);
+ public:
+  NetworkStack(const NetworkStack &) = delete;
+  NetworkStack& operator=(const NetworkStack &) = delete;
+  virtual ~NetworkStack() {
+    for (auto &&w : workers)
+      delete w;
+  }
+
+  static std::shared_ptr<NetworkStack> create(
+          CephContext *c, const string &type);
+
+  static Worker* create_worker(
+          CephContext *c, const string &t, unsigned i);
+  // backend need to override this method if supports zero copy read
+  virtual bool support_zero_copy_read() const { return false; }
+  // backend need to override this method if backend doesn't support shared
+  // listen table.
+  // For example, posix backend has in kernel global listen table. If one
+  // thread bind a port, other threads also aware this.
+  // But for dpdk backend, we maintain listen table in each thread. So we
+  // need to let each thread do binding port.
+  virtual bool support_local_listen_table() const { return false; }
+  virtual bool nonblock_connect_need_writable_event() const { return true; }
+
+  void start();
+  void stop();
+  virtual Worker *get_worker();
+  Worker *get_worker(unsigned i) {
+    return workers[i];
+  }
+  void drain();
+  unsigned get_num_worker() const {
+    return num_workers;
+  }
+
+  // direct is used in tests only
+  virtual void spawn_worker(unsigned i, std::function<void ()> &&) = 0;
+  virtual void join_worker(unsigned i) = 0;
+
+  virtual bool is_ready() { return true; };
+  virtual void ready() { };
+};
+
+#endif //CEPH_MSG_ASYNC_STACK_H
diff --git a/src/msg/async/crypto_onwire.cc b/src/msg/async/crypto_onwire.cc
new file mode 100644
index 00000000..4e423406
--- /dev/null
+++ b/src/msg/async/crypto_onwire.cc
@@ -0,0 +1,311 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <array>
+#include <openssl/evp.h>
+
+#include "crypto_onwire.h"
+
+#include "common/debug.h"
+#include "common/ceph_crypto.h"
+#include "include/types.h"
+
+#define dout_subsys ceph_subsys_ms
+
+namespace ceph::crypto::onwire {
+
+static constexpr const std::size_t AESGCM_KEY_LEN{16};
+static constexpr const std::size_t AESGCM_IV_LEN{12};
+static constexpr const std::size_t AESGCM_TAG_LEN{16};
+static constexpr const std::size_t AESGCM_BLOCK_LEN{16};
+
+struct nonce_t {
+  ceph_le32 fixed;
+  ceph_le64 counter;
+
+  bool operator==(const nonce_t& rhs) const {
+    return !memcmp(this, &rhs, sizeof(*this));
+  }
+} __attribute__((packed));
+static_assert(sizeof(nonce_t) == AESGCM_IV_LEN);
+
+using key_t = std::array<std::uint8_t, AESGCM_KEY_LEN>;
+
+// http://www.mindspring.com/~dmcgrew/gcm-nist-6.pdf
+// https://www.openssl.org/docs/man1.0.2/crypto/EVP_aes_128_gcm.html#GCM-mode
+// https://wiki.openssl.org/index.php/EVP_Authenticated_Encryption_and_Decryption
+// https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38d.pdf
+class AES128GCM_OnWireTxHandler : public ceph::crypto::onwire::TxHandler {
+  CephContext* const cct;
+  std::unique_ptr<EVP_CIPHER_CTX, decltype(&::EVP_CIPHER_CTX_free)> ectx;
+  ceph::bufferlist buffer;
+  nonce_t nonce, initial_nonce;
+  bool used_initial_nonce;
+  bool new_nonce_format;  // 64-bit counter?
+  static_assert(sizeof(nonce) == AESGCM_IV_LEN);
+
+public:
+  AES128GCM_OnWireTxHandler(CephContext* const cct,
+			    const key_t& key,
+			    const nonce_t& nonce,
+			    bool new_nonce_format)
+    : cct(cct),
+      ectx(EVP_CIPHER_CTX_new(), EVP_CIPHER_CTX_free),
+      nonce(nonce), initial_nonce(nonce), used_initial_nonce(false),
+      new_nonce_format(new_nonce_format) {
+    ceph_assert_always(ectx);
+    ceph_assert_always(key.size() * CHAR_BIT == 128);
+
+    if (1 != EVP_EncryptInit_ex(ectx.get(), EVP_aes_128_gcm(),
+			        nullptr, nullptr, nullptr)) {
+      throw std::runtime_error("EVP_EncryptInit_ex failed");
+    }
+
+    if(1 != EVP_EncryptInit_ex(ectx.get(), nullptr, nullptr,
+			       key.data(), nullptr)) {
+      throw std::runtime_error("EVP_EncryptInit_ex failed");
+    }
+  }
+
+  ~AES128GCM_OnWireTxHandler() override {
+    ::ceph::crypto::zeroize_for_security(&nonce, sizeof(nonce));
+    ::ceph::crypto::zeroize_for_security(&initial_nonce, sizeof(initial_nonce));
+  }
+
+  void reset_tx_handler(const uint32_t* first, const uint32_t* last) override;
+
+  void authenticated_encrypt_update(const ceph::bufferlist& plaintext) override;
+  ceph::bufferlist authenticated_encrypt_final() override;
+};
+
+void AES128GCM_OnWireTxHandler::reset_tx_handler(const uint32_t* first,
+                                                 const uint32_t* last)
+{
+  if (nonce == initial_nonce) {
+    if (used_initial_nonce) {
+      throw ceph::crypto::onwire::TxHandlerError("out of nonces");
+    }
+    used_initial_nonce = true;
+  }
+
+  if(1 != EVP_EncryptInit_ex(ectx.get(), nullptr, nullptr, nullptr,
+      reinterpret_cast<const unsigned char*>(&nonce))) {
+    throw std::runtime_error("EVP_EncryptInit_ex failed");
+  }
+
+  ceph_assert(buffer.get_append_buffer_unused_tail_length() == 0);
+  buffer.reserve(std::accumulate(first, last, AESGCM_TAG_LEN));
+
+  if (!new_nonce_format) {
+    // msgr2.0: 32-bit counter followed by 64-bit fixed field,
+    // susceptible to overflow!
+    nonce.fixed = nonce.fixed + 1;
+  } else {
+    nonce.counter = nonce.counter + 1;
+  }
+}
+
+void AES128GCM_OnWireTxHandler::authenticated_encrypt_update(
+  const ceph::bufferlist& plaintext)
+{
+  ceph_assert(buffer.get_append_buffer_unused_tail_length() >=
+              plaintext.length());
+  auto filler = buffer.append_hole(plaintext.length());
+
+  for (const auto& plainbuf : plaintext.buffers()) {
+    int update_len = 0;
+
+    if(1 != EVP_EncryptUpdate(ectx.get(),
+	reinterpret_cast<unsigned char*>(filler.c_str()),
+	&update_len,
+	reinterpret_cast<const unsigned char*>(plainbuf.c_str()),
+	plainbuf.length())) {
+      throw std::runtime_error("EVP_EncryptUpdate failed");
+    }
+    ceph_assert_always(update_len >= 0);
+    ceph_assert(static_cast<unsigned>(update_len) == plainbuf.length());
+    filler.advance(update_len);
+  }
+
+  ldout(cct, 15) << __func__
+		 << " plaintext.length()=" << plaintext.length()
+		 << " buffer.length()=" << buffer.length()
+		 << dendl;
+}
+
+ceph::bufferlist AES128GCM_OnWireTxHandler::authenticated_encrypt_final()
+{
+  int final_len = 0;
+  ceph_assert(buffer.get_append_buffer_unused_tail_length() ==
+              AESGCM_BLOCK_LEN);
+  auto filler = buffer.append_hole(AESGCM_BLOCK_LEN);
+  if(1 != EVP_EncryptFinal_ex(ectx.get(),
+	reinterpret_cast<unsigned char*>(filler.c_str()),
+	&final_len)) {
+    throw std::runtime_error("EVP_EncryptFinal_ex failed");
+  }
+  ceph_assert_always(final_len == 0);
+
+  static_assert(AESGCM_BLOCK_LEN == AESGCM_TAG_LEN);
+  if(1 != EVP_CIPHER_CTX_ctrl(ectx.get(),
+	EVP_CTRL_GCM_GET_TAG, AESGCM_TAG_LEN,
+	filler.c_str())) {
+    throw std::runtime_error("EVP_CIPHER_CTX_ctrl failed");
+  }
+
+  ldout(cct, 15) << __func__
+		 << " buffer.length()=" << buffer.length()
+		 << " final_len=" << final_len
+		 << dendl;
+  return std::move(buffer);
+}
+
+// RX PART
+class AES128GCM_OnWireRxHandler : public ceph::crypto::onwire::RxHandler {
+  CephContext* const cct;
+  std::unique_ptr<EVP_CIPHER_CTX, decltype(&::EVP_CIPHER_CTX_free)> ectx;
+  nonce_t nonce;
+  bool new_nonce_format;  // 64-bit counter?
+  static_assert(sizeof(nonce) == AESGCM_IV_LEN);
+
+public:
+  AES128GCM_OnWireRxHandler(CephContext* const cct,
+			    const key_t& key,
+			    const nonce_t& nonce,
+			    bool new_nonce_format)
+    : cct(cct),
+      ectx(EVP_CIPHER_CTX_new(), EVP_CIPHER_CTX_free),
+      nonce(nonce), new_nonce_format(new_nonce_format) {
+    ceph_assert_always(ectx);
+    ceph_assert_always(key.size() * CHAR_BIT == 128);
+
+    if (1 != EVP_DecryptInit_ex(ectx.get(), EVP_aes_128_gcm(),
+			        nullptr, nullptr, nullptr)) {
+      throw std::runtime_error("EVP_DecryptInit_ex failed");
+    }
+
+    if(1 != EVP_DecryptInit_ex(ectx.get(), nullptr, nullptr,
+			       key.data(), nullptr)) {
+      throw std::runtime_error("EVP_DecryptInit_ex failed");
+    }
+  }
+
+  ~AES128GCM_OnWireRxHandler() override {
+    ::ceph::crypto::zeroize_for_security(&nonce, sizeof(nonce));
+  }
+
+  std::uint32_t get_extra_size_at_final() override {
+    return AESGCM_TAG_LEN;
+  }
+  void reset_rx_handler() override;
+  void authenticated_decrypt_update(ceph::bufferlist& bl) override;
+  void authenticated_decrypt_update_final(ceph::bufferlist& bl) override;
+};
+
+void AES128GCM_OnWireRxHandler::reset_rx_handler()
+{
+  if(1 != EVP_DecryptInit_ex(ectx.get(), nullptr, nullptr, nullptr,
+	reinterpret_cast<const unsigned char*>(&nonce))) {
+    throw std::runtime_error("EVP_DecryptInit_ex failed");
+  }
+
+  if (!new_nonce_format) {
+    // msgr2.0: 32-bit counter followed by 64-bit fixed field,
+    // susceptible to overflow!
+    nonce.fixed = nonce.fixed + 1;
+  } else {
+    nonce.counter = nonce.counter + 1;
+  }
+}
+
+void AES128GCM_OnWireRxHandler::authenticated_decrypt_update(
+  ceph::bufferlist& bl)
+{
+  // discard cached crcs as we will be writing through c_str()
+  bl.invalidate_crc();
+  for (auto& buf : bl.buffers()) {
+    auto p = reinterpret_cast<unsigned char*>(const_cast<char*>(buf.c_str()));
+    int update_len = 0;
+
+    if (1 != EVP_DecryptUpdate(ectx.get(), p, &update_len, p, buf.length())) {
+      throw std::runtime_error("EVP_DecryptUpdate failed");
+    }
+    ceph_assert_always(update_len >= 0);
+    ceph_assert(static_cast<unsigned>(update_len) == buf.length());
+  }
+}
+
+void AES128GCM_OnWireRxHandler::authenticated_decrypt_update_final(
+  ceph::bufferlist& bl)
+{
+  unsigned orig_len = bl.length();
+  ceph_assert(orig_len >= AESGCM_TAG_LEN);
+
+  // decrypt optional data. Caller is obliged to provide only signature but it
+  // may supply ciphertext as well. Combining the update + final is reflected
+  // combined together.
+  ceph::bufferlist auth_tag;
+  bl.splice(orig_len - AESGCM_TAG_LEN, AESGCM_TAG_LEN, &auth_tag);
+  if (bl.length() > 0) {
+    authenticated_decrypt_update(bl);
+  }
+
+  // we need to ensure the tag is stored in continuous memory.
+  if (1 != EVP_CIPHER_CTX_ctrl(ectx.get(), EVP_CTRL_GCM_SET_TAG,
+	AESGCM_TAG_LEN, auth_tag.c_str())) {
+    throw std::runtime_error("EVP_CIPHER_CTX_ctrl failed");
+  }
+
+  // I expect that 0 bytes will be appended. The call is supposed solely to
+  // authenticate the message.
+  {
+    int final_len = 0;
+    if (0 >= EVP_DecryptFinal_ex(ectx.get(), nullptr, &final_len)) {
+      throw MsgAuthError();
+    }
+    ceph_assert_always(final_len == 0);
+    ceph_assert(bl.length() + AESGCM_TAG_LEN == orig_len);
+  }
+}
+
+ceph::crypto::onwire::rxtx_t ceph::crypto::onwire::rxtx_t::create_handler_pair(
+  CephContext* cct,
+  const AuthConnectionMeta& auth_meta,
+  bool new_nonce_format,
+  bool crossed)
+{
+  if (auth_meta.is_mode_secure()) {
+    ceph_assert_always(auth_meta.connection_secret.length() >= \
+      sizeof(key_t) + 2 * sizeof(nonce_t));
+    const char* secbuf = auth_meta.connection_secret.c_str();
+
+    key_t key;
+    {
+      ::memcpy(key.data(), secbuf, sizeof(key));
+      secbuf += sizeof(key);
+    }
+
+    nonce_t rx_nonce;
+    {
+      ::memcpy(&rx_nonce, secbuf, sizeof(rx_nonce));
+      secbuf += sizeof(rx_nonce);
+    }
+
+    nonce_t tx_nonce;
+    {
+      ::memcpy(&tx_nonce, secbuf, sizeof(tx_nonce));
+      secbuf += sizeof(tx_nonce);
+    }
+
+    return {
+      std::make_unique<AES128GCM_OnWireRxHandler>(
+	cct, key, crossed ? tx_nonce : rx_nonce, new_nonce_format),
+      std::make_unique<AES128GCM_OnWireTxHandler>(
+	cct, key, crossed ? rx_nonce : tx_nonce, new_nonce_format)
+    };
+  } else {
+    return { nullptr, nullptr };
+  }
+}
+
+} // namespace ceph::crypto::onwire
diff --git a/src/msg/async/crypto_onwire.h b/src/msg/async/crypto_onwire.h
new file mode 100644
index 00000000..55f75508
--- /dev/null
+++ b/src/msg/async/crypto_onwire.h
@@ -0,0 +1,130 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2009 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_CRYPTO_ONWIRE_H
+#define CEPH_CRYPTO_ONWIRE_H
+
+#include <cstdint>
+#include <memory>
+
+#include "auth/Auth.h"
+#include "include/buffer.h"
+
+namespace ceph::math {
+
+// TODO
+template <typename T>
+class always_aligned_t {
+  T val;
+
+  template <class... Args>
+  always_aligned_t(Args&&... args)
+    : val(std::forward<Args>(args)...) {
+  }
+};
+
+} // namespace ceph::math
+
+namespace ceph::crypto::onwire {
+
+struct MsgAuthError : public std::runtime_error {
+  MsgAuthError()
+    : runtime_error("message signature mismatch") {
+  }
+};
+
+struct TxHandlerError : public std::runtime_error {
+  TxHandlerError(const char* what)
+    : std::runtime_error(std::string("tx handler error: ") + what) {}
+};
+
+struct TxHandler {
+  virtual ~TxHandler() = default;
+
+  // Instance of TxHandler must be reset before doing any encrypt-update
+  // step. This applies also to situation when encrypt-final was already
+  // called and another round of update-...-update-final will take place.
+  //
+  // The input parameter informs implementation how the -update sequence
+  // is fragmented and allows to make concious decision about allocation
+  // or reusage of provided memory. One implementation could do in-place
+  // encryption while other might prefer one huge output buffer.
+  //
+  // It's undefined what will happen if client doesn't follow the order.
+  //
+  // TODO: switch to always_aligned_t
+  virtual void reset_tx_handler(const uint32_t* first,
+                                const uint32_t* last) = 0;
+
+  void reset_tx_handler(std::initializer_list<uint32_t> update_size_sequence) {
+    if (update_size_sequence.size() > 0) {
+      const uint32_t* first = &*update_size_sequence.begin();
+      reset_tx_handler(first, first + update_size_sequence.size());
+    } else {
+      reset_tx_handler(nullptr, nullptr);
+    }
+  }
+
+  // Perform encryption. Client gives full ownership right to provided
+  // bufferlist. The method MUST NOT be called after _final() if there
+  // was no call to _reset().
+  virtual void authenticated_encrypt_update(
+    const ceph::bufferlist& plaintext) = 0;
+
+  // Generates authentication signature and returns bufferlist crafted
+  // basing on plaintext from preceding call to _update().
+  virtual ceph::bufferlist authenticated_encrypt_final() = 0;
+};
+
+class RxHandler {
+public:
+  virtual ~RxHandler() = default;
+
+  // Transmitter can append extra bytes of ciphertext at the -final step.
+  // This method return how much was added, and thus let client translate
+  // plaintext size into ciphertext size to grab from wire.
+  virtual std::uint32_t get_extra_size_at_final() = 0;
+
+  // Instance of RxHandler must be reset before doing any decrypt-update
+  // step. This applies also to situation when decrypt-final was already
+  // called and another round of update-...-update-final will take place.
+  virtual void reset_rx_handler() = 0;
+
+  // Perform decryption ciphertext must be ALWAYS aligned to 16 bytes.
+  virtual void authenticated_decrypt_update(ceph::bufferlist& bl) = 0;
+
+  // Perform decryption of last cipertext's portion and verify signature
+  // for overall decryption sequence.
+  // Throws on integrity/authenticity checks
+  virtual void authenticated_decrypt_update_final(ceph::bufferlist& bl) = 0;
+};
+
+struct rxtx_t {
+  //rxtx_t(rxtx_t&& r) : rx(std::move(rx)), tx(std::move(tx)) {}
+  // Each peer can use different handlers.
+  // Hmm, isn't that too much flexbility?
+  std::unique_ptr<RxHandler> rx;
+  std::unique_ptr<TxHandler> tx;
+
+  static rxtx_t create_handler_pair(
+    CephContext* ctx,
+    const class AuthConnectionMeta& auth_meta,
+    bool new_nonce_format,
+    bool crossed);
+};
+
+} // namespace ceph::crypto::onwire
+
+#endif // CEPH_CRYPTO_ONWIRE_H
diff --git a/src/msg/async/dpdk/ARP.cc b/src/msg/async/dpdk/ARP.cc
new file mode 100644
index 00000000..dedc9e3c
--- /dev/null
+++ b/src/msg/async/dpdk/ARP.cc
@@ -0,0 +1,89 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#include "ARP.h"
+
+arp_for_protocol::arp_for_protocol(arp& a, uint16_t proto_num)
+    : _arp(a), _proto_num(proto_num)
+{
+  _arp.add(proto_num, this);
+}
+
+arp_for_protocol::~arp_for_protocol()
+{
+  _arp.del(_proto_num);
+}
+
+arp::arp(interface* netif):
+    _netif(netif),
+    _proto(netif, eth_protocol_num::arp, [this] { return get_packet(); }),
+    _rx_packets(
+        _proto.receive(
+            [this] (Packet p, ethernet_address ea) {
+              return process_packet(std::move(p), ea);
+            },
+            [this](forward_hash& out_hash_data, Packet& p, size_t off) {
+              return forward(out_hash_data, p, off);
+            }
+        )
+    )
+{}
+
+Tub<l3_protocol::l3packet> arp::get_packet()
+{
+  Tub<l3_protocol::l3packet> p;
+  if (!_packetq.empty()) {
+    p = std::move(_packetq.front());
+    _packetq.pop_front();
+  }
+  return p;
+}
+
+bool arp::forward(forward_hash& out_hash_data, Packet& p, size_t off)
+{
+  auto ah = p.get_header<arp_hdr>(off);
+  auto i = _arp_for_protocol.find(ntoh(ah->ptype));
+  if (i != _arp_for_protocol.end()) {
+    return i->second->forward(out_hash_data, p, off);
+  }
+  return false;
+}
+
+void arp::add(uint16_t proto_num, arp_for_protocol* afp)
+{
+  _arp_for_protocol[proto_num] = afp;
+}
+
+void arp::del(uint16_t proto_num)
+{
+  _arp_for_protocol.erase(proto_num);
+}
+
+int arp::process_packet(Packet p, ethernet_address from)
+{
+  auto ah = p.get_header<arp_hdr>()->ntoh();
+  auto i = _arp_for_protocol.find(ah.ptype);
+  if (i != _arp_for_protocol.end()) {
+    i->second->received(std::move(p));
+  }
+  return 0;
+}
diff --git a/src/msg/async/dpdk/ARP.h b/src/msg/async/dpdk/ARP.h
new file mode 100644
index 00000000..54569564
--- /dev/null
+++ b/src/msg/async/dpdk/ARP.h
@@ -0,0 +1,301 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ */
+
+#ifndef CEPH_MSG_ARP_H_
+#define CEPH_MSG_ARP_H_
+
+#include <errno.h>
+
+#include <unordered_map>
+#include <functional>
+
+#include "msg/async/Event.h"
+
+#include "ethernet.h"
+#include "circular_buffer.h"
+#include "ip_types.h"
+#include "net.h"
+#include "Packet.h"
+
+class arp;
+template <typename L3>
+class arp_for;
+
+class arp_for_protocol {
+ protected:
+  arp& _arp;
+  uint16_t _proto_num;
+ public:
+  arp_for_protocol(arp& a, uint16_t proto_num);
+  virtual ~arp_for_protocol();
+  virtual int received(Packet p) = 0;
+  virtual bool forward(forward_hash& out_hash_data, Packet& p, size_t off) { return false; }
+};
+
+class interface;
+
+class arp {
+  interface* _netif;
+  l3_protocol _proto;
+  subscription<Packet, ethernet_address> _rx_packets;
+  std::unordered_map<uint16_t, arp_for_protocol*> _arp_for_protocol;
+  circular_buffer<l3_protocol::l3packet> _packetq;
+ private:
+  struct arp_hdr {
+    uint16_t htype;
+    uint16_t ptype;
+    arp_hdr ntoh() {
+      arp_hdr hdr = *this;
+      hdr.htype = ::ntoh(htype);
+      hdr.ptype = ::ntoh(ptype);
+      return hdr;
+    }
+    arp_hdr hton() {
+      arp_hdr hdr = *this;
+      hdr.htype = ::hton(htype);
+      hdr.ptype = ::hton(ptype);
+      return hdr;
+    }
+  };
+ public:
+  explicit arp(interface* netif);
+  void add(uint16_t proto_num, arp_for_protocol* afp);
+  void del(uint16_t proto_num);
+ private:
+  ethernet_address l2self() { return _netif->hw_address(); }
+  int process_packet(Packet p, ethernet_address from);
+  bool forward(forward_hash& out_hash_data, Packet& p, size_t off);
+  Tub<l3_protocol::l3packet> get_packet();
+  template <class l3_proto>
+  friend class arp_for;
+};
+
+template <typename L3>
+class arp_for : public arp_for_protocol {
+ public:
+  using l2addr = ethernet_address;
+  using l3addr = typename L3::address_type;
+ private:
+  static constexpr auto max_waiters = 512;
+  enum oper {
+    op_request = 1,
+    op_reply = 2,
+  };
+  struct arp_hdr {
+    uint16_t htype;
+    uint16_t ptype;
+    uint8_t hlen;
+    uint8_t plen;
+    uint16_t oper;
+    l2addr sender_hwaddr;
+    l3addr sender_paddr;
+    l2addr target_hwaddr;
+    l3addr target_paddr;
+
+    arp_hdr ntoh() {
+      arp_hdr hdr = *this;
+      hdr.htype = ::ntoh(htype);
+      hdr.ptype = ::ntoh(ptype);
+      hdr.oper = ::ntoh(oper);
+      hdr.sender_hwaddr = sender_hwaddr.ntoh();
+      hdr.sender_paddr = sender_paddr.ntoh();
+      hdr.target_hwaddr = target_hwaddr.ntoh();
+      hdr.target_paddr = target_paddr.ntoh();
+      return hdr;
+    }
+
+    arp_hdr hton() {
+      arp_hdr hdr = *this;
+      hdr.htype = ::hton(htype);
+      hdr.ptype = ::hton(ptype);
+      hdr.oper = ::hton(oper);
+      hdr.sender_hwaddr = sender_hwaddr.hton();
+      hdr.sender_paddr = sender_paddr.hton();
+      hdr.target_hwaddr = target_hwaddr.hton();
+      hdr.target_paddr = target_paddr.hton();
+      return hdr;
+    }
+  };
+  struct resolution {
+    std::vector<std::pair<resolution_cb, Packet>> _waiters;
+    uint64_t timeout_fd;
+  };
+  class C_handle_arp_timeout : public EventCallback {
+    arp_for *arp;
+    l3addr paddr;
+    bool first_request;
+
+   public:
+    C_handle_arp_timeout(arp_for *a, l3addr addr, bool first):
+        arp(a), paddr(addr), first_request(first) {}
+    void do_request(uint64_t r) {
+      arp->send_query(paddr);
+      auto &res = arp->_in_progress[paddr];
+
+      for (auto& p : res._waiters) {
+        p.first(ethernet_address(), std::move(p.second), -ETIMEDOUT);
+      }
+      res._waiters.clear();
+      res.timeout_fd = arp->center->create_time_event(
+          1*1000*1000, this);
+    }
+  };
+  friend class C_handle_arp_timeout;
+
+ private:
+  CephContext *cct;
+  EventCenter *center;
+  l3addr _l3self = L3::broadcast_address();
+  std::unordered_map<l3addr, l2addr> _table;
+  std::unordered_map<l3addr, resolution> _in_progress;
+ private:
+  Packet make_query_packet(l3addr paddr);
+  virtual int received(Packet p) override;
+  int handle_request(arp_hdr* ah);
+  l2addr l2self() { return _arp.l2self(); }
+  void send(l2addr to, Packet &&p);
+ public:
+  void send_query(const l3addr& paddr);
+  explicit arp_for(CephContext *c, arp& a, EventCenter *cen)
+      : arp_for_protocol(a, L3::arp_protocol_type()), cct(c), center(cen) {
+    _table[L3::broadcast_address()] = ethernet::broadcast_address();
+  }
+  ~arp_for() {
+    for (auto && p : _in_progress)
+      center->delete_time_event(p.second.timeout_fd);
+  }
+  void wait(const l3addr& addr, Packet p, resolution_cb cb);
+  void learn(l2addr l2, l3addr l3);
+  void run();
+  void set_self_addr(l3addr addr) {
+    _table.erase(_l3self);
+    _table[addr] = l2self();
+    _l3self = addr;
+  }
+  friend class arp;
+};
+
+template <typename L3>
+void arp_for<L3>::send(l2addr to, Packet &&p) {
+  _arp._packetq.push_back(l3_protocol::l3packet{eth_protocol_num::arp, to, std::move(p)});
+}
+
+template <typename L3>
+Packet arp_for<L3>::make_query_packet(l3addr paddr) {
+  arp_hdr hdr;
+  hdr.htype = ethernet::arp_hardware_type();
+  hdr.ptype = L3::arp_protocol_type();
+  hdr.hlen = sizeof(l2addr);
+  hdr.plen = sizeof(l3addr);
+  hdr.oper = op_request;
+  hdr.sender_hwaddr = l2self();
+  hdr.sender_paddr = _l3self;
+  hdr.target_hwaddr = ethernet::broadcast_address();
+  hdr.target_paddr = paddr;
+  hdr = hdr.hton();
+  return Packet(reinterpret_cast<char*>(&hdr), sizeof(hdr));
+}
+
+template <typename L3>
+void arp_for<L3>::send_query(const l3addr& paddr) {
+  send(ethernet::broadcast_address(), make_query_packet(paddr));
+}
+
+template <typename L3>
+void arp_for<L3>::learn(l2addr hwaddr, l3addr paddr) {
+  _table[paddr] = hwaddr;
+  auto i = _in_progress.find(paddr);
+  if (i != _in_progress.end()) {
+    auto& res = i->second;
+    center->delete_time_event(res.timeout_fd);
+    for (auto &&p : res._waiters) {
+      p.first(hwaddr, std::move(p.second), 0);
+    }
+    _in_progress.erase(i);
+  }
+}
+
+template <typename L3>
+void arp_for<L3>::wait(const l3addr& paddr, Packet p, resolution_cb cb) {
+  auto i = _table.find(paddr);
+  if (i != _table.end()) {
+    cb(i->second, std::move(p), 0);
+    return ;
+  }
+
+  auto j = _in_progress.find(paddr);
+  auto first_request = j == _in_progress.end();
+  auto& res = first_request ? _in_progress[paddr] : j->second;
+
+  if (first_request) {
+    res.timeout_fd = center->create_time_event(
+        1*1000*1000, new C_handle_arp_timeout(this, paddr, first_request));
+    send_query(paddr);
+  }
+
+  if (res._waiters.size() >= max_waiters) {
+    cb(ethernet_address(), std::move(p), -EBUSY);
+    return ;
+  }
+
+  res._waiters.emplace_back(cb, std::move(p));
+  return ;
+}
+
+template <typename L3>
+int arp_for<L3>::received(Packet p) {
+  auto ah = p.get_header<arp_hdr>();
+  if (!ah) {
+    return 0;
+  }
+  auto h = ah->ntoh();
+  if (h.hlen != sizeof(l2addr) || h.plen != sizeof(l3addr)) {
+    return 0;
+  }
+  switch (h.oper) {
+    case op_request:
+      return handle_request(&h);
+    case op_reply:
+      _arp._netif->arp_learn(h.sender_hwaddr, h.sender_paddr);
+      return 0;
+    default:
+      return 0;
+  }
+}
+
+template <typename L3>
+int arp_for<L3>::handle_request(arp_hdr* ah) {
+  if (ah->target_paddr == _l3self
+      && _l3self != L3::broadcast_address()) {
+    ah->oper = op_reply;
+    ah->target_hwaddr = ah->sender_hwaddr;
+    ah->target_paddr = ah->sender_paddr;
+    ah->sender_hwaddr = l2self();
+    ah->sender_paddr = _l3self;
+    *ah = ah->hton();
+    send(ah->target_hwaddr, Packet(reinterpret_cast<char*>(ah), sizeof(*ah)));
+  }
+  return 0;
+}
+
+#endif /* CEPH_MSG_ARP_H_ */
diff --git a/src/msg/async/dpdk/DPDK.cc b/src/msg/async/dpdk/DPDK.cc
new file mode 100644
index 00000000..278efe9e
--- /dev/null
+++ b/src/msg/async/dpdk/DPDK.cc
@@ -0,0 +1,1267 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#include <atomic>
+#include <vector>
+#include <queue>
+
+#include <rte_config.h>
+#include <rte_common.h>
+#include <rte_eal.h>
+#include <rte_pci.h>
+#include <rte_ethdev.h>
+#include <rte_cycles.h>
+#include <rte_memzone.h>
+
+#include "include/page.h"
+#include "align.h"
+#include "IP.h"
+#include "const.h"
+#include "dpdk_rte.h"
+#include "DPDK.h"
+#include "toeplitz.h"
+
+#include "common/Cycles.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "dpdk "
+
+
+void* as_cookie(struct rte_pktmbuf_pool_private& p) {
+  return &p;
+};
+
+#ifndef MARKER
+typedef void    *MARKER[0];   /**< generic marker for a point in a structure */
+#endif
+
+/******************* Net device related constatns *****************************/
+static constexpr uint16_t default_ring_size      = 512;
+
+//
+// We need 2 times the ring size of buffers because of the way PMDs
+// refill the ring.
+//
+static constexpr uint16_t mbufs_per_queue_rx     = 2 * default_ring_size;
+static constexpr uint16_t rx_gc_thresh           = 64;
+
+//
+// No need to keep more descriptors in the air than can be sent in a single
+// rte_eth_tx_burst() call.
+//
+static constexpr uint16_t mbufs_per_queue_tx     = 2 * default_ring_size;
+
+static constexpr uint16_t mbuf_cache_size        = 512;
+//
+// Size of the data buffer in the non-inline case.
+//
+// We may want to change (increase) this value in future, while the
+// inline_mbuf_data_size value will unlikely change due to reasons described
+// above.
+//
+static constexpr size_t mbuf_data_size = 4096;
+
+static constexpr uint16_t mbuf_overhead          =
+                          sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM;
+//
+// We'll allocate 2K data buffers for an inline case because this would require
+// a single page per mbuf. If we used 4K data buffers here it would require 2
+// pages for a single buffer (due to "mbuf_overhead") and this is a much more
+// demanding memory constraint.
+//
+static constexpr size_t inline_mbuf_data_size = 2048;
+
+
+// (INLINE_MBUF_DATA_SIZE(2K)*32 = 64K = Max TSO/LRO size) + 1 mbuf for headers
+static constexpr uint8_t max_frags = 32 + 1;
+
+//
+// Intel's 40G NIC HW limit for a number of fragments in an xmit segment.
+//
+// See Chapter 8.4.1 "Transmit Packet in System Memory" of the xl710 devices
+// spec. for more details.
+//
+static constexpr uint8_t i40e_max_xmit_segment_frags = 8;
+
+//
+// VMWare's virtual NIC limit for a number of fragments in an xmit segment.
+//
+// see drivers/net/vmxnet3/base/vmxnet3_defs.h VMXNET3_MAX_TXD_PER_PKT
+//
+static constexpr uint8_t vmxnet3_max_xmit_segment_frags = 16;
+
+static constexpr uint16_t inline_mbuf_size = inline_mbuf_data_size + mbuf_overhead;
+
+static size_t huge_page_size = 512 * CEPH_PAGE_SIZE;
+
+uint32_t qp_mempool_obj_size()
+{
+  uint32_t mp_size = 0;
+  struct rte_mempool_objsz mp_obj_sz = {};
+
+  //
+  // We will align each size to huge page size because DPDK allocates
+  // physically contiguous memory region for each pool object.
+  //
+
+  // Rx
+  mp_size += align_up(rte_mempool_calc_obj_size(mbuf_overhead, 0, &mp_obj_sz)+
+                      sizeof(struct rte_pktmbuf_pool_private),
+                      huge_page_size);
+
+  //Tx
+  std::memset(&mp_obj_sz, 0, sizeof(mp_obj_sz));
+  mp_size += align_up(rte_mempool_calc_obj_size(inline_mbuf_size, 0,
+                                                &mp_obj_sz)+
+                      sizeof(struct rte_pktmbuf_pool_private),
+                      huge_page_size);
+  return mp_size;
+}
+
+static constexpr const char* pktmbuf_pool_name   = "dpdk_net_pktmbuf_pool";
+
+/*
+ * When doing reads from the NIC queues, use this batch size
+ */
+static constexpr uint8_t packet_read_size        = 32;
+/******************************************************************************/
+
+int DPDKDevice::init_port_start()
+{
+  ceph_assert(_port_idx < rte_eth_dev_count());
+
+  rte_eth_dev_info_get(_port_idx, &_dev_info);
+
+  //
+  // This is a workaround for a missing handling of a HW limitation in the
+  // DPDK i40e driver. This and all related to _is_i40e_device code should be
+  // removed once this handling is added.
+  //
+  if (std::string("rte_i40evf_pmd") == _dev_info.driver_name ||
+      std::string("rte_i40e_pmd") == _dev_info.driver_name) {
+    ldout(cct, 1) << __func__ << " Device is an Intel's 40G NIC. Enabling 8 fragments hack!" << dendl;
+    _is_i40e_device = true;
+  }
+
+  if (std::string("rte_vmxnet3_pmd") == _dev_info.driver_name) {
+    ldout(cct, 1) << __func__ << " Device is a VMWare Virtual NIC. Enabling 16 fragments hack!" << dendl;
+    _is_vmxnet3_device = true;
+  }
+
+  //
+  // Another workaround: this time for a lack of number of RSS bits.
+  // ixgbe PF NICs support up to 16 RSS queues.
+  // ixgbe VF NICs support up to 4 RSS queues.
+  // i40e PF NICs support up to 64 RSS queues.
+  // i40e VF NICs support up to 16 RSS queues.
+  //
+  if (std::string("rte_ixgbe_pmd") == _dev_info.driver_name) {
+    _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)16);
+  } else if (std::string("rte_ixgbevf_pmd") == _dev_info.driver_name) {
+    _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)4);
+  } else if (std::string("rte_i40e_pmd") == _dev_info.driver_name) {
+    _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)64);
+  } else if (std::string("rte_i40evf_pmd") == _dev_info.driver_name) {
+    _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)16);
+  }
+
+  // Clear txq_flags - we want to support all available offload features
+  // except for multi-mempool and refcnt'ing which we don't need
+  _dev_info.default_txconf.txq_flags =
+      ETH_TXQ_FLAGS_NOMULTMEMP | ETH_TXQ_FLAGS_NOREFCOUNT;
+
+  //
+  // Disable features that are not supported by port's HW
+  //
+  if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) {
+    _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP;
+  }
+
+  if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
+    _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP;
+  }
+
+  if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) {
+    _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP;
+  }
+
+  if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) {
+    _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL;
+  }
+
+  if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) {
+    _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL;
+  }
+
+  if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO)) {
+    _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS;
+  }
+
+  /* for port configuration all features are off by default */
+  rte_eth_conf port_conf = { 0 };
+
+  ldout(cct, 5) << __func__ << " Port " << int(_port_idx) << ": max_rx_queues "
+                << _dev_info.max_rx_queues << "  max_tx_queues "
+                << _dev_info.max_tx_queues << dendl;
+
+  _num_queues = std::min({_num_queues, _dev_info.max_rx_queues, _dev_info.max_tx_queues});
+
+  ldout(cct, 5) << __func__ << " Port " << int(_port_idx) << ": using "
+                << _num_queues << " queues" << dendl;;
+
+  // Set RSS mode: enable RSS if seastar is configured with more than 1 CPU.
+  // Even if port has a single queue we still want the RSS feature to be
+  // available in order to make HW calculate RSS hash for us.
+  if (_num_queues > 1) {
+    if (_dev_info.hash_key_size == 40) {
+      _rss_key = default_rsskey_40bytes;
+    } else if (_dev_info.hash_key_size == 52) {
+      _rss_key = default_rsskey_52bytes;
+    } else if (_dev_info.hash_key_size != 0) {
+      // WTF?!!
+      rte_exit(EXIT_FAILURE,
+               "Port %d: We support only 40 or 52 bytes RSS hash keys, %d bytes key requested",
+               _port_idx, _dev_info.hash_key_size);
+    } else {
+      _rss_key = default_rsskey_40bytes;
+      _dev_info.hash_key_size = 40;
+    }
+
+    port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
+    port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK;
+    if (_dev_info.hash_key_size) {
+      port_conf.rx_adv_conf.rss_conf.rss_key = const_cast<uint8_t *>(_rss_key.data());
+      port_conf.rx_adv_conf.rss_conf.rss_key_len = _dev_info.hash_key_size;
+    }
+  } else {
+    port_conf.rxmode.mq_mode = ETH_MQ_RX_NONE;
+  }
+
+  if (_num_queues > 1) {
+    if (_dev_info.reta_size) {
+      // RETA size should be a power of 2
+      ceph_assert((_dev_info.reta_size & (_dev_info.reta_size - 1)) == 0);
+
+      // Set the RSS table to the correct size
+      _redir_table.resize(_dev_info.reta_size);
+      _rss_table_bits = std::lround(std::log2(_dev_info.reta_size));
+      ldout(cct, 5) << __func__ << " Port " << int(_port_idx)
+                    << ": RSS table size is " << _dev_info.reta_size << dendl;
+    } else {
+      // FIXME: same with sw_reta
+      _redir_table.resize(128);
+      _rss_table_bits = std::lround(std::log2(128));
+    }
+  } else {
+    _redir_table.push_back(0);
+  }
+
+  // Set Rx VLAN stripping
+  if (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
+    port_conf.rxmode.hw_vlan_strip = 1;
+  }
+
+  // Enable HW CRC stripping
+  port_conf.rxmode.hw_strip_crc = 1;
+
+#ifdef RTE_ETHDEV_HAS_LRO_SUPPORT
+  // Enable LRO
+  if (_use_lro && (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO)) {
+    ldout(cct, 1) << __func__ << " LRO is on" << dendl;
+    port_conf.rxmode.enable_lro = 1;
+    _hw_features.rx_lro = true;
+  } else
+#endif
+    ldout(cct, 1) << __func__ << " LRO is off" << dendl;
+
+  // Check that all CSUM features are either all set all together or not set
+  // all together. If this assumption breaks we need to rework the below logic
+  // by splitting the csum offload feature bit into separate bits for IPv4,
+  // TCP.
+  ceph_assert(((_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
+          (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) ||
+         (!(_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
+          !(_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)));
+
+  // Set Rx checksum checking
+  if ((_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
+      (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
+    ldout(cct, 1) << __func__ << " RX checksum offload supported" << dendl;
+    port_conf.rxmode.hw_ip_checksum = 1;
+    _hw_features.rx_csum_offload = 1;
+  }
+
+  if ((_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
+    ldout(cct, 1) << __func__ << " TX ip checksum offload supported" << dendl;
+    _hw_features.tx_csum_ip_offload = 1;
+  }
+
+  // TSO is supported starting from DPDK v1.8
+  if (_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
+    ldout(cct, 1) << __func__ << " TSO is supported" << dendl;
+    _hw_features.tx_tso = 1;
+  }
+
+  // Check that Tx TCP CSUM features are either all set all together
+  // or not set all together. If this assumption breaks we need to rework the
+  // below logic by splitting the csum offload feature bit into separate bits
+  // for TCP.
+  ceph_assert((_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) ||
+          !(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM));
+
+  if (_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) {
+    ldout(cct, 1) << __func__ << " TX TCP checksum offload supported" << dendl;
+    _hw_features.tx_csum_l4_offload = 1;
+  }
+
+  int retval;
+
+  ldout(cct, 1) << __func__ << " Port " << int(_port_idx) << " init ... " << dendl;
+
+  /*
+   * Standard DPDK port initialisation - config port, then set up
+   * rx and tx rings.
+   */
+  if ((retval = rte_eth_dev_configure(_port_idx, _num_queues, _num_queues,
+                                      &port_conf)) != 0) {
+    lderr(cct) << __func__ << " failed to configure port " << (int)_port_idx
+               << " rx/tx queues " << _num_queues << " error " << cpp_strerror(retval) << dendl;
+    return retval;
+  }
+
+  //rte_eth_promiscuous_enable(port_num);
+  ldout(cct, 1) << __func__ << " done." << dendl;
+
+  return 0;
+}
+
+void DPDKDevice::set_hw_flow_control()
+{
+  // Read the port's current/default flow control settings
+  struct rte_eth_fc_conf fc_conf;
+  auto ret = rte_eth_dev_flow_ctrl_get(_port_idx, &fc_conf);
+
+  if (ret == -ENOTSUP) {
+    ldout(cct, 1) << __func__ << " port " << int(_port_idx)
+                  << ": not support to get hardware flow control settings: " << ret << dendl;
+    goto not_supported;
+  }
+
+  if (ret < 0) {
+    lderr(cct) << __func__ << " port " << int(_port_idx)
+               << ": failed to get hardware flow control settings: " << ret << dendl;
+    ceph_abort();
+  }
+
+  if (_enable_fc) {
+    fc_conf.mode = RTE_FC_FULL;
+  } else {
+    fc_conf.mode = RTE_FC_NONE;
+  }
+
+  ret = rte_eth_dev_flow_ctrl_set(_port_idx, &fc_conf);
+  if (ret == -ENOTSUP) {
+    ldout(cct, 1) << __func__ << " port " << int(_port_idx)
+                  << ": not support to set hardware flow control settings: " << ret << dendl;
+    goto not_supported;
+  }
+
+  if (ret < 0) {
+    lderr(cct) << __func__ << " port " << int(_port_idx)
+               << ": failed to set hardware flow control settings: " << ret << dendl;
+    ceph_abort();
+  }
+
+  ldout(cct, 1) << __func__ << " port " << int(_port_idx) << ":  HW FC " << _enable_fc << dendl;
+  return;
+
+not_supported:
+  ldout(cct, 1) << __func__ << " port " << int(_port_idx) << ": changing HW FC settings is not supported" << dendl;
+}
+
+int DPDKDevice::init_port_fini()
+{
+  // Changing FC requires HW reset, so set it before the port is initialized.
+  set_hw_flow_control();
+
+  if (rte_eth_dev_start(_port_idx) != 0) {
+    lderr(cct) << __func__ << " can't start port " << _port_idx << dendl;
+    return -1;
+  }
+
+  if (_num_queues > 1) {
+    if (!rte_eth_dev_filter_supported(_port_idx, RTE_ETH_FILTER_HASH)) {
+      ldout(cct, 5) << __func__ << " Port " << _port_idx << ": HASH FILTER configuration is supported" << dendl;
+
+      // Setup HW touse the TOEPLITZ hash function as an RSS hash function
+      struct rte_eth_hash_filter_info info = {};
+
+      info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
+      info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
+
+      if (rte_eth_dev_filter_ctrl(_port_idx, RTE_ETH_FILTER_HASH,
+                                  RTE_ETH_FILTER_SET, &info) < 0) {
+        lderr(cct) << __func__ << " cannot set hash function on a port " << _port_idx << dendl;
+        return -1;
+      }
+    }
+
+    set_rss_table();
+  }
+
+  // Wait for a link
+  if (check_port_link_status() < 0) {
+    lderr(cct) << __func__ << " port link up failed " << _port_idx << dendl;
+    return -1;
+  }
+
+  ldout(cct, 5) << __func__ << " created DPDK device" << dendl;
+  return 0;
+}
+
+void DPDKQueuePair::configure_proxies(const std::map<unsigned, float>& cpu_weights) {
+  ceph_assert(!cpu_weights.empty());
+  if (cpu_weights.size() == 1 && cpu_weights.begin()->first == _qid) {
+    // special case queue sending to self only, to avoid requiring a hash value
+    return;
+  }
+  register_packet_provider([this] {
+    Tub<Packet> p;
+    if (!_proxy_packetq.empty()) {
+      p = std::move(_proxy_packetq.front());
+      _proxy_packetq.pop_front();
+    }
+    return p;
+  });
+  build_sw_reta(cpu_weights);
+}
+
+void DPDKQueuePair::build_sw_reta(const std::map<unsigned, float>& cpu_weights) {
+  float total_weight = 0;
+  for (auto&& x : cpu_weights) {
+    total_weight += x.second;
+  }
+  float accum = 0;
+  unsigned idx = 0;
+  std::array<uint8_t, 128> reta;
+  for (auto&& entry : cpu_weights) {
+    auto cpu = entry.first;
+    auto weight = entry.second;
+    accum += weight;
+    while (idx < (accum / total_weight * reta.size() - 0.5)) {
+      reta[idx++] = cpu;
+    }
+  }
+  _sw_reta = reta;
+}
+
+
+bool DPDKQueuePair::init_rx_mbuf_pool()
+{
+  std::string name = std::string(pktmbuf_pool_name) + std::to_string(_qid) + "_rx";
+
+  // reserve the memory for Rx buffers containers
+  _rx_free_pkts.reserve(mbufs_per_queue_rx);
+  _rx_free_bufs.reserve(mbufs_per_queue_rx);
+
+  _pktmbuf_pool_rx = rte_mempool_lookup(name.c_str());
+  if (!_pktmbuf_pool_rx) {
+    ldout(cct, 1) << __func__ << " Creating Rx mbuf pool '" << name.c_str()
+                  << "' [" << mbufs_per_queue_rx << " mbufs] ..."<< dendl;
+
+    //
+    // Don't pass single-producer/single-consumer flags to mbuf create as it
+    // seems faster to use a cache instead.
+    //
+    struct rte_pktmbuf_pool_private roomsz = {};
+    roomsz.mbuf_data_room_size = mbuf_data_size + RTE_PKTMBUF_HEADROOM;
+    _pktmbuf_pool_rx = rte_mempool_create(
+        name.c_str(),
+        mbufs_per_queue_rx, mbuf_overhead + mbuf_data_size,
+        mbuf_cache_size,
+        sizeof(struct rte_pktmbuf_pool_private),
+        rte_pktmbuf_pool_init, as_cookie(roomsz),
+        rte_pktmbuf_init, nullptr,
+        rte_socket_id(), 0);
+    if (!_pktmbuf_pool_rx) {
+      lderr(cct) << __func__ << " Failed to create mempool for rx" << dendl;
+      return false;
+    }
+
+    //
+    // allocate more data buffer
+    int bufs_count =  cct->_conf->ms_dpdk_rx_buffer_count_per_core - mbufs_per_queue_rx;
+    int mz_flags = RTE_MEMZONE_1GB|RTE_MEMZONE_SIZE_HINT_ONLY;
+    std::string mz_name = "rx_buffer_data" + std::to_string(_qid);
+    const struct rte_memzone *mz = rte_memzone_reserve_aligned(mz_name.c_str(),
+          mbuf_data_size*bufs_count, _pktmbuf_pool_rx->socket_id, mz_flags, mbuf_data_size);
+    ceph_assert(mz);
+    void* m = mz->addr;
+    for (int i = 0; i < bufs_count; i++) {
+      ceph_assert(m);
+      _alloc_bufs.push_back(m);
+      m += mbuf_data_size;
+    }
+
+    if (rte_eth_rx_queue_setup(_dev_port_idx, _qid, default_ring_size,
+                               rte_eth_dev_socket_id(_dev_port_idx),
+                               _dev->def_rx_conf(), _pktmbuf_pool_rx) < 0) {
+      lderr(cct) << __func__ << " cannot initialize rx queue" << dendl;
+      return false;
+    }
+  }
+
+  return _pktmbuf_pool_rx != nullptr;
+}
+
+int DPDKDevice::check_port_link_status()
+{
+  int count = 0;
+
+  ldout(cct, 20) << __func__ << dendl;
+  const int sleep_time = 100 * 1000;
+  const int max_check_time = 90;  /* 9s (90 * 100ms) in total */
+  while (true) {
+    struct rte_eth_link link;
+    memset(&link, 0, sizeof(link));
+    rte_eth_link_get_nowait(_port_idx, &link);
+
+    if (true) {
+      if (link.link_status) {
+        ldout(cct, 5) << __func__ << " done port "
+                      << static_cast<unsigned>(_port_idx)
+                      << " link Up - speed " << link.link_speed
+                      << " Mbps - "
+                      << ((link.link_duplex == ETH_LINK_FULL_DUPLEX) ? ("full-duplex") : ("half-duplex\n"))
+                      << dendl;
+        break;
+      } else if (count++ < max_check_time) {
+        ldout(cct, 20) << __func__ << " not ready, continue to wait." << dendl;
+        usleep(sleep_time);
+      } else {
+        lderr(cct) << __func__ << " done port " << _port_idx << " link down" << dendl;
+        return -1;
+      }
+    }
+  }
+  return 0;
+}
+
+class C_handle_dev_stats : public EventCallback {
+  DPDKQueuePair *_qp;
+ public:
+  C_handle_dev_stats(DPDKQueuePair *qp): _qp(qp) { }
+  void do_request(uint64_t id) {
+    _qp->handle_stats();
+  }
+};
+
+DPDKQueuePair::DPDKQueuePair(CephContext *c, EventCenter *cen, DPDKDevice* dev, uint8_t qid)
+  : cct(c), _dev(dev), _dev_port_idx(dev->port_idx()), center(cen), _qid(qid),
+    _tx_poller(this), _rx_gc_poller(this), _tx_buf_factory(c, dev, qid),
+    _tx_gc_poller(this)
+{
+  if (!init_rx_mbuf_pool()) {
+    lderr(cct) << __func__ << " cannot initialize mbuf pools" << dendl;
+    ceph_abort();
+  }
+
+  static_assert(offsetof(tx_buf, private_end) -
+                offsetof(tx_buf, private_start) <= RTE_PKTMBUF_HEADROOM,
+                "RTE_PKTMBUF_HEADROOM is less than DPDKQueuePair::tx_buf size! "
+                "Increase the headroom size in the DPDK configuration");
+  static_assert(offsetof(tx_buf, _mbuf) == 0,
+                "There is a pad at the beginning of the tx_buf before _mbuf "
+                "field!");
+  static_assert((inline_mbuf_data_size & (inline_mbuf_data_size - 1)) == 0,
+                "inline_mbuf_data_size has to be a power of two!");
+
+  std::string name(std::string("queue") + std::to_string(qid));
+  PerfCountersBuilder plb(cct, name, l_dpdk_qp_first, l_dpdk_qp_last);
+
+  plb.add_u64_counter(l_dpdk_qp_rx_packets, "dpdk_receive_packets", "DPDK received packets");
+  plb.add_u64_counter(l_dpdk_qp_tx_packets, "dpdk_send_packets", "DPDK sendd packets");
+  plb.add_u64_counter(l_dpdk_qp_rx_bad_checksum_errors, "dpdk_receive_bad_checksum_errors", "DPDK received bad checksum packets");
+  plb.add_u64_counter(l_dpdk_qp_rx_no_memory_errors, "dpdk_receive_no_memory_errors", "DPDK received no memory packets");
+  plb.add_u64_counter(l_dpdk_qp_rx_bytes, "dpdk_receive_bytes", "DPDK received bytes", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_u64_counter(l_dpdk_qp_tx_bytes, "dpdk_send_bytes", "DPDK sendd bytes", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_u64_counter(l_dpdk_qp_rx_last_bunch, "dpdk_receive_last_bunch", "DPDK last received bunch");
+  plb.add_u64_counter(l_dpdk_qp_tx_last_bunch, "dpdk_send_last_bunch", "DPDK last send bunch");
+  plb.add_u64_counter(l_dpdk_qp_rx_fragments, "dpdk_receive_fragments", "DPDK received total fragments");
+  plb.add_u64_counter(l_dpdk_qp_tx_fragments, "dpdk_send_fragments", "DPDK sendd total fragments");
+  plb.add_u64_counter(l_dpdk_qp_rx_copy_ops, "dpdk_receive_copy_ops", "DPDK received copy operations");
+  plb.add_u64_counter(l_dpdk_qp_tx_copy_ops, "dpdk_send_copy_ops", "DPDK sendd copy operations");
+  plb.add_u64_counter(l_dpdk_qp_rx_copy_bytes, "dpdk_receive_copy_bytes", "DPDK received copy bytes", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_u64_counter(l_dpdk_qp_tx_copy_bytes, "dpdk_send_copy_bytes", "DPDK send copy bytes", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_u64_counter(l_dpdk_qp_rx_linearize_ops, "dpdk_receive_linearize_ops", "DPDK received linearize operations");
+  plb.add_u64_counter(l_dpdk_qp_tx_linearize_ops, "dpdk_send_linearize_ops", "DPDK send linearize operations");
+  plb.add_u64_counter(l_dpdk_qp_tx_queue_length, "dpdk_send_queue_length", "DPDK send queue length");
+
+  perf_logger = plb.create_perf_counters();
+  cct->get_perfcounters_collection()->add(perf_logger);
+
+  if (!_qid)
+    device_stat_time_fd = center->create_time_event(1000*1000, new C_handle_dev_stats(this));
+}
+
+void DPDKQueuePair::handle_stats()
+{
+  ldout(cct, 20) << __func__ << " started." << dendl;
+  rte_eth_stats rte_stats = {};
+  int rc = rte_eth_stats_get(_dev_port_idx, &rte_stats);
+
+  if (rc) {
+    ldout(cct, 0) << __func__ << " failed to get port statistics: " << cpp_strerror(rc) << dendl;
+    return ;
+  }
+
+#if RTE_VERSION < RTE_VERSION_NUM(16,7,0,0)
+  _dev->perf_logger->set(l_dpdk_dev_rx_mcast, rte_stats.imcasts);
+  _dev->perf_logger->set(l_dpdk_dev_rx_badcrc_errors, rte_stats.ibadcrc);
+#endif
+  _dev->perf_logger->set(l_dpdk_dev_rx_dropped_errors, rte_stats.imissed);
+  _dev->perf_logger->set(l_dpdk_dev_rx_nombuf_errors, rte_stats.rx_nombuf);
+
+  _dev->perf_logger->set(l_dpdk_dev_rx_total_errors, rte_stats.ierrors);
+  _dev->perf_logger->set(l_dpdk_dev_tx_total_errors, rte_stats.oerrors);
+  device_stat_time_fd = center->create_time_event(1000*1000, new C_handle_dev_stats(this));
+}
+
+bool DPDKQueuePair::poll_tx() {
+  bool nonloopback = !cct->_conf->ms_dpdk_debug_allow_loopback;
+#ifdef CEPH_PERF_DEV
+  uint64_t start = Cycles::rdtsc();
+#endif
+  uint32_t total_work = 0;
+  if (_tx_packetq.size() < 16) {
+    // refill send queue from upper layers
+    uint32_t work;
+    do {
+      work = 0;
+      for (auto&& pr : _pkt_providers) {
+        auto p = pr();
+        if (p) {
+          work++;
+          if (likely(nonloopback)) {
+            // ldout(cct, 0) << __func__ << " len: " << p->len() << " frags: " << p->nr_frags() << dendl;
+            _tx_packetq.push_back(std::move(*p));
+          } else {
+            auto th = p->get_header<eth_hdr>(0);
+            if (th->dst_mac == th->src_mac) {
+              _dev->l2receive(_qid, std::move(*p));
+            } else {
+              _tx_packetq.push_back(std::move(*p));
+            }
+          }
+          if (_tx_packetq.size() == 128) {
+            break;
+          }
+        }
+      }
+      total_work += work;
+    } while (work && total_work < 256 && _tx_packetq.size() < 128);
+  }
+  if (!_tx_packetq.empty()) {
+    uint64_t c = send(_tx_packetq);
+    perf_logger->inc(l_dpdk_qp_tx_packets, c);
+    perf_logger->set(l_dpdk_qp_tx_last_bunch, c);
+#ifdef CEPH_PERF_DEV
+    tx_count += total_work;
+    tx_cycles += Cycles::rdtsc() - start;
+#endif
+    return true;
+  }
+
+  return false;
+}
+
+inline Tub<Packet> DPDKQueuePair::from_mbuf_lro(rte_mbuf* m)
+{
+  _frags.clear();
+  _bufs.clear();
+
+  for (; m != nullptr; m = m->next) {
+    char* data = rte_pktmbuf_mtod(m, char*);
+
+    _frags.emplace_back(fragment{data, rte_pktmbuf_data_len(m)});
+    _bufs.push_back(data);
+  }
+
+  auto del = std::bind(
+          [this](std::vector<char*> &bufs) {
+            for (auto&& b : bufs) { _alloc_bufs.push_back(b); }
+          }, std::move(_bufs));
+  return Packet(
+      _frags.begin(), _frags.end(), make_deleter(std::move(del)));
+}
+
+inline Tub<Packet> DPDKQueuePair::from_mbuf(rte_mbuf* m)
+{
+  _rx_free_pkts.push_back(m);
+  _num_rx_free_segs += m->nb_segs;
+
+  if (!_dev->hw_features_ref().rx_lro || rte_pktmbuf_is_contiguous(m)) {
+    char* data = rte_pktmbuf_mtod(m, char*);
+
+    return Packet(fragment{data, rte_pktmbuf_data_len(m)},
+                  make_deleter([this, data] { _alloc_bufs.push_back(data); }));
+  } else {
+    return from_mbuf_lro(m);
+  }
+}
+
+inline bool DPDKQueuePair::refill_one_cluster(rte_mbuf* head)
+{
+  for (; head != nullptr; head = head->next) {
+    if (!refill_rx_mbuf(head, mbuf_data_size, _alloc_bufs)) {
+      //
+      // If we failed to allocate a new buffer - push the rest of the
+      // cluster back to the free_packets list for a later retry.
+      //
+      _rx_free_pkts.push_back(head);
+      return false;
+    }
+    _rx_free_bufs.push_back(head);
+  }
+
+  return true;
+}
+
+bool DPDKQueuePair::rx_gc(bool force)
+{
+  if (_num_rx_free_segs >= rx_gc_thresh || force) {
+    ldout(cct, 10) << __func__ << " free segs " << _num_rx_free_segs
+                   << " thresh " << rx_gc_thresh
+                   << " free pkts " << _rx_free_pkts.size()
+                   << dendl;
+
+    while (!_rx_free_pkts.empty()) {
+      //
+      // Use back() + pop_back() semantics to avoid an extra
+      // _rx_free_pkts.clear() at the end of the function - clear() has a
+      // linear complexity.
+      //
+      auto m = _rx_free_pkts.back();
+      _rx_free_pkts.pop_back();
+
+      if (!refill_one_cluster(m)) {
+        ldout(cct, 1) << __func__ << " get new mbuf failed " << dendl;
+        break;
+      }
+    }
+    for (auto&& m : _rx_free_bufs) {
+      rte_pktmbuf_prefree_seg(m);
+    }
+
+    if (_rx_free_bufs.size()) {
+      rte_mempool_put_bulk(_pktmbuf_pool_rx,
+                           (void **)_rx_free_bufs.data(),
+                           _rx_free_bufs.size());
+
+      // TODO: ceph_assert() in a fast path! Remove me ASAP!
+      ceph_assert(_num_rx_free_segs >= _rx_free_bufs.size());
+
+      _num_rx_free_segs -= _rx_free_bufs.size();
+      _rx_free_bufs.clear();
+
+      // TODO: ceph_assert() in a fast path! Remove me ASAP!
+      ceph_assert((_rx_free_pkts.empty() && !_num_rx_free_segs) ||
+             (!_rx_free_pkts.empty() && _num_rx_free_segs));
+    }
+  }
+
+  return _num_rx_free_segs >= rx_gc_thresh;
+}
+
+
+void DPDKQueuePair::process_packets(
+    struct rte_mbuf **bufs, uint16_t count)
+{
+  uint64_t nr_frags = 0, bytes = 0;
+
+  for (uint16_t i = 0; i < count; i++) {
+    struct rte_mbuf *m = bufs[i];
+    offload_info oi;
+
+    Tub<Packet> p = from_mbuf(m);
+
+    // Drop the packet if translation above has failed
+    if (!p) {
+      perf_logger->inc(l_dpdk_qp_rx_no_memory_errors);
+      continue;
+    }
+    // ldout(cct, 0) << __func__ << " len " << p->len() << " " << dendl;
+
+    nr_frags += m->nb_segs;
+    bytes    += m->pkt_len;
+
+    // Set stipped VLAN value if available
+    if ((_dev->_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) &&
+        (m->ol_flags & PKT_RX_VLAN_STRIPPED)) {
+      oi.vlan_tci = m->vlan_tci;
+    }
+
+    if (_dev->get_hw_features().rx_csum_offload) {
+      if (m->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
+        // Packet with bad checksum, just drop it.
+        perf_logger->inc(l_dpdk_qp_rx_bad_checksum_errors);
+        continue;
+      }
+      // Note that when _hw_features.rx_csum_offload is on, the receive
+      // code for ip, tcp and udp will assume they don't need to check
+      // the checksum again, because we did this here.
+    }
+
+    p->set_offload_info(oi);
+    if (m->ol_flags & PKT_RX_RSS_HASH) {
+      p->set_rss_hash(m->hash.rss);
+    }
+
+    _dev->l2receive(_qid, std::move(*p));
+  }
+
+  perf_logger->inc(l_dpdk_qp_rx_packets, count);
+  perf_logger->set(l_dpdk_qp_rx_last_bunch, count);
+  perf_logger->inc(l_dpdk_qp_rx_fragments, nr_frags);
+  perf_logger->inc(l_dpdk_qp_rx_bytes, bytes);
+}
+
+bool DPDKQueuePair::poll_rx_once()
+{
+  struct rte_mbuf *buf[packet_read_size];
+
+  /* read a port */
+#ifdef CEPH_PERF_DEV
+  uint64_t start = Cycles::rdtsc();
+#endif
+  uint16_t count = rte_eth_rx_burst(_dev_port_idx, _qid,
+                                       buf, packet_read_size);
+
+  /* Now process the NIC packets read */
+  if (likely(count > 0)) {
+    process_packets(buf, count);
+#ifdef CEPH_PERF_DEV
+    rx_cycles = Cycles::rdtsc() - start;
+    rx_count += count;
+#endif
+  }
+#ifdef CEPH_PERF_DEV
+  else {
+    if (rx_count > 10000 && tx_count) {
+      ldout(cct, 0) << __func__ << " rx count=" << rx_count << " avg rx=" << Cycles::to_nanoseconds(rx_cycles)/rx_count << "ns "
+                    << " tx count=" << tx_count << " avg tx=" << Cycles::to_nanoseconds(tx_cycles)/tx_count << "ns"
+                    << dendl;
+      rx_count = rx_cycles = tx_count = tx_cycles = 0;
+    }
+  }
+#endif
+
+  return count;
+}
+
+DPDKQueuePair::tx_buf_factory::tx_buf_factory(CephContext *c,
+        DPDKDevice *dev, uint8_t qid): cct(c)
+{
+  std::string name = std::string(pktmbuf_pool_name) + std::to_string(qid) + "_tx";
+
+  _pool = rte_mempool_lookup(name.c_str());
+  if (!_pool) {
+    ldout(cct, 0) << __func__ << " Creating Tx mbuf pool '" << name.c_str()
+                  << "' [" << mbufs_per_queue_tx << " mbufs] ..." << dendl;
+    //
+    // We are going to push the buffers from the mempool into
+    // the circular_buffer and then poll them from there anyway, so
+    // we prefer to make a mempool non-atomic in this case.
+    //
+    _pool = rte_mempool_create(name.c_str(),
+                               mbufs_per_queue_tx, inline_mbuf_size,
+                               mbuf_cache_size,
+                               sizeof(struct rte_pktmbuf_pool_private),
+                               rte_pktmbuf_pool_init, nullptr,
+                               rte_pktmbuf_init, nullptr,
+                               rte_socket_id(), 0);
+
+    if (!_pool) {
+      lderr(cct) << __func__ << " Failed to create mempool for Tx" << dendl;
+      ceph_abort();
+    }
+    if (rte_eth_tx_queue_setup(dev->port_idx(), qid, default_ring_size,
+                               rte_eth_dev_socket_id(dev->port_idx()),
+                               dev->def_tx_conf()) < 0) {
+      lderr(cct) << __func__ << " cannot initialize tx queue" << dendl;
+      ceph_abort();
+    }
+  }
+
+  //
+  // Fill the factory with the buffers from the mempool allocated
+  // above.
+  //
+  init_factory();
+}
+
+bool DPDKQueuePair::tx_buf::i40e_should_linearize(rte_mbuf *head)
+{
+  bool is_tso = head->ol_flags & PKT_TX_TCP_SEG;
+
+  // For a non-TSO case: number of fragments should not exceed 8
+  if (!is_tso){
+    return head->nb_segs > i40e_max_xmit_segment_frags;
+  }
+
+  //
+  // For a TSO case each MSS window should not include more than 8
+  // fragments including headers.
+  //
+
+  // Calculate the number of frags containing headers.
+  //
+  // Note: we support neither VLAN nor tunneling thus headers size
+  // accounting is super simple.
+  //
+  size_t headers_size = head->l2_len + head->l3_len + head->l4_len;
+  unsigned hdr_frags = 0;
+  size_t cur_payload_len = 0;
+  rte_mbuf *cur_seg = head;
+
+  while (cur_seg && cur_payload_len < headers_size) {
+    cur_payload_len += cur_seg->data_len;
+    cur_seg = cur_seg->next;
+    hdr_frags++;
+  }
+
+  //
+  // Header fragments will be used for each TSO segment, thus the
+  // maximum number of data segments will be 8 minus the number of
+  // header fragments.
+  //
+  // It's unclear from the spec how the first TSO segment is treated
+  // if the last fragment with headers contains some data bytes:
+  // whether this fragment will be accounted as a single fragment or
+  // as two separate fragments. We prefer to play it safe and assume
+  // that this fragment will be accounted as two separate fragments.
+  //
+  size_t max_win_size = i40e_max_xmit_segment_frags - hdr_frags;
+
+  if (head->nb_segs <= max_win_size) {
+    return false;
+  }
+
+  // Get the data (without headers) part of the first data fragment
+  size_t prev_frag_data = cur_payload_len - headers_size;
+  auto mss = head->tso_segsz;
+
+  while (cur_seg) {
+    unsigned frags_in_seg = 0;
+    size_t cur_seg_size = 0;
+
+    if (prev_frag_data) {
+      cur_seg_size = prev_frag_data;
+      frags_in_seg++;
+      prev_frag_data = 0;
+    }
+
+    while (cur_seg_size < mss && cur_seg) {
+      cur_seg_size += cur_seg->data_len;
+      cur_seg = cur_seg->next;
+      frags_in_seg++;
+
+      if (frags_in_seg > max_win_size) {
+        return true;
+      }
+    }
+
+    if (cur_seg_size > mss) {
+      prev_frag_data = cur_seg_size - mss;
+    }
+  }
+
+  return false;
+}
+
+void DPDKQueuePair::tx_buf::set_cluster_offload_info(const Packet& p, const DPDKQueuePair& qp, rte_mbuf* head)
+{
+  // Handle TCP checksum offload
+  auto oi = p.offload_info();
+  if (oi.needs_ip_csum) {
+    head->ol_flags |= PKT_TX_IP_CKSUM;
+    // TODO: Take a VLAN header into an account here
+    head->l2_len = sizeof(struct ether_hdr);
+    head->l3_len = oi.ip_hdr_len;
+  }
+  if (qp.port().get_hw_features().tx_csum_l4_offload) {
+    if (oi.protocol == ip_protocol_num::tcp) {
+      head->ol_flags |= PKT_TX_TCP_CKSUM;
+      // TODO: Take a VLAN header into an account here
+      head->l2_len = sizeof(struct ether_hdr);
+      head->l3_len = oi.ip_hdr_len;
+
+      if (oi.tso_seg_size) {
+        ceph_assert(oi.needs_ip_csum);
+        head->ol_flags |= PKT_TX_TCP_SEG;
+        head->l4_len = oi.tcp_hdr_len;
+        head->tso_segsz = oi.tso_seg_size;
+      }
+    }
+  }
+}
+
+DPDKQueuePair::tx_buf* DPDKQueuePair::tx_buf::from_packet_zc(
+        CephContext *cct, Packet&& p, DPDKQueuePair& qp)
+{
+  // Too fragmented - linearize
+  if (p.nr_frags() > max_frags) {
+    p.linearize();
+    qp.perf_logger->inc(l_dpdk_qp_tx_linearize_ops);
+  }
+
+ build_mbuf_cluster:
+  rte_mbuf *head = nullptr, *last_seg = nullptr;
+  unsigned nsegs = 0;
+
+  //
+  // Create a HEAD of the fragmented packet: check if frag0 has to be
+  // copied and if yes - send it in a copy way
+  //
+  if (!check_frag0(p)) {
+    if (!copy_one_frag(qp, p.frag(0), head, last_seg, nsegs)) {
+      ldout(cct, 1) << __func__ << " no available mbuf for " << p.frag(0).size << dendl;
+      return nullptr;
+    }
+  } else if (!translate_one_frag(qp, p.frag(0), head, last_seg, nsegs)) {
+    ldout(cct, 1) << __func__ << " no available mbuf for " << p.frag(0).size << dendl;
+    return nullptr;
+  }
+
+  unsigned total_nsegs = nsegs;
+
+  for (unsigned i = 1; i < p.nr_frags(); i++) {
+    rte_mbuf *h = nullptr, *new_last_seg = nullptr;
+    if (!translate_one_frag(qp, p.frag(i), h, new_last_seg, nsegs)) {
+      ldout(cct, 1) << __func__ << " no available mbuf for " << p.frag(i).size << dendl;
+      me(head)->recycle();
+      return nullptr;
+    }
+
+    total_nsegs += nsegs;
+
+    // Attach a new buffers' chain to the packet chain
+    last_seg->next = h;
+    last_seg = new_last_seg;
+  }
+
+  // Update the HEAD buffer with the packet info
+  head->pkt_len = p.len();
+  head->nb_segs = total_nsegs;
+
+  set_cluster_offload_info(p, qp, head);
+
+  //
+  // If a packet hasn't been linearized already and the resulting
+  // cluster requires the linearisation due to HW limitation:
+  //
+  //    - Recycle the cluster.
+  //    - Linearize the packet.
+  //    - Build the cluster once again
+  //
+  if (head->nb_segs > max_frags ||
+      (p.nr_frags() > 1 && qp.port().is_i40e_device() && i40e_should_linearize(head)) ||
+      (p.nr_frags() > vmxnet3_max_xmit_segment_frags && qp.port().is_vmxnet3_device())) {
+    me(head)->recycle();
+    p.linearize();
+    qp.perf_logger->inc(l_dpdk_qp_tx_linearize_ops);
+
+    goto build_mbuf_cluster;
+  }
+
+  me(last_seg)->set_packet(std::move(p));
+
+  return me(head);
+}
+
+void DPDKQueuePair::tx_buf::copy_packet_to_cluster(const Packet& p, rte_mbuf* head)
+{
+  rte_mbuf* cur_seg = head;
+  size_t cur_seg_offset = 0;
+  unsigned cur_frag_idx = 0;
+  size_t cur_frag_offset = 0;
+
+  while (true) {
+    size_t to_copy = std::min(p.frag(cur_frag_idx).size - cur_frag_offset,
+                              inline_mbuf_data_size - cur_seg_offset);
+
+    memcpy(rte_pktmbuf_mtod_offset(cur_seg, void*, cur_seg_offset),
+           p.frag(cur_frag_idx).base + cur_frag_offset, to_copy);
+
+    cur_frag_offset += to_copy;
+    cur_seg_offset += to_copy;
+
+    if (cur_frag_offset >= p.frag(cur_frag_idx).size) {
+      ++cur_frag_idx;
+      if (cur_frag_idx >= p.nr_frags()) {
+        //
+        // We are done - set the data size of the last segment
+        // of the cluster.
+        //
+        cur_seg->data_len = cur_seg_offset;
+        break;
+      }
+
+      cur_frag_offset = 0;
+    }
+
+    if (cur_seg_offset >= inline_mbuf_data_size) {
+      cur_seg->data_len = inline_mbuf_data_size;
+      cur_seg = cur_seg->next;
+      cur_seg_offset = 0;
+
+      // FIXME: assert in a fast-path - remove!!!
+      ceph_assert(cur_seg);
+    }
+  }
+}
+
+DPDKQueuePair::tx_buf* DPDKQueuePair::tx_buf::from_packet_copy(Packet&& p, DPDKQueuePair& qp)
+{
+  // sanity
+  if (!p.len()) {
+    return nullptr;
+  }
+
+  /*
+   * Here we are going to use the fact that the inline data size is a
+   * power of two.
+   *
+   * We will first try to allocate the cluster and only if we are
+   * successful - we will go and copy the data.
+   */
+  auto aligned_len = align_up((size_t)p.len(), inline_mbuf_data_size);
+  unsigned nsegs = aligned_len / inline_mbuf_data_size;
+  rte_mbuf *head = nullptr, *last_seg = nullptr;
+
+  tx_buf* buf = qp.get_tx_buf();
+  if (!buf) {
+    return nullptr;
+  }
+
+  head = buf->rte_mbuf_p();
+  last_seg = head;
+  for (unsigned i = 1; i < nsegs; i++) {
+    buf = qp.get_tx_buf();
+    if (!buf) {
+      me(head)->recycle();
+      return nullptr;
+    }
+
+    last_seg->next = buf->rte_mbuf_p();
+    last_seg = last_seg->next;
+  }
+
+  //
+  // If we've got here means that we have succeeded already!
+  // We only need to copy the data and set the head buffer with the
+  // relevant info.
+  //
+  head->pkt_len = p.len();
+  head->nb_segs = nsegs;
+
+  copy_packet_to_cluster(p, head);
+  set_cluster_offload_info(p, qp, head);
+
+  return me(head);
+}
+
+size_t DPDKQueuePair::tx_buf::copy_one_data_buf(
+    DPDKQueuePair& qp, rte_mbuf*& m, char* data, size_t buf_len)
+{
+  tx_buf* buf = qp.get_tx_buf();
+  if (!buf) {
+    return 0;
+  }
+
+  size_t len = std::min(buf_len, inline_mbuf_data_size);
+
+  m = buf->rte_mbuf_p();
+
+  // mbuf_put()
+  m->data_len = len;
+  m->pkt_len  = len;
+
+  qp.perf_logger->inc(l_dpdk_qp_tx_copy_ops);
+  qp.perf_logger->inc(l_dpdk_qp_tx_copy_bytes, len);
+
+  memcpy(rte_pktmbuf_mtod(m, void*), data, len);
+
+  return len;
+}
+
+void DPDKDevice::set_rss_table()
+{
+  // always fill our local indirection table.
+  unsigned i = 0;
+  for (auto& r : _redir_table) {
+    r = i++ % _num_queues;
+  }
+
+  if (_dev_info.reta_size == 0)
+    return;
+
+  int reta_conf_size = std::max(1, _dev_info.reta_size / RTE_RETA_GROUP_SIZE);
+  rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
+
+  // Configure the HW indirection table
+  i = 0;
+  for (auto& x : reta_conf) {
+    x.mask = ~0ULL;
+    for (auto& r: x.reta) {
+      r = i++ % _num_queues;
+    }
+  }
+
+  if (rte_eth_dev_rss_reta_update(_port_idx, reta_conf, _dev_info.reta_size)) {
+    rte_exit(EXIT_FAILURE, "Port %d: Failed to update an RSS indirection table", _port_idx);
+  }
+}
+
+/******************************** Interface functions *************************/
+
+std::unique_ptr<DPDKDevice> create_dpdk_net_device(
+    CephContext *cct,
+    unsigned cores,
+    uint8_t port_idx,
+    bool use_lro,
+    bool enable_fc)
+{
+  // Check that we have at least one DPDK-able port
+  if (rte_eth_dev_count() == 0) {
+    rte_exit(EXIT_FAILURE, "No Ethernet ports - bye\n");
+  } else {
+    ldout(cct, 10) << __func__ << " ports number: " << int(rte_eth_dev_count()) << dendl;
+  }
+
+  return std::unique_ptr<DPDKDevice>(
+      new DPDKDevice(cct, port_idx, cores, use_lro, enable_fc));
+}
diff --git a/src/msg/async/dpdk/DPDK.h b/src/msg/async/dpdk/DPDK.h
new file mode 100644
index 00000000..fa12af6b
--- /dev/null
+++ b/src/msg/async/dpdk/DPDK.h
@@ -0,0 +1,918 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_DPDK_DEV_H
+#define CEPH_DPDK_DEV_H
+
+#include <memory>
+#include <functional>
+#include <rte_config.h>
+#include <rte_common.h>
+#include <rte_ethdev.h>
+#include <rte_malloc.h>
+#include <rte_version.h>
+
+#include "include/page.h"
+#include "common/Tub.h"
+#include "common/perf_counters.h"
+#include "msg/async/Event.h"
+#include "const.h"
+#include "circular_buffer.h"
+#include "ethernet.h"
+#include "Packet.h"
+#include "stream.h"
+#include "net.h"
+#include "toeplitz.h"
+
+
+struct free_deleter {
+  void operator()(void* p) { ::free(p); }
+};
+
+
+enum {
+  l_dpdk_dev_first = 58800,
+  l_dpdk_dev_rx_mcast,
+  l_dpdk_dev_rx_total_errors,
+  l_dpdk_dev_tx_total_errors,
+  l_dpdk_dev_rx_badcrc_errors,
+  l_dpdk_dev_rx_dropped_errors,
+  l_dpdk_dev_rx_nombuf_errors,
+  l_dpdk_dev_last
+};
+
+enum {
+  l_dpdk_qp_first = 58900,
+  l_dpdk_qp_rx_packets,
+  l_dpdk_qp_tx_packets,
+  l_dpdk_qp_rx_bad_checksum_errors,
+  l_dpdk_qp_rx_no_memory_errors,
+  l_dpdk_qp_rx_bytes,
+  l_dpdk_qp_tx_bytes,
+  l_dpdk_qp_rx_last_bunch,
+  l_dpdk_qp_tx_last_bunch,
+  l_dpdk_qp_rx_fragments,
+  l_dpdk_qp_tx_fragments,
+  l_dpdk_qp_rx_copy_ops,
+  l_dpdk_qp_tx_copy_ops,
+  l_dpdk_qp_rx_copy_bytes,
+  l_dpdk_qp_tx_copy_bytes,
+  l_dpdk_qp_rx_linearize_ops,
+  l_dpdk_qp_tx_linearize_ops,
+  l_dpdk_qp_tx_queue_length,
+  l_dpdk_qp_last
+};
+
+class DPDKDevice;
+class DPDKWorker;
+
+class DPDKQueuePair {
+  using packet_provider_type = std::function<Tub<Packet> ()>;
+ public:
+  void configure_proxies(const std::map<unsigned, float>& cpu_weights);
+  // build REdirection TAble for cpu_weights map: target cpu -> weight
+  void build_sw_reta(const std::map<unsigned, float>& cpu_weights);
+  void proxy_send(Packet p) {
+    _proxy_packetq.push_back(std::move(p));
+  }
+  void register_packet_provider(packet_provider_type func) {
+    _pkt_providers.push_back(std::move(func));
+  }
+  bool poll_tx();
+  friend class DPDKDevice;
+
+  class tx_buf_factory;
+
+  class tx_buf {
+    friend class DPDKQueuePair;
+   public:
+    static tx_buf* me(rte_mbuf* mbuf) {
+      return reinterpret_cast<tx_buf*>(mbuf);
+    }
+
+   private:
+    /**
+     * Checks if the original packet of a given cluster should be linearized
+     * due to HW limitations.
+     *
+     * @param head head of a cluster to check
+     *
+     * @return TRUE if a packet should be linearized.
+     */
+    static bool i40e_should_linearize(rte_mbuf *head);
+
+    /**
+     * Sets the offload info in the head buffer of an rte_mbufs cluster.
+     *
+     * @param p an original packet the cluster is built for
+     * @param qp QP handle
+     * @param head a head of an rte_mbufs cluster
+     */
+    static void set_cluster_offload_info(const Packet& p, const DPDKQueuePair& qp, rte_mbuf* head);
+
+    /**
+     * Creates a tx_buf cluster representing a given packet in a "zero-copy"
+     * way.
+     *
+     * @param p packet to translate
+     * @param qp DPDKQueuePair handle
+     *
+     * @return the HEAD tx_buf of the cluster or nullptr in case of a
+     *         failure
+     */
+    static tx_buf* from_packet_zc(
+            CephContext *cct, Packet&& p, DPDKQueuePair& qp);
+
+    /**
+     * Copy the contents of the "packet" into the given cluster of
+     * rte_mbuf's.
+     *
+     * @note Size of the cluster has to be big enough to accommodate all the
+     *       contents of the given packet.
+     *
+     * @param p packet to copy
+     * @param head head of the rte_mbuf's cluster
+     */
+    static void copy_packet_to_cluster(const Packet& p, rte_mbuf* head);
+
+    /**
+     * Creates a tx_buf cluster representing a given packet in a "copy" way.
+     *
+     * @param p packet to translate
+     * @param qp DPDKQueuePair handle
+     *
+     * @return the HEAD tx_buf of the cluster or nullptr in case of a
+     *         failure
+     */
+    static tx_buf* from_packet_copy(Packet&& p, DPDKQueuePair& qp);
+
+    /**
+     * Zero-copy handling of a single fragment.
+     *
+     * @param do_one_buf Functor responsible for a single rte_mbuf
+     *                   handling
+     * @param qp DPDKQueuePair handle (in)
+     * @param frag Fragment to copy (in)
+     * @param head Head of the cluster (out)
+     * @param last_seg Last segment of the cluster (out)
+     * @param nsegs Number of segments in the cluster (out)
+     *
+     * @return TRUE in case of success
+     */
+    template <class DoOneBufFunc>
+    static bool do_one_frag(DoOneBufFunc do_one_buf, DPDKQueuePair& qp,
+                            fragment& frag, rte_mbuf*& head,
+                            rte_mbuf*& last_seg, unsigned& nsegs) {
+      size_t len, left_to_set = frag.size;
+      char* base = frag.base;
+
+      rte_mbuf* m;
+
+      // TODO: ceph_assert() in a fast path! Remove me ASAP!
+      ceph_assert(frag.size);
+
+      // Create a HEAD of mbufs' cluster and set the first bytes into it
+      len = do_one_buf(qp, head, base, left_to_set);
+      if (!len) {
+        return false;
+      }
+
+      left_to_set -= len;
+      base += len;
+      nsegs = 1;
+
+      //
+      // Set the rest of the data into the new mbufs and chain them to
+      // the cluster.
+      //
+      rte_mbuf* prev_seg = head;
+      while (left_to_set) {
+        len = do_one_buf(qp, m, base, left_to_set);
+        if (!len) {
+          me(head)->recycle();
+          return false;
+        }
+
+        left_to_set -= len;
+        base += len;
+        nsegs++;
+
+        prev_seg->next = m;
+        prev_seg = m;
+      }
+
+      // Return the last mbuf in the cluster
+      last_seg = prev_seg;
+
+      return true;
+    }
+
+    /**
+     * Zero-copy handling of a single fragment.
+     *
+     * @param qp DPDKQueuePair handle (in)
+     * @param frag Fragment to copy (in)
+     * @param head Head of the cluster (out)
+     * @param last_seg Last segment of the cluster (out)
+     * @param nsegs Number of segments in the cluster (out)
+     *
+     * @return TRUE in case of success
+     */
+    static bool translate_one_frag(DPDKQueuePair& qp, fragment& frag,
+                                   rte_mbuf*& head, rte_mbuf*& last_seg,
+                                   unsigned& nsegs) {
+      return do_one_frag(set_one_data_buf, qp, frag, head,
+                         last_seg, nsegs);
+    }
+
+    /**
+     * Copies one fragment into the cluster of rte_mbuf's.
+     *
+     * @param qp DPDKQueuePair handle (in)
+     * @param frag Fragment to copy (in)
+     * @param head Head of the cluster (out)
+     * @param last_seg Last segment of the cluster (out)
+     * @param nsegs Number of segments in the cluster (out)
+     *
+     * We return the "last_seg" to avoid traversing the cluster in order to get
+     * it.
+     *
+     * @return TRUE in case of success
+     */
+    static bool copy_one_frag(DPDKQueuePair& qp, fragment& frag,
+                              rte_mbuf*& head, rte_mbuf*& last_seg,
+                              unsigned& nsegs) {
+      return do_one_frag(copy_one_data_buf, qp, frag, head,
+                         last_seg, nsegs);
+    }
+
+    /**
+     * Allocates a single rte_mbuf and sets it to point to a given data
+     * buffer.
+     *
+     * @param qp DPDKQueuePair handle (in)
+     * @param m New allocated rte_mbuf (out)
+     * @param va virtual address of a data buffer (in)
+     * @param buf_len length of the data to copy (in)
+     *
+     * @return The actual number of bytes that has been set in the mbuf
+     */
+    static size_t set_one_data_buf(
+        DPDKQueuePair& qp, rte_mbuf*& m, char* va, size_t buf_len) {
+      static constexpr size_t max_frag_len = 15 * 1024; // 15K
+
+      // FIXME: current all tx buf is allocated without rte_malloc
+      return copy_one_data_buf(qp, m, va, buf_len);
+      //
+      // Currently we break a buffer on a 15K boundary because 82599
+      // devices have a 15.5K limitation on a maximum single fragment
+      // size.
+      //
+      rte_iova_t pa = rte_malloc_virt2iova(va);
+      if (!pa)
+        return copy_one_data_buf(qp, m, va, buf_len);
+
+      ceph_assert(buf_len);
+      tx_buf* buf = qp.get_tx_buf();
+      if (!buf) {
+        return 0;
+      }
+
+      size_t len = std::min(buf_len, max_frag_len);
+
+      buf->set_zc_info(va, pa, len);
+      m = buf->rte_mbuf_p();
+
+      return len;
+    }
+
+    /**
+     *  Allocates a single rte_mbuf and copies a given data into it.
+     *
+     * @param qp DPDKQueuePair handle (in)
+     * @param m New allocated rte_mbuf (out)
+     * @param data Data to copy from (in)
+     * @param buf_len length of the data to copy (in)
+     *
+     * @return The actual number of bytes that has been copied
+     */
+    static size_t copy_one_data_buf(
+        DPDKQueuePair& qp, rte_mbuf*& m, char* data, size_t buf_len);
+
+    /**
+     * Checks if the first fragment of the given packet satisfies the
+     * zero-copy flow requirement: its first 128 bytes should not cross the
+     * 4K page boundary. This is required in order to avoid splitting packet
+     * headers.
+     *
+     * @param p packet to check
+     *
+     * @return TRUE if packet is ok and FALSE otherwise.
+     */
+    static bool check_frag0(Packet& p)
+    {
+      //
+      // First frag is special - it has headers that should not be split.
+      // If the addressing is such that the first fragment has to be
+      // split, then send this packet in a (non-zero) copy flow. We'll
+      // check if the first 128 bytes of the first fragment reside in the
+      // physically contiguous area. If that's the case - we are good to
+      // go.
+      //
+      if (p.frag(0).size < 128)
+        return false;
+
+      return true;
+    }
+
+   public:
+    tx_buf(tx_buf_factory& fc) : _fc(fc) {
+
+      _buf_physaddr = _mbuf.buf_physaddr;
+      _data_off     = _mbuf.data_off;
+    }
+
+    rte_mbuf* rte_mbuf_p() { return &_mbuf; }
+
+    void set_zc_info(void* va, phys_addr_t pa, size_t len) {
+      // mbuf_put()
+      _mbuf.data_len           = len;
+      _mbuf.pkt_len            = len;
+
+      // Set the mbuf to point to our data
+      _mbuf.buf_addr           = va;
+      _mbuf.buf_physaddr       = pa;
+      _mbuf.data_off           = 0;
+      _is_zc                   = true;
+    }
+
+    void reset_zc() {
+
+      //
+      // If this mbuf was the last in a cluster and contains an
+      // original packet object then call the destructor of the
+      // original packet object.
+      //
+      if (_p) {
+        //
+        // Reset the std::optional. This in particular is going
+        // to call the "packet"'s destructor and reset the
+        // "optional" state to "nonengaged".
+        //
+        _p.destroy();
+
+      } else if (!_is_zc) {
+        return;
+      }
+
+      // Restore the rte_mbuf fields we trashed in set_zc_info()
+      _mbuf.buf_physaddr = _buf_physaddr;
+      _mbuf.buf_addr     = rte_mbuf_to_baddr(&_mbuf);
+      _mbuf.data_off     = _data_off;
+
+      _is_zc             = false;
+    }
+
+    void recycle() {
+      struct rte_mbuf *m = &_mbuf, *m_next;
+
+      while (m != nullptr) {
+        m_next = m->next;
+        rte_pktmbuf_reset(m);
+        _fc.put(me(m));
+        m = m_next;
+      }
+    }
+
+    void set_packet(Packet&& p) {
+      _p = std::move(p);
+    }
+
+   private:
+    struct rte_mbuf _mbuf;
+    MARKER private_start;
+    Tub<Packet> _p;
+    phys_addr_t _buf_physaddr;
+    uint16_t _data_off;
+    // TRUE if underlying mbuf has been used in the zero-copy flow
+    bool _is_zc = false;
+    // buffers' factory the buffer came from
+    tx_buf_factory& _fc;
+    MARKER private_end;
+  };
+
+  class tx_buf_factory {
+    //
+    // Number of buffers to free in each GC iteration:
+    // We want the buffers to be allocated from the mempool as many as
+    // possible.
+    //
+    // On the other hand if there is no Tx for some time we want the
+    // completions to be eventually handled. Thus we choose the smallest
+    // possible packets count number here.
+    //
+    static constexpr int gc_count = 1;
+   public:
+    tx_buf_factory(CephContext *c, DPDKDevice *dev, uint8_t qid);
+    ~tx_buf_factory() {
+      // put all mbuf back into mempool in order to make the next factory work
+      while (gc());
+      rte_mempool_put_bulk(_pool, (void**)_ring.data(),
+                           _ring.size());
+    }
+
+
+    /**
+     * @note Should not be called if there are no free tx_buf's
+     *
+     * @return a free tx_buf object
+     */
+    tx_buf* get() {
+      // Take completed from the HW first
+      tx_buf *pkt = get_one_completed();
+      if (pkt) {
+        pkt->reset_zc();
+        return pkt;
+      }
+
+      //
+      // If there are no completed at the moment - take from the
+      // factory's cache.
+      //
+      if (_ring.empty()) {
+        return nullptr;
+      }
+
+      pkt = _ring.back();
+      _ring.pop_back();
+
+      return pkt;
+    }
+
+    void put(tx_buf* buf) {
+      buf->reset_zc();
+      _ring.push_back(buf);
+    }
+
+    bool gc() {
+      for (int cnt = 0; cnt < gc_count; ++cnt) {
+        auto tx_buf_p = get_one_completed();
+        if (!tx_buf_p) {
+          return false;
+        }
+
+        put(tx_buf_p);
+      }
+
+      return true;
+    }
+   private:
+    /**
+     * Fill the mbufs circular buffer: after this the _pool will become
+     * empty. We will use it to catch the completed buffers:
+     *
+     * - Underlying PMD drivers will "free" the mbufs once they are
+     *   completed.
+     * - We will poll the _pktmbuf_pool_tx till it's empty and release
+     *   all the buffers from the freed mbufs.
+     */
+    void init_factory() {
+      while (rte_mbuf* mbuf = rte_pktmbuf_alloc(_pool)) {
+        _ring.push_back(new(tx_buf::me(mbuf)) tx_buf{*this});
+      }
+    }
+
+    /**
+     * PMD puts the completed buffers back into the mempool they have
+     * originally come from.
+     *
+     * @note rte_pktmbuf_alloc() resets the mbuf so there is no need to call
+     *       rte_pktmbuf_reset() here again.
+     *
+     * @return a single tx_buf that has been completed by HW.
+     */
+    tx_buf* get_one_completed() {
+      return tx_buf::me(rte_pktmbuf_alloc(_pool));
+    }
+
+   private:
+    CephContext *cct;
+    std::vector<tx_buf*> _ring;
+    rte_mempool* _pool = nullptr;
+  };
+
+ public:
+  explicit DPDKQueuePair(CephContext *c, EventCenter *cen, DPDKDevice* dev, uint8_t qid);
+  ~DPDKQueuePair() {
+    if (device_stat_time_fd) {
+      center->delete_time_event(device_stat_time_fd);
+    }
+    rx_gc(true);
+  }
+
+  void rx_start() {
+    _rx_poller.construct(this);
+  }
+
+  uint32_t send(circular_buffer<Packet>& pb) {
+    // Zero-copy send
+    return _send(pb, [&] (Packet&& p) {
+      return tx_buf::from_packet_zc(cct, std::move(p), *this);
+    });
+  }
+
+  DPDKDevice& port() const { return *_dev; }
+  tx_buf* get_tx_buf() { return _tx_buf_factory.get(); }
+
+  void handle_stats();
+
+ private:
+  template <class Func>
+  uint32_t _send(circular_buffer<Packet>& pb, Func &&packet_to_tx_buf_p) {
+    if (_tx_burst.size() == 0) {
+      for (auto&& p : pb) {
+        // TODO: ceph_assert() in a fast path! Remove me ASAP!
+        ceph_assert(p.len());
+
+        tx_buf* buf = packet_to_tx_buf_p(std::move(p));
+        if (!buf) {
+          break;
+        }
+
+        _tx_burst.push_back(buf->rte_mbuf_p());
+      }
+    }
+
+    uint16_t sent = rte_eth_tx_burst(_dev_port_idx, _qid,
+                                     _tx_burst.data() + _tx_burst_idx,
+                                     _tx_burst.size() - _tx_burst_idx);
+
+    uint64_t nr_frags = 0, bytes = 0;
+
+    for (int i = 0; i < sent; i++) {
+      rte_mbuf* m = _tx_burst[_tx_burst_idx + i];
+      bytes    += m->pkt_len;
+      nr_frags += m->nb_segs;
+      pb.pop_front();
+    }
+
+    perf_logger->inc(l_dpdk_qp_tx_fragments, nr_frags);
+    perf_logger->inc(l_dpdk_qp_tx_bytes, bytes);
+
+    _tx_burst_idx += sent;
+
+    if (_tx_burst_idx == _tx_burst.size()) {
+      _tx_burst_idx = 0;
+      _tx_burst.clear();
+    }
+
+    return sent;
+  }
+
+  /**
+   * Allocate a new data buffer and set the mbuf to point to it.
+   *
+   * Do some DPDK hacks to work on PMD: it assumes that the buf_addr
+   * points to the private data of RTE_PKTMBUF_HEADROOM before the actual
+   * data buffer.
+   *
+   * @param m mbuf to update
+   */
+  static bool refill_rx_mbuf(rte_mbuf* m, size_t size,
+                             std::vector<void*> &datas) {
+    if (datas.empty())
+      return false;
+    void *data = datas.back();
+    datas.pop_back();
+
+    //
+    // Set the mbuf to point to our data.
+    //
+    // Do some DPDK hacks to work on PMD: it assumes that the buf_addr
+    // points to the private data of RTE_PKTMBUF_HEADROOM before the
+    // actual data buffer.
+    //
+    m->buf_addr      = (char*)data - RTE_PKTMBUF_HEADROOM;
+    m->buf_physaddr  = rte_mem_virt2phy(data) - RTE_PKTMBUF_HEADROOM;
+    return true;
+  }
+
+  bool init_rx_mbuf_pool();
+  bool rx_gc(bool force=false);
+  bool refill_one_cluster(rte_mbuf* head);
+
+  /**
+   * Polls for a burst of incoming packets. This function will not block and
+   * will immediately return after processing all available packets.
+   *
+   */
+  bool poll_rx_once();
+
+  /**
+   * Translates an rte_mbuf's into packet and feeds them to _rx_stream.
+   *
+   * @param bufs An array of received rte_mbuf's
+   * @param count Number of buffers in the bufs[]
+   */
+  void process_packets(struct rte_mbuf **bufs, uint16_t count);
+
+  /**
+   * Translate rte_mbuf into the "packet".
+   * @param m mbuf to translate
+   *
+   * @return a "optional" object representing the newly received data if in an
+   *         "engaged" state or an error if in a "disengaged" state.
+   */
+  Tub<Packet> from_mbuf(rte_mbuf* m);
+
+  /**
+   * Transform an LRO rte_mbuf cluster into the "packet" object.
+   * @param m HEAD of the mbufs' cluster to transform
+   *
+   * @return a "optional" object representing the newly received LRO packet if
+   *         in an "engaged" state or an error if in a "disengaged" state.
+   */
+  Tub<Packet> from_mbuf_lro(rte_mbuf* m);
+
+ private:
+  CephContext *cct;
+  std::vector<packet_provider_type> _pkt_providers;
+  Tub<std::array<uint8_t, 128>> _sw_reta;
+  circular_buffer<Packet> _proxy_packetq;
+  stream<Packet> _rx_stream;
+  circular_buffer<Packet> _tx_packetq;
+  std::vector<void*> _alloc_bufs;
+
+  PerfCounters *perf_logger;
+  DPDKDevice* _dev;
+  uint8_t _dev_port_idx;
+  EventCenter *center;
+  uint8_t _qid;
+  rte_mempool *_pktmbuf_pool_rx;
+  std::vector<rte_mbuf*> _rx_free_pkts;
+  std::vector<rte_mbuf*> _rx_free_bufs;
+  std::vector<fragment> _frags;
+  std::vector<char*> _bufs;
+  size_t _num_rx_free_segs = 0;
+  uint64_t device_stat_time_fd = 0;
+
+#ifdef CEPH_PERF_DEV
+  uint64_t rx_cycles = 0;
+  uint64_t rx_count = 0;
+  uint64_t tx_cycles = 0;
+  uint64_t tx_count = 0;
+#endif
+
+  class DPDKTXPoller : public EventCenter::Poller {
+    DPDKQueuePair *qp;
+
+   public:
+    explicit DPDKTXPoller(DPDKQueuePair *qp)
+        : EventCenter::Poller(qp->center, "DPDK::DPDKTXPoller"), qp(qp) {}
+
+    virtual int poll() {
+      return qp->poll_tx();
+    }
+  } _tx_poller;
+
+  class DPDKRXGCPoller : public EventCenter::Poller {
+    DPDKQueuePair *qp;
+
+   public:
+    explicit DPDKRXGCPoller(DPDKQueuePair *qp)
+        : EventCenter::Poller(qp->center, "DPDK::DPDKRXGCPoller"), qp(qp) {}
+
+    virtual int poll() {
+      return qp->rx_gc();
+    }
+  } _rx_gc_poller;
+  tx_buf_factory _tx_buf_factory;
+  class DPDKRXPoller : public EventCenter::Poller {
+    DPDKQueuePair *qp;
+
+   public:
+    explicit DPDKRXPoller(DPDKQueuePair *qp)
+        : EventCenter::Poller(qp->center, "DPDK::DPDKRXPoller"), qp(qp) {}
+
+    virtual int poll() {
+      return qp->poll_rx_once();
+    }
+  };
+  Tub<DPDKRXPoller> _rx_poller;
+  class DPDKTXGCPoller : public EventCenter::Poller {
+    DPDKQueuePair *qp;
+
+   public:
+    explicit DPDKTXGCPoller(DPDKQueuePair *qp)
+        : EventCenter::Poller(qp->center, "DPDK::DPDKTXGCPoller"), qp(qp) {}
+
+    virtual int poll() {
+      return qp->_tx_buf_factory.gc();
+    }
+  } _tx_gc_poller;
+  std::vector<rte_mbuf*> _tx_burst;
+  uint16_t _tx_burst_idx = 0;
+};
+
+class DPDKDevice {
+ public:
+  CephContext *cct;
+  PerfCounters *perf_logger;
+  std::vector<std::unique_ptr<DPDKQueuePair>> _queues;
+  std::vector<DPDKWorker*> workers;
+  size_t _rss_table_bits = 0;
+  uint8_t _port_idx;
+  uint16_t _num_queues;
+  unsigned cores;
+  hw_features _hw_features;
+  uint8_t _queues_ready = 0;
+  unsigned _home_cpu;
+  bool _use_lro;
+  bool _enable_fc;
+  std::vector<uint8_t> _redir_table;
+  rss_key_type _rss_key;
+  bool _is_i40e_device = false;
+  bool _is_vmxnet3_device = false;
+
+ public:
+  rte_eth_dev_info _dev_info = {};
+
+  /**
+   * The final stage of a port initialization.
+   * @note Must be called *after* all queues from stage (2) have been
+   *       initialized.
+   */
+  int init_port_fini();
+
+ private:
+  /**
+   * Port initialization consists of 3 main stages:
+   * 1) General port initialization which ends with a call to
+   *    rte_eth_dev_configure() where we request the needed number of Rx and
+   *    Tx queues.
+   * 2) Individual queues initialization. This is done in the constructor of
+   *    DPDKQueuePair class. In particular the memory pools for queues are allocated
+   *    in this stage.
+   * 3) The final stage of the initialization which starts with the call of
+   *    rte_eth_dev_start() after which the port becomes fully functional. We
+   *    will also wait for a link to get up in this stage.
+   */
+
+
+  /**
+   * First stage of the port initialization.
+   *
+   * @return 0 in case of success and an appropriate error code in case of an
+   *         error.
+   */
+  int init_port_start();
+
+  /**
+   * Check the link status of out port in up to 9s, and print them finally.
+   */
+  int check_port_link_status();
+
+  /**
+   * Configures the HW Flow Control
+   */
+  void set_hw_flow_control();
+
+ public:
+  DPDKDevice(CephContext *c, uint8_t port_idx, uint16_t num_queues, bool use_lro, bool enable_fc):
+      cct(c), _port_idx(port_idx), _num_queues(num_queues),
+      _home_cpu(0), _use_lro(use_lro),
+      _enable_fc(enable_fc) {
+    _queues = std::vector<std::unique_ptr<DPDKQueuePair>>(_num_queues);
+    /* now initialise the port we will use */
+    int ret = init_port_start();
+    if (ret != 0) {
+      rte_exit(EXIT_FAILURE, "Cannot initialise port %u\n", _port_idx);
+    }
+    string name(std::string("port") + std::to_string(port_idx));
+    PerfCountersBuilder plb(cct, name, l_dpdk_dev_first, l_dpdk_dev_last);
+
+    plb.add_u64_counter(l_dpdk_dev_rx_mcast, "dpdk_device_receive_multicast_packets", "DPDK received multicast packets");
+    plb.add_u64_counter(l_dpdk_dev_rx_badcrc_errors, "dpdk_device_receive_badcrc_errors", "DPDK received bad crc errors");
+
+    plb.add_u64_counter(l_dpdk_dev_rx_total_errors, "dpdk_device_receive_total_errors", "DPDK received total_errors");
+    plb.add_u64_counter(l_dpdk_dev_tx_total_errors, "dpdk_device_send_total_errors", "DPDK sendd total_errors");
+    plb.add_u64_counter(l_dpdk_dev_rx_dropped_errors, "dpdk_device_receive_dropped_errors", "DPDK received dropped errors");
+    plb.add_u64_counter(l_dpdk_dev_rx_nombuf_errors, "dpdk_device_receive_nombuf_errors", "DPDK received RX mbuf allocation errors");
+
+    perf_logger = plb.create_perf_counters();
+    cct->get_perfcounters_collection()->add(perf_logger);
+  }
+
+  ~DPDKDevice() {
+    rte_eth_dev_stop(_port_idx);
+  }
+
+  DPDKQueuePair& queue_for_cpu(unsigned cpu) { return *_queues[cpu]; }
+  void l2receive(int qid, Packet p) {
+    _queues[qid]->_rx_stream.produce(std::move(p));
+  }
+  subscription<Packet> receive(unsigned cpuid, std::function<int (Packet)> next_packet) {
+    auto sub = _queues[cpuid]->_rx_stream.listen(std::move(next_packet));
+    _queues[cpuid]->rx_start();
+    return std::move(sub);
+  }
+  ethernet_address hw_address() {
+    struct ether_addr mac;
+    rte_eth_macaddr_get(_port_idx, &mac);
+
+    return mac.addr_bytes;
+  }
+  hw_features get_hw_features() {
+    return _hw_features;
+  }
+  const rss_key_type& rss_key() const { return _rss_key; }
+  uint16_t hw_queues_count() { return _num_queues; }
+  std::unique_ptr<DPDKQueuePair> init_local_queue(CephContext *c, EventCenter *center, string hugepages, uint16_t qid) {
+    std::unique_ptr<DPDKQueuePair> qp;
+    qp = std::unique_ptr<DPDKQueuePair>(new DPDKQueuePair(c, center, this, qid));
+    return std::move(qp);
+  }
+  unsigned hash2qid(uint32_t hash) {
+    // return hash % hw_queues_count();
+    return _redir_table[hash & (_redir_table.size() - 1)];
+  }
+  void set_local_queue(unsigned i, std::unique_ptr<DPDKQueuePair> qp) {
+    ceph_assert(!_queues[i]);
+    _queues[i] = std::move(qp);
+  }
+  void unset_local_queue(unsigned i) {
+    ceph_assert(_queues[i]);
+    _queues[i].reset();
+  }
+  template <typename Func>
+  unsigned forward_dst(unsigned src_cpuid, Func&& hashfn) {
+    auto& qp = queue_for_cpu(src_cpuid);
+    if (!qp._sw_reta)
+      return src_cpuid;
+
+    ceph_assert(!qp._sw_reta);
+    auto hash = hashfn() >> _rss_table_bits;
+    auto& reta = *qp._sw_reta;
+    return reta[hash % reta.size()];
+  }
+  unsigned hash2cpu(uint32_t hash) {
+    // there is an assumption here that qid == get_id() which will
+    // not necessary be true in the future
+    return forward_dst(hash2qid(hash), [hash] { return hash; });
+  }
+
+  hw_features& hw_features_ref() { return _hw_features; }
+
+  const rte_eth_rxconf* def_rx_conf() const {
+    return &_dev_info.default_rxconf;
+  }
+
+  const rte_eth_txconf* def_tx_conf() const {
+    return &_dev_info.default_txconf;
+  }
+
+  /**
+   *  Set the RSS table in the device and store it in the internal vector.
+   */
+  void set_rss_table();
+
+  uint8_t port_idx() { return _port_idx; }
+  bool is_i40e_device() const {
+    return _is_i40e_device;
+  }
+  bool is_vmxnet3_device() const {
+    return _is_vmxnet3_device;
+  }
+};
+
+
+std::unique_ptr<DPDKDevice> create_dpdk_net_device(
+    CephContext *c, unsigned cores, uint8_t port_idx = 0,
+    bool use_lro = true, bool enable_fc = true);
+
+
+/**
+ * @return Number of bytes needed for mempool objects of each QP.
+ */
+uint32_t qp_mempool_obj_size();
+
+#endif // CEPH_DPDK_DEV_H
diff --git a/src/msg/async/dpdk/DPDKStack.cc b/src/msg/async/dpdk/DPDKStack.cc
new file mode 100644
index 00000000..3101ae57
--- /dev/null
+++ b/src/msg/async/dpdk/DPDKStack.cc
@@ -0,0 +1,281 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <memory>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <tuple>
+
+#include "common/ceph_argparse.h"
+#include "dpdk_rte.h"
+#include "DPDKStack.h"
+#include "DPDK.h"
+#include "IP.h"
+#include "TCP-Stack.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+#include "common/Cond.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "dpdkstack "
+
+static int dpdk_thread_adaptor(void* f)
+{
+  (*static_cast<std::function<void ()>*>(f))();
+  return 0;
+}
+
+void DPDKWorker::initialize()
+{
+  static enum {
+    WAIT_DEVICE_STAGE,
+    WAIT_PORT_FIN_STAGE,
+    DONE
+  } create_stage = WAIT_DEVICE_STAGE;
+  static Mutex lock("DPDKStack::lock");
+  static Cond cond;
+  static unsigned queue_init_done = 0;
+  static unsigned cores = 0;
+  static std::shared_ptr<DPDKDevice> sdev;
+
+  unsigned i = center.get_id();
+  if (i == 0) {
+    // Hardcoded port index 0.
+    // TODO: Inherit it from the opts
+    cores = cct->_conf->ms_async_op_threads;
+    std::unique_ptr<DPDKDevice> dev = create_dpdk_net_device(
+        cct, cores, cct->_conf->ms_dpdk_port_id,
+        cct->_conf->ms_dpdk_lro,
+        cct->_conf->ms_dpdk_hw_flow_control);
+    sdev = std::shared_ptr<DPDKDevice>(dev.release());
+    sdev->workers.resize(cores);
+    ldout(cct, 1) << __func__ << " using " << cores << " cores " << dendl;
+
+    Mutex::Locker l(lock);
+    create_stage = WAIT_PORT_FIN_STAGE;
+    cond.Signal();
+  } else {
+    Mutex::Locker l(lock);
+    while (create_stage <= WAIT_DEVICE_STAGE)
+      cond.Wait(lock);
+  }
+  ceph_assert(sdev);
+  if (i < sdev->hw_queues_count()) {
+    auto qp = sdev->init_local_queue(cct, &center, cct->_conf->ms_dpdk_hugepages, i);
+    std::map<unsigned, float> cpu_weights;
+    for (unsigned j = sdev->hw_queues_count() + i % sdev->hw_queues_count();
+         j < cores; j+= sdev->hw_queues_count())
+      cpu_weights[i] = 1;
+    cpu_weights[i] = cct->_conf->ms_dpdk_hw_queue_weight;
+    qp->configure_proxies(cpu_weights);
+    sdev->set_local_queue(i, std::move(qp));
+    Mutex::Locker l(lock);
+    ++queue_init_done;
+    cond.Signal();
+  } else {
+    // auto master = qid % sdev->hw_queues_count();
+    // sdev->set_local_queue(create_proxy_net_device(master, sdev.get()));
+    ceph_abort();
+  }
+  if (i == 0) {
+    {
+      Mutex::Locker l(lock);
+      while (queue_init_done < cores)
+        cond.Wait(lock);
+    }
+
+    if (sdev->init_port_fini() < 0) {
+      lderr(cct) << __func__ << " init_port_fini failed " << dendl;
+      ceph_abort();
+    }
+    Mutex::Locker l(lock);
+    create_stage = DONE;
+    cond.Signal();
+  } else {
+    Mutex::Locker l(lock);
+    while (create_stage <= WAIT_PORT_FIN_STAGE)
+      cond.Wait(lock);
+  }
+
+  sdev->workers[i] = this;
+  _impl = std::unique_ptr<DPDKWorker::Impl>(
+          new DPDKWorker::Impl(cct, i, &center, sdev));
+  {
+    Mutex::Locker l(lock);
+    if (!--queue_init_done) {
+      create_stage = WAIT_DEVICE_STAGE;
+      sdev.reset();
+    }
+  }
+}
+
+using AvailableIPAddress = std::tuple<string, string, string>;
+static bool parse_available_address(
+        const string &ips, const string &gates, const string &masks, vector<AvailableIPAddress> &res)
+{
+  vector<string> ip_vec, gate_vec, mask_vec;
+  string_to_vec(ip_vec, ips);
+  string_to_vec(gate_vec, gates);
+  string_to_vec(mask_vec, masks);
+  if (ip_vec.empty() || ip_vec.size() != gate_vec.size() || ip_vec.size() != mask_vec.size())
+    return false;
+
+  for (size_t i = 0; i < ip_vec.size(); ++i) {
+    res.push_back(AvailableIPAddress{ip_vec[i], gate_vec[i], mask_vec[i]});
+  }
+  return true;
+}
+
+static bool match_available_address(const vector<AvailableIPAddress> &avails,
+                                    const entity_addr_t &ip, int &res)
+{
+  for (size_t i = 0; i < avails.size(); ++i) {
+    entity_addr_t addr;
+    auto a = std::get<0>(avails[i]).c_str();
+    if (!addr.parse(a))
+      continue;
+    if (addr.is_same_host(ip)) {
+      res = i;
+      return true;
+    }
+  }
+  return false;
+}
+
+DPDKWorker::Impl::Impl(CephContext *cct, unsigned i, EventCenter *c, std::shared_ptr<DPDKDevice> dev)
+    : id(i), _netif(cct, dev, c), _dev(dev), _inet(cct, c, &_netif)
+{
+  vector<AvailableIPAddress> tuples;
+  bool parsed = parse_available_address(cct->_conf.get_val<std::string>("ms_dpdk_host_ipv4_addr"),
+                                        cct->_conf.get_val<std::string>("ms_dpdk_gateway_ipv4_addr"),
+                                        cct->_conf.get_val<std::string>("ms_dpdk_netmask_ipv4_addr"), tuples);
+  if (!parsed) {
+    lderr(cct) << __func__ << " no available address "
+               << cct->_conf.get_val<std::string>("ms_dpdk_host_ipv4_addr") << ", "
+               << cct->_conf.get_val<std::string>("ms_dpdk_gateway_ipv4_addr") << ", "
+               << cct->_conf.get_val<std::string>("ms_dpdk_netmask_ipv4_addr") << ", "
+               << dendl;
+    ceph_abort();
+  }
+  _inet.set_host_address(ipv4_address(std::get<0>(tuples[0])));
+  _inet.set_gw_address(ipv4_address(std::get<1>(tuples[0])));
+  _inet.set_netmask_address(ipv4_address(std::get<2>(tuples[0])));
+}
+
+DPDKWorker::Impl::~Impl()
+{
+  _dev->unset_local_queue(id);
+}
+
+int DPDKWorker::listen(entity_addr_t &sa, const SocketOptions &opt,
+                       ServerSocket *sock)
+{
+  ceph_assert(sa.get_family() == AF_INET);
+  ceph_assert(sock);
+
+  ldout(cct, 10) << __func__ << " addr " << sa << dendl;
+  // vector<AvailableIPAddress> tuples;
+  // bool parsed = parse_available_address(cct->_conf->ms_dpdk_host_ipv4_addr,
+  //                                       cct->_conf->ms_dpdk_gateway_ipv4_addr,
+  //                                       cct->_conf->ms_dpdk_netmask_ipv4_addr, tuples);
+  // if (!parsed) {
+  //   lderr(cct) << __func__ << " no available address "
+  //              << cct->_conf->ms_dpdk_host_ipv4_addr << ", "
+  //              << cct->_conf->ms_dpdk_gateway_ipv4_addr << ", "
+  //              << cct->_conf->ms_dpdk_netmask_ipv4_addr << ", "
+  //              << dendl;
+  //   return -EINVAL;
+  // }
+  // int idx;
+  // parsed = match_available_address(tuples, sa, idx);
+  // if (!parsed) {
+  //   lderr(cct) << __func__ << " no matched address for " << sa << dendl;
+  //   return -EINVAL;
+  // }
+  // _inet.set_host_address(ipv4_address(std::get<0>(tuples[idx])));
+  // _inet.set_gw_address(ipv4_address(std::get<1>(tuples[idx])));
+  // _inet.set_netmask_address(ipv4_address(std::get<2>(tuples[idx])));
+  return tcpv4_listen(_impl->_inet.get_tcp(), sa.get_port(), opt, sa.get_type(),
+		      sock);
+}
+
+int DPDKWorker::connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket)
+{
+  // ceph_assert(addr.get_family() == AF_INET);
+  int r =  tcpv4_connect(_impl->_inet.get_tcp(), addr, socket);
+  ldout(cct, 10) << __func__ << " addr " << addr << dendl;
+  return r;
+}
+
+void DPDKStack::spawn_worker(unsigned i, std::function<void ()> &&func)
+{
+  // create a extra master thread
+  //
+  funcs[i] = std::move(func);
+  int r = 0;
+  r = dpdk::eal::init(cct);
+  if (r < 0) {
+    lderr(cct) << __func__ << " init dpdk rte failed, r=" << r << dendl;
+    ceph_abort();
+  }
+  // if dpdk::eal::init already called by NVMEDevice, we will select 1..n
+  // cores
+  ceph_assert(rte_lcore_count() >= i + 1);
+  unsigned core_id;
+  int j = i;
+  RTE_LCORE_FOREACH_SLAVE(core_id) {
+    if (i-- == 0) {
+      break;
+    }
+  }
+  dpdk::eal::execute_on_master([&]() {
+    r = rte_eal_remote_launch(dpdk_thread_adaptor, static_cast<void*>(&funcs[j]), core_id);
+    if (r < 0) {
+      lderr(cct) << __func__ << " remote launch failed, r=" << r << dendl;
+      ceph_abort();
+    }
+  });
+}
+
+void DPDKStack::join_worker(unsigned i)
+{
+  dpdk::eal::execute_on_master([&]() {
+    rte_eal_wait_lcore(i+1);
+  });
+}
diff --git a/src/msg/async/dpdk/DPDKStack.h b/src/msg/async/dpdk/DPDKStack.h
new file mode 100644
index 00000000..a44ae383
--- /dev/null
+++ b/src/msg/async/dpdk/DPDKStack.h
@@ -0,0 +1,257 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef CEPH_MSG_DPDKSTACK_H
+#define CEPH_MSG_DPDKSTACK_H
+
+#include <functional>
+
+#include "common/ceph_context.h"
+#include "common/Tub.h"
+
+#include "msg/async/Stack.h"
+#include "net.h"
+#include "const.h"
+#include "IP.h"
+#include "Packet.h"
+
+class interface;
+
+template <typename Protocol>
+class NativeConnectedSocketImpl;
+
+// DPDKServerSocketImpl
+template <typename Protocol>
+class DPDKServerSocketImpl : public ServerSocketImpl {
+  typename Protocol::listener _listener;
+ public:
+  DPDKServerSocketImpl(Protocol& proto, uint16_t port, const SocketOptions &opt,
+		       int type);
+  int listen() {
+    return _listener.listen();
+  }
+  virtual int accept(ConnectedSocket *s, const SocketOptions &opts, entity_addr_t *out, Worker *w) override;
+  virtual void abort_accept() override;
+  virtual int fd() const override {
+    return _listener.fd();
+  }
+};
+
+// NativeConnectedSocketImpl
+template <typename Protocol>
+class NativeConnectedSocketImpl : public ConnectedSocketImpl {
+  typename Protocol::connection _conn;
+  uint32_t _cur_frag = 0;
+  uint32_t _cur_off = 0;
+  Tub<Packet> _buf;
+  Tub<bufferptr> _cache_ptr;
+
+ public:
+  explicit NativeConnectedSocketImpl(typename Protocol::connection conn)
+          : _conn(std::move(conn)) {}
+  NativeConnectedSocketImpl(NativeConnectedSocketImpl &&rhs)
+      : _conn(std::move(rhs._conn)), _buf(std::move(rhs.buf))  {}
+  virtual int is_connected() override {
+    return _conn.is_connected();
+  }
+
+  virtual ssize_t read(char *buf, size_t len) override {
+    size_t left = len;
+    ssize_t r = 0;
+    size_t off = 0;
+    while (left > 0) {
+      if (!_cache_ptr) {
+        _cache_ptr.construct();
+        r = zero_copy_read(*_cache_ptr);
+        if (r <= 0) {
+          _cache_ptr.destroy();
+          if (r == -EAGAIN)
+            break;
+          return r;
+        }
+      }
+      if (_cache_ptr->length() <= left) {
+        _cache_ptr->copy_out(0, _cache_ptr->length(), buf+off);
+        left -= _cache_ptr->length();
+        off += _cache_ptr->length();
+        _cache_ptr.destroy();
+      } else {
+        _cache_ptr->copy_out(0, left, buf+off);
+        _cache_ptr->set_offset(_cache_ptr->offset() + left);
+        _cache_ptr->set_length(_cache_ptr->length() - left);
+        left = 0;
+        break;
+      }
+    }
+    return len - left ? len - left : -EAGAIN;
+  }
+
+  virtual ssize_t zero_copy_read(bufferptr &data) override {
+    auto err = _conn.get_errno();
+    if (err <= 0)
+      return err;
+
+    if (!_buf) {
+      _buf = std::move(_conn.read());
+      if (!_buf)
+        return -EAGAIN;
+    }
+
+    fragment &f = _buf->frag(_cur_frag);
+    Packet p = _buf->share(_cur_off, f.size);
+    auto del = std::bind(
+            [](Packet &p) {}, std::move(p));
+    data = buffer::claim_buffer(
+            f.size, f.base, make_deleter(std::move(del)));
+    if (++_cur_frag == _buf->nr_frags()) {
+      _cur_frag = 0;
+      _cur_off = 0;
+      _buf.destroy();
+    } else {
+      _cur_off += f.size;
+    }
+    ceph_assert(data.length());
+    return data.length();
+  }
+  virtual ssize_t send(bufferlist &bl, bool more) override {
+    auto err = _conn.get_errno();
+    if (err < 0)
+      return (ssize_t)err;
+
+    size_t available = _conn.peek_sent_available();
+    if (available == 0) {
+      return 0;
+    }
+
+    std::vector<fragment> frags;
+    std::list<bufferptr>::const_iterator pb = bl.buffers().begin();
+    uint64_t left_pbrs = bl.buffers().size();
+    uint64_t len = 0;
+    uint64_t seglen = 0;
+    while (len < available && left_pbrs--) {
+      seglen = pb->length();
+      if (len + seglen > available) {
+        // don't continue if we enough at least 1 fragment since no available
+        // space for next ptr.
+        if (len > 0)
+          break;
+        seglen = std::min(seglen, available);
+      }
+      len += seglen;
+      frags.push_back(fragment{(char*)pb->c_str(), seglen});
+      ++pb;
+    }
+
+    if (len != bl.length()) {
+      bufferlist swapped;
+      bl.splice(0, len, &swapped);
+      auto del = std::bind(
+              [](bufferlist &bl) {}, std::move(swapped));
+      return _conn.send(Packet(std::move(frags), make_deleter(std::move(del))));
+    } else {
+      auto del = std::bind(
+              [](bufferlist &bl) {}, std::move(bl));
+
+      return _conn.send(Packet(std::move(frags), make_deleter(std::move(del))));
+    }
+  }
+  virtual void shutdown() override {
+    _conn.close_write();
+  }
+  // FIXME need to impl close
+  virtual void close() override {
+    _conn.close_write();
+  }
+  virtual int fd() const override {
+    return _conn.fd();
+  }
+  virtual int socket_fd() const override {
+    return _conn.fd();
+  }
+
+};
+
+template <typename Protocol>
+DPDKServerSocketImpl<Protocol>::DPDKServerSocketImpl(
+  Protocol& proto, uint16_t port, const SocketOptions &opt, int type)
+  : ServerSocketImpl(type), _listener(proto.listen(port)) {}
+
+template <typename Protocol>
+int DPDKServerSocketImpl<Protocol>::accept(ConnectedSocket *s, const SocketOptions &options, entity_addr_t *out, Worker *w) {
+  if (_listener.get_errno() < 0)
+    return _listener.get_errno();
+  auto c = _listener.accept();
+  if (!c)
+    return -EAGAIN;
+
+  if (out) {
+    *out = c->remote_addr();
+    out->set_type(addr_type);
+  }
+  std::unique_ptr<NativeConnectedSocketImpl<Protocol>> csi(
+          new NativeConnectedSocketImpl<Protocol>(std::move(*c)));
+  *s = ConnectedSocket(std::move(csi));
+  return 0;
+}
+
+template <typename Protocol>
+void DPDKServerSocketImpl<Protocol>::abort_accept() {
+  _listener.abort_accept();
+}
+
+class DPDKWorker : public Worker {
+  struct Impl {
+    unsigned id;
+    interface _netif;
+    std::shared_ptr<DPDKDevice> _dev;
+    ipv4 _inet;
+    Impl(CephContext *cct, unsigned i, EventCenter *c, std::shared_ptr<DPDKDevice> dev);
+    ~Impl();
+  };
+  std::unique_ptr<Impl> _impl;
+
+  virtual void initialize() override;
+  void set_ipv4_packet_filter(ip_packet_filter* filter) {
+    _impl->_inet.set_packet_filter(filter);
+  }
+  using tcp4 = tcp<ipv4_traits>;
+
+ public:
+  explicit DPDKWorker(CephContext *c, unsigned i): Worker(c, i) {}
+  virtual int listen(entity_addr_t &addr, const SocketOptions &opts, ServerSocket *) override;
+  virtual int connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) override;
+  void arp_learn(ethernet_address l2, ipv4_address l3) {
+    _impl->_inet.learn(l2, l3);
+  }
+  virtual void destroy() override {
+    _impl.reset();
+  }
+
+  friend class DPDKServerSocketImpl<tcp4>;
+};
+
+class DPDKStack : public NetworkStack {
+  vector<std::function<void()> > funcs;
+ public:
+  explicit DPDKStack(CephContext *cct, const string &t): NetworkStack(cct, t) {
+    funcs.resize(cct->_conf->ms_async_max_op_threads);
+  }
+  virtual bool support_zero_copy_read() const override { return true; }
+  virtual bool support_local_listen_table() const override { return true; }
+
+  virtual void spawn_worker(unsigned i, std::function<void ()> &&func) override;
+  virtual void join_worker(unsigned i) override;
+};
+
+#endif
diff --git a/src/msg/async/dpdk/EventDPDK.cc b/src/msg/async/dpdk/EventDPDK.cc
new file mode 100644
index 00000000..5d291716
--- /dev/null
+++ b/src/msg/async/dpdk/EventDPDK.cc
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+  *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/errno.h"
+#include "DPDKStack.h"
+#include "EventDPDK.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_ms
+
+#undef dout_prefix
+#define dout_prefix *_dout << "DPDKDriver."
+
+int DPDKDriver::init(EventCenter *c, int nevent)
+{
+	return 0;
+}
+
+int DPDKDriver::add_event(int fd, int cur_mask, int add_mask)
+{
+	ldout(cct, 20) << __func__ << " add event fd=" << fd << " cur_mask=" << cur_mask
+								 << " add_mask=" << add_mask << dendl;
+
+	int r = manager.listen(fd, add_mask);
+	if (r < 0) {
+		lderr(cct) << __func__ << " add fd=" << fd << " failed. "
+		           << cpp_strerror(-r) << dendl;
+		return -errno;
+	}
+
+	return 0;
+}
+
+int DPDKDriver::del_event(int fd, int cur_mask, int delmask)
+{
+	ldout(cct, 20) << __func__ << " del event fd=" << fd << " cur_mask=" << cur_mask
+								 << " delmask=" << delmask << dendl;
+	int r = 0;
+
+	if (delmask != EVENT_NONE) {
+		if ((r = manager.unlisten(fd, delmask)) < 0) {
+			lderr(cct) << __func__ << " delete fd=" << fd << " delmask=" << delmask
+								 << " failed." << cpp_strerror(-r) << dendl;
+			return r;
+		}
+	}
+	return 0;
+}
+
+int DPDKDriver::resize_events(int newsize)
+{
+	return 0;
+}
+
+int DPDKDriver::event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tvp)
+{
+	int num_events = 512;
+	int events[num_events];
+  int masks[num_events];
+
+	int retval = manager.poll(events, masks, num_events, tvp);
+	if (retval > 0) {
+		fired_events.resize(retval);
+		for (int i = 0; i < retval; i++) {
+			fired_events[i].fd = events[i];
+			fired_events[i].mask = masks[i];
+		}
+	}
+	return retval;
+}
diff --git a/src/msg/async/dpdk/EventDPDK.h b/src/msg/async/dpdk/EventDPDK.h
new file mode 100644
index 00000000..541c2210
--- /dev/null
+++ b/src/msg/async/dpdk/EventDPDK.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_EVENTDPDK_H
+#define CEPH_EVENTDPDK_H
+
+#include "msg/async/Event.h"
+#include "msg/async/Stack.h"
+#include "UserspaceEvent.h"
+
+class DPDKDriver : public EventDriver {
+  CephContext *cct;
+
+ public:
+  UserspaceEventManager manager;
+
+  explicit DPDKDriver(CephContext *c): cct(c), manager(c) {}
+  virtual ~DPDKDriver() { }
+
+  int init(EventCenter *c, int nevent) override;
+  int add_event(int fd, int cur_mask, int add_mask) override;
+  int del_event(int fd, int cur_mask, int del_mask) override;
+  int resize_events(int newsize) override;
+  int event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tp) override;
+  bool need_wakeup() override { return false; }
+};
+
+#endif //CEPH_EVENTDPDK_H
diff --git a/src/msg/async/dpdk/IP.cc b/src/msg/async/dpdk/IP.cc
new file mode 100644
index 00000000..f730cded
--- /dev/null
+++ b/src/msg/async/dpdk/IP.cc
@@ -0,0 +1,470 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ */
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/perf_counters.h"
+
+#include "capture.h"
+#include "IP.h"
+#include "toeplitz.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "dpdk "
+
+std::ostream& operator<<(std::ostream& os, const ipv4_address& a) {
+  auto ip = a.ip;
+  return os << ((ip >> 24) & 0xff) << "." << ((ip >> 16) & 0xff)
+            << "." << ((ip >> 8) & 0xff) << "." << ((ip >> 0) & 0xff);
+}
+
+utime_t ipv4::_frag_timeout = utime_t(30, 0);
+constexpr uint32_t ipv4::_frag_low_thresh;
+constexpr uint32_t ipv4::_frag_high_thresh;
+
+class C_handle_frag_timeout : public EventCallback {
+  ipv4 *_ipv4;
+
+ public:
+  C_handle_frag_timeout(ipv4 *i): _ipv4(i) {}
+  void do_request(uint64_t fd_or_id) {
+    _ipv4->frag_timeout();
+  }
+};
+
+enum {
+  l_dpdk_qp_first = 99000,
+  l_dpdk_total_linearize_operations,
+  l_dpdk_qp_last
+};
+
+ipv4::ipv4(CephContext *c, EventCenter *cen, interface* netif)
+  : cct(c), center(cen), _netif(netif), _global_arp(netif),
+    _arp(c, _global_arp, cen),
+    _host_address(0), _gw_address(0), _netmask(0),
+    _l3(netif, eth_protocol_num::ipv4, [this] { return get_packet(); }),
+    _rx_packets(
+      _l3.receive(
+        [this] (Packet p, ethernet_address ea) {
+          return handle_received_packet(std::move(p), ea);
+        },
+        [this] (forward_hash& out_hash_data, Packet& p, size_t off) {
+          return forward(out_hash_data, p, off);
+        }
+      )
+    ),
+    _tcp(*this, cen), _icmp(c, *this),
+    _l4({{ uint8_t(ip_protocol_num::tcp), &_tcp },
+         { uint8_t(ip_protocol_num::icmp), &_icmp }}),
+    _packet_filter(nullptr)
+{
+  PerfCountersBuilder plb(cct, "ipv4", l_dpdk_qp_first, l_dpdk_qp_last);
+  plb.add_u64_counter(l_dpdk_total_linearize_operations, "dpdk_ip_linearize_operations", "DPDK IP Packet linearization operations");
+  perf_logger = plb.create_perf_counters();
+  cct->get_perfcounters_collection()->add(perf_logger);
+  frag_handler = new C_handle_frag_timeout(this);
+}
+
+bool ipv4::forward(forward_hash& out_hash_data, Packet& p, size_t off)
+{
+  auto iph = p.get_header<ip_hdr>(off);
+
+  out_hash_data.push_back(iph->src_ip.ip);
+  out_hash_data.push_back(iph->dst_ip.ip);
+
+  auto h = iph->ntoh();
+  auto l4 = _l4[h.ip_proto];
+  if (l4) {
+    if (h.mf() == false && h.offset() == 0) {
+      // This IP datagram is atomic, forward according to tcp connection hash
+      l4->forward(out_hash_data, p, off + sizeof(ip_hdr));
+    }
+    // else forward according to ip fields only
+  }
+  return true;
+}
+
+int ipv4::handle_received_packet(Packet p, ethernet_address from)
+{
+  auto iph = p.get_header<ip_hdr>(0);
+  if (!iph) {
+    return 0;
+  }
+
+  // Skip checking csum of reassembled IP datagram
+  if (!get_hw_features().rx_csum_offload && !p.offload_info_ref().reassembled) {
+    checksummer csum;
+    csum.sum(reinterpret_cast<char*>(iph), sizeof(*iph));
+    if (csum.get() != 0) {
+      return 0;
+    }
+  }
+
+  auto h = iph->ntoh();
+  unsigned ip_len = h.len;
+  unsigned ip_hdr_len = h.ihl * 4;
+  unsigned pkt_len = p.len();
+  auto offset = h.offset();
+
+  ldout(cct, 10) << __func__ << " get " << std::hex << int(h.ip_proto)
+                 << std::dec << " packet from "
+                 << h.src_ip << " -> " << h.dst_ip << " id=" << h.id
+                 << " ip_len=" << ip_len << " ip_hdr_len=" << ip_hdr_len
+                 << " pkt_len=" << pkt_len << " offset=" << offset << dendl;
+
+  if (pkt_len > ip_len) {
+    // Trim extra data in the packet beyond IP total length
+    p.trim_back(pkt_len - ip_len);
+  } else if (pkt_len < ip_len) {
+    // Drop if it contains less than IP total length
+    return 0;
+  }
+  // Drop if the reassembled datagram will be larger than maximum IP size
+  if (offset + p.len() > ip_packet_len_max) {
+    return 0;
+  }
+
+  // FIXME: process options
+  if (in_my_netmask(h.src_ip) && h.src_ip != _host_address) {
+    ldout(cct, 20) << __func__ << " learn mac " << from << " with " << h.src_ip << dendl;
+    _arp.learn(from, h.src_ip);
+  }
+
+  if (_packet_filter) {
+    bool handled = false;
+    _packet_filter->handle(p, &h, from, handled);
+    if (handled) {
+      return 0;
+    }
+  }
+
+  if (h.dst_ip != _host_address) {
+    // FIXME: forward
+    return 0;
+  }
+
+  // Does this IP datagram need reassembly
+  auto mf = h.mf();
+  if (mf == true || offset != 0) {
+    frag_limit_mem();
+    auto frag_id = ipv4_frag_id{h.src_ip, h.dst_ip, h.id, h.ip_proto};
+    auto& frag = _frags[frag_id];
+    if (mf == false) {
+      frag.last_frag_received = true;
+    }
+    // This is a newly created frag_id
+    if (frag.mem_size == 0) {
+      _frags_age.push_back(frag_id);
+      frag.rx_time = ceph_clock_now();
+    }
+    auto added_size = frag.merge(h, offset, std::move(p));
+    _frag_mem += added_size;
+    if (frag.is_complete()) {
+      // All the fragments are received
+      auto dropped_size = frag.mem_size;
+      auto& ip_data = frag.data.map.begin()->second;
+      // Choose a cpu to forward this packet
+      auto cpu_id = center->get_id();
+      auto l4 = _l4[h.ip_proto];
+      if (l4) {
+        size_t l4_offset = 0;
+        forward_hash hash_data;
+        hash_data.push_back(hton(h.src_ip.ip));
+        hash_data.push_back(hton(h.dst_ip.ip));
+        l4->forward(hash_data, ip_data, l4_offset);
+        cpu_id = _netif->hash2cpu(toeplitz_hash(_netif->rss_key(), hash_data));
+      }
+
+      // No need to forward if the dst cpu is the current cpu
+      if (cpu_id == center->get_id()) {
+        l4->received(std::move(ip_data), h.src_ip, h.dst_ip);
+      } else {
+        auto to = _netif->hw_address();
+        auto pkt = frag.get_assembled_packet(from, to);
+        _netif->forward(center, cpu_id, std::move(pkt));
+      }
+
+      // Delete this frag from _frags and _frags_age
+      frag_drop(frag_id, dropped_size);
+      _frags_age.remove(frag_id);
+      perf_logger->set(l_dpdk_total_linearize_operations,
+                       ipv4_packet_merger::linearizations());
+    } else {
+      // Some of the fragments are missing
+      if (frag_timefd) {
+        frag_arm();
+      }
+    }
+    return 0;
+  }
+
+  auto l4 = _l4[h.ip_proto];
+  if (l4) {
+    // Trim IP header and pass to upper layer
+    p.trim_front(ip_hdr_len);
+    l4->received(std::move(p), h.src_ip, h.dst_ip);
+  }
+  return 0;
+}
+
+void ipv4::wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb) {
+  // Figure out where to send the packet to. If it is a directly connected
+  // host, send to it directly, otherwise send to the default gateway.
+  ipv4_address dst;
+  if (in_my_netmask(to)) {
+    dst = to;
+  } else {
+    dst = _gw_address;
+  }
+
+  _arp.wait(std::move(dst), std::move(p), std::move(cb));
+}
+
+const hw_features& ipv4::get_hw_features() const
+{
+  return _netif->get_hw_features();
+}
+
+void ipv4::send(ipv4_address to, ip_protocol_num proto_num,
+        Packet p, ethernet_address e_dst) {
+  auto needs_frag = this->needs_frag(p, proto_num, get_hw_features());
+
+  auto send_pkt = [this, to, proto_num, needs_frag, e_dst] (Packet& pkt, uint16_t remaining, uint16_t offset) mutable  {
+    static uint16_t id = 0;
+    auto iph = pkt.prepend_header<ip_hdr>();
+    iph->ihl = sizeof(*iph) / 4;
+    iph->ver = 4;
+    iph->dscp = 0;
+    iph->ecn = 0;
+    iph->len = pkt.len();
+    // FIXME: a proper id
+    iph->id = id++;
+    if (needs_frag) {
+      uint16_t mf = remaining > 0;
+      // The fragment offset is measured in units of 8 octets (64 bits)
+      auto off = offset / 8;
+      iph->frag = (mf << uint8_t(ip_hdr::frag_bits::mf)) | off;
+    } else {
+      iph->frag = 0;
+    }
+    iph->ttl = 64;
+    iph->ip_proto = (uint8_t)proto_num;
+    iph->csum = 0;
+    iph->src_ip = _host_address;
+    iph->dst_ip = to;
+    ldout(cct, 20) << " ipv4::send " << " id=" << iph->id << " " << _host_address << " -> " << to
+                   << " len " << pkt.len() << dendl;
+    *iph = iph->hton();
+
+    if (get_hw_features().tx_csum_ip_offload) {
+      iph->csum = 0;
+      pkt.offload_info_ref().needs_ip_csum = true;
+    } else {
+      checksummer csum;
+      csum.sum(reinterpret_cast<char*>(iph), sizeof(*iph));
+      iph->csum = csum.get();
+    }
+
+    _packetq.push_back(
+            l3_protocol::l3packet{eth_protocol_num::ipv4, e_dst, std::move(pkt)});
+  };
+
+  if (needs_frag) {
+    uint16_t offset = 0;
+    uint16_t remaining = p.len();
+    auto mtu = get_hw_features().mtu;
+
+    while (remaining) {
+      auto can_send = std::min(uint16_t(mtu - ipv4_hdr_len_min), remaining);
+      remaining -= can_send;
+      auto pkt = p.share(offset, can_send);
+      send_pkt(pkt, remaining, offset);
+      offset += can_send;
+    }
+  } else {
+    // The whole packet can be send in one shot
+    send_pkt(p, 0, 0);
+  }
+}
+
+Tub<l3_protocol::l3packet> ipv4::get_packet() {
+  // _packetq will be mostly empty here unless it hold remnants of previously
+  // fragmented packet
+  if (_packetq.empty()) {
+    for (size_t i = 0; i < _pkt_providers.size(); i++) {
+      auto l4p = _pkt_providers[_pkt_provider_idx++]();
+      if (_pkt_provider_idx == _pkt_providers.size()) {
+        _pkt_provider_idx = 0;
+      }
+      if (l4p) {
+        ldout(cct, 20) << " ipv4::get_packet len " << l4p->p.len() << dendl;
+        send(l4p->to, l4p->proto_num, std::move(l4p->p), l4p->e_dst);
+        break;
+      }
+    }
+  }
+
+  Tub<l3_protocol::l3packet> p;
+  if (!_packetq.empty()) {
+    p = std::move(_packetq.front());
+    _packetq.pop_front();
+  }
+  return p;
+}
+
+void ipv4::frag_limit_mem() {
+  if (_frag_mem <= _frag_high_thresh) {
+    return;
+  }
+  auto drop = _frag_mem - _frag_low_thresh;
+  while (drop) {
+    if (_frags_age.empty()) {
+      return;
+    }
+    // Drop the oldest frag (first element) from _frags_age
+    auto frag_id = _frags_age.front();
+    _frags_age.pop_front();
+
+    // Drop from _frags as well
+    auto& frag = _frags[frag_id];
+    auto dropped_size = frag.mem_size;
+    frag_drop(frag_id, dropped_size);
+
+    drop -= std::min(drop, dropped_size);
+  }
+}
+
+void ipv4::frag_timeout() {
+  if (_frags.empty()) {
+    return;
+  }
+  auto now = ceph_clock_now();
+  for (auto it = _frags_age.begin(); it != _frags_age.end();) {
+    auto frag_id = *it;
+    auto& frag = _frags[frag_id];
+    if (now > frag.rx_time + _frag_timeout) {
+      auto dropped_size = frag.mem_size;
+      // Drop from _frags
+      frag_drop(frag_id, dropped_size);
+      // Drop from _frags_age
+      it = _frags_age.erase(it);
+    } else {
+      // The further items can only be younger
+      break;
+    }
+  }
+  if (_frags.size() != 0) {
+    frag_arm(now);
+  } else {
+    _frag_mem = 0;
+  }
+}
+
+int32_t ipv4::frag::merge(ip_hdr &h, uint16_t offset, Packet p) {
+  uint32_t old = mem_size;
+  unsigned ip_hdr_len = h.ihl * 4;
+  // Store IP header
+  if (offset == 0) {
+    header = p.share(0, ip_hdr_len);
+  }
+  // Sotre IP payload
+  p.trim_front(ip_hdr_len);
+  data.merge(offset, std::move(p));
+  // Update mem size
+  mem_size = header.memory();
+  for (const auto& x : data.map) {
+    mem_size += x.second.memory();
+  }
+  auto added_size = mem_size - old;
+  return added_size;
+}
+
+bool ipv4::frag::is_complete() {
+  // If all the fragments are received, ipv4::frag::merge() should merge all
+  // the fragments into a single packet
+  auto offset = data.map.begin()->first;
+  auto nr_packet = data.map.size();
+  return last_frag_received && nr_packet == 1 && offset == 0;
+}
+
+Packet ipv4::frag::get_assembled_packet(ethernet_address from, ethernet_address to) {
+  auto& ip_header = header;
+  auto& ip_data = data.map.begin()->second;
+  // Append a ethernet header, needed for forwarding
+  auto eh = ip_header.prepend_header<eth_hdr>();
+  eh->src_mac = from;
+  eh->dst_mac = to;
+  eh->eth_proto = uint16_t(eth_protocol_num::ipv4);
+  *eh = eh->hton();
+  // Prepare a packet contains both ethernet header, ip header and ip data
+  ip_header.append(std::move(ip_data));
+  auto pkt = std::move(ip_header);
+  auto iph = pkt.get_header<ip_hdr>(sizeof(eth_hdr));
+  // len is the sum of each fragment
+  iph->len = hton(uint16_t(pkt.len() - sizeof(eth_hdr)));
+  // No fragmentation for the assembled datagram
+  iph->frag = 0;
+  // Since each fragment's csum is checked, no need to csum
+  // again for the assembled datagram
+  offload_info oi;
+  oi.reassembled = true;
+  pkt.set_offload_info(oi);
+  return pkt;
+}
+
+void icmp::received(Packet p, ipaddr from, ipaddr to) {
+  auto hdr = p.get_header<icmp_hdr>(0);
+  if (!hdr || hdr->type != icmp_hdr::msg_type::echo_request) {
+    return;
+  }
+  hdr->type = icmp_hdr::msg_type::echo_reply;
+  hdr->code = 0;
+  hdr->csum = 0;
+  checksummer csum;
+  csum.sum(reinterpret_cast<char*>(hdr), p.len());
+  hdr->csum = csum.get();
+
+  if (_queue_space.get_or_fail(p.len())) { // drop packets that do not fit the queue
+    auto cb = [this, from] (const ethernet_address e_dst, Packet p, int r) mutable {
+        if (r == 0) {
+          _packetq.emplace_back(ipv4_traits::l4packet{from, std::move(p), e_dst, ip_protocol_num::icmp});
+        }
+    };
+    _inet.wait_l2_dst_address(from, std::move(p), cb);
+  }
+}
diff --git a/src/msg/async/dpdk/IP.h b/src/msg/async/dpdk/IP.h
new file mode 100644
index 00000000..480b4b95
--- /dev/null
+++ b/src/msg/async/dpdk/IP.h
@@ -0,0 +1,414 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ */
+
+#ifndef CEPH_MSG_IP_H_
+#define CEPH_MSG_IP_H_
+
+#include <arpa/inet.h>
+#include <unordered_map>
+#include <cstdint>
+#include <array>
+#include <map>
+#include <list>
+#include <chrono>
+
+#include "msg/async/Event.h"
+#include "common/Throttle.h"
+
+#include "array_map.h"
+#include "ARP.h"
+#include "IPChecksum.h"
+#include "ip_types.h"
+#include "const.h"
+#include "net.h"
+#include "PacketUtil.h"
+#include "toeplitz.h"
+
+class ipv4;
+template <ip_protocol_num ProtoNum>
+class ipv4_l4;
+
+template <typename InetTraits>
+class tcp;
+
+struct ipv4_traits {
+  using address_type = ipv4_address;
+  using inet_type = ipv4_l4<ip_protocol_num::tcp>;
+  struct l4packet {
+    ipv4_address to;
+    Packet p;
+    ethernet_address e_dst;
+    ip_protocol_num proto_num;
+  };
+  using packet_provider_type = std::function<Tub<l4packet> ()>;
+  static void tcp_pseudo_header_checksum(checksummer& csum, ipv4_address src, ipv4_address dst, uint16_t len) {
+    csum.sum_many(src.ip, dst.ip, uint8_t(0), uint8_t(ip_protocol_num::tcp), len);
+  }
+  static constexpr uint8_t ip_hdr_len_min = ipv4_hdr_len_min;
+};
+
+template <ip_protocol_num ProtoNum>
+class ipv4_l4 {
+ public:
+  ipv4& _inet;
+ public:
+  ipv4_l4(ipv4& inet) : _inet(inet) {}
+  void register_packet_provider(ipv4_traits::packet_provider_type func);
+  void wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb);
+};
+
+class ip_protocol {
+ public:
+  virtual ~ip_protocol() {}
+  virtual void received(Packet p, ipv4_address from, ipv4_address to) = 0;
+  virtual bool forward(forward_hash& out_hash_data, Packet& p, size_t off) { return true; }
+};
+
+template <typename InetTraits>
+struct l4connid {
+  using ipaddr = typename InetTraits::address_type;
+  using inet_type = typename InetTraits::inet_type;
+  struct connid_hash;
+
+  ipaddr local_ip;
+  ipaddr foreign_ip;
+  uint16_t local_port;
+  uint16_t foreign_port;
+
+  bool operator==(const l4connid& x) const {
+    return local_ip == x.local_ip
+           && foreign_ip == x.foreign_ip
+           && local_port == x.local_port
+           && foreign_port == x.foreign_port;
+  }
+
+  uint32_t hash(const rss_key_type& rss_key) {
+    forward_hash hash_data;
+    hash_data.push_back(hton(foreign_ip.ip));
+    hash_data.push_back(hton(local_ip.ip));
+    hash_data.push_back(hton(foreign_port));
+    hash_data.push_back(hton(local_port));
+    return toeplitz_hash(rss_key, hash_data);
+  }
+};
+
+class ipv4_tcp final : public ip_protocol {
+  ipv4_l4<ip_protocol_num::tcp> _inet_l4;
+  std::unique_ptr<tcp<ipv4_traits>> _tcp;
+ public:
+  ipv4_tcp(ipv4& inet, EventCenter *c);
+  ~ipv4_tcp();
+  virtual void received(Packet p, ipv4_address from, ipv4_address to) override;
+  virtual bool forward(forward_hash& out_hash_data, Packet& p, size_t off) override;
+  friend class ipv4;
+};
+
+struct icmp_hdr {
+  enum class msg_type : uint8_t {
+    echo_reply = 0,
+    echo_request = 8,
+  };
+  msg_type type;
+  uint8_t code;
+  uint16_t csum;
+  uint32_t rest;
+} __attribute__((packed));
+
+
+class icmp {
+ public:
+  using ipaddr = ipv4_address;
+  using inet_type = ipv4_l4<ip_protocol_num::icmp>;
+  explicit icmp(CephContext *c, inet_type& inet)
+      : cct(c), _inet(inet), _queue_space(c, "DPDK::icmp::_queue_space", 212992) {
+    _inet.register_packet_provider([this] {
+      Tub<ipv4_traits::l4packet> l4p;
+      if (!_packetq.empty()) {
+        l4p = std::move(_packetq.front());
+        _packetq.pop_front();
+        _queue_space.put(l4p->p.len());
+      }
+      return l4p;
+    });
+  }
+  void received(Packet p, ipaddr from, ipaddr to);
+
+ private:
+  CephContext *cct;
+  // ipv4_l4<ip_protocol_num::icmp>
+  inet_type& _inet;
+  circular_buffer<ipv4_traits::l4packet> _packetq;
+  Throttle _queue_space;
+};
+
+class ipv4_icmp final : public ip_protocol {
+  CephContext *cct;
+  ipv4_l4<ip_protocol_num::icmp> _inet_l4;
+  icmp _icmp;
+ public:
+  ipv4_icmp(CephContext *c, ipv4& inet) : cct(c), _inet_l4(inet), _icmp(c, _inet_l4) {}
+  virtual void received(Packet p, ipv4_address from, ipv4_address to) override {
+    _icmp.received(std::move(p), from, to);
+  }
+  friend class ipv4;
+};
+
+struct ip_hdr;
+
+struct ip_packet_filter {
+  virtual ~ip_packet_filter() {};
+  virtual void handle(Packet& p, ip_hdr* iph, ethernet_address from, bool & handled) = 0;
+};
+
+struct ipv4_frag_id {
+  struct hash;
+  ipv4_address src_ip;
+  ipv4_address dst_ip;
+  uint16_t identification;
+  uint8_t protocol;
+  bool operator==(const ipv4_frag_id& x) const {
+    return src_ip == x.src_ip &&
+           dst_ip == x.dst_ip &&
+           identification == x.identification &&
+           protocol == x.protocol;
+  }
+};
+
+struct ipv4_frag_id::hash : private std::hash<ipv4_address>,
+                            private std::hash<uint16_t>, private std::hash<uint8_t> {
+  size_t operator()(const ipv4_frag_id& id) const noexcept {
+    using h1 = std::hash<ipv4_address>;
+    using h2 = std::hash<uint16_t>;
+    using h3 = std::hash<uint8_t>;
+    return h1::operator()(id.src_ip) ^
+           h1::operator()(id.dst_ip) ^
+           h2::operator()(id.identification) ^
+           h3::operator()(id.protocol);
+  }
+};
+
+struct ipv4_tag {};
+using ipv4_packet_merger = packet_merger<uint32_t, ipv4_tag>;
+
+class interface;
+
+class ipv4 {
+ public:
+  using address_type = ipv4_address;
+  using proto_type = uint16_t;
+  static address_type broadcast_address() { return ipv4_address(0xffffffff); }
+  static proto_type arp_protocol_type() { return proto_type(eth_protocol_num::ipv4); }
+  CephContext *cct;
+  EventCenter *center;
+
+ private:
+  interface* _netif;
+  std::vector<ipv4_traits::packet_provider_type> _pkt_providers;
+  Tub<uint64_t> frag_timefd;
+  EventCallbackRef frag_handler;
+  arp _global_arp;
+  arp_for<ipv4> _arp;
+  ipv4_address _host_address;
+  ipv4_address _gw_address;
+  ipv4_address _netmask;
+  l3_protocol _l3;
+  subscription<Packet, ethernet_address> _rx_packets;
+  ipv4_tcp _tcp;
+  ipv4_icmp _icmp;
+  array_map<ip_protocol*, 256> _l4;
+  ip_packet_filter *_packet_filter;
+  struct frag {
+    Packet header;
+    ipv4_packet_merger data;
+    utime_t rx_time;
+    uint32_t mem_size = 0;
+    // fragment with MF == 0 inidates it is the last fragment
+    bool last_frag_received = false;
+
+    Packet get_assembled_packet(ethernet_address from, ethernet_address to);
+    int32_t merge(ip_hdr &h, uint16_t offset, Packet p);
+    bool is_complete();
+  };
+  std::unordered_map<ipv4_frag_id, frag, ipv4_frag_id::hash> _frags;
+  std::list<ipv4_frag_id> _frags_age;
+  static utime_t _frag_timeout;
+  static constexpr uint32_t _frag_low_thresh{3 * 1024 * 1024};
+  static constexpr uint32_t _frag_high_thresh{4 * 1024 * 1024};
+  uint32_t _frag_mem = 0;
+  circular_buffer<l3_protocol::l3packet> _packetq;
+  unsigned _pkt_provider_idx = 0;
+  PerfCounters *perf_logger;
+
+ private:
+  int handle_received_packet(Packet p, ethernet_address from);
+  bool forward(forward_hash& out_hash_data, Packet& p, size_t off);
+  Tub<l3_protocol::l3packet> get_packet();
+  bool in_my_netmask(ipv4_address a) const {
+    return !((a.ip ^ _host_address.ip) & _netmask.ip);
+  }
+  void frag_limit_mem();
+  void frag_drop(ipv4_frag_id frag_id, uint32_t dropped_size) {
+    _frags.erase(frag_id);
+    _frag_mem -= dropped_size;
+  }
+  void frag_arm(utime_t now) {
+    auto tp = now + _frag_timeout;
+    frag_timefd.construct(center->create_time_event(tp.to_nsec() / 1000, frag_handler));
+  }
+  void frag_arm() {
+    auto now = ceph_clock_now();
+    frag_timefd.construct(center->create_time_event(now.to_nsec() / 1000, frag_handler));
+  }
+
+ public:
+  void frag_timeout();
+
+ public:
+  explicit ipv4(CephContext *c, EventCenter *cen, interface* netif);
+  ~ipv4() {
+    delete frag_handler;
+  }
+  void set_host_address(ipv4_address ip) {
+    _host_address = ip;
+    _arp.set_self_addr(ip);
+  }
+  ipv4_address host_address() {
+    return _host_address;
+  }
+  void set_gw_address(ipv4_address ip) {
+    _gw_address = ip;
+  }
+  ipv4_address gw_address() const {
+    return _gw_address;
+  }
+  void set_netmask_address(ipv4_address ip) {
+    _netmask = ip;
+  }
+  ipv4_address netmask_address() const {
+    return _netmask;
+  }
+  interface *netif() const {
+    return _netif;
+  }
+  // TODO or something. Should perhaps truly be a list
+  // of filters. With ordering. And blackjack. Etc.
+  // But for now, a simple single raw pointer suffices
+  void set_packet_filter(ip_packet_filter *f) {
+    _packet_filter = f;
+  }
+  ip_packet_filter * packet_filter() const {
+    return _packet_filter;
+  }
+  void send(ipv4_address to, ip_protocol_num proto_num, Packet p, ethernet_address e_dst);
+  tcp<ipv4_traits>& get_tcp() { return *_tcp._tcp; }
+  void register_l4(proto_type id, ip_protocol* handler);
+  const hw_features& get_hw_features() const;
+  static bool needs_frag(Packet& p, ip_protocol_num proto_num, hw_features hw_features) {
+    if (p.len() + ipv4_hdr_len_min <= hw_features.mtu)
+      return false;
+
+    if ((proto_num == ip_protocol_num::tcp && hw_features.tx_tso))
+      return false;
+
+    return true;
+  }
+  void learn(ethernet_address l2, ipv4_address l3) {
+    _arp.learn(l2, l3);
+  }
+  void register_packet_provider(ipv4_traits::packet_provider_type&& func) {
+    _pkt_providers.push_back(std::move(func));
+  }
+  void wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb);
+};
+
+template <ip_protocol_num ProtoNum>
+inline void ipv4_l4<ProtoNum>::register_packet_provider(
+    ipv4_traits::packet_provider_type func) {
+  _inet.register_packet_provider([func] {
+    auto l4p = func();
+    if (l4p) {
+      (*l4p).proto_num = ProtoNum;
+    }
+    return l4p;
+  });
+}
+
+template <ip_protocol_num ProtoNum>
+inline void ipv4_l4<ProtoNum>::wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb) {
+  _inet.wait_l2_dst_address(to, std::move(p), std::move(cb));
+}
+
+struct ip_hdr {
+  uint8_t ihl : 4;
+  uint8_t ver : 4;
+  uint8_t dscp : 6;
+  uint8_t ecn : 2;
+  uint16_t len;
+  uint16_t id;
+  uint16_t frag;
+  enum class frag_bits : uint8_t { mf = 13, df = 14, reserved = 15, offset_shift = 3 };
+  uint8_t ttl;
+  uint8_t ip_proto;
+  uint16_t csum;
+  ipv4_address src_ip;
+  ipv4_address dst_ip;
+  uint8_t options[0];
+  ip_hdr hton() {
+    ip_hdr hdr = *this;
+    hdr.len = ::hton(len);
+    hdr.id = ::hton(id);
+    hdr.frag = ::hton(frag);
+    hdr.csum = ::hton(csum);
+    hdr.src_ip.ip = ::hton(src_ip.ip);
+    hdr.dst_ip.ip = ::hton(dst_ip.ip);
+    return hdr;
+  }
+  ip_hdr ntoh() {
+    ip_hdr hdr = *this;
+    hdr.len = ::ntoh(len);
+    hdr.id = ::ntoh(id);
+    hdr.frag = ::ntoh(frag);
+    hdr.csum = ::ntoh(csum);
+    hdr.src_ip = src_ip.ntoh();
+    hdr.dst_ip = dst_ip.ntoh();
+    return hdr;
+  }
+
+  bool mf() { return frag & (1 << uint8_t(frag_bits::mf)); }
+  bool df() { return frag & (1 << uint8_t(frag_bits::df)); }
+  uint16_t offset() { return frag << uint8_t(frag_bits::offset_shift); }
+} __attribute__((packed));
+
+template <typename InetTraits>
+struct l4connid<InetTraits>::connid_hash : private std::hash<ipaddr>, private std::hash<uint16_t> {
+  size_t operator()(const l4connid<InetTraits>& id) const noexcept {
+    using h1 = std::hash<ipaddr>;
+    using h2 = std::hash<uint16_t>;
+    return h1::operator()(id.local_ip)
+           ^ h1::operator()(id.foreign_ip)
+           ^ h2::operator()(id.local_port)
+           ^ h2::operator()(id.foreign_port);
+  }
+};
+
+#endif /* CEPH_MSG_IP_H */
diff --git a/src/msg/async/dpdk/IPChecksum.cc b/src/msg/async/dpdk/IPChecksum.cc
new file mode 100644
index 00000000..7a3253c1
--- /dev/null
+++ b/src/msg/async/dpdk/IPChecksum.cc
@@ -0,0 +1,70 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#include <arpa/inet.h>
+#include "net.h"
+#include "IPChecksum.h"
+
+void checksummer::sum(const char* data, size_t len) {
+  auto orig_len = len;
+  if (odd) {
+    csum += uint8_t(*data++);
+    --len;
+  }
+  auto p64 = reinterpret_cast<const uint64_t*>(data);
+  while (len >= 8) {
+    csum += ntohq(*p64++);
+    len -= 8;
+  }
+  auto p16 = reinterpret_cast<const uint16_t*>(p64);
+  while (len >= 2) {
+    csum += ntohs(*p16++);
+    len -= 2;
+  }
+  auto p8 = reinterpret_cast<const uint8_t*>(p16);
+  if (len) {
+    csum += *p8++ << 8;
+    len -= 1;
+  }
+  odd ^= orig_len & 1;
+}
+
+uint16_t checksummer::get() const {
+  __int128 csum1 = (csum & 0xffffffffffffffff) + (csum >> 64);
+  uint64_t csum = (csum1 & 0xffffffffffffffff) + (csum1 >> 64);
+  csum = (csum & 0xffff) + ((csum >> 16) & 0xffff) + ((csum >> 32) & 0xffff) + (csum >> 48);
+  csum = (csum & 0xffff) + (csum >> 16);
+  csum = (csum & 0xffff) + (csum >> 16);
+  return htons(~csum);
+}
+
+void checksummer::sum(const Packet& p) {
+  for (auto&& f : p.fragments()) {
+    sum(f.base, f.size);
+  }
+}
+
+uint16_t ip_checksum(const void* data, size_t len) {
+  checksummer cksum;
+  cksum.sum(reinterpret_cast<const char*>(data), len);
+  return cksum.get();
+}
diff --git a/src/msg/async/dpdk/IPChecksum.h b/src/msg/async/dpdk/IPChecksum.h
new file mode 100644
index 00000000..9af4a86b
--- /dev/null
+++ b/src/msg/async/dpdk/IPChecksum.h
@@ -0,0 +1,72 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_CHECKSUM_H_
+#define CEPH_MSG_CHECKSUM_H_
+
+#include <cstdint>
+#include <cstddef>
+#include <arpa/inet.h>
+
+#include "Packet.h"
+
+uint16_t ip_checksum(const void* data, size_t len);
+
+struct checksummer {
+  __int128 csum = 0;
+  bool odd = false;
+  void sum(const char* data, size_t len);
+  void sum(const Packet& p);
+  void sum(uint8_t data) {
+    if (!odd) {
+      csum += data << 8;
+    } else {
+      csum += data;
+    }
+    odd = !odd;
+  }
+  void sum(uint16_t data) {
+    if (odd) {
+      sum(uint8_t(data >> 8));
+      sum(uint8_t(data));
+    } else {
+      csum += data;
+    }
+  }
+  void sum(uint32_t data) {
+    if (odd) {
+      sum(uint16_t(data));
+      sum(uint16_t(data >> 16));
+    } else {
+      csum += data;
+    }
+  }
+  void sum_many() {}
+  template <typename T0, typename... T>
+  void sum_many(T0 data, T... rest) {
+    sum(data);
+    sum_many(rest...);
+  }
+  uint16_t get() const;
+};
+
+#endif /* CEPH_MSG_CHECKSUM_H_ */
diff --git a/src/msg/async/dpdk/Packet.cc b/src/msg/async/dpdk/Packet.cc
new file mode 100644
index 00000000..6c2320a0
--- /dev/null
+++ b/src/msg/async/dpdk/Packet.cc
@@ -0,0 +1,146 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <iostream>
+#include <algorithm>
+#include <cctype>
+
+#include "capture.h"
+#include "Packet.h"
+
+constexpr size_t Packet::internal_data_size;
+constexpr size_t Packet::default_nr_frags;
+
+void Packet::linearize(size_t at_frag, size_t desired_size) {
+  _impl->unuse_internal_data();
+  size_t nr_frags = 0;
+  size_t accum_size = 0;
+  while (accum_size < desired_size) {
+    accum_size += _impl->frags[at_frag + nr_frags].size;
+    ++nr_frags;
+  }
+  char *new_frag = new char[accum_size];
+  auto p = new_frag;
+  for (size_t i = 0; i < nr_frags; ++i) {
+    auto& f = _impl->frags[at_frag + i];
+    p = std::copy(f.base, f.base + f.size, p);
+  }
+  // collapse nr_frags into one fragment
+  std::copy(_impl->frags + at_frag + nr_frags, _impl->frags + _impl->_nr_frags,
+            _impl->frags + at_frag + 1);
+  _impl->_nr_frags -= nr_frags - 1;
+  _impl->frags[at_frag] = fragment{new_frag, accum_size};
+  if (at_frag == 0 && desired_size == len()) {
+    // We can drop the old buffer safely
+    auto x = std::move(_impl->_deleter);
+    _impl->_deleter = make_deleter([new_frag] { delete []new_frag; });
+  } else {
+    auto del = std::bind(
+            [new_frag](deleter &d) { delete []new_frag; }, std::move(_impl->_deleter));
+    _impl->_deleter = make_deleter(std::move(del));
+  }
+}
+
+class C_free_on_cpu : public EventCallback {
+  deleter del;
+  std::function<void()> cb;
+ public:
+  C_free_on_cpu(deleter &&d, std::function<void()> &&c):
+      del(std::move(d)), cb(std::move(c)) {}
+  void do_request(uint64_t fd) {
+    // deleter needs to be moved from lambda capture to be destroyed here
+    // otherwise deleter destructor will be called on a cpu that called
+    // create_external_event when work_item is destroyed.
+    deleter xxx(std::move(del));
+    cb();
+    delete this;
+  }
+};
+
+Packet Packet::free_on_cpu(EventCenter *center, std::function<void()> cb)
+{
+  auto del = std::bind(
+      [center, cb] (deleter &del) mutable {
+        center->dispatch_event_external(new C_free_on_cpu(std::move(del), std::move(cb)));
+      }, std::move(_impl->_deleter));
+  // make new deleter that runs old deleter on an origin cpu
+  _impl->_deleter = make_deleter(deleter(), std::move(del));
+
+  return Packet(impl::copy(_impl.get()));
+}
+
+std::ostream& operator<<(std::ostream& os, const Packet& p) {
+  os << "Packet{";
+  bool first = true;
+  for (auto&& frag : p.fragments()) {
+    if (!first) {
+      os << ", ";
+    }
+    first = false;
+    if (std::all_of(frag.base, frag.base + frag.size, [] (int c) { return c >= 9 && c <= 0x7f; })) {
+      os << '"';
+      for (auto p = frag.base; p != frag.base + frag.size; ++p) {
+        auto c = *p;
+        if (isprint(c)) {
+          os << c;
+        } else if (c == '\r') {
+          os << "\\r";
+        } else if (c == '\n') {
+          os << "\\n";
+        } else if (c == '\t') {
+          os << "\\t";
+        } else {
+          uint8_t b = c;
+          os << "\\x" << (b / 16) << (b % 16);
+        }
+      }
+      os << '"';
+    } else {
+      os << "{";
+      bool nfirst = true;
+      for (auto p = frag.base; p != frag.base + frag.size; ++p) {
+        if (!nfirst) {
+          os << " ";
+        }
+        nfirst = false;
+        uint8_t b = *p;
+        os << b;
+      }
+      os << "}";
+    }
+  }
+  os << "}";
+  return os;
+}
diff --git a/src/msg/async/dpdk/Packet.h b/src/msg/async/dpdk/Packet.h
new file mode 100644
index 00000000..db9cd2a7
--- /dev/null
+++ b/src/msg/async/dpdk/Packet.h
@@ -0,0 +1,550 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_PACKET_H_
+#define CEPH_MSG_PACKET_H_
+
+#include <vector>
+#include <algorithm>
+#include <iosfwd>
+
+#include "include/types.h"
+#include "common/Tub.h"
+#include "common/deleter.h"
+#include "msg/async/Event.h"
+
+#include "const.h"
+
+struct fragment {
+    char* base;
+    size_t size;
+};
+
+struct offload_info {
+  ip_protocol_num protocol = ip_protocol_num::unused;
+  bool needs_csum = false;
+  uint8_t ip_hdr_len = 20;
+  uint8_t tcp_hdr_len = 20;
+  uint8_t udp_hdr_len = 8;
+  bool needs_ip_csum = false;
+  bool reassembled = false;
+  uint16_t tso_seg_size = 0;
+  // HW stripped VLAN header (CPU order)
+  Tub<uint16_t> vlan_tci;
+};
+
+// Zero-copy friendly packet class
+//
+// For implementing zero-copy, we need a flexible destructor that can
+// destroy packet data in different ways: decrementing a reference count,
+// or calling a free()-like function.
+//
+// Moreover, we need different destructors for each set of fragments within
+// a single fragment. For example, a header and trailer might need delete[]
+// to be called, while the internal data needs a reference count to be
+// released.  Matters are complicated in that fragments can be split
+// (due to virtual/physical translation).
+//
+// To implement this, we associate each packet with a single destructor,
+// but allow composing a packet from another packet plus a fragment to
+// be added, with its own destructor, causing the destructors to be chained.
+//
+// The downside is that the data needed for the destructor is duplicated,
+// if it is already available in the fragment itself.
+//
+// As an optimization, when we allocate small fragments, we allocate some
+// extra space, so prepending to the packet does not require extra
+// allocations.  This is useful when adding headers.
+//
+class Packet {
+  // enough for lots of headers, not quite two cache lines:
+  static constexpr size_t internal_data_size = 128 - 16;
+  static constexpr size_t default_nr_frags = 4;
+
+  struct pseudo_vector {
+    fragment* _start;
+    fragment* _finish;
+    pseudo_vector(fragment* start, size_t nr)
+        : _start(start), _finish(_start + nr) {}
+    fragment* begin() { return _start; }
+    fragment* end() { return _finish; }
+    fragment& operator[](size_t idx) { return _start[idx]; }
+  };
+
+  struct impl {
+    // when destroyed, virtual destructor will reclaim resources
+    deleter _deleter;
+    unsigned _len = 0;
+    uint16_t _nr_frags = 0;
+    uint16_t _allocated_frags;
+    offload_info _offload_info;
+    Tub<uint32_t> rss_hash;
+    char data[internal_data_size]; // only frags[0] may use
+    unsigned headroom = internal_data_size; // in data
+    // FIXME: share data/frags space
+
+    fragment frags[];
+
+    explicit impl(size_t nr_frags = default_nr_frags);
+    impl(const impl&) = delete;
+    impl(fragment frag, size_t nr_frags = default_nr_frags);
+
+    pseudo_vector fragments() { return { frags, _nr_frags }; }
+
+    static std::unique_ptr<impl> allocate(size_t nr_frags) {
+      nr_frags = std::max(nr_frags, default_nr_frags);
+      return std::unique_ptr<impl>(new (nr_frags) impl(nr_frags));
+    }
+
+    static std::unique_ptr<impl> copy(impl* old, size_t nr) {
+      auto n = allocate(nr);
+      n->_deleter = std::move(old->_deleter);
+      n->_len = old->_len;
+      n->_nr_frags = old->_nr_frags;
+      n->headroom = old->headroom;
+      n->_offload_info = old->_offload_info;
+      n->rss_hash.construct(old->rss_hash);
+      std::copy(old->frags, old->frags + old->_nr_frags, n->frags);
+      old->copy_internal_fragment_to(n.get());
+      return std::move(n);
+    }
+
+    static std::unique_ptr<impl> copy(impl* old) {
+      return copy(old, old->_nr_frags);
+    }
+
+    static std::unique_ptr<impl> allocate_if_needed(std::unique_ptr<impl> old, size_t extra_frags) {
+      if (old->_allocated_frags >= old->_nr_frags + extra_frags) {
+        return std::move(old);
+      }
+      return copy(old.get(), std::max<size_t>(old->_nr_frags + extra_frags, 2 * old->_nr_frags));
+    }
+    void* operator new(size_t size, size_t nr_frags = default_nr_frags) {
+      ceph_assert(nr_frags == uint16_t(nr_frags));
+      return ::operator new(size + nr_frags * sizeof(fragment));
+    }
+    // Matching the operator new above
+    void operator delete(void* ptr, size_t nr_frags) {
+      return ::operator delete(ptr);
+    }
+    // Since the above "placement delete" hides the global one, expose it
+    void operator delete(void* ptr) {
+      return ::operator delete(ptr);
+    }
+
+    bool using_internal_data() const {
+      return _nr_frags
+              && frags[0].base >= data
+              && frags[0].base < data + internal_data_size;
+    }
+
+    void unuse_internal_data() {
+      if (!using_internal_data()) {
+        return;
+      }
+      auto buf = static_cast<char*>(::malloc(frags[0].size));
+      if (!buf) {
+        throw std::bad_alloc();
+      }
+      deleter d = make_free_deleter(buf);
+      std::copy(frags[0].base, frags[0].base + frags[0].size, buf);
+      frags[0].base = buf;
+      _deleter.append(std::move(d));
+      headroom = internal_data_size;
+    }
+    void copy_internal_fragment_to(impl* to) {
+      if (!using_internal_data()) {
+        return;
+      }
+      to->frags[0].base = to->data + headroom;
+      std::copy(frags[0].base, frags[0].base + frags[0].size,
+              to->frags[0].base);
+    }
+  };
+  explicit Packet(std::unique_ptr<impl>&& impl) : _impl(std::move(impl)) {}
+  std::unique_ptr<impl> _impl;
+public:
+  static Packet from_static_data(const char* data, size_t len) {
+    return {fragment{const_cast<char*>(data), len}, deleter()};
+  }
+
+  // build empty Packet
+  Packet();
+  // build empty Packet with nr_frags allocated
+  explicit Packet(size_t nr_frags);
+  // move existing Packet
+  Packet(Packet&& x) noexcept;
+  // copy data into Packet
+  Packet(const char* data, size_t len);
+  // copy data into Packet
+  explicit Packet(fragment frag);
+  // zero-copy single fragment
+  Packet(fragment frag, deleter del);
+  // zero-copy multiple fragments
+  Packet(std::vector<fragment> frag, deleter del);
+  // build Packet with iterator
+  template <typename Iterator>
+  Packet(Iterator begin, Iterator end, deleter del);
+  // append fragment (copying new fragment)
+  Packet(Packet&& x, fragment frag);
+  // prepend fragment (copying new fragment, with header optimization)
+  Packet(fragment frag, Packet&& x);
+  // prepend fragment (zero-copy)
+  Packet(fragment frag, deleter del, Packet&& x);
+  // append fragment (zero-copy)
+  Packet(Packet&& x, fragment frag, deleter d);
+  // append deleter
+  Packet(Packet&& x, deleter d);
+
+  Packet& operator=(Packet&& x) {
+    if (this != &x) {
+      this->~Packet();
+      new (this) Packet(std::move(x));
+    }
+    return *this;
+  }
+
+  unsigned len() const { return _impl->_len; }
+  unsigned memory() const { return len() +  sizeof(Packet::impl); }
+
+  fragment frag(unsigned idx) const { return _impl->frags[idx]; }
+  fragment& frag(unsigned idx) { return _impl->frags[idx]; }
+
+  unsigned nr_frags() const { return _impl->_nr_frags; }
+  pseudo_vector fragments() const { return { _impl->frags, _impl->_nr_frags }; }
+  fragment* fragment_array() const { return _impl->frags; }
+
+  // share Packet data (reference counted, non COW)
+  Packet share();
+  Packet share(size_t offset, size_t len);
+
+  void append(Packet&& p);
+
+  void trim_front(size_t how_much);
+  void trim_back(size_t how_much);
+
+  // get a header pointer, linearizing if necessary
+  template <typename Header>
+  Header* get_header(size_t offset = 0);
+
+  // get a header pointer, linearizing if necessary
+  char* get_header(size_t offset, size_t size);
+
+  // prepend a header (default-initializing it)
+  template <typename Header>
+  Header* prepend_header(size_t extra_size = 0);
+
+  // prepend a header (uninitialized!)
+  char* prepend_uninitialized_header(size_t size);
+
+  Packet free_on_cpu(EventCenter *c, std::function<void()> cb = []{});
+
+  void linearize() { return linearize(0, len()); }
+
+  void reset() { _impl.reset(); }
+
+  void reserve(int n_frags) {
+    if (n_frags > _impl->_nr_frags) {
+      auto extra = n_frags - _impl->_nr_frags;
+      _impl = impl::allocate_if_needed(std::move(_impl), extra);
+    }
+  }
+  Tub<uint32_t> rss_hash() {
+    return _impl->rss_hash;
+  }
+  void set_rss_hash(uint32_t hash) {
+    _impl->rss_hash.construct(hash);
+  }
+private:
+  void linearize(size_t at_frag, size_t desired_size);
+  bool allocate_headroom(size_t size);
+public:
+  class offload_info offload_info() const { return _impl->_offload_info; }
+  class offload_info& offload_info_ref() { return _impl->_offload_info; }
+  void set_offload_info(class offload_info oi) { _impl->_offload_info = oi; }
+};
+
+std::ostream& operator<<(std::ostream& os, const Packet& p);
+
+inline Packet::Packet(Packet&& x) noexcept
+    : _impl(std::move(x._impl)) {
+}
+
+inline Packet::impl::impl(size_t nr_frags)
+    : _len(0), _allocated_frags(nr_frags) {
+}
+
+inline Packet::impl::impl(fragment frag, size_t nr_frags)
+    : _len(frag.size), _allocated_frags(nr_frags) {
+    ceph_assert(_allocated_frags > _nr_frags);
+  if (frag.size <= internal_data_size) {
+    headroom -= frag.size;
+    frags[0] = { data + headroom, frag.size };
+  } else {
+    auto buf = static_cast<char*>(::malloc(frag.size));
+    if (!buf) {
+      throw std::bad_alloc();
+    }
+    deleter d = make_free_deleter(buf);
+    frags[0] = { buf, frag.size };
+    _deleter.append(std::move(d));
+  }
+  std::copy(frag.base, frag.base + frag.size, frags[0].base);
+  ++_nr_frags;
+}
+
+inline Packet::Packet(): _impl(impl::allocate(1)) {
+}
+
+inline Packet::Packet(size_t nr_frags): _impl(impl::allocate(nr_frags)) {
+}
+
+inline Packet::Packet(fragment frag): _impl(new impl(frag)) {
+}
+
+inline Packet::Packet(const char* data, size_t size):
+    Packet(fragment{const_cast<char*>(data), size}) {
+}
+
+inline Packet::Packet(fragment frag, deleter d)
+    : _impl(impl::allocate(1)) {
+  _impl->_deleter = std::move(d);
+  _impl->frags[_impl->_nr_frags++] = frag;
+  _impl->_len = frag.size;
+}
+
+inline Packet::Packet(std::vector<fragment> frag, deleter d)
+    : _impl(impl::allocate(frag.size())) {
+  _impl->_deleter = std::move(d);
+  std::copy(frag.begin(), frag.end(), _impl->frags);
+  _impl->_nr_frags = frag.size();
+  _impl->_len = 0;
+  for (auto&& f : _impl->fragments()) {
+    _impl->_len += f.size;
+  }
+}
+
+template <typename Iterator>
+inline Packet::Packet(Iterator begin, Iterator end, deleter del) {
+  unsigned nr_frags = 0, len = 0;
+  nr_frags = std::distance(begin, end);
+  std::for_each(begin, end, [&] (fragment& frag) { len += frag.size; });
+  _impl = impl::allocate(nr_frags);
+  _impl->_deleter = std::move(del);
+  _impl->_len = len;
+  _impl->_nr_frags = nr_frags;
+  std::copy(begin, end, _impl->frags);
+}
+
+inline Packet::Packet(Packet&& x, fragment frag)
+    : _impl(impl::allocate_if_needed(std::move(x._impl), 1)) {
+  _impl->_len += frag.size;
+  char* buf = new char[frag.size];
+  std::copy(frag.base, frag.base + frag.size, buf);
+  _impl->frags[_impl->_nr_frags++] = {buf, frag.size};
+  _impl->_deleter = make_deleter(std::move(_impl->_deleter), [buf] {
+    delete[] buf;
+  });
+}
+
+inline bool Packet::allocate_headroom(size_t size) {
+  if (_impl->headroom >= size) {
+    _impl->_len += size;
+    if (!_impl->using_internal_data()) {
+      _impl = impl::allocate_if_needed(std::move(_impl), 1);
+      std::copy_backward(_impl->frags, _impl->frags + _impl->_nr_frags,
+              _impl->frags + _impl->_nr_frags + 1);
+      _impl->frags[0] = { _impl->data + internal_data_size, 0 };
+      ++_impl->_nr_frags;
+    }
+    _impl->headroom -= size;
+    _impl->frags[0].base -= size;
+    _impl->frags[0].size += size;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+
+inline Packet::Packet(fragment frag, Packet&& x)
+    : _impl(std::move(x._impl)) {
+  // try to prepend into existing internal fragment
+  if (allocate_headroom(frag.size)) {
+    std::copy(frag.base, frag.base + frag.size, _impl->frags[0].base);
+    return;
+  } else {
+    // didn't work out, allocate and copy
+    _impl->unuse_internal_data();
+    _impl = impl::allocate_if_needed(std::move(_impl), 1);
+    _impl->_len += frag.size;
+    char *buf = new char[frag.size];
+    std::copy(frag.base, frag.base + frag.size, buf);
+    std::copy_backward(_impl->frags, _impl->frags + _impl->_nr_frags,
+            _impl->frags + _impl->_nr_frags + 1);
+    ++_impl->_nr_frags;
+    _impl->frags[0] = {buf, frag.size};
+    _impl->_deleter = make_deleter(
+            std::move(_impl->_deleter), [buf] { delete []buf; });
+  }
+}
+
+inline Packet::Packet(Packet&& x, fragment frag, deleter d)
+    : _impl(impl::allocate_if_needed(std::move(x._impl), 1)) {
+  _impl->_len += frag.size;
+  _impl->frags[_impl->_nr_frags++] = frag;
+  d.append(std::move(_impl->_deleter));
+  _impl->_deleter = std::move(d);
+}
+
+inline Packet::Packet(Packet&& x, deleter d): _impl(std::move(x._impl)) {
+  _impl->_deleter.append(std::move(d));
+}
+
+inline void Packet::append(Packet&& p) {
+  if (!_impl->_len) {
+    *this = std::move(p);
+    return;
+  }
+  _impl = impl::allocate_if_needed(std::move(_impl), p._impl->_nr_frags);
+  _impl->_len += p._impl->_len;
+  p._impl->unuse_internal_data();
+  std::copy(p._impl->frags, p._impl->frags + p._impl->_nr_frags,
+            _impl->frags + _impl->_nr_frags);
+  _impl->_nr_frags += p._impl->_nr_frags;
+  p._impl->_deleter.append(std::move(_impl->_deleter));
+  _impl->_deleter = std::move(p._impl->_deleter);
+}
+
+inline char* Packet::get_header(size_t offset, size_t size) {
+  if (offset + size > _impl->_len) {
+    return nullptr;
+  }
+  size_t i = 0;
+  while (i != _impl->_nr_frags && offset >= _impl->frags[i].size) {
+    offset -= _impl->frags[i++].size;
+  }
+  if (i == _impl->_nr_frags) {
+    return nullptr;
+  }
+  if (offset + size > _impl->frags[i].size) {
+    linearize(i, offset + size);
+  }
+  return _impl->frags[i].base + offset;
+}
+
+template <typename Header>
+inline Header* Packet::get_header(size_t offset) {
+  return reinterpret_cast<Header*>(get_header(offset, sizeof(Header)));
+}
+
+inline void Packet::trim_front(size_t how_much) {
+  ceph_assert(how_much <= _impl->_len);
+  _impl->_len -= how_much;
+  size_t i = 0;
+  while (how_much && how_much >= _impl->frags[i].size) {
+    how_much -= _impl->frags[i++].size;
+  }
+  std::copy(_impl->frags + i, _impl->frags + _impl->_nr_frags, _impl->frags);
+  _impl->_nr_frags -= i;
+  if (!_impl->using_internal_data()) {
+    _impl->headroom = internal_data_size;
+  }
+  if (how_much) {
+    if (_impl->using_internal_data()) {
+      _impl->headroom += how_much;
+    }
+    _impl->frags[0].base += how_much;
+    _impl->frags[0].size -= how_much;
+  }
+}
+
+inline void Packet::trim_back(size_t how_much) {
+  ceph_assert(how_much <= _impl->_len);
+  _impl->_len -= how_much;
+  size_t i = _impl->_nr_frags - 1;
+  while (how_much && how_much >= _impl->frags[i].size) {
+    how_much -= _impl->frags[i--].size;
+  }
+  _impl->_nr_frags = i + 1;
+  if (how_much) {
+    _impl->frags[i].size -= how_much;
+    if (i == 0 && _impl->using_internal_data()) {
+        _impl->headroom += how_much;
+    }
+  }
+}
+
+template <typename Header>
+Header* Packet::prepend_header(size_t extra_size) {
+  auto h = prepend_uninitialized_header(sizeof(Header) + extra_size);
+  return new (h) Header{};
+}
+
+// prepend a header (uninitialized!)
+inline char* Packet::prepend_uninitialized_header(size_t size) {
+  if (!allocate_headroom(size)) {
+    // didn't work out, allocate and copy
+    _impl->unuse_internal_data();
+    // try again, after unuse_internal_data we may have space after all
+    if (!allocate_headroom(size)) {
+      // failed
+      _impl->_len += size;
+      _impl = impl::allocate_if_needed(std::move(_impl), 1);
+      char *buf = new char[size];
+      std::copy_backward(_impl->frags, _impl->frags + _impl->_nr_frags,
+              _impl->frags + _impl->_nr_frags + 1);
+      ++_impl->_nr_frags;
+      _impl->frags[0] = {buf, size};
+      _impl->_deleter = make_deleter(std::move(_impl->_deleter),
+              [buf] { delete []buf; });
+    }
+  }
+  return _impl->frags[0].base;
+}
+
+inline Packet Packet::share() {
+    return share(0, _impl->_len);
+}
+
+inline Packet Packet::share(size_t offset, size_t len) {
+  _impl->unuse_internal_data(); // FIXME: eliminate?
+  Packet n;
+  n._impl = impl::allocate_if_needed(std::move(n._impl), _impl->_nr_frags);
+  size_t idx = 0;
+  while (offset > 0 && offset >= _impl->frags[idx].size) {
+    offset -= _impl->frags[idx++].size;
+  }
+  while (n._impl->_len < len) {
+    auto& f = _impl->frags[idx++];
+    auto fsize = std::min(len - n._impl->_len, f.size - offset);
+    n._impl->frags[n._impl->_nr_frags++] = { f.base + offset, fsize };
+    n._impl->_len += fsize;
+    offset = 0;
+  }
+  n._impl->_offload_info = _impl->_offload_info;
+  ceph_assert(!n._impl->_deleter);
+  n._impl->_deleter = _impl->_deleter.share();
+  return n;
+}
+
+#endif /* CEPH_MSG_PACKET_H_ */
diff --git a/src/msg/async/dpdk/PacketUtil.h b/src/msg/async/dpdk/PacketUtil.h
new file mode 100644
index 00000000..118218e6
--- /dev/null
+++ b/src/msg/async/dpdk/PacketUtil.h
@@ -0,0 +1,154 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_PACKET_UTIL_H_
+#define CEPH_MSG_PACKET_UTIL_H_
+
+#include <map>
+#include <iostream>
+
+#include "Packet.h"
+
+template <typename Offset, typename Tag>
+class packet_merger {
+ private:
+  static uint64_t& linearizations_ref() {
+    static thread_local uint64_t linearization_count;
+    return linearization_count;
+  }
+ public:
+  std::map<Offset, Packet> map;
+
+  static uint64_t linearizations() {
+    return linearizations_ref();
+  }
+
+  void merge(Offset offset, Packet p) {
+    bool insert = true;
+    auto beg = offset;
+    auto end = beg + p.len();
+    // First, try to merge the packet with existing segment
+    for (auto it = map.begin(); it != map.end();) {
+      auto& seg_pkt = it->second;
+      auto seg_beg = it->first;
+      auto seg_end = seg_beg + seg_pkt.len();
+      // There are 6 cases:
+      if (seg_beg <= beg && end <= seg_end) {
+        // 1) seg_beg beg end seg_end
+        // We already have data in this packet
+        return;
+      } else if (beg <= seg_beg && seg_end <= end) {
+        // 2) beg seg_beg seg_end end
+        // The new segment contains more data than this old segment
+        // Delete the old one, insert the new one
+        it = map.erase(it);
+        insert = true;
+        break;
+      } else if (beg < seg_beg && seg_beg <= end && end <= seg_end) {
+        // 3) beg seg_beg end seg_end
+        // Merge two segments, trim front of old segment
+        auto trim = end - seg_beg;
+        seg_pkt.trim_front(trim);
+        p.append(std::move(seg_pkt));
+        // Delete the old one, insert the new one
+        it = map.erase(it);
+        insert = true;
+        break;
+      } else if (seg_beg <= beg && beg <= seg_end && seg_end < end) {
+        // 4) seg_beg beg seg_end end
+        // Merge two segments, trim front of new segment
+        auto trim = seg_end - beg;
+        p.trim_front(trim);
+        // Append new data to the old segment, keep the old segment
+        seg_pkt.append(std::move(p));
+        seg_pkt.linearize();
+        ++linearizations_ref();
+        insert = false;
+        break;
+      } else {
+        // 5) beg end < seg_beg seg_end
+        //   or
+        // 6) seg_beg seg_end < beg end
+        // Can not merge with this segment, keep looking
+        it++;
+        insert = true;
+      }
+    }
+
+    if (insert) {
+      p.linearize();
+      ++linearizations_ref();
+      map.emplace(beg, std::move(p));
+    }
+
+    // Second, merge adjacent segments after this packet has been merged,
+    // because this packet might fill a "whole" and make two adjacent
+    // segments mergable
+    for (auto it = map.begin(); it != map.end();) {
+      // The first segment
+      auto& seg_pkt = it->second;
+      auto seg_beg = it->first;
+      auto seg_end = seg_beg + seg_pkt.len();
+
+      // The second segment
+      auto it_next = it;
+      it_next++;
+      if (it_next == map.end()) {
+        break;
+      }
+      auto& p = it_next->second;
+      auto beg = it_next->first;
+      auto end = beg + p.len();
+
+      // Merge the the second segment into first segment if possible
+      if (seg_beg <= beg && beg <= seg_end && seg_end < end) {
+        // Merge two segments, trim front of second segment
+        auto trim = seg_end - beg;
+        p.trim_front(trim);
+        // Append new data to the first segment, keep the first segment
+        seg_pkt.append(std::move(p));
+
+        // Delete the second segment
+        map.erase(it_next);
+
+        // Keep merging this first segment with its new next packet
+        // So we do not update the iterator: it
+        continue;
+      } else if (end <= seg_end) {
+        // The first segment has all the data in the second segment
+        // Delete the second segment
+        map.erase(it_next);
+        continue;
+      } else if (seg_end < beg) {
+        // Can not merge first segment with second segment
+        it = it_next;
+        continue;
+      } else {
+        // If we reach here, we have a bug with merge.
+        std::cout << "packet_merger: merge error\n";
+        abort();
+      }
+    }
+  }
+};
+
+#endif
diff --git a/src/msg/async/dpdk/TCP-Stack.h b/src/msg/async/dpdk/TCP-Stack.h
new file mode 100644
index 00000000..996ae93c
--- /dev/null
+++ b/src/msg/async/dpdk/TCP-Stack.h
@@ -0,0 +1,40 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+// tcp/network-stack integration
+
+#ifndef CEPH_MSG_DPDK_TCP_STACK_H
+#define CEPH_MSG_DPDK_TCP_STACK_H
+
+class ServerSocket;
+class ConnectedSocket;
+
+class ipv4_traits;
+template <typename InetTraits>
+class tcp;
+
+int tcpv4_listen(tcp<ipv4_traits>& tcpv4, uint16_t port, const SocketOptions &opts,
+                 int type, ServerSocket *sa);
+
+int tcpv4_connect(tcp<ipv4_traits>& tcpv4, const entity_addr_t &addr,
+                  ConnectedSocket *sa);
+
+#endif
diff --git a/src/msg/async/dpdk/TCP.cc b/src/msg/async/dpdk/TCP.cc
new file mode 100644
index 00000000..c6397709
--- /dev/null
+++ b/src/msg/async/dpdk/TCP.cc
@@ -0,0 +1,840 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#include "align.h"
+#include "TCP.h"
+#include "IP.h"
+#include "DPDKStack.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "tcp "
+
+void tcp_option::parse(uint8_t* beg, uint8_t* end)
+{
+  while (beg < end) {
+    auto kind = option_kind(*beg);
+    if (kind != option_kind::nop && kind != option_kind::eol) {
+      // Make sure there is enough room for this option
+      auto len = *(beg + 1);
+      if (beg + len > end) {
+        return;
+      }
+    }
+    switch (kind) {
+      case option_kind::mss:
+        _mss_received = true;
+        _remote_mss = ntoh(reinterpret_cast<mss*>(beg)->mss);
+        beg += option_len::mss;
+        break;
+      case option_kind::win_scale:
+        _win_scale_received = true;
+        _remote_win_scale = reinterpret_cast<win_scale*>(beg)->shift;
+        // We can turn on win_scale option, 7 is Linux's default win scale size
+        _local_win_scale = 7;
+        beg += option_len::win_scale;
+        break;
+      case option_kind::sack:
+        _sack_received = true;
+        beg += option_len::sack;
+        break;
+      case option_kind::nop:
+        beg += option_len::nop;
+        break;
+      case option_kind::eol:
+        return;
+      default:
+        // Ignore options we do not understand
+        auto len = *(beg + 1);
+        beg += len;
+        // Prevent infinite loop
+        if (len == 0) {
+            return;
+        }
+        break;
+    }
+  }
+}
+
+uint8_t tcp_option::fill(tcp_hdr* th, uint8_t options_size)
+{
+  auto hdr = reinterpret_cast<uint8_t*>(th);
+  auto off = hdr + sizeof(tcp_hdr);
+  uint8_t size = 0;
+  bool syn_on = th->f_syn;
+  bool ack_on = th->f_ack;
+
+  if (syn_on) {
+    if (_mss_received || !ack_on) {
+      auto mss = new (off) tcp_option::mss;
+      mss->mss = _local_mss;
+      off += mss->len;
+      size += mss->len;
+      *mss = mss->hton();
+    }
+    if (_win_scale_received || !ack_on) {
+      auto win_scale = new (off) tcp_option::win_scale;
+      win_scale->shift = _local_win_scale;
+      off += win_scale->len;
+      size += win_scale->len;
+    }
+  }
+  if (size > 0) {
+    // Insert NOP option
+    auto size_max = align_up(uint8_t(size + 1), tcp_option::align);
+    while (size < size_max - uint8_t(option_len::eol)) {
+      new (off) tcp_option::nop;
+      off += option_len::nop;
+      size += option_len::nop;
+    }
+    new (off) tcp_option::eol;
+    size += option_len::eol;
+  }
+  ceph_assert(size == options_size);
+
+  return size;
+}
+
+uint8_t tcp_option::get_size(bool syn_on, bool ack_on)
+{
+  uint8_t size = 0;
+  if (syn_on) {
+    if (_mss_received || !ack_on) {
+      size += option_len::mss;
+    }
+    if (_win_scale_received || !ack_on) {
+      size += option_len::win_scale;
+    }
+  }
+  if (size > 0) {
+    size += option_len::eol;
+    // Insert NOP option to align on 32-bit
+    size = align_up(size, tcp_option::align);
+  }
+  return size;
+}
+
+ipv4_tcp::ipv4_tcp(ipv4& inet, EventCenter *c)
+    : _inet_l4(inet), _tcp(std::unique_ptr<tcp<ipv4_traits>>(new tcp<ipv4_traits>(inet.cct, _inet_l4, c)))
+{ }
+
+ipv4_tcp::~ipv4_tcp() { }
+
+void ipv4_tcp::received(Packet p, ipv4_address from, ipv4_address to)
+{
+  _tcp->received(std::move(p), from, to);
+}
+
+bool ipv4_tcp::forward(forward_hash& out_hash_data, Packet& p, size_t off)
+{
+  return _tcp->forward(out_hash_data, p, off);
+}
+
+int tcpv4_listen(tcp<ipv4_traits>& tcpv4, uint16_t port, const SocketOptions &opts,
+                 int type, ServerSocket *sock)
+{
+  auto p = new DPDKServerSocketImpl<tcp<ipv4_traits>>(tcpv4, port, opts, type);
+  int r = p->listen();
+  if (r < 0) {
+    delete p;
+    return r;
+  }
+  *sock = ServerSocket(std::unique_ptr<ServerSocketImpl>(p));
+  return 0;
+}
+
+int tcpv4_connect(tcp<ipv4_traits>& tcpv4, const entity_addr_t &addr,
+                  ConnectedSocket *sock)
+{
+  auto conn = tcpv4.connect(addr);
+  *sock = ConnectedSocket(std::unique_ptr<ConnectedSocketImpl>(
+          new NativeConnectedSocketImpl<tcp<ipv4_traits>>(std::move(conn))));
+  return 0;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr foreign_ip)
+{
+  ldout(cct, 20) << __func__ << " tcp header rst=" << bool(rth->f_rst) << " fin=" << bool(rth->f_fin)
+                 << " syn=" << bool(rth->f_syn) << dendl;
+  if (rth->f_rst) {
+    return;
+  }
+  Packet p;
+  auto th = p.prepend_header<tcp_hdr>();
+  th->src_port = rth->dst_port;
+  th->dst_port = rth->src_port;
+  if (rth->f_ack) {
+    th->seq = rth->ack;
+  }
+  // If this RST packet is in response to a SYN packet. We ACK the ISN.
+  if (rth->f_syn) {
+    th->ack = rth->seq + 1;
+    th->f_ack = true;
+  }
+  th->f_rst = true;
+  th->data_offset = sizeof(*th) / 4;
+  th->checksum = 0;
+  *th = th->hton();
+
+  checksummer csum;
+  offload_info oi;
+  InetTraits::tcp_pseudo_header_checksum(csum, local_ip, foreign_ip, sizeof(*th));
+  if (get_hw_features().tx_csum_l4_offload) {
+    th->checksum = ~csum.get();
+    oi.needs_csum = true;
+  } else {
+    csum.sum(p);
+    th->checksum = csum.get();
+    oi.needs_csum = false;
+  }
+
+  oi.protocol = ip_protocol_num::tcp;
+  oi.tcp_hdr_len = sizeof(tcp_hdr);
+  p.set_offload_info(oi);
+
+  send_packet_without_tcb(local_ip, foreign_ip, std::move(p));
+}
+
+#undef dout_prefix
+#define dout_prefix _prefix(_dout)
+template<typename InetTraits>
+ostream& tcp<InetTraits>::tcb::_prefix(std::ostream *_dout) {
+  return *_dout << "tcp " << _local_ip << ":" << _local_port << " -> " << _foreign_ip << ":" << _foreign_port
+                << " tcb(" << this << " fd=" << fd << " s=" << _state << ").";
+}
+
+template<typename InetTraits>
+void tcp<InetTraits>::tcb::input_handle_listen_state(tcp_hdr* th, Packet p)
+{
+  auto opt_len = th->data_offset * 4 - sizeof(tcp_hdr);
+  auto opt_start = reinterpret_cast<uint8_t*>(p.get_header(0, th->data_offset * 4)) + sizeof(tcp_hdr);
+  auto opt_end = opt_start + opt_len;
+  p.trim_front(th->data_offset * 4);
+  tcp_sequence seg_seq = th->seq;
+
+  // Set RCV.NXT to SEG.SEQ+1, IRS is set to SEG.SEQ
+  _rcv.next = seg_seq + 1;
+  _rcv.initial = seg_seq;
+
+  // ISS should be selected and a SYN segment sent of the form:
+  // <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK>
+  // SND.NXT is set to ISS+1 and SND.UNA to ISS
+  // NOTE: In previous code, _snd.next is set to ISS + 1 only when SYN is
+  // ACKed. Now, we set _snd.next to ISS + 1 here, so in output_one(): we
+  // have
+  //     th->seq = syn_on ? _snd.initial : _snd.next
+  // to make sure retransmitted SYN has correct SEQ number.
+  do_setup_isn();
+
+  _rcv.urgent = _rcv.next;
+
+  ldout(_tcp.cct, 10) << __func__ << " listen: LISTEN -> SYN_RECEIVED" << dendl;
+  init_from_options(th, opt_start, opt_end);
+  do_syn_received();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::input_handle_syn_sent_state(tcp_hdr* th, Packet p)
+{
+  auto opt_len = th->data_offset * 4 - sizeof(tcp_hdr);
+  auto opt_start = reinterpret_cast<uint8_t*>(p.get_header(0, th->data_offset * 4)) + sizeof(tcp_hdr);
+  auto opt_end = opt_start + opt_len;
+  p.trim_front(th->data_offset * 4);
+  tcp_sequence seg_seq = th->seq;
+  auto seg_ack = th->ack;
+
+  ldout(_tcp.cct, 20) << __func__ << " tcp header seq " << seg_seq.raw << " ack " << seg_ack.raw
+                      << " fin=" << bool(th->f_fin) << " syn=" << bool(th->f_syn) << dendl;
+
+  bool acceptable = false;
+  // 3.1 first check the ACK bit
+  if (th->f_ack) {
+    // If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless the
+    // RST bit is set, if so drop the segment and return)
+    if (seg_ack <= _snd.initial || seg_ack > _snd.next) {
+      return respond_with_reset(th);
+    }
+
+    // If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable.
+    acceptable = _snd.unacknowledged <= seg_ack && seg_ack <= _snd.next;
+  }
+
+  // 3.2 second check the RST bit
+  if (th->f_rst) {
+    // If the ACK was acceptable then signal the user "error: connection
+    // reset", drop the segment, enter CLOSED state, delete TCB, and
+    // return.  Otherwise (no ACK) drop the segment and return.
+    if (acceptable) {
+      return do_reset();
+    } else {
+      return;
+    }
+  }
+
+  // 3.3 third check the security and precedence
+  // NOTE: Ignored for now
+
+  // 3.4 fourth check the SYN bit
+  if (th->f_syn) {
+    // RCV.NXT is set to SEG.SEQ+1, IRS is set to SEG.SEQ.  SND.UNA should
+    // be advanced to equal SEG.ACK (if there is an ACK), and any segments
+    // on the retransmission queue which are thereby acknowledged should be
+    // removed.
+    _rcv.next = seg_seq + 1;
+    _rcv.initial = seg_seq;
+    if (th->f_ack) {
+      // TODO: clean retransmission queue
+      _snd.unacknowledged = seg_ack;
+    }
+    if (_snd.unacknowledged > _snd.initial) {
+      // If SND.UNA > ISS (our SYN has been ACKed), change the connection
+      // state to ESTABLISHED, form an ACK segment
+      // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
+      ldout(_tcp.cct, 20) << __func__ << " syn: SYN_SENT -> ESTABLISHED" << dendl;
+      init_from_options(th, opt_start, opt_end);
+      do_established();
+      output();
+    } else {
+      // Otherwise enter SYN_RECEIVED, form a SYN,ACK segment
+      // <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK>
+      ldout(_tcp.cct, 20) << __func__ << " syn: SYN_SENT -> SYN_RECEIVED" << dendl;
+      do_syn_received();
+    }
+  }
+
+  // 3.5 fifth, if neither of the SYN or RST bits is set then drop the
+  // segment and return.
+  return;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::input_handle_other_state(tcp_hdr* th, Packet p)
+{
+  p.trim_front(th->data_offset * 4);
+  bool do_output = false;
+  bool do_output_data = false;
+  tcp_sequence seg_seq = th->seq;
+  auto seg_ack = th->ack;
+  auto seg_len = p.len();
+  ldout(_tcp.cct, 20) << __func__ << " tcp header seq " << seg_seq.raw << " ack " << seg_ack.raw
+                      << " snd next " << _snd.next.raw << " unack " << _snd.unacknowledged.raw
+                      << " rcv next " << _rcv.next.raw << " len " << seg_len
+                      << " fin=" << bool(th->f_fin) << " syn=" << bool(th->f_syn) << dendl;
+
+  // 4.1 first check sequence number
+  if (!segment_acceptable(seg_seq, seg_len)) {
+    //<SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
+    return output();
+  }
+
+  // In the following it is assumed that the segment is the idealized
+  // segment that begins at RCV.NXT and does not exceed the window.
+  if (seg_seq < _rcv.next) {
+    // ignore already acknowledged data
+    auto dup = std::min(uint32_t(_rcv.next - seg_seq), seg_len);
+    ldout(_tcp.cct, 10) << __func__ << " dup segment len " << dup << dendl;
+    p.trim_front(dup);
+    seg_len -= dup;
+    seg_seq += dup;
+  }
+  // FIXME: We should trim data outside the right edge of the receive window as well
+
+  if (seg_seq != _rcv.next) {
+    ldout(_tcp.cct, 10) << __func__ << " out of order, expect " << _rcv.next.raw
+                        << " actual " << seg_seq.raw
+                        << " out of order size " << _rcv.out_of_order.map.size()
+                        << dendl;
+    insert_out_of_order(seg_seq, std::move(p));
+    // A TCP receiver SHOULD send an immediate duplicate ACK
+    // when an out-of-order segment arrives.
+    return output();
+  }
+
+  // 4.2 second check the RST bit
+  if (th->f_rst) {
+    if (in_state(SYN_RECEIVED)) {
+      // If this connection was initiated with a passive OPEN (i.e.,
+      // came from the LISTEN state), then return this connection to
+      // LISTEN state and return.  The user need not be informed.  If
+      // this connection was initiated with an active OPEN (i.e., came
+      // from SYN_SENT state) then the connection was refused, signal
+      // the user "connection refused".  In either case, all segments
+      // on the retransmission queue should be removed.  And in the
+      // active OPEN case, enter the CLOSED state and delete the TCB,
+      // and return.
+      errno = -ECONNREFUSED;
+      return do_reset();
+    }
+    if (in_state(ESTABLISHED | FIN_WAIT_1 | FIN_WAIT_2 | CLOSE_WAIT)) {
+      // If the RST bit is set then, any outstanding RECEIVEs and SEND
+      // should receive "reset" responses.  All segment queues should be
+      // flushed.  Users should also receive an unsolicited general
+      // "connection reset" signal.  Enter the CLOSED state, delete the
+      // TCB, and return.
+      return do_reset();
+    }
+    if (in_state(CLOSING | LAST_ACK | TIME_WAIT)) {
+      // If the RST bit is set then, enter the CLOSED state, delete the
+      // TCB, and return.
+      return do_closed();
+    }
+  }
+
+  // 4.3 third check security and precedence
+  // NOTE: Ignored for now
+
+  // 4.4 fourth, check the SYN bit
+  if (th->f_syn) {
+    // SYN_RECEIVED, ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2
+    // CLOSE_WAIT, CLOSING, LAST_ACK, TIME_WAIT
+
+    // If the SYN is in the window it is an error, send a reset, any
+    // outstanding RECEIVEs and SEND should receive "reset" responses,
+    // all segment queues should be flushed, the user should also
+    // receive an unsolicited general "connection reset" signal, enter
+    // the CLOSED state, delete the TCB, and return.
+    respond_with_reset(th);
+    return do_reset();
+
+    // If the SYN is not in the window this step would not be reached
+    // and an ack would have been sent in the first step (sequence
+    // number check).
+  }
+
+  // 4.5 fifth check the ACK field
+  if (!th->f_ack) {
+    // if the ACK bit is off drop the segment and return
+    return;
+  } else {
+    // SYN_RECEIVED STATE
+    if (in_state(SYN_RECEIVED)) {
+      // If SND.UNA =< SEG.ACK =< SND.NXT then enter ESTABLISHED state
+      // and continue processing.
+      if (_snd.unacknowledged <= seg_ack && seg_ack <= _snd.next) {
+        ldout(_tcp.cct, 20) << __func__ << " SYN_RECEIVED -> ESTABLISHED" << dendl;
+        do_established();
+        if (_tcp.push_listen_queue(_local_port, this)) {
+          ldout(_tcp.cct, 20) << __func__ << " successfully accepting socket" << dendl;
+        } else {
+          ldout(_tcp.cct, 5) << __func__ << " not exist listener or full queue, reset" << dendl;
+          return respond_with_reset(th);
+        }
+      } else {
+        // <SEQ=SEG.ACK><CTL=RST>
+        return respond_with_reset(th);
+      }
+    }
+    auto update_window = [this, th, seg_seq, seg_ack] {
+      ldout(_tcp.cct, 20) << __func__ << " window update seg_seq=" << seg_seq
+                          << " seg_ack=" << seg_ack << " old window=" << th->window
+                          << " new window=" << int(_snd.window_scale) << dendl;
+      _snd.window = th->window << _snd.window_scale;
+      _snd.wl1 = seg_seq;
+      _snd.wl2 = seg_ack;
+      if (_snd.window == 0) {
+        _persist_time_out = _rto;
+        start_persist_timer();
+      } else {
+        stop_persist_timer();
+      }
+    };
+    // ESTABLISHED STATE or
+    // CLOSE_WAIT STATE: Do the same processing as for the ESTABLISHED state.
+    if (in_state(ESTABLISHED | CLOSE_WAIT)) {
+      // If SND.UNA < SEG.ACK =< SND.NXT then, set SND.UNA <- SEG.ACK.
+      if (_snd.unacknowledged < seg_ack && seg_ack <= _snd.next) {
+        // Remote ACKed data we sent
+        auto acked_bytes = data_segment_acked(seg_ack);
+
+        // If SND.UNA < SEG.ACK =< SND.NXT, the send window should be updated.
+        if (_snd.wl1 < seg_seq || (_snd.wl1 == seg_seq && _snd.wl2 <= seg_ack)) {
+          update_window();
+        }
+
+        // some data is acked, try send more data
+        do_output_data = true;
+
+        auto set_retransmit_timer = [this] {
+          if (_snd.data.empty()) {
+            // All outstanding segments are acked, turn off the timer.
+            stop_retransmit_timer();
+            // Signal the waiter of this event
+            signal_all_data_acked();
+          } else {
+            // Restart the timer becasue new data is acked.
+            start_retransmit_timer();
+          }
+        };
+
+        if (_snd.dupacks >= 3) {
+          // We are in fast retransmit / fast recovery phase
+          uint32_t smss = _snd.mss;
+          if (seg_ack > _snd.recover) {
+            ldout(_tcp.cct, 20) << __func__ << " ack: full_ack" << dendl;
+            // Set cwnd to min (ssthresh, max(FlightSize, SMSS) + SMSS)
+            _snd.cwnd = std::min(_snd.ssthresh, std::max(flight_size(), smss) + smss);
+            // Exit the fast recovery procedure
+            exit_fast_recovery();
+            set_retransmit_timer();
+          } else {
+            ldout(_tcp.cct, 20) << __func__ << " ack: partial_ack" << dendl;
+            // Retransmit the first unacknowledged segment
+            fast_retransmit();
+            // Deflate the congestion window by the amount of new data
+            // acknowledged by the Cumulative Acknowledgment field
+            _snd.cwnd -= acked_bytes;
+            // If the partial ACK acknowledges at least one SMSS of new
+            // data, then add back SMSS bytes to the congestion window
+            if (acked_bytes >= smss) {
+              _snd.cwnd += smss;
+            }
+            // Send a new segment if permitted by the new value of
+            // cwnd.  Do not exit the fast recovery procedure For
+            // the first partial ACK that arrives during fast
+            // recovery, also reset the retransmit timer.
+            if (++_snd.partial_ack == 1) {
+              start_retransmit_timer();
+            }
+          }
+        } else {
+          // RFC5681: The fast retransmit algorithm uses the arrival
+          // of 3 duplicate ACKs (as defined in section 2, without
+          // any intervening ACKs which move SND.UNA) as an
+          // indication that a segment has been lost.
+          //
+          // So, here we reset dupacks to zero becasue this ACK moves
+          // SND.UNA.
+          exit_fast_recovery();
+          set_retransmit_timer();
+        }
+      } else if (!_snd.data.empty() && seg_len == 0 &&
+                 th->f_fin == 0 && th->f_syn == 0 &&
+                 th->ack == _snd.unacknowledged &&
+                 uint32_t(th->window << _snd.window_scale) == _snd.window) {
+        // Note:
+        // RFC793 states:
+        // If the ACK is a duplicate (SEG.ACK < SND.UNA), it can be ignored
+        // RFC5681 states:
+        // The TCP sender SHOULD use the "fast retransmit" algorithm to detect
+        // and repair loss, based on incoming duplicate ACKs.
+        // Here, We follow RFC5681.
+        _snd.dupacks++;
+        uint32_t smss = _snd.mss;
+        // 3 duplicated ACKs trigger a fast retransmit
+        if (_snd.dupacks == 1 || _snd.dupacks == 2) {
+          // RFC5681 Step 3.1
+          // Send cwnd + 2 * smss per RFC3042
+          do_output_data = true;
+        } else if (_snd.dupacks == 3) {
+          // RFC6582 Step 3.2
+          if (seg_ack - 1 > _snd.recover) {
+            _snd.recover = _snd.next - 1;
+            // RFC5681 Step 3.2
+            _snd.ssthresh = std::max((flight_size() - _snd.limited_transfer) / 2, 2 * smss);
+            fast_retransmit();
+          } else {
+            // Do not enter fast retransmit and do not reset ssthresh
+          }
+          // RFC5681 Step 3.3
+          _snd.cwnd = _snd.ssthresh + 3 * smss;
+        } else if (_snd.dupacks > 3) {
+          // RFC5681 Step 3.4
+          _snd.cwnd += smss;
+          // RFC5681 Step 3.5
+          do_output_data = true;
+        }
+      } else if (seg_ack > _snd.next) {
+        // If the ACK acks something not yet sent (SEG.ACK > SND.NXT)
+        // then send an ACK, drop the segment, and return
+        return output();
+      } else if (_snd.window == 0 && th->window > 0) {
+        update_window();
+        do_output_data = true;
+      }
+    }
+    // FIN_WAIT_1 STATE
+    if (in_state(FIN_WAIT_1)) {
+      // In addition to the processing for the ESTABLISHED state, if
+      // our FIN is now acknowledged then enter FIN-WAIT-2 and continue
+      // processing in that state.
+      if (seg_ack == _snd.next + 1) {
+        ldout(_tcp.cct, 20) << __func__ << " ack: FIN_WAIT_1 -> FIN_WAIT_2" << dendl;
+        _state = FIN_WAIT_2;
+        do_local_fin_acked();
+      }
+    }
+    // FIN_WAIT_2 STATE
+    if (in_state(FIN_WAIT_2)) {
+      // In addition to the processing for the ESTABLISHED state, if
+      // the retransmission queue is empty, the user’s CLOSE can be
+      // acknowledged ("ok") but do not delete the TCB.
+      // TODO
+    }
+    // CLOSING STATE
+    if (in_state(CLOSING)) {
+      if (seg_ack == _snd.next + 1) {
+        ldout(_tcp.cct, 20) << __func__ << " ack: CLOSING -> TIME_WAIT" << dendl;
+        do_local_fin_acked();
+        return do_time_wait();
+      } else {
+        return;
+      }
+    }
+    // LAST_ACK STATE
+    if (in_state(LAST_ACK)) {
+      if (seg_ack == _snd.next + 1) {
+        ldout(_tcp.cct, 20) << __func__ << " ack: LAST_ACK -> CLOSED" << dendl;
+        do_local_fin_acked();
+        return do_closed();
+      }
+    }
+    // TIME_WAIT STATE
+    if (in_state(TIME_WAIT)) {
+      // The only thing that can arrive in this state is a
+      // retransmission of the remote FIN. Acknowledge it, and restart
+      // the 2 MSL timeout.
+      // TODO
+    }
+  }
+
+  // 4.6 sixth, check the URG bit
+  if (th->f_urg) {
+    // TODO
+  }
+
+  // 4.7 seventh, process the segment text
+  if (in_state(ESTABLISHED | FIN_WAIT_1 | FIN_WAIT_2)) {
+    if (p.len()) {
+      // Once the TCP takes responsibility for the data it advances
+      // RCV.NXT over the data accepted, and adjusts RCV.WND as
+      // apporopriate to the current buffer availability.  The total of
+      // RCV.NXT and RCV.WND should not be reduced.
+      _rcv.data.push_back(std::move(p));
+      _rcv.next += seg_len;
+      auto merged = merge_out_of_order();
+      signal_data_received();
+      // Send an acknowledgment of the form:
+      // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
+      // This acknowledgment should be piggybacked on a segment being
+      // transmitted if possible without incurring undue delay.
+      if (merged) {
+        // TCP receiver SHOULD send an immediate ACK when the
+        // incoming segment fills in all or part of a gap in the
+        // sequence space.
+        do_output = true;
+      } else {
+        do_output = should_send_ack(seg_len);
+      }
+      ldout(_tcp.cct, 20) << __func__ << " merged=" << merged << " do_output=" << do_output << dendl;
+    }
+  } else if (in_state(CLOSE_WAIT | CLOSING | LAST_ACK | TIME_WAIT)) {
+    // This should not occur, since a FIN has been received from the
+    // remote side. Ignore the segment text.
+    return;
+  }
+
+  // 4.8 eighth, check the FIN bit
+  if (th->f_fin) {
+    if (in_state(CLOSED | LISTEN | SYN_SENT)) {
+      // Do not process the FIN if the state is CLOSED, LISTEN or SYN-SENT
+      // since the SEG.SEQ cannot be validated; drop the segment and return.
+      return;
+    }
+    auto fin_seq = seg_seq + seg_len;
+    if (fin_seq == _rcv.next) {
+      _rcv.next = fin_seq + 1;
+
+      // If this <FIN> packet contains data as well, we can ACK both data
+      // and <FIN> in a single packet, so canncel the previous ACK.
+      clear_delayed_ack();
+      do_output = false;
+      // Send ACK for the FIN!
+      output();
+      signal_data_received();
+      _errno = 0;
+
+      if (in_state(SYN_RECEIVED | ESTABLISHED)) {
+        ldout(_tcp.cct, 20) << __func__ << " fin: SYN_RECEIVED or ESTABLISHED -> CLOSE_WAIT" << dendl;
+        _state = CLOSE_WAIT;
+        // EOF
+      }
+      if (in_state(FIN_WAIT_1)) {
+        // If our FIN has been ACKed (perhaps in this segment), then
+        // enter TIME-WAIT, start the time-wait timer, turn off the other
+        // timers; otherwise enter the CLOSING state.
+        // Note: If our FIN has been ACKed, we should be in FIN_WAIT_2
+        // not FIN_WAIT_1 if we reach here.
+        ldout(_tcp.cct, 20) << __func__ << " fin: FIN_WAIT_1 -> CLOSING" << dendl;
+        _state = CLOSING;
+      }
+      if (in_state(FIN_WAIT_2)) {
+        ldout(_tcp.cct, 20) << __func__ << " fin: FIN_WAIT_2 -> TIME_WAIT" << dendl;
+        return do_time_wait();
+      }
+    }
+  }
+  if (do_output || (do_output_data && can_send())) {
+    // Since we will do output, we can canncel scheduled delayed ACK.
+    clear_delayed_ack();
+    output();
+  }
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::connect()
+{
+  ldout(_tcp.cct, 20) << __func__ << dendl;
+  // An initial send sequence number (ISS) is selected.  A SYN segment of the
+  // form <SEQ=ISS><CTL=SYN> is sent.  Set SND.UNA to ISS, SND.NXT to ISS+1,
+  // enter SYN-SENT state, and return.
+  do_setup_isn();
+
+  // Local receive window scale factor
+  _rcv.window_scale = _option._local_win_scale = 7;
+  // Maximum segment size local can receive
+  _rcv.mss = _option._local_mss = local_mss();
+  // Linux's default window size
+  _rcv.window = 29200 << _rcv.window_scale;
+
+  do_syn_sent();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::close_final_cleanup()
+{
+  if (_snd._all_data_acked_fd >= 0) {
+    center->delete_file_event(_snd._all_data_acked_fd, EVENT_READABLE);
+    _tcp.manager.close(_snd._all_data_acked_fd);
+    _snd._all_data_acked_fd = -1;
+  }
+
+  _snd.closed = true;
+  signal_data_received();
+  ldout(_tcp.cct, 20) << __func__ << " unsent_len=" << _snd.unsent_len << dendl;
+  if (in_state(CLOSE_WAIT)) {
+    ldout(_tcp.cct, 20) << __func__ << " CLOSE_WAIT -> LAST_ACK" << dendl;
+    _state = LAST_ACK;
+  } else if (in_state(ESTABLISHED)) {
+    ldout(_tcp.cct, 20) << __func__ << " ESTABLISHED -> FIN_WAIT_1" << dendl;
+    _state = FIN_WAIT_1;
+  }
+  // Send <FIN> to remote
+  // Note: we call output_one to make sure a packet with FIN actually
+  // sent out. If we only call output() and _packetq is not empty,
+  // tcp::tcb::get_packet(), packet with FIN will not be generated.
+  output_one();
+  output();
+  center->delete_file_event(fd, EVENT_READABLE|EVENT_WRITABLE);
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::retransmit()
+{
+  auto output_update_rto = [this] {
+    output();
+    // According to RFC6298, Update RTO <- RTO * 2 to perform binary exponential back-off
+    this->_rto = std::min(this->_rto * 2, this->_rto_max);
+    start_retransmit_timer();
+  };
+
+  // Retransmit SYN
+  if (syn_needs_on()) {
+    if (_snd.syn_retransmit++ < _max_nr_retransmit) {
+      output_update_rto();
+    } else {
+      _errno = -ECONNABORTED;
+      ldout(_tcp.cct, 5) << __func__ << " syn retransmit exceed max "
+                         << _max_nr_retransmit << dendl;
+      _errno = -ETIMEDOUT;
+      cleanup();
+      return;
+    }
+  }
+
+  // Retransmit FIN
+  if (fin_needs_on()) {
+    if (_snd.fin_retransmit++ < _max_nr_retransmit) {
+      output_update_rto();
+    } else {
+      ldout(_tcp.cct, 5) << __func__ << " fin retransmit exceed max "
+                         << _max_nr_retransmit << dendl;
+      _errno = -ETIMEDOUT;
+      cleanup();
+      return;
+    }
+  }
+
+  // Retransmit Data
+  if (_snd.data.empty()) {
+    return;
+  }
+
+  // If there are unacked data, retransmit the earliest segment
+  auto& unacked_seg = _snd.data.front();
+
+  // According to RFC5681
+  // Update ssthresh only for the first retransmit
+  uint32_t smss = _snd.mss;
+  if (unacked_seg.nr_transmits == 0) {
+    _snd.ssthresh = std::max(flight_size() / 2, 2 * smss);
+  }
+  // RFC6582 Step 4
+  _snd.recover = _snd.next - 1;
+  // Start the slow start process
+  _snd.cwnd = smss;
+  // End fast recovery
+  exit_fast_recovery();
+
+  ldout(_tcp.cct, 20) << __func__ << " unack data size " << _snd.data.size()
+                      << " nr=" << unacked_seg.nr_transmits << dendl;
+  if (unacked_seg.nr_transmits < _max_nr_retransmit) {
+    unacked_seg.nr_transmits++;
+  } else {
+    // Delete connection when max num of retransmission is reached
+    ldout(_tcp.cct, 5) << __func__ << " seg retransmit exceed max "
+                       << _max_nr_retransmit << dendl;
+    _errno = -ETIMEDOUT;
+    cleanup();
+    return;
+  }
+  retransmit_one();
+
+  output_update_rto();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::persist() {
+  ldout(_tcp.cct, 20) << __func__ << " persist timer fired" << dendl;
+  // Send 1 byte packet to probe peer's window size
+  _snd.window_probe = true;
+  output_one();
+  _snd.window_probe = false;
+
+  output();
+  // Perform binary exponential back-off per RFC1122
+  _persist_time_out = std::min(_persist_time_out * 2, _rto_max);
+  start_persist_timer();
+}
diff --git a/src/msg/async/dpdk/TCP.h b/src/msg/async/dpdk/TCP.h
new file mode 100644
index 00000000..b7bd7132
--- /dev/null
+++ b/src/msg/async/dpdk/TCP.h
@@ -0,0 +1,1503 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_DPDK_TCP_H_
+#define CEPH_DPDK_TCP_H_
+
+#include <unordered_map>
+#include <map>
+#include <queue>
+#include <functional>
+#include <deque>
+#include <chrono>
+#include <stdexcept>
+#include <system_error>
+
+#include "msg/async/dpdk/EventDPDK.h"
+
+#include "include/utime.h"
+#include "common/Throttle.h"
+#include "common/ceph_time.h"
+#include "common/ceph_crypto.h"
+#include "msg/async/Event.h"
+#include "IPChecksum.h"
+#include "IP.h"
+#include "const.h"
+#include "byteorder.h"
+#include "shared_ptr.h"
+#include "PacketUtil.h"
+
+#include "include/random.h"
+
+struct tcp_hdr;
+
+enum class tcp_state : uint16_t {
+  CLOSED          = (1 << 0),
+  LISTEN          = (1 << 1),
+  SYN_SENT        = (1 << 2),
+  SYN_RECEIVED    = (1 << 3),
+  ESTABLISHED     = (1 << 4),
+  FIN_WAIT_1      = (1 << 5),
+  FIN_WAIT_2      = (1 << 6),
+  CLOSE_WAIT      = (1 << 7),
+  CLOSING         = (1 << 8),
+  LAST_ACK        = (1 << 9),
+  TIME_WAIT       = (1 << 10)
+};
+
+inline tcp_state operator|(tcp_state s1, tcp_state s2) {
+  return tcp_state(uint16_t(s1) | uint16_t(s2));
+}
+
+inline std::ostream & operator<<(std::ostream & str, const tcp_state& s) {
+  switch (s) {
+    case tcp_state::CLOSED: return str << "CLOSED";
+    case tcp_state::LISTEN: return str << "LISTEN";
+    case tcp_state::SYN_SENT: return str << "SYN_SENT";
+    case tcp_state::SYN_RECEIVED: return str << "SYN_RECEIVED";
+    case tcp_state::ESTABLISHED: return str << "ESTABLISHED";
+    case tcp_state::FIN_WAIT_1: return str << "FIN_WAIT_1";
+    case tcp_state::FIN_WAIT_2: return str << "FIN_WAIT_2";
+    case tcp_state::CLOSE_WAIT: return str << "CLOSE_WAIT";
+    case tcp_state::CLOSING: return str << "CLOSING";
+    case tcp_state::LAST_ACK: return str << "LAST_ACK";
+    case tcp_state::TIME_WAIT: return str << "TIME_WAIT";
+    default: return str << "UNKNOWN";
+  }
+}
+
+struct tcp_option {
+  // The kind and len field are fixed and defined in TCP protocol
+  enum class option_kind: uint8_t { mss = 2, win_scale = 3, sack = 4, timestamps = 8,  nop = 1, eol = 0 };
+  enum class option_len:  uint8_t { mss = 4, win_scale = 3, sack = 2, timestamps = 10, nop = 1, eol = 1 };
+  struct mss {
+    option_kind kind = option_kind::mss;
+    option_len len = option_len::mss;
+    uint16_t mss;
+    struct mss hton() {
+      struct mss m = *this;
+      m.mss = ::hton(m.mss);
+      return m;
+    }
+  } __attribute__((packed));
+  struct win_scale {
+    option_kind kind = option_kind::win_scale;
+    option_len len = option_len::win_scale;
+    uint8_t shift;
+  } __attribute__((packed));
+  struct sack {
+    option_kind kind = option_kind::sack;
+    option_len len = option_len::sack;
+  } __attribute__((packed));
+  struct timestamps {
+    option_kind kind = option_kind::timestamps;
+    option_len len = option_len::timestamps;
+    uint32_t t1;
+    uint32_t t2;
+  } __attribute__((packed));
+  struct nop {
+    option_kind kind = option_kind::nop;
+  } __attribute__((packed));
+  struct eol {
+    option_kind kind = option_kind::eol;
+  } __attribute__((packed));
+  static const uint8_t align = 4;
+
+  void parse(uint8_t* beg, uint8_t* end);
+  uint8_t fill(tcp_hdr* th, uint8_t option_size);
+  uint8_t get_size(bool syn_on, bool ack_on);
+
+  // For option negotiattion
+  bool _mss_received = false;
+  bool _win_scale_received = false;
+  bool _timestamps_received = false;
+  bool _sack_received = false;
+
+  // Option data
+  uint16_t _remote_mss = 536;
+  uint16_t _local_mss;
+  uint8_t _remote_win_scale = 0;
+  uint8_t _local_win_scale = 0;
+};
+inline uint8_t*& operator+=(uint8_t*& x, tcp_option::option_len len) { x += uint8_t(len); return x; }
+inline uint8_t& operator+=(uint8_t& x, tcp_option::option_len len) { x += uint8_t(len); return x; }
+
+struct tcp_sequence {
+  uint32_t raw;
+};
+
+tcp_sequence ntoh(tcp_sequence ts) {
+  return tcp_sequence { ::ntoh(ts.raw) };
+}
+
+tcp_sequence hton(tcp_sequence ts) {
+  return tcp_sequence { ::hton(ts.raw) };
+}
+
+inline std::ostream& operator<<(std::ostream& os, const tcp_sequence& s) {
+  return os << s.raw;
+}
+
+inline tcp_sequence make_seq(uint32_t raw) { return tcp_sequence{raw}; }
+inline tcp_sequence& operator+=(tcp_sequence& s, int32_t n) { s.raw += n; return s; }
+inline tcp_sequence& operator-=(tcp_sequence& s, int32_t n) { s.raw -= n; return s; }
+inline tcp_sequence operator+(tcp_sequence s, int32_t n) { return s += n; }
+inline tcp_sequence operator-(tcp_sequence s, int32_t n) { return s -= n; }
+inline int32_t operator-(tcp_sequence s, tcp_sequence q) { return s.raw - q.raw; }
+inline bool operator==(tcp_sequence s, tcp_sequence q)  { return s.raw == q.raw; }
+inline bool operator!=(tcp_sequence s, tcp_sequence q) { return !(s == q); }
+inline bool operator<(tcp_sequence s, tcp_sequence q) { return s - q < 0; }
+inline bool operator>(tcp_sequence s, tcp_sequence q) { return q < s; }
+inline bool operator<=(tcp_sequence s, tcp_sequence q) { return !(s > q); }
+inline bool operator>=(tcp_sequence s, tcp_sequence q) { return !(s < q); }
+
+struct tcp_hdr {
+  uint16_t src_port;
+  uint16_t dst_port;
+  tcp_sequence seq;
+  tcp_sequence ack;
+  uint8_t rsvd1 : 4;
+  uint8_t data_offset : 4;
+  uint8_t f_fin : 1;
+  uint8_t f_syn : 1;
+  uint8_t f_rst : 1;
+  uint8_t f_psh : 1;
+  uint8_t f_ack : 1;
+  uint8_t f_urg : 1;
+  uint8_t rsvd2 : 2;
+  uint16_t window;
+  uint16_t checksum;
+  uint16_t urgent;
+
+  tcp_hdr hton() {
+    tcp_hdr hdr = *this;
+    hdr.src_port = ::hton(src_port);
+    hdr.dst_port = ::hton(dst_port);
+    hdr.seq = ::hton(seq);
+    hdr.ack = ::hton(ack);
+    hdr.window = ::hton(window);
+    hdr.checksum = ::hton(checksum);
+    hdr.urgent = ::hton(urgent);
+    return hdr;
+  }
+
+  tcp_hdr ntoh() {
+    tcp_hdr hdr = *this;
+    hdr.src_port = ::ntoh(src_port);
+    hdr.dst_port = ::ntoh(dst_port);
+    hdr.seq = ::ntoh(seq);
+    hdr.ack = ::ntoh(ack);
+    hdr.window = ::ntoh(window);
+    hdr.checksum = ::ntoh(checksum);
+    hdr.urgent = ::ntoh(urgent);
+    return hdr;
+  }
+} __attribute__((packed));
+
+struct tcp_tag {};
+using tcp_packet_merger = packet_merger<tcp_sequence, tcp_tag>;
+
+template <typename InetTraits>
+class tcp {
+ public:
+  using ipaddr = typename InetTraits::address_type;
+  using inet_type = typename InetTraits::inet_type;
+  using connid = l4connid<InetTraits>;
+  using connid_hash = typename connid::connid_hash;
+  class connection;
+  class listener;
+ private:
+  class tcb;
+
+  class C_handle_delayed_ack : public EventCallback {
+    tcb *tc;
+
+   public:
+    C_handle_delayed_ack(tcb *t): tc(t) { }
+    void do_request(uint64_t r) {
+      tc->_nr_full_seg_received = 0;
+      tc->output();
+    }
+  };
+
+  class C_handle_retransmit : public EventCallback {
+    tcb *tc;
+
+   public:
+    C_handle_retransmit(tcb *t): tc(t) { }
+    void do_request(uint64_t r) {
+      tc->retransmit();
+    }
+  };
+
+  class C_handle_persist : public EventCallback {
+    tcb *tc;
+
+   public:
+    C_handle_persist(tcb *t): tc(t) { }
+    void do_request(uint64_t r) {
+      tc->persist();
+    }
+  };
+
+  class C_all_data_acked : public EventCallback {
+    tcb *tc;
+
+   public:
+    C_all_data_acked(tcb *t): tc(t) {}
+    void do_request(uint64_t fd_or_id) {
+      tc->close_final_cleanup();
+    }
+  };
+
+  class C_actual_remove_tcb : public EventCallback {
+    lw_shared_ptr<tcb> tc;
+   public:
+    C_actual_remove_tcb(tcb *t): tc(t->shared_from_this()) {}
+    void do_request(uint64_t r) {
+      delete this;
+    }
+  };
+
+  class tcb : public enable_lw_shared_from_this<tcb> {
+    using clock_type = ceph::coarse_real_clock;
+    static constexpr tcp_state CLOSED         = tcp_state::CLOSED;
+    static constexpr tcp_state LISTEN         = tcp_state::LISTEN;
+    static constexpr tcp_state SYN_SENT       = tcp_state::SYN_SENT;
+    static constexpr tcp_state SYN_RECEIVED   = tcp_state::SYN_RECEIVED;
+    static constexpr tcp_state ESTABLISHED    = tcp_state::ESTABLISHED;
+    static constexpr tcp_state FIN_WAIT_1     = tcp_state::FIN_WAIT_1;
+    static constexpr tcp_state FIN_WAIT_2     = tcp_state::FIN_WAIT_2;
+    static constexpr tcp_state CLOSE_WAIT     = tcp_state::CLOSE_WAIT;
+    static constexpr tcp_state CLOSING        = tcp_state::CLOSING;
+    static constexpr tcp_state LAST_ACK       = tcp_state::LAST_ACK;
+    static constexpr tcp_state TIME_WAIT      = tcp_state::TIME_WAIT;
+    tcp_state _state = CLOSED;
+    tcp& _tcp;
+    UserspaceEventManager &manager;
+    connection* _conn = nullptr;
+    bool _connect_done = false;
+    ipaddr _local_ip;
+    ipaddr _foreign_ip;
+    uint16_t _local_port;
+    uint16_t _foreign_port;
+    struct unacked_segment {
+      Packet p;
+      uint16_t data_len;
+      unsigned nr_transmits;
+      clock_type::time_point tx_time;
+    };
+    struct send {
+      tcp_sequence unacknowledged;
+      tcp_sequence next;
+      uint32_t window;
+      uint8_t window_scale;
+      uint16_t mss;
+      tcp_sequence urgent;
+      tcp_sequence wl1;
+      tcp_sequence wl2;
+      tcp_sequence initial;
+      std::deque<unacked_segment> data;
+      std::deque<Packet> unsent;
+      uint32_t unsent_len = 0;
+      uint32_t queued_len = 0;
+      bool closed = false;
+      // Wait for all data are acked
+      int _all_data_acked_fd = -1;
+      // Limit number of data queued into send queue
+      Throttle user_queue_space;
+      // Round-trip time variation
+      std::chrono::microseconds rttvar;
+      // Smoothed round-trip time
+      std::chrono::microseconds srtt;
+      bool first_rto_sample = true;
+      clock_type::time_point syn_tx_time;
+      // Congestion window
+      uint32_t cwnd;
+      // Slow start threshold
+      uint32_t ssthresh;
+      // Duplicated ACKs
+      uint16_t dupacks = 0;
+      unsigned syn_retransmit = 0;
+      unsigned fin_retransmit = 0;
+      uint32_t limited_transfer = 0;
+      uint32_t partial_ack = 0;
+      tcp_sequence recover;
+      bool window_probe = false;
+      send(CephContext *c): user_queue_space(c, "DPDK::tcp::tcb::user_queue_space", 81920) {}
+    } _snd;
+    struct receive {
+      tcp_sequence next;
+      uint32_t window;
+      uint8_t window_scale;
+      uint16_t mss;
+      tcp_sequence urgent;
+      tcp_sequence initial;
+      std::deque<Packet> data;
+      tcp_packet_merger out_of_order;
+    } _rcv;
+    EventCenter *center;
+    int fd;
+    // positive means no errno, 0 means eof, nagetive means error
+    int16_t _errno = 1;
+    tcp_option _option;
+    EventCallbackRef delayed_ack_event;
+    Tub<uint64_t> _delayed_ack_fd;
+    // Retransmission timeout
+    std::chrono::microseconds _rto{1000*1000};
+    std::chrono::microseconds _persist_time_out{1000*1000};
+    static constexpr std::chrono::microseconds _rto_min{1000*1000};
+    static constexpr std::chrono::microseconds _rto_max{60000*1000};
+    // Clock granularity
+    static constexpr std::chrono::microseconds _rto_clk_granularity{1000};
+    static constexpr uint16_t _max_nr_retransmit{5};
+    EventCallbackRef retransmit_event;
+    Tub<uint64_t> retransmit_fd;
+    EventCallbackRef persist_event;
+    EventCallbackRef all_data_ack_event;
+    Tub<uint64_t> persist_fd;
+    uint16_t _nr_full_seg_received = 0;
+    struct isn_secret {
+      // 512 bits secretkey for ISN generating
+      uint32_t key[16];
+      isn_secret () {
+        for (auto& k : key) {
+          k = ceph::util::generate_random_number<uint32_t>(0, std::numeric_limits<uint32_t>::max());
+        }
+      }
+    };
+    static isn_secret _isn_secret;
+    tcp_sequence get_isn();
+    circular_buffer<typename InetTraits::l4packet> _packetq;
+    bool _poll_active = false;
+   public:
+    // callback
+    void close_final_cleanup();
+    ostream& _prefix(std::ostream *_dout);
+
+   public:
+    tcb(tcp& t, connid id);
+    ~tcb();
+    void input_handle_listen_state(tcp_hdr* th, Packet p);
+    void input_handle_syn_sent_state(tcp_hdr* th, Packet p);
+    void input_handle_other_state(tcp_hdr* th, Packet p);
+    void output_one(bool data_retransmit = false);
+    bool is_all_data_acked();
+    int send(Packet p);
+    void connect();
+    Tub<Packet> read();
+    void close();
+    void remove_from_tcbs() {
+      auto id = connid{_local_ip, _foreign_ip, _local_port, _foreign_port};
+      _tcp._tcbs.erase(id);
+    }
+    Tub<typename InetTraits::l4packet> get_packet();
+    void output() {
+      if (!_poll_active) {
+        _poll_active = true;
+
+        auto tcb = this->shared_from_this();
+        _tcp._inet.wait_l2_dst_address(_foreign_ip, Packet(), [tcb] (const ethernet_address &dst, Packet p, int r) {
+          if (r == 0) {
+            tcb->_tcp.poll_tcb(dst, std::move(tcb));
+          } else if (r == -ETIMEDOUT) {
+            // in other states connection should time out
+            if (tcb->in_state(SYN_SENT)) {
+              tcb->_errno = -ETIMEDOUT;
+              tcb->cleanup();
+            }
+          } else if (r == -EBUSY) {
+            // retry later
+            tcb->_poll_active = false;
+            tcb->start_retransmit_timer();
+          }
+        });
+      }
+    }
+
+    int16_t get_errno() const {
+      return _errno;
+    }
+
+    tcp_state& state() {
+      return _state;
+    }
+
+    uint64_t peek_sent_available() {
+      if (!in_state(ESTABLISHED))
+        return 0;
+      uint64_t left = _snd.user_queue_space.get_max() - _snd.user_queue_space.get_current();
+      return left;
+    }
+
+    int is_connected() const {
+      if (_errno <= 0)
+        return _errno;
+      return _connect_done;
+    }
+
+   private:
+    void respond_with_reset(tcp_hdr* th);
+    bool merge_out_of_order();
+    void insert_out_of_order(tcp_sequence seq, Packet p);
+    void trim_receive_data_after_window();
+    bool should_send_ack(uint16_t seg_len);
+    void clear_delayed_ack();
+    Packet get_transmit_packet();
+    void retransmit_one() {
+      bool data_retransmit = true;
+      output_one(data_retransmit);
+    }
+    void start_retransmit_timer() {
+      if (retransmit_fd)
+        center->delete_time_event(*retransmit_fd);
+      retransmit_fd.construct(center->create_time_event(_rto.count(), retransmit_event));
+    };
+    void stop_retransmit_timer() {
+      if (retransmit_fd) {
+        center->delete_time_event(*retransmit_fd);
+        retransmit_fd.destroy();
+      }
+    };
+    void start_persist_timer() {
+      if (persist_fd)
+        center->delete_time_event(*persist_fd);
+      persist_fd.construct(center->create_time_event(_persist_time_out.count(), persist_event));
+    };
+    void stop_persist_timer() {
+      if (persist_fd) {
+        center->delete_time_event(*persist_fd);
+        persist_fd.destroy();
+      }
+    };
+    void persist();
+    void retransmit();
+    void fast_retransmit();
+    void update_rto(clock_type::time_point tx_time);
+    void update_cwnd(uint32_t acked_bytes);
+    void cleanup();
+    uint32_t can_send() {
+      if (_snd.window_probe) {
+        return 1;
+      }
+      // Can not send more than advertised window allows
+      auto x = std::min(uint32_t(_snd.unacknowledged + _snd.window - _snd.next), _snd.unsent_len);
+      // Can not send more than congestion window allows
+      x = std::min(_snd.cwnd, x);
+      if (_snd.dupacks == 1 || _snd.dupacks == 2) {
+        // RFC5681 Step 3.1
+        // Send cwnd + 2 * smss per RFC3042
+        auto flight = flight_size();
+        auto max = _snd.cwnd + 2 * _snd.mss;
+        x = flight <= max ? std::min(x, max - flight) : 0;
+        _snd.limited_transfer += x;
+      } else if (_snd.dupacks >= 3) {
+        // RFC5681 Step 3.5
+        // Sent 1 full-sized segment at most
+        x = std::min(uint32_t(_snd.mss), x);
+      }
+      return x;
+    }
+    uint32_t flight_size() {
+      uint32_t size = 0;
+      std::for_each(_snd.data.begin(), _snd.data.end(),
+                    [&] (unacked_segment& seg) { size += seg.p.len(); });
+      return size;
+    }
+    uint16_t local_mss() {
+      return _tcp.get_hw_features().mtu - tcp_hdr_len_min - InetTraits::ip_hdr_len_min;
+    }
+    void queue_packet(Packet p) {
+      _packetq.emplace_back(
+          typename InetTraits::l4packet{_foreign_ip, std::move(p)});
+    }
+    void signal_data_received() {
+      manager.notify(fd, EVENT_READABLE);
+    }
+    void signal_all_data_acked() {
+      if (_snd._all_data_acked_fd >= 0 && _snd.unsent_len == 0 && _snd.queued_len == 0)
+        manager.notify(_snd._all_data_acked_fd, EVENT_READABLE);
+    }
+    void do_syn_sent() {
+      _state = SYN_SENT;
+      _snd.syn_tx_time = clock_type::now();
+      // Send <SYN> to remote
+      output();
+    }
+    void do_syn_received() {
+      _state = SYN_RECEIVED;
+      _snd.syn_tx_time = clock_type::now();
+      // Send <SYN,ACK> to remote
+      output();
+    }
+    void do_established() {
+      _state = ESTABLISHED;
+      update_rto(_snd.syn_tx_time);
+      _connect_done = true;
+      manager.notify(fd, EVENT_READABLE|EVENT_WRITABLE);
+    }
+    void do_reset() {
+      _state = CLOSED;
+      // Free packets to be sent which are waiting for user_queue_space
+      _snd.user_queue_space.reset();
+      cleanup();
+      _errno = -ECONNRESET;
+      manager.notify(fd, EVENT_READABLE);
+
+      if (_snd._all_data_acked_fd >= 0)
+        manager.notify(_snd._all_data_acked_fd, EVENT_READABLE);
+    }
+    void do_time_wait() {
+      // FIXME: Implement TIME_WAIT state timer
+      _state = TIME_WAIT;
+      cleanup();
+    }
+    void do_closed() {
+      _state = CLOSED;
+      cleanup();
+    }
+    void do_setup_isn() {
+      _snd.initial = get_isn();
+      _snd.unacknowledged = _snd.initial;
+      _snd.next = _snd.initial + 1;
+      _snd.recover = _snd.initial;
+    }
+    void do_local_fin_acked() {
+      _snd.unacknowledged += 1;
+      _snd.next += 1;
+    }
+    bool syn_needs_on() {
+      return in_state(SYN_SENT | SYN_RECEIVED);
+    }
+    bool fin_needs_on() {
+      return in_state(FIN_WAIT_1 | CLOSING | LAST_ACK) && _snd.closed &&
+             _snd.unsent_len == 0 && _snd.queued_len == 0;
+    }
+    bool ack_needs_on() {
+      return !in_state(CLOSED | LISTEN | SYN_SENT);
+    }
+    bool foreign_will_not_send() {
+      return in_state(CLOSING | TIME_WAIT | CLOSE_WAIT | LAST_ACK | CLOSED);
+    }
+    bool in_state(tcp_state state) {
+      return uint16_t(_state) & uint16_t(state);
+    }
+    void exit_fast_recovery() {
+      _snd.dupacks = 0;
+      _snd.limited_transfer = 0;
+      _snd.partial_ack = 0;
+    }
+    uint32_t data_segment_acked(tcp_sequence seg_ack);
+    bool segment_acceptable(tcp_sequence seg_seq, unsigned seg_len);
+    void init_from_options(tcp_hdr* th, uint8_t* opt_start, uint8_t* opt_end);
+    friend class connection;
+
+    friend class C_handle_delayed_ack;
+    friend class C_handle_retransmit;
+    friend class C_handle_persist;
+    friend class C_all_data_acked;
+  };
+
+  CephContext *cct;
+  // ipv4_l4<ip_protocol_num::tcp>
+  inet_type& _inet;
+  EventCenter *center;
+  UserspaceEventManager &manager;
+  std::unordered_map<connid, lw_shared_ptr<tcb>, connid_hash> _tcbs;
+  std::unordered_map<uint16_t, listener*> _listening;
+  std::random_device _rd;
+  std::default_random_engine _e;
+  std::uniform_int_distribution<uint16_t> _port_dist{41952, 65535};
+  circular_buffer<std::pair<lw_shared_ptr<tcb>, ethernet_address>> _poll_tcbs;
+  // queue for packets that do not belong to any tcb
+  circular_buffer<ipv4_traits::l4packet> _packetq;
+  Throttle _queue_space;
+  // Limit number of data queued into send queue
+ public:
+  class connection {
+    lw_shared_ptr<tcb> _tcb;
+   public:
+    explicit connection(lw_shared_ptr<tcb> tcbp) : _tcb(std::move(tcbp)) { _tcb->_conn = this; }
+    connection(const connection&) = delete;
+    connection(connection&& x) noexcept : _tcb(std::move(x._tcb)) {
+      _tcb->_conn = this;
+    }
+    ~connection();
+    void operator=(const connection&) = delete;
+    connection& operator=(connection&& x) {
+      if (this != &x) {
+        this->~connection();
+        new (this) connection(std::move(x));
+      }
+      return *this;
+    }
+    int fd() const {
+      return _tcb->fd;
+    }
+    int send(Packet p) {
+      return _tcb->send(std::move(p));
+    }
+    Tub<Packet> read() {
+      return _tcb->read();
+    }
+    int16_t get_errno() const {
+      return _tcb->get_errno();
+    }
+    void close_read();
+    void close_write();
+    entity_addr_t remote_addr() const {
+      entity_addr_t addr;
+      auto net_ip = _tcb->_foreign_ip.hton();
+      memcpy((void*)&addr.in4_addr().sin_addr.s_addr,
+             &net_ip, sizeof(addr.in4_addr().sin_addr.s_addr));
+      addr.set_family(AF_INET);
+      return addr;
+    }
+    uint64_t peek_sent_available() {
+      return _tcb->peek_sent_available();
+    }
+    int is_connected() const { return _tcb->is_connected(); }
+  };
+  class listener {
+    tcp& _tcp;
+    uint16_t _port;
+    int _fd = -1;
+    int16_t _errno;
+    queue<connection> _q;
+    size_t _q_max_length;
+
+   private:
+    listener(tcp& t, uint16_t port, size_t queue_length)
+        : _tcp(t), _port(port), _errno(0), _q(), _q_max_length(queue_length) {
+    }
+   public:
+    listener(const listener&) = delete;
+    void operator=(const listener&) = delete;
+    listener(listener&& x)
+        : _tcp(x._tcp), _port(x._port), _fd(std::move(x._fd)), _errno(x._errno),
+          _q(std::move(x._q)) {
+      if (_fd >= 0)
+        _tcp._listening[_port] = this;
+    }
+    ~listener() {
+      abort_accept();
+    }
+    int listen() {
+      if (_tcp._listening.find(_port) != _tcp._listening.end())
+        return -EADDRINUSE;
+      _tcp._listening.emplace(_port, this);
+      _fd = _tcp.manager.get_eventfd();
+      return 0;
+    }
+    Tub<connection> accept() {
+      Tub<connection> c;
+      if (!_q.empty()) {
+        c = std::move(_q.front());
+        _q.pop();
+      }
+      return c;
+    }
+    void abort_accept() {
+      while (!_q.empty())
+        _q.pop();
+      if (_fd >= 0) {
+        _tcp._listening.erase(_port);
+        _tcp.manager.close(_fd);
+        _fd = -1;
+      }
+    }
+    int16_t get_errno() const {
+      return _errno;
+    }
+    bool full() const {
+      return _q.size() == _q_max_length;
+    }
+    int fd() const {
+      return _fd;
+    }
+    friend class tcp;
+  };
+ public:
+  explicit tcp(CephContext *c, inet_type& inet, EventCenter *cen);
+  void received(Packet p, ipaddr from, ipaddr to);
+  bool forward(forward_hash& out_hash_data, Packet& p, size_t off);
+  listener listen(uint16_t port, size_t queue_length = 100);
+  connection connect(const entity_addr_t &addr);
+  const hw_features& get_hw_features() const { return _inet._inet.get_hw_features(); }
+  void poll_tcb(const ethernet_address &dst, lw_shared_ptr<tcb> tcb) {
+    _poll_tcbs.emplace_back(std::move(tcb), dst);
+  }
+  bool push_listen_queue(uint16_t port, tcb *t) {
+    auto listener = _listening.find(port);
+    if (listener == _listening.end() || listener->second->full()) {
+      return false;
+    }
+    listener->second->_q.push(connection(t->shared_from_this()));
+    manager.notify(listener->second->_fd, EVENT_READABLE);
+    return true;
+  }
+
+ private:
+  void send_packet_without_tcb(ipaddr from, ipaddr to, Packet p);
+  void respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr foreign_ip);
+  friend class listener;
+};
+
+template <typename InetTraits>
+tcp<InetTraits>::tcp(CephContext *c, inet_type& inet, EventCenter *cen)
+    : cct(c), _inet(inet), center(cen),
+      manager(static_cast<DPDKDriver*>(cen->get_driver())->manager),
+      _e(_rd()), _queue_space(cct, "DPDK::tcp::queue_space", 81920) {
+  int tcb_polled = 0u;
+  _inet.register_packet_provider([this, tcb_polled] () mutable {
+    Tub<typename InetTraits::l4packet> l4p;
+    auto c = _poll_tcbs.size();
+    if (!_packetq.empty() && (!(tcb_polled % 128) || c == 0)) {
+      l4p = std::move(_packetq.front());
+      _packetq.pop_front();
+      _queue_space.put(l4p->p.len());
+    } else {
+      while (c--) {
+        tcb_polled++;
+        lw_shared_ptr<tcb> tcb;
+        ethernet_address dst;
+        std::tie(tcb, dst) = std::move(_poll_tcbs.front());
+        _poll_tcbs.pop_front();
+        l4p = std::move(tcb->get_packet());
+        if (l4p) {
+          l4p->e_dst = dst;
+          break;
+        }
+      }
+    }
+    return l4p;
+  });
+}
+
+template <typename InetTraits>
+auto tcp<InetTraits>::listen(uint16_t port, size_t queue_length) -> listener {
+  return listener(*this, port, queue_length);
+}
+
+template <typename InetTraits>
+typename tcp<InetTraits>::connection tcp<InetTraits>::connect(const entity_addr_t &addr) {
+  uint16_t src_port;
+  connid id;
+  auto src_ip = _inet._inet.host_address();
+  auto dst_ip = ipv4_address(addr);
+  auto dst_port = addr.get_port();
+
+  do {
+    src_port = _port_dist(_e);
+    id = connid{src_ip, dst_ip, src_port, (uint16_t)dst_port};
+    if (_tcbs.find(id) == _tcbs.end()) {
+      if (_inet._inet.netif()->hw_queues_count() == 1 ||
+          _inet._inet.netif()->hash2cpu(
+              id.hash(_inet._inet.netif()->rss_key())) == center->get_id())
+        break;
+    }
+  } while (true);
+
+  auto tcbp = make_lw_shared<tcb>(*this, id);
+  _tcbs.insert({id, tcbp});
+  tcbp->connect();
+  return connection(tcbp);
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::forward(forward_hash& out_hash_data, Packet& p, size_t off) {
+  auto th = p.get_header<tcp_hdr>(off);
+  if (th) {
+    out_hash_data.push_back(th->src_port);
+    out_hash_data.push_back(th->dst_port);
+  }
+  return true;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::received(Packet p, ipaddr from, ipaddr to) {
+  auto th = p.get_header<tcp_hdr>(0);
+  if (!th) {
+    return;
+  }
+  // th->data_offset is correct even before ntoh()
+  if (unsigned(th->data_offset * 4) < sizeof(*th)) {
+    return;
+  }
+
+  if (!get_hw_features().rx_csum_offload) {
+    checksummer csum;
+    InetTraits::tcp_pseudo_header_checksum(csum, from, to, p.len());
+    csum.sum(p);
+    if (csum.get() != 0) {
+      return;
+    }
+  }
+  auto h = th->ntoh();
+  auto id = connid{to, from, h.dst_port, h.src_port};
+  auto tcbi = _tcbs.find(id);
+  lw_shared_ptr<tcb> tcbp;
+  if (tcbi == _tcbs.end()) {
+    auto listener = _listening.find(id.local_port);
+    if (listener == _listening.end() || listener->second->full()) {
+      // 1) In CLOSE state
+      // 1.1 all data in the incoming segment is discarded.  An incoming
+      // segment containing a RST is discarded. An incoming segment not
+      // containing a RST causes a RST to be sent in response.
+      // FIXME:
+      //      if ACK off: <SEQ=0><ACK=SEG.SEQ+SEG.LEN><CTL=RST,ACK>
+      //      if ACK on:  <SEQ=SEG.ACK><CTL=RST>
+      return respond_with_reset(&h, id.local_ip, id.foreign_ip);
+    } else {
+      // 2) In LISTEN state
+      // 2.1 first check for an RST
+      if (h.f_rst) {
+        // An incoming RST should be ignored
+        return;
+      }
+      // 2.2 second check for an ACK
+      if (h.f_ack) {
+        // Any acknowledgment is bad if it arrives on a connection
+        // still in the LISTEN state.
+        // <SEQ=SEG.ACK><CTL=RST>
+        return respond_with_reset(&h, id.local_ip, id.foreign_ip);
+      }
+      // 2.3 third check for a SYN
+      if (h.f_syn) {
+        // check the security
+        // NOTE: Ignored for now
+        tcbp = make_lw_shared<tcb>(*this, id);
+        _tcbs.insert({id, tcbp});
+        return tcbp->input_handle_listen_state(&h, std::move(p));
+      }
+      // 2.4 fourth other text or control
+      // So you are unlikely to get here, but if you do, drop the
+      // segment, and return.
+      return;
+    }
+  } else {
+    tcbp = tcbi->second;
+    if (tcbp->state() == tcp_state::SYN_SENT) {
+      // 3) In SYN_SENT State
+      return tcbp->input_handle_syn_sent_state(&h, std::move(p));
+    } else {
+      // 4) In other state, can be one of the following:
+      // SYN_RECEIVED, ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2
+      // CLOSE_WAIT, CLOSING, LAST_ACK, TIME_WAIT
+      return tcbp->input_handle_other_state(&h, std::move(p));
+    }
+  }
+}
+
+// Send packet does not belong to any tcb
+template <typename InetTraits>
+void tcp<InetTraits>::send_packet_without_tcb(ipaddr from, ipaddr to, Packet p) {
+  if (_queue_space.get_or_fail(p.len())) { // drop packets that do not fit the queue
+    _inet.wait_l2_dst_address(to, std::move(p), [this, to] (const ethernet_address &e_dst, Packet p, int r) mutable {
+      if (r == 0)
+        _packetq.emplace_back(ipv4_traits::l4packet{to, std::move(p), e_dst, ip_protocol_num::tcp});
+    });
+  }
+}
+
+template <typename InetTraits>
+tcp<InetTraits>::connection::~connection() {
+  if (_tcb) {
+    _tcb->_conn = nullptr;
+    close_read();
+    close_write();
+  }
+}
+
+template <typename InetTraits>
+tcp<InetTraits>::tcb::tcb(tcp& t, connid id)
+    : _tcp(t), manager(t.manager), _local_ip(id.local_ip) , _foreign_ip(id.foreign_ip),
+      _local_port(id.local_port), _foreign_port(id.foreign_port),
+      _snd(_tcp.cct),
+      center(t.center),
+      fd(t.manager.get_eventfd()),
+      delayed_ack_event(new tcp<InetTraits>::C_handle_delayed_ack(this)),
+      retransmit_event(new tcp<InetTraits>::C_handle_retransmit(this)),
+      persist_event(new tcp<InetTraits>::C_handle_persist(this)),
+      all_data_ack_event(new tcp<InetTraits>::C_all_data_acked(this)) {}
+
+template <typename InetTraits>
+tcp<InetTraits>::tcb::~tcb()
+{
+  if (_delayed_ack_fd)
+    center->delete_time_event(*_delayed_ack_fd);
+  if (retransmit_fd)
+    center->delete_time_event(*retransmit_fd);
+  if (persist_fd)
+    center->delete_time_event(*persist_fd);
+  delete delayed_ack_event;
+  delete retransmit_event;
+  delete persist_event;
+  delete all_data_ack_event;
+  manager.close(fd);
+  fd = -1;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::respond_with_reset(tcp_hdr* rth)
+{
+  _tcp.respond_with_reset(rth, _local_ip, _foreign_ip);
+}
+
+template <typename InetTraits>
+uint32_t tcp<InetTraits>::tcb::data_segment_acked(tcp_sequence seg_ack) {
+  uint32_t total_acked_bytes = 0;
+  // Full ACK of segment
+  while (!_snd.data.empty()
+         && (_snd.unacknowledged + _snd.data.front().p.len() <= seg_ack)) {
+    auto acked_bytes = _snd.data.front().p.len();
+    _snd.unacknowledged += acked_bytes;
+    // Ignore retransmitted segments when setting the RTO
+    if (_snd.data.front().nr_transmits == 0) {
+      update_rto(_snd.data.front().tx_time);
+    }
+    update_cwnd(acked_bytes);
+    total_acked_bytes += acked_bytes;
+    _snd.user_queue_space.put(_snd.data.front().data_len);
+    manager.notify(fd, EVENT_WRITABLE);
+    _snd.data.pop_front();
+  }
+  // Partial ACK of segment
+  if (_snd.unacknowledged < seg_ack) {
+    auto acked_bytes = seg_ack - _snd.unacknowledged;
+    if (!_snd.data.empty()) {
+      auto& unacked_seg = _snd.data.front();
+      unacked_seg.p.trim_front(acked_bytes);
+    }
+    _snd.unacknowledged = seg_ack;
+    update_cwnd(acked_bytes);
+    total_acked_bytes += acked_bytes;
+  }
+  return total_acked_bytes;
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::tcb::segment_acceptable(tcp_sequence seg_seq, unsigned seg_len) {
+  if (seg_len == 0 && _rcv.window == 0) {
+    // SEG.SEQ = RCV.NXT
+    return seg_seq == _rcv.next;
+  } else if (seg_len == 0 && _rcv.window > 0) {
+    // RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
+    return (_rcv.next <= seg_seq) && (seg_seq < _rcv.next + _rcv.window);
+  } else if (seg_len > 0 && _rcv.window > 0) {
+    // RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
+    //    or
+    // RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND
+    bool x = (_rcv.next <= seg_seq) && seg_seq < (_rcv.next + _rcv.window);
+    bool y = (_rcv.next <= seg_seq + seg_len - 1) && (seg_seq + seg_len - 1 < _rcv.next + _rcv.window);
+    return x || y;
+  } else  {
+    // SEG.LEN > 0 RCV.WND = 0, not acceptable
+    return false;
+  }
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::init_from_options(tcp_hdr* th, uint8_t* opt_start, uint8_t* opt_end) {
+  // Handle tcp options
+  _option.parse(opt_start, opt_end);
+
+  // Remote receive window scale factor
+  _snd.window_scale = _option._remote_win_scale;
+  // Local receive window scale factor
+  _rcv.window_scale = _option._local_win_scale;
+
+  // Maximum segment size remote can receive
+  _snd.mss = _option._remote_mss;
+  // Maximum segment size local can receive
+  _rcv.mss = _option._local_mss = local_mss();
+
+  // Linux's default window size
+  _rcv.window = 29200 << _rcv.window_scale;
+  _snd.window = th->window << _snd.window_scale;
+
+  // Segment sequence number used for last window update
+  _snd.wl1 = th->seq;
+  // Segment acknowledgment number used for last window update
+  _snd.wl2 = th->ack;
+
+  // Setup initial congestion window
+  if (2190 < _snd.mss) {
+    _snd.cwnd = 2 * _snd.mss;
+  } else if (1095 < _snd.mss && _snd.mss <= 2190) {
+    _snd.cwnd = 3 * _snd.mss;
+  } else {
+    _snd.cwnd = 4 * _snd.mss;
+  }
+
+  // Setup initial slow start threshold
+  _snd.ssthresh = th->window << _snd.window_scale;
+}
+
+template <typename InetTraits>
+Packet tcp<InetTraits>::tcb::get_transmit_packet() {
+  // easy case: empty queue
+  if (_snd.unsent.empty()) {
+    return Packet();
+  }
+  auto can_send = this->can_send();
+  // Max number of TCP payloads we can pass to NIC
+  uint32_t len;
+  if (_tcp.get_hw_features().tx_tso) {
+    // FIXME: Info tap device the size of the split packet
+    len = _tcp.get_hw_features().max_packet_len - tcp_hdr_len_min - InetTraits::ip_hdr_len_min;
+  } else {
+    len = std::min(uint16_t(_tcp.get_hw_features().mtu - tcp_hdr_len_min - InetTraits::ip_hdr_len_min), _snd.mss);
+  }
+  can_send = std::min(can_send, len);
+  // easy case: one small packet
+  if (_snd.unsent.front().len() <= can_send) {
+    auto p = std::move(_snd.unsent.front());
+    _snd.unsent.pop_front();
+    _snd.unsent_len -= p.len();
+    return p;
+  }
+  // moderate case: need to split one packet
+  if (_snd.unsent.front().len() > can_send) {
+    auto p = _snd.unsent.front().share(0, can_send);
+    _snd.unsent.front().trim_front(can_send);
+    _snd.unsent_len -= p.len();
+    return p;
+  }
+  // hard case: merge some packets, possibly split last
+  auto p = std::move(_snd.unsent.front());
+  _snd.unsent.pop_front();
+  can_send -= p.len();
+  while (!_snd.unsent.empty()
+         && _snd.unsent.front().len() <= can_send) {
+    can_send -= _snd.unsent.front().len();
+    p.append(std::move(_snd.unsent.front()));
+    _snd.unsent.pop_front();
+  }
+  // FIXME: this will result in calling "deleter" of packet which free managed objects
+  // will used later
+  // if (!_snd.unsent.empty() && can_send) {
+  //   auto& q = _snd.unsent.front();
+  //   p.append(q.share(0, can_send));
+  //   q.trim_front(can_send);
+  // }
+  _snd.unsent_len -= p.len();
+  return p;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::output_one(bool data_retransmit) {
+  if (in_state(CLOSED)) {
+    return;
+  }
+
+  Packet p = data_retransmit ? _snd.data.front().p.share() : get_transmit_packet();
+  Packet clone = p.share();  // early clone to prevent share() from calling packet::unuse_internal_data() on header.
+  uint16_t len = p.len();
+  bool syn_on = syn_needs_on();
+  bool ack_on = ack_needs_on();
+
+  auto options_size = _option.get_size(syn_on, ack_on);
+  auto th = p.prepend_header<tcp_hdr>(options_size);
+
+  th->src_port = _local_port;
+  th->dst_port = _foreign_port;
+
+  th->f_syn = syn_on;
+  th->f_ack = ack_on;
+  if (ack_on) {
+    clear_delayed_ack();
+  }
+  th->f_urg = false;
+  th->f_psh = false;
+
+  tcp_sequence seq;
+  if (data_retransmit) {
+    seq = _snd.unacknowledged;
+  } else {
+    seq = syn_on ? _snd.initial : _snd.next;
+    _snd.next += len;
+  }
+  th->seq = seq;
+  th->ack = _rcv.next;
+  th->data_offset = (sizeof(*th) + options_size) / 4;
+  th->window = _rcv.window >> _rcv.window_scale;
+  th->checksum = 0;
+
+  // FIXME: does the FIN have to fit in the window?
+  bool fin_on = fin_needs_on();
+  th->f_fin = fin_on;
+
+  // Add tcp options
+  _option.fill(th, options_size);
+  *th = th->hton();
+
+  offload_info oi;
+  checksummer csum;
+  uint16_t pseudo_hdr_seg_len = 0;
+
+  oi.tcp_hdr_len = sizeof(tcp_hdr) + options_size;
+
+  if (_tcp.get_hw_features().tx_csum_l4_offload) {
+    oi.needs_csum = true;
+
+    //
+    // tx checksum offloading: both virtio-net's VIRTIO_NET_F_CSUM dpdk's
+    // PKT_TX_TCP_CKSUM - requires th->checksum to be initialized to ones'
+    // complement sum of the pseudo header.
+    //
+    // For TSO the csum should be calculated for a pseudo header with
+    // segment length set to 0. All the rest is the same as for a TCP Tx
+    // CSUM offload case.
+    //
+    if (_tcp.get_hw_features().tx_tso && len > _snd.mss) {
+      oi.tso_seg_size = _snd.mss;
+    } else {
+      pseudo_hdr_seg_len = sizeof(*th) + options_size + len;
+    }
+  } else {
+    pseudo_hdr_seg_len = sizeof(*th) + options_size + len;
+    oi.needs_csum = false;
+  }
+
+  InetTraits::tcp_pseudo_header_checksum(csum, _local_ip, _foreign_ip,
+                                         pseudo_hdr_seg_len);
+
+  if (_tcp.get_hw_features().tx_csum_l4_offload) {
+    th->checksum = ~csum.get();
+  } else {
+    csum.sum(p);
+    th->checksum = csum.get();
+  }
+
+  oi.protocol = ip_protocol_num::tcp;
+
+  p.set_offload_info(oi);
+
+  if (!data_retransmit && (len || syn_on || fin_on)) {
+    auto now = clock_type::now();
+    if (len) {
+      unsigned nr_transmits = 0;
+      _snd.data.emplace_back(unacked_segment{std::move(clone),
+                                             len, nr_transmits, now});
+    }
+    if (!retransmit_fd) {
+      start_retransmit_timer();
+    }
+  }
+
+  queue_packet(std::move(p));
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::tcb::is_all_data_acked() {
+  if (_snd.data.empty() && _snd.unsent_len == 0 && _snd.queued_len == 0) {
+    return true;
+  }
+  return false;
+}
+
+template <typename InetTraits>
+Tub<Packet> tcp<InetTraits>::tcb::read() {
+  Tub<Packet> p;
+  if (_rcv.data.empty())
+    return p;
+
+  p.construct();
+  for (auto&& q : _rcv.data) {
+    p->append(std::move(q));
+  }
+  _rcv.data.clear();
+  return p;
+}
+
+template <typename InetTraits>
+int tcp<InetTraits>::tcb::send(Packet p) {
+  // We can not send after the connection is closed
+  ceph_assert(!_snd.closed);
+
+  if (in_state(CLOSED))
+    return -ECONNRESET;
+
+  auto len = p.len();
+  if (!_snd.user_queue_space.get_or_fail(len)) {
+    // note: caller must ensure enough queue space to send
+    ceph_abort();
+  }
+  // TODO: Handle p.len() > max user_queue_space case
+  _snd.queued_len += len;
+  _snd.unsent_len += len;
+  _snd.queued_len -= len;
+  _snd.unsent.push_back(std::move(p));
+  if (can_send() > 0) {
+    output();
+  }
+  return len;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::close() {
+  if (in_state(CLOSED) || _snd.closed) {
+    return ;
+  }
+  // TODO: We should make this asynchronous
+
+  _errno = -EPIPE;
+  center->delete_file_event(fd, EVENT_READABLE|EVENT_WRITABLE);
+  bool acked = is_all_data_acked();
+  if (!acked) {
+    _snd._all_data_acked_fd = manager.get_eventfd();
+    center->create_file_event(_snd._all_data_acked_fd, EVENT_READABLE, all_data_ack_event);
+  } else {
+    close_final_cleanup();
+  }
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::tcb::should_send_ack(uint16_t seg_len) {
+  // We've received a TSO packet, do ack immediately
+  if (seg_len > _rcv.mss) {
+    _nr_full_seg_received = 0;
+    if (_delayed_ack_fd) {
+      center->delete_time_event(*_delayed_ack_fd);
+      _delayed_ack_fd.destroy();
+    }
+    return true;
+  }
+
+  // We've received a full sized segment, ack for every second full sized segment
+  if (seg_len == _rcv.mss) {
+    if (_nr_full_seg_received++ >= 1) {
+      _nr_full_seg_received = 0;
+      if (_delayed_ack_fd) {
+        center->delete_time_event(*_delayed_ack_fd);
+        _delayed_ack_fd.destroy();
+      }
+      return true;
+    }
+  }
+
+  // If the timer is armed and its callback hasn't been run.
+  if (_delayed_ack_fd) {
+    return false;
+  }
+
+  // If the timer is not armed, schedule a delayed ACK.
+  // The maximum delayed ack timer allowed by RFC1122 is 500ms, most
+  // implementations use 200ms.
+  _delayed_ack_fd.construct(center->create_time_event(200*1000, delayed_ack_event));
+  return false;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::clear_delayed_ack() {
+  if (_delayed_ack_fd) {
+    center->delete_time_event(*_delayed_ack_fd);
+    _delayed_ack_fd.destroy();
+  }
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::tcb::merge_out_of_order() {
+  bool merged = false;
+  if (_rcv.out_of_order.map.empty()) {
+    return merged;
+  }
+  for (auto it = _rcv.out_of_order.map.begin(); it != _rcv.out_of_order.map.end();) {
+    auto& p = it->second;
+    auto seg_beg = it->first;
+    auto seg_len = p.len();
+    auto seg_end = seg_beg + seg_len;
+    if (seg_beg <= _rcv.next && seg_end > _rcv.next) {
+      // This segment has been received out of order and its previous
+      // segment has been received now
+      auto trim = _rcv.next - seg_beg;
+      if (trim) {
+        p.trim_front(trim);
+        seg_len -= trim;
+      }
+      _rcv.next += seg_len;
+      _rcv.data.push_back(std::move(p));
+      // Since c++11, erase() always returns the value of the following element
+      it = _rcv.out_of_order.map.erase(it);
+      merged = true;
+    } else if (_rcv.next >= seg_end) {
+      // This segment has been receive already, drop it
+      it = _rcv.out_of_order.map.erase(it);
+    } else {
+      // seg_beg > _rcv.need, can not merge. Note, seg_beg can grow only,
+      // so we can stop looking here.
+      it++;
+      break;
+    }
+  }
+  return merged;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::insert_out_of_order(tcp_sequence seg, Packet p) {
+  _rcv.out_of_order.merge(seg, std::move(p));
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::trim_receive_data_after_window() {
+  abort();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::fast_retransmit() {
+  if (!_snd.data.empty()) {
+    auto& unacked_seg = _snd.data.front();
+    unacked_seg.nr_transmits++;
+    retransmit_one();
+    output();
+  }
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::update_rto(clock_type::time_point tx_time) {
+  // Update RTO according to RFC6298
+  auto R = std::chrono::duration_cast<std::chrono::microseconds>(clock_type::now() - tx_time);
+  if (_snd.first_rto_sample) {
+    _snd.first_rto_sample = false;
+    // RTTVAR <- R/2
+    // SRTT <- R
+    _snd.rttvar = R / 2;
+    _snd.srtt = R;
+  } else {
+    // RTTVAR <- (1 - beta) * RTTVAR + beta * |SRTT - R'|
+    // SRTT <- (1 - alpha) * SRTT + alpha * R'
+    // where alpha = 1/8 and beta = 1/4
+    auto delta = _snd.srtt > R ? (_snd.srtt - R) : (R - _snd.srtt);
+    _snd.rttvar = _snd.rttvar * 3 / 4 + delta / 4;
+    _snd.srtt = _snd.srtt * 7 / 8 +  R / 8;
+  }
+  // RTO <- SRTT + max(G, K * RTTVAR)
+  _rto =  _snd.srtt + std::max(_rto_clk_granularity, 4 * _snd.rttvar);
+
+  // Make sure 1 sec << _rto << 60 sec
+  _rto = std::max(_rto, _rto_min);
+  _rto = std::min(_rto, _rto_max);
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::update_cwnd(uint32_t acked_bytes) {
+  uint32_t smss = _snd.mss;
+  if (_snd.cwnd < _snd.ssthresh) {
+    // In slow start phase
+    _snd.cwnd += std::min(acked_bytes, smss);
+  } else {
+    // In congestion avoidance phase
+    uint32_t round_up = 1;
+    _snd.cwnd += std::max(round_up, smss * smss / _snd.cwnd);
+  }
+}
+
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::cleanup() {
+  manager.notify(fd, EVENT_READABLE);
+  _snd.closed = true;
+  _snd.unsent.clear();
+  _snd.data.clear();
+  _rcv.out_of_order.map.clear();
+  _rcv.data.clear();
+  stop_retransmit_timer();
+  clear_delayed_ack();
+  center->dispatch_event_external(new tcp<InetTraits>::C_actual_remove_tcb(this));
+  remove_from_tcbs();
+}
+
+template <typename InetTraits>
+tcp_sequence tcp<InetTraits>::tcb::get_isn() {
+  // Per RFC6528, TCP SHOULD generate its Initial Sequence Numbers
+  // with the expression:
+  //   ISN = M + F(localip, localport, remoteip, remoteport, secretkey)
+  //   M is the 4 microsecond timer
+  using namespace std::chrono;
+  uint32_t hash[4];
+  hash[0] = _local_ip.ip;
+  hash[1] = _foreign_ip.ip;
+  hash[2] = (_local_port << 16) + _foreign_port;
+  hash[3] = _isn_secret.key[15];
+  ceph::crypto::MD5 md5;
+  md5.Update((const unsigned char*)_isn_secret.key, sizeof(_isn_secret.key));
+  md5.Final((unsigned char*)hash);
+  auto seq = hash[0];
+  auto m = duration_cast<microseconds>(clock_type::now().time_since_epoch());
+  seq += m.count() / 4;
+  return make_seq(seq);
+}
+
+template <typename InetTraits>
+Tub<typename InetTraits::l4packet> tcp<InetTraits>::tcb::get_packet() {
+  _poll_active = false;
+  if (_packetq.empty()) {
+    output_one();
+  }
+
+  Tub<typename InetTraits::l4packet> p;
+  if (in_state(CLOSED)) {
+    return p;
+  }
+
+  ceph_assert(!_packetq.empty());
+
+  p = std::move(_packetq.front());
+  _packetq.pop_front();
+  if (!_packetq.empty() || (_snd.dupacks < 3 && can_send() > 0)) {
+    // If there are packets to send in the queue or tcb is allowed to send
+    // more add tcp back to polling set to keep sending. In addition, dupacks >= 3
+    // is an indication that an segment is lost, stop sending more in this case.
+    output();
+  }
+  return p;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::connection::close_read() {
+  // do nothing
+  // _tcb->manager.notify(_tcb->fd, EVENT_READABLE);
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::connection::close_write() {
+  _tcb->close();
+}
+
+template <typename InetTraits>
+constexpr uint16_t tcp<InetTraits>::tcb::_max_nr_retransmit;
+
+template <typename InetTraits>
+constexpr std::chrono::microseconds tcp<InetTraits>::tcb::_rto_min;
+
+template <typename InetTraits>
+constexpr std::chrono::microseconds tcp<InetTraits>::tcb::_rto_max;
+
+template <typename InetTraits>
+constexpr std::chrono::microseconds tcp<InetTraits>::tcb::_rto_clk_granularity;
+
+template <typename InetTraits>
+typename tcp<InetTraits>::tcb::isn_secret tcp<InetTraits>::tcb::_isn_secret;
+
+
+#endif /* TCP_HH_ */
diff --git a/src/msg/async/dpdk/UserspaceEvent.cc b/src/msg/async/dpdk/UserspaceEvent.cc
new file mode 100644
index 00000000..282dcef1
--- /dev/null
+++ b/src/msg/async/dpdk/UserspaceEvent.cc
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "UserspaceEvent.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "dpdk "
+
+int UserspaceEventManager::get_eventfd()
+{
+  int fd;
+  if (!unused_fds.empty()) {
+    fd = unused_fds.front();
+    unused_fds.pop_front();
+  } else {
+    fd = ++max_fd;
+    fds.resize(fd + 1);
+  }
+
+  Tub<UserspaceFDImpl> &impl = fds[fd];
+  ceph_assert(!impl);
+  impl.construct();
+  ldout(cct, 20) << __func__ << " fd=" << fd << dendl;
+  return fd;
+}
+
+int UserspaceEventManager::notify(int fd, int mask)
+{
+  ldout(cct, 20) << __func__ << " fd=" << fd << " mask=" << mask << dendl;
+  if ((size_t)fd >= fds.size())
+    return -ENOENT;
+
+  Tub<UserspaceFDImpl> &impl = fds[fd];
+  if (!impl)
+    return -ENOENT;
+
+  ldout(cct, 20) << __func__ << " activing=" << int(impl->activating_mask)
+                 << " listening=" << int(impl->listening_mask)
+                 << " waiting_idx=" << int(impl->waiting_idx) << dendl;
+
+  impl->activating_mask |= mask;
+  if (impl->waiting_idx)
+    return 0;
+
+  if (impl->listening_mask & mask) {
+    if (waiting_fds.size() <= max_wait_idx)
+      waiting_fds.resize(waiting_fds.size()*2);
+    impl->waiting_idx = ++max_wait_idx;
+    waiting_fds[max_wait_idx] = fd;
+  }
+
+  ldout(cct, 20) << __func__ << " activing=" << int(impl->activating_mask)
+                 << " listening=" << int(impl->listening_mask)
+                 << " waiting_idx=" << int(impl->waiting_idx) << " done " << dendl;
+  return 0;
+}
+
+void UserspaceEventManager::close(int fd)
+{
+  ldout(cct, 20) << __func__ << " fd=" << fd << dendl;
+  if ((size_t)fd >= fds.size())
+    return ;
+
+  Tub<UserspaceFDImpl> &impl = fds[fd];
+  if (!impl)
+    return ;
+
+  if (fd == max_fd)
+    --max_fd;
+  else
+    unused_fds.push_back(fd);
+
+  if (impl->activating_mask) {
+    if (waiting_fds[max_wait_idx] == fd) {
+      ceph_assert(impl->waiting_idx == max_wait_idx);
+      --max_wait_idx;
+    }
+    waiting_fds[impl->waiting_idx] = -1;
+  }
+  impl.destroy();
+}
+
+int UserspaceEventManager::poll(int *events, int *masks, int num_events, struct timeval *tp)
+{
+  int fd;
+  uint32_t i = 0;
+  int count = 0;
+  ceph_assert(num_events);
+  // leave zero slot for waiting_fds
+  while (i < max_wait_idx) {
+    fd = waiting_fds[++i];
+    if (fd == -1)
+      continue;
+
+    events[count] = fd;
+    Tub<UserspaceFDImpl> &impl = fds[fd];
+    ceph_assert(impl);
+    masks[count] = impl->listening_mask & impl->activating_mask;
+    ceph_assert(masks[count]);
+    ldout(cct, 20) << __func__ << " fd=" << fd << " mask=" << masks[count] << dendl;
+    impl->activating_mask &= (~masks[count]);
+    impl->waiting_idx = 0;
+    if (++count >= num_events)
+      break;
+  }
+  if (i < max_wait_idx) {
+    memmove(&waiting_fds[1], &waiting_fds[i+1], sizeof(int)*(max_wait_idx-i));
+  }
+  max_wait_idx -= i;
+  return count;
+}
diff --git a/src/msg/async/dpdk/UserspaceEvent.h b/src/msg/async/dpdk/UserspaceEvent.h
new file mode 100644
index 00000000..7e89517d
--- /dev/null
+++ b/src/msg/async/dpdk/UserspaceEvent.h
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_USERSPACEEVENT_H
+#define CEPH_USERSPACEEVENT_H
+
+#include <cstddef>
+#include <errno.h>
+#include <string.h>
+
+#include <vector>
+#include <list>
+
+#include "include/ceph_assert.h"
+#include "include/int_types.h"
+#include "common/Tub.h"
+
+class CephContext;
+
+class UserspaceEventManager {
+  struct UserspaceFDImpl {
+    uint32_t waiting_idx = 0;
+    int16_t read_errno = 0;
+    int16_t write_errno = 0;
+    int8_t listening_mask = 0;
+    int8_t activating_mask = 0;
+    uint32_t magic = 4921;
+  };
+  CephContext *cct;
+  int max_fd = 0;
+  uint32_t max_wait_idx = 0;
+  std::vector<Tub<UserspaceFDImpl> > fds;
+  std::vector<int> waiting_fds;
+  std::list<uint32_t> unused_fds;
+
+ public:
+  explicit UserspaceEventManager(CephContext *c): cct(c) {
+    waiting_fds.resize(1024);
+  }
+
+  int get_eventfd();
+
+  int listen(int fd, int mask) {
+    if ((size_t)fd >= fds.size())
+      return -ENOENT;
+
+    Tub<UserspaceFDImpl> &impl = fds[fd];
+    if (!impl)
+      return -ENOENT;
+
+    impl->listening_mask |= mask;
+    if (impl->activating_mask & impl->listening_mask && !impl->waiting_idx) {
+      if (waiting_fds.size() <= max_wait_idx)
+        waiting_fds.resize(waiting_fds.size()*2);
+      impl->waiting_idx = ++max_wait_idx;
+      waiting_fds[max_wait_idx] = fd;
+    }
+    return 0;
+  }
+
+  int unlisten(int fd, int mask) {
+    if ((size_t)fd >= fds.size())
+      return -ENOENT;
+
+    Tub<UserspaceFDImpl> &impl = fds[fd];
+    if (!impl)
+      return -ENOENT;
+
+    impl->listening_mask &= (~mask);
+    if (!(impl->activating_mask & impl->listening_mask) && impl->waiting_idx) {
+      if (waiting_fds[max_wait_idx] == fd) {
+        ceph_assert(impl->waiting_idx == max_wait_idx);
+        --max_wait_idx;
+      }
+      waiting_fds[impl->waiting_idx] = -1;
+      impl->waiting_idx = 0;
+    }
+    return 0;
+  }
+
+  int notify(int fd, int mask);
+  void close(int fd);
+  int poll(int *events, int *masks, int num_events, struct timeval *tp);
+
+  bool check() {
+    for (auto &&m : fds) {
+      if (m && m->magic != 4921)
+        return false;
+    }
+    return true;
+  }
+};
+
+#endif //CEPH_USERSPACEEVENT_H
diff --git a/src/msg/async/dpdk/align.h b/src/msg/async/dpdk/align.h
new file mode 100644
index 00000000..3b48f789
--- /dev/null
+++ b/src/msg/async/dpdk/align.h
@@ -0,0 +1,50 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_DPDK_ALIGN_HH_
+#define CEPH_MSG_DPDK_ALIGN_HH_
+
+#include <cstdint>
+#include <cstdlib>
+
+template <typename T>
+inline constexpr T align_up(T v, T align) {
+  return (v + align - 1) & ~(align - 1);
+}
+
+template <typename T>
+inline constexpr T* align_up(T* v, size_t align) {
+  static_assert(sizeof(T) == 1, "align byte pointers only");
+  return reinterpret_cast<T*>(align_up(reinterpret_cast<uintptr_t>(v), align));
+}
+
+template <typename T>
+inline constexpr T align_down(T v, T align) {
+  return v & ~(align - 1);
+}
+
+template <typename T>
+inline constexpr T* align_down(T* v, size_t align) {
+  static_assert(sizeof(T) == 1, "align byte pointers only");
+  return reinterpret_cast<T*>(align_down(reinterpret_cast<uintptr_t>(v), align));
+}
+
+#endif /* CEPH_MSG_DPDK_ALIGN_HH_ */
diff --git a/src/msg/async/dpdk/array_map.h b/src/msg/async/dpdk/array_map.h
new file mode 100644
index 00000000..40f7728d
--- /dev/null
+++ b/src/msg/async/dpdk/array_map.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_ARRAY_MAP_HH_
+#define CEPH_ARRAY_MAP_HH_
+
+#include <array>
+
+// unordered_map implemented as a simple array
+
+template <typename Value, size_t Max>
+class array_map {
+  std::array<Value, Max> _a {};
+ public:
+  array_map(std::initializer_list<std::pair<size_t, Value>> i) {
+    for (auto kv : i) {
+      _a[kv.first] = kv.second;
+    }
+  }
+  Value& operator[](size_t key) { return _a[key]; }
+  const Value& operator[](size_t key) const { return _a[key]; }
+
+  Value& at(size_t key) {
+    if (key >= Max) {
+      throw std::out_of_range(std::to_string(key) + " >= " + std::to_string(Max));
+    }
+    return _a[key];
+  }
+};
+
+#endif /* ARRAY_MAP_HH_ */
diff --git a/src/msg/async/dpdk/byteorder.h b/src/msg/async/dpdk/byteorder.h
new file mode 100644
index 00000000..a996ec07
--- /dev/null
+++ b/src/msg/async/dpdk/byteorder.h
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_BYTEORDER_H_
+#define CEPH_MSG_BYTEORDER_H_
+
+#include <arpa/inet.h>  // for ntohs() and friends
+#include <iosfwd>
+#include <utility>
+
+inline uint64_t ntohq(uint64_t v) {
+  return __builtin_bswap64(v);
+}
+inline uint64_t htonq(uint64_t v) {
+  return __builtin_bswap64(v);
+}
+
+inline void ntoh() {}
+inline void hton() {}
+
+inline uint8_t ntoh(uint8_t x) { return x; }
+inline uint8_t hton(uint8_t x) { return x; }
+inline uint16_t ntoh(uint16_t x) { return ntohs(x); }
+inline uint16_t hton(uint16_t x) { return htons(x); }
+inline uint32_t ntoh(uint32_t x) { return ntohl(x); }
+inline uint32_t hton(uint32_t x) { return htonl(x); }
+inline uint64_t ntoh(uint64_t x) { return ntohq(x); }
+inline uint64_t hton(uint64_t x) { return htonq(x); }
+
+inline int8_t ntoh(int8_t x) { return x; }
+inline int8_t hton(int8_t x) { return x; }
+inline int16_t ntoh(int16_t x) { return ntohs(x); }
+inline int16_t hton(int16_t x) { return htons(x); }
+inline int32_t ntoh(int32_t x) { return ntohl(x); }
+inline int32_t hton(int32_t x) { return htonl(x); }
+inline int64_t ntoh(int64_t x) { return ntohq(x); }
+inline int64_t hton(int64_t x) { return htonq(x); }
+
+#endif /* CEPH_MSG_BYTEORDER_H_ */
diff --git a/src/msg/async/dpdk/capture.h b/src/msg/async/dpdk/capture.h
new file mode 100644
index 00000000..1ace8eeb
--- /dev/null
+++ b/src/msg/async/dpdk/capture.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_DPDK_CAPTURE_H
+#define CEPH_MSG_DPDK_CAPTURE_H
+
+#include <utility>
+
+template <typename T, typename F>
+class capture_impl {
+  T x;
+  F f;
+ public:
+  capture_impl(capture_impl &) = delete;
+  capture_impl( T && x, F && f )
+      : x{std::forward<T>(x)}, f{std::forward<F>(f)}
+  {}
+
+  template <typename ...Ts> auto operator()( Ts&&...args )
+  -> decltype(f( x, std::forward<Ts>(args)... ))
+  {
+    return f( x, std::forward<Ts>(args)... );
+  }
+
+  template <typename ...Ts> auto operator()( Ts&&...args ) const
+  -> decltype(f( x, std::forward<Ts>(args)... ))
+  {
+    return f( x, std::forward<Ts>(args)... );
+  }
+};
+
+template <typename T, typename F>
+capture_impl<T,F> capture( T && x, F && f ) {
+  return capture_impl<T,F>(
+      std::forward<T>(x), std::forward<F>(f) );
+}
+
+#endif //CEPH_MSG_DPDK_CAPTURE_H
diff --git a/src/msg/async/dpdk/circular_buffer.h b/src/msg/async/dpdk/circular_buffer.h
new file mode 100644
index 00000000..2c92c120
--- /dev/null
+++ b/src/msg/async/dpdk/circular_buffer.h
@@ -0,0 +1,347 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_CIRCULAR_BUFFER_HH_
+#define CEPH_CIRCULAR_BUFFER_HH_
+
+// A growable double-ended queue container that can be efficiently
+// extended (and shrunk) from both ends.  Implementation is a single
+// storage vector.
+//
+// Similar to libstdc++'s std::deque, except that it uses a single level
+// store, and so is more efficient for simple stored items.
+// Similar to boost::circular_buffer_space_optimized, except it uses
+// uninitialized storage for unoccupied elements (and thus move/copy
+// constructors instead of move/copy assignments, which are less efficient).
+
+#include <memory>
+#include <algorithm>
+
+#include "transfer.h"
+
+template <typename T, typename Alloc = std::allocator<T>>
+class circular_buffer {
+  struct impl : Alloc {
+    T* storage = nullptr;
+    // begin, end interpreted (mod capacity)
+    size_t begin = 0;
+    size_t end = 0;
+    size_t capacity = 0;
+  };
+  impl _impl;
+ public:
+  using value_type = T;
+  using size_type = size_t;
+  using reference = T&;
+  using pointer = T*;
+  using const_reference = const T&;
+  using const_pointer = const T*;
+ public:
+  circular_buffer() = default;
+  circular_buffer(circular_buffer&& X);
+  circular_buffer(const circular_buffer& X) = delete;
+  ~circular_buffer();
+  circular_buffer& operator=(const circular_buffer&) = delete;
+  circular_buffer& operator=(circular_buffer&&) = delete;
+  void push_front(const T& data);
+  void push_front(T&& data);
+  template <typename... A>
+  void emplace_front(A&&... args);
+  void push_back(const T& data);
+  void push_back(T&& data);
+  template <typename... A>
+  void emplace_back(A&&... args);
+  T& front();
+  T& back();
+  void pop_front();
+  void pop_back();
+  bool empty() const;
+  size_t size() const;
+  size_t capacity() const;
+  T& operator[](size_t idx);
+  template <typename Func>
+  void for_each(Func func);
+  // access an element, may return wrong or destroyed element
+  // only useful if you do not rely on data accuracy (e.g. prefetch)
+  T& access_element_unsafe(size_t idx);
+ private:
+  void expand();
+  void maybe_expand(size_t nr = 1);
+  size_t mask(size_t idx) const;
+
+  template<typename CB, typename ValueType>
+  struct cbiterator : std::iterator<std::random_access_iterator_tag, ValueType> {
+    typedef std::iterator<std::random_access_iterator_tag, ValueType> super_t;
+
+    ValueType& operator*() const { return cb->_impl.storage[cb->mask(idx)]; }
+    ValueType* operator->() const { return &cb->_impl.storage[cb->mask(idx)]; }
+    // prefix
+    cbiterator<CB, ValueType>& operator++() {
+      idx++;
+      return *this;
+    }
+    // postfix
+    cbiterator<CB, ValueType> operator++(int unused) {
+      auto v = *this;
+      idx++;
+      return v;
+    }
+    // prefix
+    cbiterator<CB, ValueType>& operator--() {
+      idx--;
+      return *this;
+    }
+    // postfix
+    cbiterator<CB, ValueType> operator--(int unused) {
+      auto v = *this;
+      idx--;
+      return v;
+    }
+    cbiterator<CB, ValueType> operator+(typename super_t::difference_type n) const {
+      return cbiterator<CB, ValueType>(cb, idx + n);
+    }
+    cbiterator<CB, ValueType> operator-(typename super_t::difference_type n) const {
+      return cbiterator<CB, ValueType>(cb, idx - n);
+    }
+    cbiterator<CB, ValueType>& operator+=(typename super_t::difference_type n) {
+      idx += n;
+      return *this;
+    }
+    cbiterator<CB, ValueType>& operator-=(typename super_t::difference_type n) {
+      idx -= n;
+      return *this;
+    }
+    bool operator==(const cbiterator<CB, ValueType>& rhs) const {
+      return idx == rhs.idx;
+    }
+    bool operator!=(const cbiterator<CB, ValueType>& rhs) const {
+      return idx != rhs.idx;
+    }
+    bool operator<(const cbiterator<CB, ValueType>& rhs) const {
+      return idx < rhs.idx;
+    }
+    bool operator>(const cbiterator<CB, ValueType>& rhs) const {
+      return idx > rhs.idx;
+    }
+    bool operator>=(const cbiterator<CB, ValueType>& rhs) const {
+      return idx >= rhs.idx;
+    }
+    bool operator<=(const cbiterator<CB, ValueType>& rhs) const {
+      return idx <= rhs.idx;
+    }
+    typename super_t::difference_type operator-(const cbiterator<CB, ValueType>& rhs) const {
+      return idx - rhs.idx;
+    }
+   private:
+    CB* cb;
+    size_t idx;
+    cbiterator<CB, ValueType>(CB* b, size_t i) : cb(b), idx(i) {}
+    friend class circular_buffer;
+  };
+  friend class iterator;
+
+ public:
+  typedef cbiterator<circular_buffer, T> iterator;
+  typedef cbiterator<const circular_buffer, const T> const_iterator;
+
+  iterator begin() {
+    return iterator(this, _impl.begin);
+  }
+  const_iterator begin() const {
+    return const_iterator(this, _impl.begin);
+  }
+  iterator end() {
+    return iterator(this, _impl.end);
+  }
+  const_iterator end() const {
+    return const_iterator(this, _impl.end);
+  }
+  const_iterator cbegin() const {
+    return const_iterator(this, _impl.begin);
+  }
+  const_iterator cend() const {
+    return const_iterator(this, _impl.end);
+  }
+};
+
+template <typename T, typename Alloc>
+inline size_t circular_buffer<T, Alloc>::mask(size_t idx) const {
+  return idx & (_impl.capacity - 1);
+}
+
+template <typename T, typename Alloc>
+inline bool circular_buffer<T, Alloc>::empty() const {
+  return _impl.begin == _impl.end;
+}
+
+template <typename T, typename Alloc>
+inline size_t circular_buffer<T, Alloc>::size() const {
+  return _impl.end - _impl.begin;
+}
+
+template <typename T, typename Alloc>
+inline size_t circular_buffer<T, Alloc>::capacity() const {
+  return _impl.capacity;
+}
+
+template <typename T, typename Alloc>
+inline circular_buffer<T, Alloc>::circular_buffer(circular_buffer&& x)
+    : _impl(std::move(x._impl)) {
+  x._impl = {};
+}
+
+template <typename T, typename Alloc>
+template <typename Func>
+inline void circular_buffer<T, Alloc>::for_each(Func func) {
+  auto s = _impl.storage;
+  auto m = _impl.capacity - 1;
+  for (auto i = _impl.begin; i != _impl.end; ++i) {
+    func(s[i & m]);
+  }
+}
+
+template <typename T, typename Alloc>
+inline circular_buffer<T, Alloc>::~circular_buffer() {
+  for_each([this] (T& obj) {
+    _impl.destroy(&obj);
+  });
+  _impl.deallocate(_impl.storage, _impl.capacity);
+}
+
+template <typename T, typename Alloc>
+void circular_buffer<T, Alloc>::expand() {
+  auto new_cap = std::max<size_t>(_impl.capacity * 2, 1);
+  auto new_storage = _impl.allocate(new_cap);
+  auto p = new_storage;
+  try {
+    for_each([this, &p] (T& obj) {
+      transfer_pass1(_impl, &obj, p);
+      p++;
+    });
+  } catch (...) {
+    while (p != new_storage) {
+      _impl.destroy(--p);
+    }
+    _impl.deallocate(new_storage, new_cap);
+    throw;
+  }
+  p = new_storage;
+  for_each([this, &p] (T& obj) {
+    transfer_pass2(_impl, &obj, p++);
+  });
+  std::swap(_impl.storage, new_storage);
+  std::swap(_impl.capacity, new_cap);
+  _impl.begin = 0;
+  _impl.end = p - _impl.storage;
+  _impl.deallocate(new_storage, new_cap);
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::maybe_expand(size_t nr) {
+  if (_impl.end - _impl.begin + nr > _impl.capacity) {
+    expand();
+  }
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::push_front(const T& data) {
+  maybe_expand();
+  auto p = &_impl.storage[mask(_impl.begin - 1)];
+  _impl.construct(p, data);
+  --_impl.begin;
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::push_front(T&& data) {
+  maybe_expand();
+  auto p = &_impl.storage[mask(_impl.begin - 1)];
+  _impl.construct(p, std::move(data));
+  --_impl.begin;
+}
+
+template <typename T, typename Alloc>
+template <typename... Args>
+inline void circular_buffer<T, Alloc>::emplace_front(Args&&... args) {
+  maybe_expand();
+  auto p = &_impl.storage[mask(_impl.begin - 1)];
+  _impl.construct(p, std::forward<Args>(args)...);
+  --_impl.begin;
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::push_back(const T& data) {
+  maybe_expand();
+  auto p = &_impl.storage[mask(_impl.end)];
+  _impl.construct(p, data);
+  ++_impl.end;
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::push_back(T&& data) {
+  maybe_expand();
+  auto p = &_impl.storage[mask(_impl.end)];
+  _impl.construct(p, std::move(data));
+  ++_impl.end;
+}
+
+template <typename T, typename Alloc>
+template <typename... Args>
+inline void circular_buffer<T, Alloc>::emplace_back(Args&&... args) {
+  maybe_expand();
+  auto p = &_impl.storage[mask(_impl.end)];
+  _impl.construct(p, std::forward<Args>(args)...);
+  ++_impl.end;
+}
+
+template <typename T, typename Alloc>
+inline T& circular_buffer<T, Alloc>::front() {
+  return _impl.storage[mask(_impl.begin)];
+}
+
+template <typename T, typename Alloc>
+inline T& circular_buffer<T, Alloc>::back() {
+  return _impl.storage[mask(_impl.end - 1)];
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::pop_front() {
+  _impl.destroy(&front());
+  ++_impl.begin;
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::pop_back() {
+  _impl.destroy(&back());
+  --_impl.end;
+}
+
+template <typename T, typename Alloc>
+inline T& circular_buffer<T, Alloc>::operator[](size_t idx) {
+  return _impl.storage[mask(_impl.begin + idx)];
+}
+
+template <typename T, typename Alloc>
+inline T& circular_buffer<T, Alloc>::access_element_unsafe(size_t idx) {
+  return _impl.storage[mask(_impl.begin + idx)];
+}
+
+#endif /* CEPH_CIRCULAR_BUFFER_HH_ */
diff --git a/src/msg/async/dpdk/const.h b/src/msg/async/dpdk/const.h
new file mode 100644
index 00000000..ea5dc49e
--- /dev/null
+++ b/src/msg/async/dpdk/const.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_CONST_H_
+#define CEPH_MSG_CONST_H_
+
+#include <stdint.h>
+
+enum class ip_protocol_num : uint8_t {
+  icmp = 1, tcp = 6, unused = 255
+};
+
+enum class eth_protocol_num : uint16_t {
+  ipv4 = 0x0800, arp = 0x0806, ipv6 = 0x86dd
+};
+
+const uint8_t eth_hdr_len = 14;
+const uint8_t tcp_hdr_len_min = 20;
+const uint8_t ipv4_hdr_len_min = 20;
+const uint8_t ipv6_hdr_len_min = 40;
+const uint16_t ip_packet_len_max = 65535;
+
+#endif
diff --git a/src/msg/async/dpdk/dpdk_rte.cc b/src/msg/async/dpdk/dpdk_rte.cc
new file mode 100644
index 00000000..9f9d343b
--- /dev/null
+++ b/src/msg/async/dpdk/dpdk_rte.cc
@@ -0,0 +1,154 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <bitset>
+
+#include <rte_config.h>
+#include <rte_common.h>
+#include <rte_ethdev.h>
+#include <rte_version.h>
+
+#include "DPDK.h"
+#include "dpdk_rte.h"
+
+namespace dpdk {
+
+  static inline std::vector<char> string2vector(std::string str) {
+    auto v = std::vector<char>(str.begin(), str.end());
+    v.push_back('\0');
+    return v;
+  }
+
+  bool eal::initialized = false;
+  std::thread eal::t;
+  std::mutex eal::lock;
+  std::condition_variable eal::cond;
+  std::list<std::function<void()>> eal::funcs;
+
+  static int bitcount(unsigned long long n)
+  {
+    return std::bitset<CHAR_BIT * sizeof(n)>{n}.count();
+  }
+
+  int eal::init(CephContext *c)
+  {
+    if (initialized) {
+      return 1;
+    }
+
+    bool done = false;
+    auto num = std::stoull(c->_conf.get_val<std::string>("ms_dpdk_coremask"),
+                           nullptr, 16);
+    unsigned int coremaskbit = bitcount(num);
+
+    ceph_assert(coremaskbit > c->_conf->ms_async_op_threads);
+
+    t = std::thread([&]() {
+      // TODO: Inherit these from the app parameters - "opts"
+      std::vector<std::vector<char>> args {
+          string2vector(string("ceph")),
+          string2vector("-c"), string2vector(c->_conf.get_val<std::string>("ms_dpdk_coremask")),
+          string2vector("-n"), string2vector(c->_conf->ms_dpdk_memory_channel),
+      };
+
+      Tub<std::string> hugepages_path;
+      if (!c->_conf->ms_dpdk_hugepages.empty()) {
+        hugepages_path.construct(c->_conf->ms_dpdk_hugepages);
+      }
+
+      // If "hugepages" is not provided and DPDK PMD drivers mode is requested -
+      // use the default DPDK huge tables configuration.
+      if (hugepages_path) {
+        args.push_back(string2vector("--huge-dir"));
+        args.push_back(string2vector(*hugepages_path));
+
+        //
+        // We don't know what is going to be our networking configuration so we
+        // assume there is going to be a queue per-CPU. Plus we'll give a DPDK
+        // 64MB for "other stuff".
+        //
+        unsigned int x;
+        std::stringstream ss;
+        ss << std::hex << "fffefffe";
+        ss >> x;
+        size_t size_MB = mem_size(bitcount(x)) >> 20;
+        std::stringstream size_MB_str;
+        size_MB_str << size_MB;
+
+        args.push_back(string2vector("-m"));
+        args.push_back(string2vector(size_MB_str.str()));
+      } else if (!c->_conf->ms_dpdk_pmd.empty()) {
+        args.push_back(string2vector("--no-huge"));
+      }
+
+      std::string rte_file_prefix;
+      rte_file_prefix = "rte_";
+      rte_file_prefix += c->_conf->name.to_str();
+      args.push_back(string2vector("--file-prefix"));
+      args.push_back(string2vector(rte_file_prefix));
+
+      std::vector<char*> cargs;
+
+      for (auto&& a: args) {
+        cargs.push_back(a.data());
+      }
+      /* initialise the EAL for all */
+      int ret = rte_eal_init(cargs.size(), cargs.data());
+      if (ret < 0)
+        return ret;
+
+      std::unique_lock<std::mutex> l(lock);
+      initialized = true;
+      done = true;
+      cond.notify_all();
+      while (true) {
+        if (!funcs.empty()) {
+          auto f = std::move(funcs.front());
+          funcs.pop_front();
+          f();
+          cond.notify_all();
+        } else {
+          cond.wait(l);
+        }
+      }
+    });
+    t.detach();
+    std::unique_lock<std::mutex> l(lock);
+    while (!done)
+      cond.wait(l);
+    return 0;
+  }
+
+  size_t eal::mem_size(int num_cpus)
+  {
+    size_t memsize = 0;
+    //
+    // PMD mempool memory:
+    //
+    // We don't know what is going to be our networking configuration so we
+    // assume there is going to be a queue per-CPU.
+    //
+    memsize += num_cpus * qp_mempool_obj_size();
+
+    // Plus we'll give a DPDK 64MB for "other stuff".
+    memsize += (64UL << 20);
+
+    return memsize;
+  }
+
+} // namespace dpdk
diff --git a/src/msg/async/dpdk/dpdk_rte.h b/src/msg/async/dpdk/dpdk_rte.h
new file mode 100644
index 00000000..4aa83899
--- /dev/null
+++ b/src/msg/async/dpdk/dpdk_rte.h
@@ -0,0 +1,74 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef CEPH_DPDK_RTE_H_
+#define CEPH_DPDK_RTE_H_
+
+
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+
+#include <bitset>
+#include <rte_config.h>
+#include <rte_version.h>
+#include <boost/program_options.hpp>
+
+/*********************** Compat section ***************************************/
+// We currently support only versions 2.0 and above.
+#if (RTE_VERSION < RTE_VERSION_NUM(2,0,0,0))
+#error "DPDK version above 2.0.0 is required"
+#endif
+
+#if defined(RTE_MBUF_REFCNT_ATOMIC)
+#warning "CONFIG_RTE_MBUF_REFCNT_ATOMIC should be disabled in DPDK's " \
+         "config/common_linuxapp"
+#endif
+/******************************************************************************/
+
+namespace dpdk {
+
+// DPDK Environment Abstraction Layer
+class eal {
+ public:
+  using cpuset = std::bitset<RTE_MAX_LCORE>;
+
+  static std::mutex lock;
+  static std::condition_variable cond;
+  static std::list<std::function<void()>> funcs;
+  static int init(CephContext *c);
+  static void execute_on_master(std::function<void()> &&f) {
+    bool done = false;
+    std::unique_lock<std::mutex> l(lock);
+    funcs.emplace_back([&]() { f(); done = true; });
+    cond.notify_all();
+    while (!done)
+      cond.wait(l);
+  }
+  /**
+   * Returns the amount of memory needed for DPDK
+   * @param num_cpus Number of CPUs the application is going to use
+   *
+   * @return
+   */
+  static size_t mem_size(int num_cpus);
+  static bool initialized;
+  static std::thread t;
+};
+
+} // namespace dpdk
+#endif // CEPH_DPDK_RTE_H_
diff --git a/src/msg/async/dpdk/ethernet.cc b/src/msg/async/dpdk/ethernet.cc
new file mode 100644
index 00000000..9aca5078
--- /dev/null
+++ b/src/msg/async/dpdk/ethernet.cc
@@ -0,0 +1,16 @@
+#include <iomanip>
+
+#include "ethernet.h"
+
+std::ostream& operator<<(std::ostream& os, const ethernet_address& ea) {
+  auto& m = ea.mac;
+  using u = uint32_t;
+  os << std::hex << std::setw(2)
+     << u(m[0]) << ":"
+     << u(m[1]) << ":"
+     << u(m[2]) << ":"
+     << u(m[3]) << ":"
+     << u(m[4]) << ":"
+     << u(m[5]);
+  return os;
+}
diff --git a/src/msg/async/dpdk/ethernet.h b/src/msg/async/dpdk/ethernet.h
new file mode 100644
index 00000000..b007425f
--- /dev/null
+++ b/src/msg/async/dpdk/ethernet.h
@@ -0,0 +1,84 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_ETHERNET_H_
+#define CEPH_MSG_ETHERNET_H_
+
+#include <array>
+#include <sstream>
+
+#include "include/ceph_assert.h"
+#include "byteorder.h"
+
+struct ethernet_address {
+  ethernet_address() {}
+
+  ethernet_address(const uint8_t *eaddr) {
+    std::copy(eaddr, eaddr + 6, mac.begin());
+  }
+
+  ethernet_address(std::initializer_list<uint8_t> eaddr) {
+    ceph_assert(eaddr.size() == mac.size());
+    std::copy(eaddr.begin(), eaddr.end(), mac.begin());
+  }
+
+  ethernet_address ntoh() {
+    return *this;
+  }
+  ethernet_address hton() {
+    return *this;
+  }
+  std::array<uint8_t, 6> mac;
+} __attribute__((packed));
+
+inline bool operator==(const ethernet_address& a, const ethernet_address& b) {
+  return a.mac == b.mac;
+}
+std::ostream& operator<<(std::ostream& os, const ethernet_address& ea);
+
+struct ethernet {
+  using address = ethernet_address;
+  static address broadcast_address() {
+      return {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+  }
+  static constexpr uint16_t arp_hardware_type() { return 1; }
+};
+
+struct eth_hdr {
+  ethernet_address dst_mac;
+  ethernet_address src_mac;
+  uint16_t eth_proto;
+  eth_hdr hton() {
+    eth_hdr hdr = *this;
+    hdr.eth_proto = ::hton(eth_proto);
+    return hdr;
+  }
+  eth_hdr ntoh() {
+    eth_hdr hdr = *this;
+    hdr.eth_proto = ::ntoh(eth_proto);
+    return hdr;
+  }
+} __attribute__((packed));
+
+ethernet_address parse_ethernet_address(std::string addr);
+
+#endif /* CEPH_MSG_ETHERNET_H_ */
diff --git a/src/msg/async/dpdk/ip_types.h b/src/msg/async/dpdk/ip_types.h
new file mode 100644
index 00000000..356d8fd6
--- /dev/null
+++ b/src/msg/async/dpdk/ip_types.h
@@ -0,0 +1,109 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ */
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_IP_TYPES_H_H
+#define CEPH_IP_TYPES_H_H
+
+#include <boost/asio/ip/address_v4.hpp>
+#include <string>
+
+class Packet;
+class ethernet_address;
+using resolution_cb = std::function<void (const ethernet_address&, Packet, int)>;
+
+struct ipv4_addr {
+  uint32_t ip;
+  uint16_t port;
+
+  ipv4_addr() : ip(0), port(0) {}
+  ipv4_addr(uint32_t ip, uint16_t port) : ip(ip), port(port) {}
+  ipv4_addr(uint16_t port) : ip(0), port(port) {}
+  ipv4_addr(const std::string &addr);
+  ipv4_addr(const std::string &addr, uint16_t port);
+
+  ipv4_addr(const entity_addr_t &ad) {
+    ip = ntoh(ad.in4_addr().sin_addr.s_addr);
+    port = ad.get_port();
+  }
+
+  ipv4_addr(entity_addr_t &&addr) : ipv4_addr(addr) {}
+};
+
+struct ipv4_address {
+  ipv4_address() : ip(0) {}
+  explicit ipv4_address(uint32_t ip) : ip(ip) {}
+  explicit ipv4_address(const std::string& addr) {
+    ip = static_cast<uint32_t>(boost::asio::ip::address_v4::from_string(addr).to_ulong());
+  }
+  ipv4_address(ipv4_addr addr) {
+    ip = addr.ip;
+  }
+
+  uint32_t ip;
+
+  ipv4_address hton() {
+    ipv4_address addr;
+    addr.ip = ::hton(ip);
+    return addr;
+  }
+  ipv4_address ntoh() {
+    ipv4_address addr;
+    addr.ip = ::ntoh(ip);
+    return addr;
+  }
+
+  friend bool operator==(ipv4_address x, ipv4_address y) {
+    return x.ip == y.ip;
+  }
+  friend bool operator!=(ipv4_address x, ipv4_address y) {
+    return x.ip != y.ip;
+  }
+} __attribute__((packed));
+
+static inline bool is_unspecified(ipv4_address addr) { return addr.ip == 0; }
+
+std::ostream& operator<<(std::ostream& os, const ipv4_address& a);
+
+namespace std {
+
+  template <>
+  struct hash<ipv4_address> {
+    size_t operator()(ipv4_address a) const { return a.ip; }
+  };
+
+}
+
+#endif //CEPH_IP_TYPES_H_H
diff --git a/src/msg/async/dpdk/net.cc b/src/msg/async/dpdk/net.cc
new file mode 100644
index 00000000..6e361f18
--- /dev/null
+++ b/src/msg/async/dpdk/net.cc
@@ -0,0 +1,205 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ */
+
+#include "net.h"
+#include "DPDK.h"
+#include "DPDKStack.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "net "
+
+interface::interface(CephContext *cct, std::shared_ptr<DPDKDevice> dev, EventCenter *center)
+    : cct(cct), _dev(dev),
+      _rx(_dev->receive(
+          center->get_id(),
+          [center, this] (Packet p) {
+            return dispatch_packet(center, std::move(p));
+          }
+      )),
+      _hw_address(_dev->hw_address()),
+      _hw_features(_dev->get_hw_features()) {
+  auto idx = 0u;
+  unsigned qid = center->get_id();
+  dev->queue_for_cpu(center->get_id()).register_packet_provider([this, idx, qid] () mutable {
+    Tub<Packet> p;
+    for (size_t i = 0; i < _pkt_providers.size(); i++) {
+      auto l3p = _pkt_providers[idx++]();
+      if (idx == _pkt_providers.size())
+        idx = 0;
+      if (l3p) {
+        auto l3pv = std::move(*l3p);
+        auto eh = l3pv.p.prepend_header<eth_hdr>();
+        eh->dst_mac = l3pv.to;
+        eh->src_mac = _hw_address;
+        eh->eth_proto = uint16_t(l3pv.proto_num);
+        *eh = eh->hton();
+        ldout(this->cct, 10) << "=== tx === proto " << std::hex << uint16_t(l3pv.proto_num)
+                       << " " << _hw_address << " -> " << l3pv.to
+                       << " length " << std::dec << l3pv.p.len() << dendl;
+        p = std::move(l3pv.p);
+        return p;
+      }
+    }
+    return p;
+  });
+}
+
+subscription<Packet, ethernet_address> interface::register_l3(
+    eth_protocol_num proto_num,
+    std::function<int (Packet p, ethernet_address from)> next,
+    std::function<bool (forward_hash&, Packet& p, size_t)> forward)
+{
+  auto i = _proto_map.emplace(std::piecewise_construct, std::make_tuple(uint16_t(proto_num)), std::forward_as_tuple(std::move(forward)));
+  ceph_assert(i.second);
+  l3_rx_stream& l3_rx = i.first->second;
+  return l3_rx.packet_stream.listen(std::move(next));
+}
+
+unsigned interface::hash2cpu(uint32_t hash) {
+  return _dev->hash2cpu(hash);
+}
+
+const rss_key_type& interface::rss_key() const {
+  return _dev->rss_key();
+}
+
+uint16_t interface::hw_queues_count() const {
+  return _dev->hw_queues_count();
+}
+
+class C_handle_l2forward : public EventCallback {
+  std::shared_ptr<DPDKDevice> sdev;
+  unsigned &queue_depth;
+  Packet p;
+  unsigned dst;
+
+ public:
+  C_handle_l2forward(std::shared_ptr<DPDKDevice> &p, unsigned &qd, Packet pkt, unsigned target)
+      : sdev(p), queue_depth(qd), p(std::move(pkt)), dst(target) {}
+  void do_request(uint64_t fd) {
+    sdev->l2receive(dst, std::move(p));
+    queue_depth--;
+    delete this;
+  }
+};
+
+void interface::forward(EventCenter *source, unsigned target, Packet p) {
+  static __thread unsigned queue_depth;
+
+  if (queue_depth < 1000) {
+    queue_depth++;
+    // FIXME: need ensure this event not be called after EventCenter destruct
+    _dev->workers[target]->center.dispatch_event_external(
+        new C_handle_l2forward(_dev, queue_depth, std::move(p.free_on_cpu(source)), target));
+  }
+}
+
+int interface::dispatch_packet(EventCenter *center, Packet p) {
+  auto eh = p.get_header<eth_hdr>();
+  if (eh) {
+    auto i = _proto_map.find(ntoh(eh->eth_proto));
+    auto hwrss = p.rss_hash();
+    if (hwrss) {
+      ldout(cct, 10) << __func__ << " === rx === proto " << std::hex << ::ntoh(eh->eth_proto)
+                     << " "<< eh->src_mac.ntoh() << " -> " << eh->dst_mac.ntoh()
+                     << " length " << std::dec << p.len() << " rss_hash " << *p.rss_hash() << dendl;
+    } else {
+      ldout(cct, 10) << __func__ << " === rx === proto " << std::hex << ::ntoh(eh->eth_proto)
+                     << " "<< eh->src_mac.ntoh() << " -> " << eh->dst_mac.ntoh()
+                     << " length " << std::dec << p.len() << dendl;
+    }
+    if (i != _proto_map.end()) {
+      l3_rx_stream& l3 = i->second;
+      auto fw = _dev->forward_dst(center->get_id(), [&p, &l3, this] () {
+        auto hwrss = p.rss_hash();
+        if (hwrss) {
+          return *hwrss;
+        } else {
+          forward_hash data;
+          if (l3.forward(data, p, sizeof(eth_hdr))) {
+            return toeplitz_hash(rss_key(), data);
+          }
+          return 0u;
+        }
+      });
+      if (fw != center->get_id()) {
+        ldout(cct, 1) << __func__ << " forward to " << fw << dendl;
+        forward(center, fw, std::move(p));
+      } else {
+        auto h = eh->ntoh();
+        auto from = h.src_mac;
+        p.trim_front(sizeof(*eh));
+        // avoid chaining, since queue length is unlimited
+        // drop instead.
+        if (l3.ready()) {
+          return l3.packet_stream.produce(std::move(p), from);
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+class C_arp_learn : public EventCallback {
+  DPDKWorker *worker;
+  ethernet_address l2_addr;
+  ipv4_address l3_addr;
+
+ public:
+  C_arp_learn(DPDKWorker *w, ethernet_address l2, ipv4_address l3)
+      : worker(w), l2_addr(l2), l3_addr(l3) {}
+  void do_request(uint64_t id) {
+    worker->arp_learn(l2_addr, l3_addr);
+    delete this;
+  }
+};
+
+void interface::arp_learn(ethernet_address l2, ipv4_address l3)
+{
+  for (auto &&w : _dev->workers) {
+    w->center.dispatch_event_external(
+        new C_arp_learn(w, l2, l3));
+  }
+}
+
+l3_protocol::l3_protocol(interface* netif, eth_protocol_num proto_num, packet_provider_type func)
+    : _netif(netif), _proto_num(proto_num)  {
+  _netif->register_packet_provider(std::move(func));
+}
+
+subscription<Packet, ethernet_address> l3_protocol::receive(
+    std::function<int (Packet, ethernet_address)> rx_fn,
+    std::function<bool (forward_hash &h, Packet &p, size_t s)> forward) {
+  return _netif->register_l3(_proto_num, std::move(rx_fn), std::move(forward));
+};
diff --git a/src/msg/async/dpdk/net.h b/src/msg/async/dpdk/net.h
new file mode 100644
index 00000000..63f0422b
--- /dev/null
+++ b/src/msg/async/dpdk/net.h
@@ -0,0 +1,138 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_DPDK_NET_H
+#define CEPH_MSG_DPDK_NET_H
+
+#include "const.h"
+#include "ethernet.h"
+#include "Packet.h"
+#include "stream.h"
+#include "toeplitz.h"
+
+struct hw_features {
+  // Enable tx ip header checksum offload
+  bool tx_csum_ip_offload = false;
+  // Enable tx l4 (TCP or UDP) checksum offload
+  bool tx_csum_l4_offload = false;
+  // Enable rx checksum offload
+  bool rx_csum_offload = false;
+  // LRO is enabled
+  bool rx_lro = false;
+  // Enable tx TCP segment offload
+  bool tx_tso = false;
+  // Enable tx UDP fragmentation offload
+  bool tx_ufo = false;
+  // Maximum Transmission Unit
+  uint16_t mtu = 1500;
+  // Maximun packet len when TCP/UDP offload is enabled
+  uint16_t max_packet_len = ip_packet_len_max - eth_hdr_len;
+};
+
+class forward_hash {
+  uint8_t data[64];
+  size_t end_idx = 0;
+ public:
+  size_t size() const {
+    return end_idx;
+  }
+  void push_back(uint8_t b) {
+    ceph_assert(end_idx < sizeof(data));
+    data[end_idx++] = b;
+  }
+  void push_back(uint16_t b) {
+    push_back(uint8_t(b));
+    push_back(uint8_t(b >> 8));
+  }
+  void push_back(uint32_t b) {
+    push_back(uint16_t(b));
+    push_back(uint16_t(b >> 16));
+  }
+  const uint8_t& operator[](size_t idx) const {
+    return data[idx];
+  }
+};
+
+class interface;
+
+class l3_protocol {
+ public:
+  struct l3packet {
+    eth_protocol_num proto_num;
+    ethernet_address to;
+    Packet p;
+  };
+  using packet_provider_type = std::function<Tub<l3packet> ()>;
+
+ private:
+  interface* _netif;
+  eth_protocol_num _proto_num;
+
+ public:
+  explicit l3_protocol(interface* netif, eth_protocol_num proto_num, packet_provider_type func);
+  subscription<Packet, ethernet_address> receive(
+      std::function<int (Packet, ethernet_address)> rx_fn,
+      std::function<bool (forward_hash &h, Packet &p, size_t s)> forward);
+
+ private:
+  friend class interface;
+};
+
+class DPDKDevice;
+struct ipv4_address;
+
+class interface {
+  CephContext *cct;
+  struct l3_rx_stream {
+    stream<Packet, ethernet_address> packet_stream;
+    std::function<bool (forward_hash&, Packet&, size_t)> forward;
+    bool ready() { return packet_stream.started(); }
+    explicit l3_rx_stream(std::function<bool (forward_hash&, Packet&, size_t)>&& fw) : forward(fw) {}
+  };
+  std::unordered_map<uint16_t, l3_rx_stream> _proto_map;
+  std::shared_ptr<DPDKDevice> _dev;
+  subscription<Packet> _rx;
+  ethernet_address _hw_address;
+  struct hw_features _hw_features;
+  std::vector<l3_protocol::packet_provider_type> _pkt_providers;
+
+ private:
+  int dispatch_packet(EventCenter *c, Packet p);
+ public:
+  explicit interface(CephContext *cct, std::shared_ptr<DPDKDevice> dev, EventCenter *center);
+  ethernet_address hw_address() { return _hw_address; }
+  const struct hw_features& get_hw_features() const { return _hw_features; }
+  subscription<Packet, ethernet_address> register_l3(
+      eth_protocol_num proto_num,
+      std::function<int (Packet, ethernet_address)> next,
+      std::function<bool (forward_hash&, Packet&, size_t)> forward);
+  void forward(EventCenter *source, unsigned target, Packet p);
+  unsigned hash2cpu(uint32_t hash);
+  void register_packet_provider(l3_protocol::packet_provider_type func) {
+    _pkt_providers.push_back(std::move(func));
+  }
+  const rss_key_type& rss_key() const;
+  uint16_t hw_queues_count() const;
+  void arp_learn(ethernet_address l2, ipv4_address l3);
+  friend class l3_protocol;
+};
+
+#endif //CEPH_MSG_DPDK_NET_H
diff --git a/src/msg/async/dpdk/queue.h b/src/msg/async/dpdk/queue.h
new file mode 100644
index 00000000..984ddca1
--- /dev/null
+++ b/src/msg/async/dpdk/queue.h
@@ -0,0 +1,96 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_DPDK_QUEUE_H_
+#define CEPH_MSG_DPDK_QUEUE_H_
+
+#include <queue>
+
+#include "circular_buffer.h"
+
+template <typename T>
+class queue {
+  std::queue<T, circular_buffer<T>> _q;
+  size_t _max;
+
+ public:
+  explicit queue(size_t size): _max(size) {}
+
+  // Push an item.
+  //
+  // Returns false if the queue was full and the item was not pushed.
+  bool push(T&& a);
+
+  // pops an item.
+  T pop();
+
+  // Consumes items from the queue, passing them to @func, until @func
+  // returns false or the queue it empty
+  //
+  // Returns false if func returned false.
+  template <typename Func>
+  bool consume(Func&& func);
+
+  // Returns true when the queue is empty.
+  bool empty() const;
+
+  // Returns true when the queue is full.
+  bool full() const;
+
+  size_t size() const { return _q.size(); }
+
+  // Destroy any items in the queue
+  void clear() {
+    while (!_q.empty()) {
+      _q.pop();
+    }
+  }
+};
+
+template <typename T>
+inline bool queue<T>::push(T&& data) {
+  if (_q.size() < _max) {
+    _q.push(std::move(data));
+    notify_not_empty();
+    return true;
+  } else {
+    return false;
+  }
+}
+
+template <typename T>
+inline T queue<T>::pop() {
+  T data = std::move(_q.front());
+  _q.pop();
+  return data;
+}
+
+template <typename T>
+inline bool queue<T>::empty() const {
+  return _q.empty();
+}
+
+template <typename T>
+inline bool queue<T>::full() const {
+  return _q.size() == _max;
+}
+
+#endif /* CEPH_MSG_DPDK_QUEUE_H_ */
diff --git a/src/msg/async/dpdk/shared_ptr.h b/src/msg/async/dpdk/shared_ptr.h
new file mode 100644
index 00000000..d078063b
--- /dev/null
+++ b/src/msg/async/dpdk/shared_ptr.h
@@ -0,0 +1,391 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:4; indent-tabs-mode:nil -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_LW_SHARED_PTR_H_
+#define CEPH_LW_SHARED_PTR_H_
+
+#include <utility>
+#include <type_traits>
+#include <functional>
+#include <iostream>
+
+// This header defines a shared pointer facility, lw_shared_ptr<>,
+// modeled after std::shared_ptr<>.
+//
+// Unlike std::shared_ptr<>, this implementation is thread
+// safe, and two pointers sharing the same object must not be used in
+// different threads.
+//
+// lw_shared_ptr<> is the more lightweight variant, with a lw_shared_ptr<>
+// occupying just one machine word, and adding just one word to the shared
+// object.  However, it does not support polymorphism.
+//
+// It supports shared_from_this() via enable_shared_from_this<>
+// and lw_enable_shared_from_this<>().
+//
+
+template <typename T>
+class lw_shared_ptr;
+
+template <typename T>
+class enable_lw_shared_from_this;
+
+template <typename T>
+class enable_shared_from_this;
+
+template <typename T, typename... A>
+lw_shared_ptr<T> make_lw_shared(A&&... a);
+
+template <typename T>
+lw_shared_ptr<T> make_lw_shared(T&& a);
+
+template <typename T>
+lw_shared_ptr<T> make_lw_shared(T& a);
+
+struct lw_shared_ptr_counter_base {
+    long _count = 0;
+};
+
+
+namespace internal {
+
+template <class T, class U>
+struct lw_shared_ptr_accessors;
+
+template <class T>
+struct lw_shared_ptr_accessors_esft;
+
+template <class T>
+struct lw_shared_ptr_accessors_no_esft;
+
+}
+
+
+// We want to support two use cases for shared_ptr<T>:
+//
+//   1. T is any type (primitive or class type)
+//
+//   2. T is a class type that inherits from enable_shared_from_this<T>.
+//
+// In the first case, we must wrap T in an object containing the counter,
+// since T may be a primitive type and cannot be a base class.
+//
+// In the second case, we want T to reach the counter through its
+// enable_shared_from_this<> base class, so that we can implement
+// shared_from_this().
+//
+// To implement those two conflicting requirements (T alongside its counter;
+// T inherits from an object containing the counter) we use std::conditional<>
+// and some accessor functions to select between two implementations.
+
+
+// CRTP from this to enable shared_from_this:
+template <typename T>
+class enable_lw_shared_from_this : private lw_shared_ptr_counter_base {
+    using ctor = T;
+protected:
+    enable_lw_shared_from_this() noexcept {}
+    enable_lw_shared_from_this(enable_lw_shared_from_this&&) noexcept {}
+    enable_lw_shared_from_this(const enable_lw_shared_from_this&) noexcept {}
+    enable_lw_shared_from_this& operator=(const enable_lw_shared_from_this&) noexcept { return *this; }
+    enable_lw_shared_from_this& operator=(enable_lw_shared_from_this&&) noexcept { return *this; }
+public:
+    lw_shared_ptr<T> shared_from_this();
+    lw_shared_ptr<const T> shared_from_this() const;
+
+    template <typename X>
+    friend class lw_shared_ptr;
+    template <typename X>
+    friend class ::internal::lw_shared_ptr_accessors_esft;
+    template <typename X, class Y>
+    friend class ::internal::lw_shared_ptr_accessors;
+};
+
+template <typename T>
+struct shared_ptr_no_esft : private lw_shared_ptr_counter_base {
+    T _value;
+
+    shared_ptr_no_esft() = default;
+    shared_ptr_no_esft(const T& x) : _value(x) {}
+    shared_ptr_no_esft(T&& x) : _value(std::move(x)) {}
+    template <typename... A>
+    shared_ptr_no_esft(A&&... a) : _value(std::forward<A>(a)...) {}
+
+    template <typename X>
+    friend class lw_shared_ptr;
+    template <typename X>
+    friend class ::internal::lw_shared_ptr_accessors_no_esft;
+    template <typename X, class Y>
+    friend class ::internal::lw_shared_ptr_accessors;
+};
+
+
+/// Extension point: the user may override this to change how \ref lw_shared_ptr objects are destroyed,
+/// primarily so that incomplete classes can be used.
+///
+/// Customizing the deleter requires that \c T be derived from \c enable_lw_shared_from_this<T>.
+/// The specialization must be visible for all uses of \c lw_shared_ptr<T>.
+///
+/// To customize, the template must have a `static void dispose(T*)` operator that disposes of
+/// the object.
+template <typename T>
+struct lw_shared_ptr_deleter;  // No generic implementation
+
+namespace internal {
+
+template <typename T>
+struct lw_shared_ptr_accessors_esft {
+    using concrete_type = std::remove_const_t<T>;
+    static T* to_value(lw_shared_ptr_counter_base* counter) {
+        return static_cast<T*>(counter);
+    }
+    static void dispose(lw_shared_ptr_counter_base* counter) {
+        delete static_cast<T*>(counter);
+    }
+    static void instantiate_to_value(lw_shared_ptr_counter_base* p) {
+        // since to_value() is defined above, we don't need to do anything special
+        // to force-instantiate it
+    }
+};
+
+template <typename T>
+struct lw_shared_ptr_accessors_no_esft {
+    using concrete_type = shared_ptr_no_esft<T>;
+    static T* to_value(lw_shared_ptr_counter_base* counter) {
+        return &static_cast<concrete_type*>(counter)->_value;
+    }
+    static void dispose(lw_shared_ptr_counter_base* counter) {
+        delete static_cast<concrete_type*>(counter);
+    }
+    static void instantiate_to_value(lw_shared_ptr_counter_base* p) {
+        // since to_value() is defined above, we don't need to do anything special
+        // to force-instantiate it
+    }
+};
+
+// Generic case: lw_shared_ptr_deleter<T> is not specialized, select
+// implementation based on whether T inherits from enable_lw_shared_from_this<T>.
+template <typename T, typename U = void>
+struct lw_shared_ptr_accessors : std::conditional_t<
+         std::is_base_of<enable_lw_shared_from_this<T>, T>::value,
+         lw_shared_ptr_accessors_esft<T>,
+         lw_shared_ptr_accessors_no_esft<T>> {
+};
+
+// Overload when lw_shared_ptr_deleter<T> specialized
+template <typename T>
+struct lw_shared_ptr_accessors<T, std::void_t<decltype(lw_shared_ptr_deleter<T>{})>> {
+    using concrete_type = T;
+    static T* to_value(lw_shared_ptr_counter_base* counter);
+    static void dispose(lw_shared_ptr_counter_base* counter) {
+        lw_shared_ptr_deleter<T>::dispose(to_value(counter));
+    }
+    static void instantiate_to_value(lw_shared_ptr_counter_base* p) {
+        // instantiate to_value(); must be defined by shared_ptr_incomplete.hh
+        to_value(p);
+    }
+};
+
+}
+
+template <typename T>
+class lw_shared_ptr {
+    using accessors = ::internal::lw_shared_ptr_accessors<std::remove_const_t<T>>;
+    using concrete_type = typename accessors::concrete_type;
+    mutable lw_shared_ptr_counter_base* _p = nullptr;
+private:
+    lw_shared_ptr(lw_shared_ptr_counter_base* p) noexcept : _p(p) {
+        if (_p) {
+            ++_p->_count;
+        }
+    }
+    template <typename... A>
+    static lw_shared_ptr make(A&&... a) {
+        auto p = new concrete_type(std::forward<A>(a)...);
+        accessors::instantiate_to_value(p);
+        return lw_shared_ptr(p);
+    }
+public:
+    using element_type = T;
+
+    lw_shared_ptr() noexcept = default;
+    lw_shared_ptr(std::nullptr_t) noexcept : lw_shared_ptr() {}
+    lw_shared_ptr(const lw_shared_ptr& x) noexcept : _p(x._p) {
+        if (_p) {
+            ++_p->_count;
+        }
+    }
+    lw_shared_ptr(lw_shared_ptr&& x) noexcept  : _p(x._p) {
+        x._p = nullptr;
+    }
+    [[gnu::always_inline]]
+    ~lw_shared_ptr() {
+        if (_p && !--_p->_count) {
+            accessors::dispose(_p);
+        }
+    }
+    lw_shared_ptr& operator=(const lw_shared_ptr& x) noexcept {
+        if (_p != x._p) {
+            this->~lw_shared_ptr();
+            new (this) lw_shared_ptr(x);
+        }
+        return *this;
+    }
+    lw_shared_ptr& operator=(lw_shared_ptr&& x) noexcept {
+        if (_p != x._p) {
+            this->~lw_shared_ptr();
+            new (this) lw_shared_ptr(std::move(x));
+        }
+        return *this;
+    }
+    lw_shared_ptr& operator=(std::nullptr_t) noexcept {
+        return *this = lw_shared_ptr();
+    }
+    lw_shared_ptr& operator=(T&& x) noexcept {
+        this->~lw_shared_ptr();
+        new (this) lw_shared_ptr(make_lw_shared<T>(std::move(x)));
+        return *this;
+    }
+
+    T& operator*() const noexcept { return *accessors::to_value(_p); }
+    T* operator->() const noexcept { return accessors::to_value(_p); }
+    T* get() const noexcept {
+        if (_p) {
+            return accessors::to_value(_p);
+        } else {
+            return nullptr;
+        }
+    }
+
+    long int use_count() const noexcept {
+        if (_p) {
+            return _p->_count;
+        } else {
+            return 0;
+        }
+    }
+
+    operator lw_shared_ptr<const T>() const noexcept {
+        return lw_shared_ptr<const T>(_p);
+    }
+
+    explicit operator bool() const noexcept {
+        return _p;
+    }
+
+    bool owned() const noexcept {
+        return _p->_count == 1;
+    }
+
+    bool operator==(const lw_shared_ptr<const T>& x) const {
+        return _p == x._p;
+    }
+
+    bool operator!=(const lw_shared_ptr<const T>& x) const {
+        return !operator==(x);
+    }
+
+    bool operator==(const lw_shared_ptr<std::remove_const_t<T>>& x) const {
+        return _p == x._p;
+    }
+
+    bool operator!=(const lw_shared_ptr<std::remove_const_t<T>>& x) const {
+        return !operator==(x);
+    }
+
+    bool operator<(const lw_shared_ptr<const T>& x) const {
+        return _p < x._p;
+    }
+
+    bool operator<(const lw_shared_ptr<std::remove_const_t<T>>& x) const {
+        return _p < x._p;
+    }
+
+    template <typename U>
+    friend class lw_shared_ptr;
+
+    template <typename X, typename... A>
+    friend lw_shared_ptr<X> make_lw_shared(A&&...);
+
+    template <typename U>
+    friend lw_shared_ptr<U> make_lw_shared(U&&);
+
+    template <typename U>
+    friend lw_shared_ptr<U> make_lw_shared(U&);
+
+    template <typename U>
+    friend class enable_lw_shared_from_this;
+};
+
+template <typename T, typename... A>
+inline
+lw_shared_ptr<T> make_lw_shared(A&&... a) {
+    return lw_shared_ptr<T>::make(std::forward<A>(a)...);
+}
+
+template <typename T>
+inline
+lw_shared_ptr<T> make_lw_shared(T&& a) {
+    return lw_shared_ptr<T>::make(std::move(a));
+}
+
+template <typename T>
+inline
+lw_shared_ptr<T> make_lw_shared(T& a) {
+    return lw_shared_ptr<T>::make(a);
+}
+
+template <typename T>
+inline
+lw_shared_ptr<T>
+enable_lw_shared_from_this<T>::shared_from_this() {
+    return lw_shared_ptr<T>(this);
+}
+
+template <typename T>
+inline
+lw_shared_ptr<const T>
+enable_lw_shared_from_this<T>::shared_from_this() const {
+    return lw_shared_ptr<const T>(const_cast<enable_lw_shared_from_this*>(this));
+}
+
+template <typename T>
+static inline
+std::ostream& operator<<(std::ostream& out, const lw_shared_ptr<T>& p) {
+    if (!p) {
+        return out << "null";
+    }
+    return out << *p;
+}
+
+namespace std {
+
+  template <typename T>
+  struct hash<lw_shared_ptr<T>> : private hash<T*> {
+    size_t operator()(const lw_shared_ptr<T>& p) const {
+        return hash<T*>::operator()(p.get());
+    }
+  };
+
+}
+
+#endif /* CEPH_LW_SHARED_PTR_H_ */
diff --git a/src/msg/async/dpdk/stream.h b/src/msg/async/dpdk/stream.h
new file mode 100644
index 00000000..1898e8f8
--- /dev/null
+++ b/src/msg/async/dpdk/stream.h
@@ -0,0 +1,155 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_STREAM_H_
+#define CEPH_MSG_STREAM_H_
+
+#include <exception>
+#include <cassert>
+
+// A stream<> is the producer side.  It may call produce() as long
+// as the returned from the previous invocation is ready.
+// To signify no more data is available, call close().
+//
+// A subscription<> is the consumer side.  It is created by a call
+// to stream::listen().  Calling subscription::start(),
+// which registers the data processing callback, starts processing
+// events.  It may register for end-of-stream notifications by
+// return the when_done() future, which also delivers error
+// events (as exceptions).
+//
+// The consumer can pause generation of new data by returning
+// positive integer; when it becomes ready, the producer
+// will resume processing.
+
+template <typename... T>
+class subscription;
+
+template <typename... T>
+class stream {
+  subscription<T...>* _sub = nullptr;
+  int done;
+  bool ready;
+ public:
+  using next_fn = std::function<int (T...)>;
+  stream() = default;
+  stream(const stream&) = delete;
+  stream(stream&&) = delete;
+  ~stream() {
+    if (_sub) {
+      _sub->_stream = nullptr;
+    }
+  }
+
+  void operator=(const stream&) = delete;
+  void operator=(stream&&) = delete;
+
+  // Returns a subscription that reads value from this
+  // stream.
+  subscription<T...> listen() {
+    return subscription<T...>(this);
+  }
+
+  // Returns a subscription that reads value from this
+  // stream, and also sets up the listen function.
+  subscription<T...> listen(next_fn next) {
+    auto sub = subscription<T...>(this);
+    sub.start(std::move(next));
+    return sub;
+  }
+
+  // Becomes ready when the listener is ready to accept
+  // values.  Call only once, when beginning to produce
+  // values.
+  bool started() {
+    return ready;
+  }
+
+  // Produce a value.  Call only after started(), and after
+  // a previous produce() is ready.
+  int produce(T... data) {
+      return _sub->_next(std::move(data)...);
+  }
+
+  // End the stream.   Call only after started(), and after
+  // a previous produce() is ready.  No functions may be called
+  // after this.
+  void close() {
+    done = 1;
+  }
+
+  // Signal an error.   Call only after started(), and after
+  // a previous produce() is ready.  No functions may be called
+  // after this.
+  void set_exception(int error) {
+    done = error;
+  }
+ private:
+  void start();
+  friend class subscription<T...>;
+};
+
+template <typename... T>
+class subscription {
+ public:
+  using next_fn = typename stream<T...>::next_fn;
+ private:
+  stream<T...>* _stream;
+  next_fn _next;
+ private:
+  explicit subscription(stream<T...>* s): _stream(s) {
+    ceph_assert(!_stream->_sub);
+    _stream->_sub = this;
+  }
+
+ public:
+  subscription(subscription&& x)
+    : _stream(x._stream), _next(std::move(x._next)) {
+    x._stream = nullptr;
+    if (_stream) {
+      _stream->_sub = this;
+    }
+  }
+  ~subscription() {
+    if (_stream) {
+      _stream->_sub = nullptr;
+    }
+  }
+
+  /// \brief Start receiving events from the stream.
+  ///
+  /// \param next Callback to call for each event
+  void start(std::function<int (T...)> next) {
+    _next = std::move(next);
+    _stream->ready = true;
+  }
+
+  // Becomes ready when the stream is empty, or when an error
+  // happens (in that case, an exception is held).
+  int done() {
+    return _stream->done;
+  }
+
+  friend class stream<T...>;
+};
+
+#endif /* CEPH_MSG_STREAM_H_ */
diff --git a/src/msg/async/dpdk/toeplitz.h b/src/msg/async/dpdk/toeplitz.h
new file mode 100644
index 00000000..3ca38808
--- /dev/null
+++ b/src/msg/async/dpdk/toeplitz.h
@@ -0,0 +1,92 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*-
+ * Copyright (c) 2010 David Malone <dwmalone@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef CEPH_MSG_TOEPLITZ_H_
+#define CEPH_MSG_TOEPLITZ_H_
+
+#include <vector>
+
+using rss_key_type = std::vector<uint8_t>;
+
+// Mellanox Linux's driver key
+static const rss_key_type default_rsskey_40bytes = {
+    0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
+    0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
+    0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
+    0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
+    0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
+};
+
+// Intel's i40e PMD default RSS key
+static const rss_key_type default_rsskey_52bytes = {
+    0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23,
+    0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30,
+    0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02,
+    0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c,
+    0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55,
+    0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e,
+    0x81, 0x15, 0x03, 0x66
+};
+
+template<typename T>
+static inline uint32_t toeplitz_hash(const rss_key_type& key, const T& data)
+{
+	uint32_t hash = 0, v;
+	u_int i, b;
+
+	/* XXXRW: Perhaps an assertion about key length vs. data length? */
+
+	v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
+	for (i = 0; i < data.size(); i++) {
+		for (b = 0; b < 8; b++) {
+			if (data[i] & (1<<(7-b)))
+				hash ^= v;
+			v <<= 1;
+			if ((i + 4) < key.size() &&
+			    (key[i+4] & (1<<(7-b))))
+				v |= 1;
+		}
+	}
+	return (hash);
+}
+#endif
diff --git a/src/msg/async/dpdk/transfer.h b/src/msg/async/dpdk/transfer.h
new file mode 100644
index 00000000..599db5bd
--- /dev/null
+++ b/src/msg/async/dpdk/transfer.h
@@ -0,0 +1,64 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_TRANSFER_H_
+#define CEPH_TRANSFER_H_
+
+// Helper functions for copying or moving multiple objects in an exception
+// safe manner, then destroying the sources.
+//
+// To transfer, call transfer_pass1(allocator, &from, &to) on all object pairs,
+// (this copies the object from @from to @to).  If no exceptions are encountered,
+// call transfer_pass2(allocator, &from, &to).  This destroys the object at the
+// origin.  If exceptions were encountered, simply destroy all copied objects.
+//
+// As an optimization, if the objects are moveable without throwing (noexcept)
+// transfer_pass1() simply moves the objects and destroys the source, and
+// transfer_pass2() does nothing.
+
+#include <type_traits>
+#include <utility>
+
+template <typename T, typename Alloc>
+inline void transfer_pass1(Alloc& a, T* from, T* to,
+                           typename std::enable_if<std::is_nothrow_move_constructible<T>::value>::type* = nullptr) {
+    a.construct(to, std::move(*from));
+    a.destroy(from);
+}
+
+template <typename T, typename Alloc>
+inline void transfer_pass2(Alloc& a, T* from, T* to,
+                           typename std::enable_if<std::is_nothrow_move_constructible<T>::value>::type* = nullptr) {
+}
+
+template <typename T, typename Alloc>
+inline void transfer_pass1(Alloc& a, T* from, T* to,
+               typename std::enable_if<!std::is_nothrow_move_constructible<T>::value>::type* = nullptr) {
+    a.construct(to, *from);
+}
+
+template <typename T, typename Alloc>
+inline void transfer_pass2(Alloc& a, T* from, T* to,
+               typename std::enable_if<!std::is_nothrow_move_constructible<T>::value>::type* = nullptr) {
+    a.destroy(from);
+}
+
+#endif /* CEPH_TRANSFER_H_ */
diff --git a/src/msg/async/frames_v2.cc b/src/msg/async/frames_v2.cc
new file mode 100644
index 00000000..f047eb18
--- /dev/null
+++ b/src/msg/async/frames_v2.cc
@@ -0,0 +1,480 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "frames_v2.h"
+
+#include <ostream>
+
+#undef FMT_HEADER_ONLY
+#define FMT_HEADER_ONLY 1
+#include "seastar/fmt/include/fmt/format.h"
+
+namespace ceph::msgr::v2 {
+
+// Unpads bufferlist to unpadded_len.
+static void unpad_zero(bufferlist& bl, uint32_t unpadded_len) {
+  ceph_assert(bl.length() >= unpadded_len);
+  if (bl.length() > unpadded_len) {
+    bl.splice(unpadded_len, bl.length() - unpadded_len);
+  }
+}
+
+// Discards trailing empty segments, unless there is just one segment.
+// A frame always has at least one (possibly empty) segment.
+static size_t calc_num_segments(const bufferlist segment_bls[],
+                                size_t segment_count) {
+  ceph_assert(segment_count > 0 && segment_count <= MAX_NUM_SEGMENTS);
+  for (size_t i = segment_count; i-- > 0; ) {
+    if (segment_bls[i].length() > 0) {
+      return i + 1;
+    }
+  }
+  return 1;
+}
+
+static void check_segment_crc(const bufferlist& segment_bl,
+                              uint32_t expected_crc) {
+  uint32_t crc = segment_bl.crc32c(-1);
+  if (crc != expected_crc) {
+    throw FrameError(fmt::format(
+        "bad segment crc calculated={} expected={}", crc, expected_crc));
+  }
+}
+
+// Returns true if the frame is ready for dispatching, or false if
+// it was aborted by the sender and must be dropped.
+static bool check_epilogue_late_status(__u8 late_status) {
+  __u8 aborted = late_status & FRAME_LATE_STATUS_ABORTED_MASK;
+  if (aborted != FRAME_LATE_STATUS_ABORTED &&
+      aborted != FRAME_LATE_STATUS_COMPLETE) {
+    throw FrameError(fmt::format("bad late_status"));
+  }
+  return aborted == FRAME_LATE_STATUS_COMPLETE;
+}
+
+void FrameAssembler::fill_preamble(Tag tag,
+                                   preamble_block_t& preamble) const {
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  ::memset(&preamble, 0, sizeof(preamble));
+
+  preamble.tag = static_cast<__u8>(tag);
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    preamble.segments[i].length = m_descs[i].logical_len;
+    preamble.segments[i].alignment = m_descs[i].align;
+  }
+  preamble.num_segments = m_descs.size();
+  preamble.crc = ceph_crc32c(
+      0, reinterpret_cast<const unsigned char*>(&preamble),
+      sizeof(preamble) - sizeof(preamble.crc));
+}
+
+uint64_t FrameAssembler::get_frame_logical_len() const {
+  ceph_assert(!m_descs.empty());
+  uint64_t logical_len = 0;
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    logical_len += m_descs[i].logical_len;
+  }
+  return logical_len;
+}
+
+uint64_t FrameAssembler::get_frame_onwire_len() const {
+  ceph_assert(!m_descs.empty());
+  uint64_t onwire_len = get_preamble_onwire_len();
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    onwire_len += get_segment_onwire_len(i);
+  }
+  onwire_len += get_epilogue_onwire_len();
+  return onwire_len;
+}
+
+bufferlist FrameAssembler::asm_crc_rev0(const preamble_block_t& preamble,
+                                        bufferlist segment_bls[]) const {
+  epilogue_crc_rev0_block_t epilogue;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  ::memset(&epilogue, 0, sizeof(epilogue));
+
+  bufferlist frame_bl(sizeof(preamble) + sizeof(epilogue));
+  frame_bl.append(reinterpret_cast<const char*>(&preamble), sizeof(preamble));
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    ceph_assert(segment_bls[i].length() == m_descs[i].logical_len);
+    epilogue.crc_values[i] = segment_bls[i].crc32c(-1);
+    if (segment_bls[i].length() > 0) {
+      frame_bl.claim_append(segment_bls[i]);
+    }
+  }
+  frame_bl.append(reinterpret_cast<const char*>(&epilogue), sizeof(epilogue));
+  return frame_bl;
+}
+
+bufferlist FrameAssembler::asm_secure_rev0(const preamble_block_t& preamble,
+                                           bufferlist segment_bls[]) const {
+  bufferlist preamble_bl(sizeof(preamble));
+  preamble_bl.append(reinterpret_cast<const char*>(&preamble),
+                     sizeof(preamble));
+
+  epilogue_secure_rev0_block_t epilogue;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  ::memset(&epilogue, 0, sizeof(epilogue));
+  bufferlist epilogue_bl(sizeof(epilogue));
+  epilogue_bl.append(reinterpret_cast<const char*>(&epilogue),
+                     sizeof(epilogue));
+
+  // preamble + MAX_NUM_SEGMENTS + epilogue
+  uint32_t onwire_lens[MAX_NUM_SEGMENTS + 2];
+  onwire_lens[0] = preamble_bl.length();
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    onwire_lens[i + 1] = segment_bls[i].length();  // already padded
+  }
+  onwire_lens[m_descs.size() + 1] = epilogue_bl.length();
+  m_crypto->tx->reset_tx_handler(onwire_lens,
+                                 onwire_lens + m_descs.size() + 2);
+  m_crypto->tx->authenticated_encrypt_update(preamble_bl);
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    if (segment_bls[i].length() > 0) {
+      m_crypto->tx->authenticated_encrypt_update(segment_bls[i]);
+    }
+  }
+  m_crypto->tx->authenticated_encrypt_update(epilogue_bl);
+  return m_crypto->tx->authenticated_encrypt_final();
+}
+
+bufferlist FrameAssembler::asm_crc_rev1(const preamble_block_t& preamble,
+                                        bufferlist segment_bls[]) const {
+  epilogue_crc_rev1_block_t epilogue;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  ::memset(&epilogue, 0, sizeof(epilogue));
+  epilogue.late_status |= FRAME_LATE_STATUS_COMPLETE;
+
+  bufferlist frame_bl(sizeof(preamble) + FRAME_CRC_SIZE + sizeof(epilogue));
+  frame_bl.append(reinterpret_cast<const char*>(&preamble), sizeof(preamble));
+
+  ceph_assert(segment_bls[0].length() == m_descs[0].logical_len);
+  if (segment_bls[0].length() > 0) {
+    uint32_t crc = segment_bls[0].crc32c(-1);
+    frame_bl.claim_append(segment_bls[0]);
+    encode(crc, frame_bl);
+  }
+  if (m_descs.size() == 1) {
+    return frame_bl;  // no epilogue if only one segment
+  }
+
+  for (size_t i = 1; i < m_descs.size(); i++) {
+    ceph_assert(segment_bls[i].length() == m_descs[i].logical_len);
+    epilogue.crc_values[i - 1] = segment_bls[i].crc32c(-1);
+    if (segment_bls[i].length() > 0) {
+      frame_bl.claim_append(segment_bls[i]);
+    }
+  }
+  frame_bl.append(reinterpret_cast<const char*>(&epilogue), sizeof(epilogue));
+  return frame_bl;
+}
+
+bufferlist FrameAssembler::asm_secure_rev1(const preamble_block_t& preamble,
+                                           bufferlist segment_bls[]) const {
+  bufferlist preamble_bl;
+  if (segment_bls[0].length() > FRAME_PREAMBLE_INLINE_SIZE) {
+    // first segment is partially inlined, inline buffer is full
+    preamble_bl.reserve(sizeof(preamble));
+    preamble_bl.append(reinterpret_cast<const char*>(&preamble),
+                       sizeof(preamble));
+    segment_bls[0].splice(0, FRAME_PREAMBLE_INLINE_SIZE, &preamble_bl);
+  } else {
+    // first segment is fully inlined, inline buffer may need padding
+    uint32_t pad_len = FRAME_PREAMBLE_INLINE_SIZE - segment_bls[0].length();
+    preamble_bl.reserve(sizeof(preamble) + pad_len);
+    preamble_bl.append(reinterpret_cast<const char*>(&preamble),
+                       sizeof(preamble));
+    preamble_bl.claim_append(segment_bls[0]);
+    if (pad_len > 0) {
+      preamble_bl.append_zero(pad_len);
+    }
+  }
+
+  m_crypto->tx->reset_tx_handler({preamble_bl.length()});
+  m_crypto->tx->authenticated_encrypt_update(preamble_bl);
+  auto frame_bl = m_crypto->tx->authenticated_encrypt_final();
+
+  if (segment_bls[0].length() > 0) {
+    m_crypto->tx->reset_tx_handler({segment_bls[0].length()});
+    m_crypto->tx->authenticated_encrypt_update(segment_bls[0]);
+    auto tmp = m_crypto->tx->authenticated_encrypt_final();
+    frame_bl.claim_append(tmp);
+  }
+  if (m_descs.size() == 1) {
+    return frame_bl;  // no epilogue if only one segment
+  }
+
+  epilogue_secure_rev1_block_t epilogue;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  ::memset(&epilogue, 0, sizeof(epilogue));
+  epilogue.late_status |= FRAME_LATE_STATUS_COMPLETE;
+  bufferlist epilogue_bl(sizeof(epilogue));
+  epilogue_bl.append(reinterpret_cast<const char*>(&epilogue),
+                     sizeof(epilogue));
+
+  // MAX_NUM_SEGMENTS - 1 + epilogue
+  uint32_t onwire_lens[MAX_NUM_SEGMENTS];
+  for (size_t i = 1; i < m_descs.size(); i++) {
+    onwire_lens[i - 1] = segment_bls[i].length();  // already padded
+  }
+  onwire_lens[m_descs.size() - 1] = epilogue_bl.length();
+  m_crypto->tx->reset_tx_handler(onwire_lens, onwire_lens + m_descs.size());
+  for (size_t i = 1; i < m_descs.size(); i++) {
+    if (segment_bls[i].length() > 0) {
+      m_crypto->tx->authenticated_encrypt_update(segment_bls[i]);
+    }
+  }
+  m_crypto->tx->authenticated_encrypt_update(epilogue_bl);
+  auto tmp = m_crypto->tx->authenticated_encrypt_final();
+  frame_bl.claim_append(tmp);
+  return frame_bl;
+}
+
+bufferlist FrameAssembler::assemble_frame(Tag tag, bufferlist segment_bls[],
+                                          const uint16_t segment_aligns[],
+                                          size_t segment_count) {
+  m_descs.resize(calc_num_segments(segment_bls, segment_count));
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    m_descs[i].logical_len = segment_bls[i].length();
+    m_descs[i].align = segment_aligns[i];
+  }
+
+  preamble_block_t preamble;
+  fill_preamble(tag, preamble);
+
+  if (m_crypto->rx) {
+    for (size_t i = 0; i < m_descs.size(); i++) {
+      ceph_assert(segment_bls[i].length() == m_descs[i].logical_len);
+      // We're padding segments to biggest cipher's block size. Although
+      // AES-GCM can live without that as it's a stream cipher, we don't
+      // want to be fixed to stream ciphers only.
+      uint32_t padded_len = get_segment_padded_len(i);
+      if (padded_len > segment_bls[i].length()) {
+        uint32_t pad_len = padded_len - segment_bls[i].length();
+        segment_bls[i].reserve(pad_len);
+        segment_bls[i].append_zero(pad_len);
+      }
+    }
+    if (m_is_rev1) {
+      return asm_secure_rev1(preamble, segment_bls);
+    }
+    return asm_secure_rev0(preamble, segment_bls);
+  }
+  if (m_is_rev1) {
+    return asm_crc_rev1(preamble, segment_bls);
+  }
+  return asm_crc_rev0(preamble, segment_bls);
+}
+
+Tag FrameAssembler::disassemble_preamble(bufferlist& preamble_bl) {
+  if (m_crypto->rx) {
+    m_crypto->rx->reset_rx_handler();
+    if (m_is_rev1) {
+      ceph_assert(preamble_bl.length() == FRAME_PREAMBLE_WITH_INLINE_SIZE +
+                                          get_auth_tag_len());
+      m_crypto->rx->authenticated_decrypt_update_final(preamble_bl);
+    } else {
+      ceph_assert(preamble_bl.length() == sizeof(preamble_block_t));
+      m_crypto->rx->authenticated_decrypt_update(preamble_bl);
+    }
+  } else {
+    ceph_assert(preamble_bl.length() == sizeof(preamble_block_t));
+  }
+
+  // I expect ceph_le32 will make the endian conversion for me. Passing
+  // everything through ::Decode is unnecessary.
+  auto preamble = reinterpret_cast<const preamble_block_t*>(
+      preamble_bl.c_str());
+  // check preamble crc before any further processing
+  uint32_t crc = ceph_crc32c(
+      0, reinterpret_cast<const unsigned char*>(preamble),
+      sizeof(*preamble) - sizeof(preamble->crc));
+  if (crc != preamble->crc) {
+    throw FrameError(fmt::format(
+        "bad preamble crc calculated={} expected={}", crc, preamble->crc));
+  }
+
+  // see calc_num_segments()
+  if (preamble->num_segments < 1 ||
+      preamble->num_segments > MAX_NUM_SEGMENTS) {
+    throw FrameError(fmt::format(
+        "bad number of segments num_segments={}", preamble->num_segments));
+  }
+  if (preamble->num_segments > 1 &&
+      preamble->segments[preamble->num_segments - 1].length == 0) {
+    throw FrameError("last segment empty");
+  }
+
+  m_descs.resize(preamble->num_segments);
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    m_descs[i].logical_len = preamble->segments[i].length;
+    m_descs[i].align = preamble->segments[i].alignment;
+  }
+  return static_cast<Tag>(preamble->tag);
+}
+
+bool FrameAssembler::disasm_all_crc_rev0(bufferlist segment_bls[],
+                                         bufferlist& epilogue_bl) const {
+  ceph_assert(epilogue_bl.length() == sizeof(epilogue_crc_rev0_block_t));
+  auto epilogue = reinterpret_cast<const epilogue_crc_rev0_block_t*>(
+      epilogue_bl.c_str());
+
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    ceph_assert(segment_bls[i].length() == m_descs[i].logical_len);
+    check_segment_crc(segment_bls[i], epilogue->crc_values[i]);
+  }
+  return !(epilogue->late_flags & FRAME_LATE_FLAG_ABORTED);
+}
+
+bool FrameAssembler::disasm_all_secure_rev0(bufferlist segment_bls[],
+                                            bufferlist& epilogue_bl) const {
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    ceph_assert(segment_bls[i].length() == get_segment_padded_len(i));
+    if (segment_bls[i].length() > 0) {
+      m_crypto->rx->authenticated_decrypt_update(segment_bls[i]);
+      unpad_zero(segment_bls[i], m_descs[i].logical_len);
+    }
+  }
+
+  ceph_assert(epilogue_bl.length() == sizeof(epilogue_secure_rev0_block_t) +
+                                      get_auth_tag_len());
+  m_crypto->rx->authenticated_decrypt_update_final(epilogue_bl);
+  auto epilogue = reinterpret_cast<const epilogue_secure_rev0_block_t*>(
+      epilogue_bl.c_str());
+  return !(epilogue->late_flags & FRAME_LATE_FLAG_ABORTED);
+}
+
+void FrameAssembler::disasm_first_crc_rev1(bufferlist& preamble_bl,
+                                           bufferlist& segment_bl) const {
+  ceph_assert(preamble_bl.length() == sizeof(preamble_block_t));
+  if (m_descs[0].logical_len > 0) {
+    ceph_assert(segment_bl.length() == m_descs[0].logical_len +
+                                       FRAME_CRC_SIZE);
+    bufferlist::const_iterator it(&segment_bl, m_descs[0].logical_len);
+    uint32_t expected_crc;
+    decode(expected_crc, it);
+    segment_bl.splice(m_descs[0].logical_len, FRAME_CRC_SIZE);
+    check_segment_crc(segment_bl, expected_crc);
+  } else {
+    ceph_assert(segment_bl.length() == 0);
+  }
+}
+
+bool FrameAssembler::disasm_remaining_crc_rev1(bufferlist segment_bls[],
+                                               bufferlist& epilogue_bl) const {
+  ceph_assert(epilogue_bl.length() == sizeof(epilogue_crc_rev1_block_t));
+  auto epilogue = reinterpret_cast<const epilogue_crc_rev1_block_t*>(
+      epilogue_bl.c_str());
+
+  for (size_t i = 1; i < m_descs.size(); i++) {
+    ceph_assert(segment_bls[i].length() == m_descs[i].logical_len);
+    check_segment_crc(segment_bls[i], epilogue->crc_values[i - 1]);
+  }
+  return check_epilogue_late_status(epilogue->late_status);
+}
+
+void FrameAssembler::disasm_first_secure_rev1(bufferlist& preamble_bl,
+                                              bufferlist& segment_bl) const {
+  ceph_assert(preamble_bl.length() == FRAME_PREAMBLE_WITH_INLINE_SIZE);
+  uint32_t padded_len = get_segment_padded_len(0);
+  if (padded_len > FRAME_PREAMBLE_INLINE_SIZE) {
+    ceph_assert(segment_bl.length() == padded_len + get_auth_tag_len() -
+                                       FRAME_PREAMBLE_INLINE_SIZE);
+    m_crypto->rx->reset_rx_handler();
+    m_crypto->rx->authenticated_decrypt_update_final(segment_bl);
+    // prepend the inline buffer (already decrypted) to segment_bl
+    bufferlist tmp;
+    segment_bl.swap(tmp);
+    preamble_bl.splice(sizeof(preamble_block_t), FRAME_PREAMBLE_INLINE_SIZE,
+                       &segment_bl);
+    segment_bl.claim_append(tmp);
+  } else {
+    ceph_assert(segment_bl.length() == 0);
+    preamble_bl.splice(sizeof(preamble_block_t), FRAME_PREAMBLE_INLINE_SIZE,
+                       &segment_bl);
+  }
+  unpad_zero(segment_bl, m_descs[0].logical_len);
+  ceph_assert(segment_bl.length() == m_descs[0].logical_len);
+}
+
+bool FrameAssembler::disasm_remaining_secure_rev1(
+    bufferlist segment_bls[], bufferlist& epilogue_bl) const {
+  m_crypto->rx->reset_rx_handler();
+  for (size_t i = 1; i < m_descs.size(); i++) {
+    ceph_assert(segment_bls[i].length() == get_segment_padded_len(i));
+    if (segment_bls[i].length() > 0) {
+      m_crypto->rx->authenticated_decrypt_update(segment_bls[i]);
+      unpad_zero(segment_bls[i], m_descs[i].logical_len);
+    }
+  }
+
+  ceph_assert(epilogue_bl.length() == sizeof(epilogue_secure_rev1_block_t) +
+                                      get_auth_tag_len());
+  m_crypto->rx->authenticated_decrypt_update_final(epilogue_bl);
+  auto epilogue = reinterpret_cast<const epilogue_secure_rev1_block_t*>(
+      epilogue_bl.c_str());
+  return check_epilogue_late_status(epilogue->late_status);
+}
+
+void FrameAssembler::disassemble_first_segment(bufferlist& preamble_bl,
+                                               bufferlist& segment_bl) const {
+  ceph_assert(!m_descs.empty());
+  if (m_is_rev1) {
+    if (m_crypto->rx) {
+      disasm_first_secure_rev1(preamble_bl, segment_bl);
+    } else {
+      disasm_first_crc_rev1(preamble_bl, segment_bl);
+    }
+  } else {
+    // noop, everything is handled in disassemble_remaining_segments()
+  }
+}
+
+bool FrameAssembler::disassemble_remaining_segments(
+    bufferlist segment_bls[], bufferlist& epilogue_bl) const {
+  ceph_assert(!m_descs.empty());
+  if (m_is_rev1) {
+    if (m_descs.size() == 1) {
+      // no epilogue if only one segment
+      ceph_assert(epilogue_bl.length() == 0);
+      return true;
+    }
+    if (m_crypto->rx) {
+      return disasm_remaining_secure_rev1(segment_bls, epilogue_bl);
+    }
+    return disasm_remaining_crc_rev1(segment_bls, epilogue_bl);
+  }
+  if (m_crypto->rx) {
+    return disasm_all_secure_rev0(segment_bls, epilogue_bl);
+  }
+  return disasm_all_crc_rev0(segment_bls, epilogue_bl);
+}
+
+std::ostream& operator<<(std::ostream& os, const FrameAssembler& frame_asm) {
+  if (!frame_asm.m_descs.empty()) {
+    os << frame_asm.get_preamble_onwire_len();
+    for (size_t i = 0; i < frame_asm.m_descs.size(); i++) {
+      os << " + " << frame_asm.get_segment_onwire_len(i)
+         << " (logical " << frame_asm.m_descs[i].logical_len
+         << "/" << frame_asm.m_descs[i].align << ")";
+    }
+    os << " + " << frame_asm.get_epilogue_onwire_len() << " ";
+  }
+  os << "rev1=" << frame_asm.m_is_rev1
+     << " rx=" << frame_asm.m_crypto->rx.get()
+     << " tx=" << frame_asm.m_crypto->tx.get();
+  return os;
+}
+
+}  // namespace ceph::msgr::v2
diff --git a/src/msg/async/frames_v2.h b/src/msg/async/frames_v2.h
new file mode 100644
index 00000000..88fa4e1b
--- /dev/null
+++ b/src/msg/async/frames_v2.h
@@ -0,0 +1,842 @@
+#ifndef _MSG_ASYNC_FRAMES_V2_
+#define _MSG_ASYNC_FRAMES_V2_
+
+#include "include/types.h"
+#include "common/Clock.h"
+#include "crypto_onwire.h"
+#include <array>
+#include <iosfwd>
+#include <utility>
+
+#include <boost/container/static_vector.hpp>
+
+/**
+ * Protocol V2 Frame Structures
+ * 
+ * Documentation in: doc/dev/msgr2.rst
+ **/
+
+namespace ceph::msgr::v2 {
+
+// We require these features from any peer, period, in order to encode
+// a entity_addrvec_t.
+const uint64_t msgr2_required = CEPH_FEATUREMASK_MSG_ADDR2;
+
+// We additionally assume the peer has the below features *purely for
+// the purpose of encoding the frames themselves*.  The only complex
+// types in the frames are entity_addr_t and entity_addrvec_t, and we
+// specifically want the peer to understand the (new in nautilus)
+// TYPE_ANY.  We treat narrow this assumption to frames because we
+// expect there may be future clients (the kernel) that understand
+// msgr v2 and understand this encoding but don't necessarily have
+// everything else that SERVER_NAUTILUS implies.  Yes, a fresh feature
+// bit would be a cleaner approach, but those are scarce these days.
+const uint64_t msgr2_frame_assumed =
+		   msgr2_required |
+		   CEPH_FEATUREMASK_SERVER_NAUTILUS;
+
+enum class Tag : __u8 {
+  HELLO = 1,
+  AUTH_REQUEST,
+  AUTH_BAD_METHOD,
+  AUTH_REPLY_MORE,
+  AUTH_REQUEST_MORE,
+  AUTH_DONE,
+  AUTH_SIGNATURE,
+  CLIENT_IDENT,
+  SERVER_IDENT,
+  IDENT_MISSING_FEATURES,
+  SESSION_RECONNECT,
+  SESSION_RESET,
+  SESSION_RETRY,
+  SESSION_RETRY_GLOBAL,
+  SESSION_RECONNECT_OK,
+  WAIT,
+  MESSAGE,
+  KEEPALIVE2,
+  KEEPALIVE2_ACK,
+  ACK
+};
+
+struct segment_t {
+  // TODO: this will be dropped with support for `allocation policies`.
+  // We need them because of the rx_buffers zero-copy optimization.
+  static constexpr __le16 PAGE_SIZE_ALIGNMENT{4096};
+
+  static constexpr __le16 DEFAULT_ALIGNMENT = sizeof(void *);
+
+  ceph_le32 length;
+  ceph_le16 alignment;
+} __attribute__((packed));
+
+struct SegmentIndex {
+  struct Msg {
+    static constexpr std::size_t HEADER = 0;
+    static constexpr std::size_t FRONT = 1;
+    static constexpr std::size_t MIDDLE = 2;
+    static constexpr std::size_t DATA = 3;
+  };
+
+  struct Control {
+    static constexpr std::size_t PAYLOAD = 0;
+  };
+};
+
+static constexpr uint8_t CRYPTO_BLOCK_SIZE { 16 };
+
+static constexpr std::size_t MAX_NUM_SEGMENTS = 4;
+
+// V2 preamble consists of one or more preamble blocks depending on
+// the number of segments a particular frame needs. Each block holds
+// up to MAX_NUM_SEGMENTS segments and has its own CRC.
+//
+// XXX: currently the multi-segment facility is NOT implemented.
+struct preamble_block_t {  
+  // Tag. For multi-segmented frames the value is the same
+  // between subsequent preamble blocks.
+  __u8 tag;
+
+  // Number of segments to go in entire frame. First preable block has
+  // set this to just #segments, second #segments - MAX_NUM_SEGMENTS,
+  // third to #segments - MAX_NUM_SEGMENTS and so on.
+  __u8 num_segments;
+
+  std::array<segment_t, MAX_NUM_SEGMENTS> segments;
+  __u8 _reserved[2];
+
+  // CRC32 for this single preamble block.
+  ceph_le32 crc;
+} __attribute__((packed));
+static_assert(sizeof(preamble_block_t) % CRYPTO_BLOCK_SIZE == 0);
+static_assert(std::is_standard_layout<preamble_block_t>::value);
+
+struct epilogue_crc_rev0_block_t {
+  __u8 late_flags;  // FRAME_LATE_FLAG_ABORTED
+  std::array<ceph_le32, MAX_NUM_SEGMENTS> crc_values;
+} __attribute__((packed));
+static_assert(std::is_standard_layout_v<epilogue_crc_rev0_block_t>);
+
+struct epilogue_crc_rev1_block_t {
+  __u8 late_status;  // FRAME_LATE_STATUS_*
+  ceph_le32 crc_values[MAX_NUM_SEGMENTS - 1];
+} __attribute__((packed));
+static_assert(std::is_standard_layout_v<epilogue_crc_rev1_block_t>);
+
+struct epilogue_secure_rev0_block_t {
+  __u8 late_flags;  // FRAME_LATE_FLAG_ABORTED
+  __u8 padding[CRYPTO_BLOCK_SIZE - sizeof(late_flags)];
+} __attribute__((packed));
+static_assert(sizeof(epilogue_secure_rev0_block_t) % CRYPTO_BLOCK_SIZE == 0);
+static_assert(std::is_standard_layout_v<epilogue_secure_rev0_block_t>);
+
+// epilogue_secure_rev0_block_t with late_flags changed to late_status
+struct epilogue_secure_rev1_block_t {
+  __u8 late_status;  // FRAME_LATE_STATUS_*
+  __u8 padding[CRYPTO_BLOCK_SIZE - sizeof(late_status)];
+} __attribute__((packed));
+static_assert(sizeof(epilogue_secure_rev1_block_t) % CRYPTO_BLOCK_SIZE == 0);
+static_assert(std::is_standard_layout_v<epilogue_secure_rev1_block_t>);
+
+static constexpr uint32_t FRAME_CRC_SIZE = 4;
+static constexpr uint32_t FRAME_PREAMBLE_INLINE_SIZE = 48;
+static_assert(FRAME_PREAMBLE_INLINE_SIZE % CRYPTO_BLOCK_SIZE == 0);
+// just for performance, nothing should break otherwise
+static_assert(sizeof(ceph_msg_header2) <= FRAME_PREAMBLE_INLINE_SIZE);
+static constexpr uint32_t FRAME_PREAMBLE_WITH_INLINE_SIZE =
+    sizeof(preamble_block_t) + FRAME_PREAMBLE_INLINE_SIZE;
+
+// A frame can be aborted by the sender after transmitting the
+// preamble and the first segment.  The remainder of the frame
+// is filled with zeros, up until the epilogue.
+//
+// This flag is for msgr2.0.  Note that in crc mode, late_flags
+// is not covered by any crc -- a single bit flip can result in
+// a completed frame being dropped or in an aborted frame with
+// garbage segment payloads being dispatched.
+#define FRAME_LATE_FLAG_ABORTED           (1<<0)
+
+// For msgr2.1, FRAME_LATE_STATUS_ABORTED has the same meaning
+// as FRAME_LATE_FLAG_ABORTED and late_status replaces late_flags.
+// Bit error detection in crc mode is achieved by using a 4-bit
+// nibble per flag with two code words that are far apart in terms
+// of Hamming Distance (HD=4, same as provided by CRC32-C for
+// input lengths over ~5K).
+#define FRAME_LATE_STATUS_ABORTED         0x1
+#define FRAME_LATE_STATUS_COMPLETE        0xe
+#define FRAME_LATE_STATUS_ABORTED_MASK    0xf
+
+#define FRAME_LATE_STATUS_RESERVED_TRUE   0x10
+#define FRAME_LATE_STATUS_RESERVED_FALSE  0xe0
+#define FRAME_LATE_STATUS_RESERVED_MASK   0xf0
+
+struct FrameError : std::runtime_error {
+  using runtime_error::runtime_error;
+};
+
+class FrameAssembler {
+public:
+  // crypto must be non-null
+  FrameAssembler(const ceph::crypto::onwire::rxtx_t* crypto, bool is_rev1)
+      : m_crypto(crypto), m_is_rev1(is_rev1) {}
+
+  void set_is_rev1(bool is_rev1) {
+    m_descs.clear();
+    m_is_rev1 = is_rev1;
+  }
+
+  bool get_is_rev1() {
+    return m_is_rev1;
+  }
+
+  size_t get_num_segments() const {
+    ceph_assert(!m_descs.empty());
+    return m_descs.size();
+  }
+
+  uint32_t get_segment_logical_len(size_t seg_idx) const {
+    ceph_assert(seg_idx < m_descs.size());
+    return m_descs[seg_idx].logical_len;
+  }
+
+  uint16_t get_segment_align(size_t seg_idx) const {
+    ceph_assert(seg_idx < m_descs.size());
+    return m_descs[seg_idx].align;
+  }
+
+  // Preamble:
+  //
+  //   preamble_block_t
+  //   [preamble inline buffer + auth tag -- only in msgr2.1 secure mode]
+  //
+  // The preamble is generated unconditionally.
+  //
+  // In msgr2.1 secure mode, the first segment is inlined into the
+  // preamble inline buffer, either fully or partially.
+  uint32_t get_preamble_onwire_len() const {
+    if (m_is_rev1 && m_crypto->rx) {
+      return FRAME_PREAMBLE_WITH_INLINE_SIZE + get_auth_tag_len();
+    }
+    return sizeof(preamble_block_t);
+  }
+
+  // Segment:
+  //
+  //   segment payload
+  //   [zero padding -- only in secure mode]
+  //   [crc or auth tag -- only in msgr2.1, only for the first segment]
+  //
+  // For an empty segment, nothing is generated.  In msgr2.1 secure
+  // mode, if the first segment gets fully inlined into the preamble
+  // inline buffer, it is considered empty.
+  uint32_t get_segment_onwire_len(size_t seg_idx) const {
+    ceph_assert(seg_idx < m_descs.size());
+    if (m_crypto->rx) {
+      uint32_t padded_len = get_segment_padded_len(seg_idx);
+      if (m_is_rev1 && seg_idx == 0) {
+        if (padded_len > FRAME_PREAMBLE_INLINE_SIZE) {
+          return padded_len + get_auth_tag_len() - FRAME_PREAMBLE_INLINE_SIZE;
+        }
+        return 0;
+      }
+      return padded_len;
+    }
+    if (m_is_rev1 && seg_idx == 0 && m_descs[0].logical_len > 0) {
+      return m_descs[0].logical_len + FRAME_CRC_SIZE;
+    }
+    return m_descs[seg_idx].logical_len;
+  }
+
+  // Epilogue:
+  //
+  //   epilogue_*_block_t
+  //   [auth tag -- only in secure mode]
+  //
+  // For msgr2.0, the epilogue is generated unconditionally.  In
+  // crc mode, it stores crcs for all segments; the preamble is
+  // covered by its own crc.  In secure mode, the epilogue auth tag
+  // covers the whole frame.
+  //
+  // For msgr2.1, the epilogue is generated only if the frame has
+  // more than one segment (i.e. at least one of second to fourth
+  // segments is not empty).  In crc mode, it stores crcs for
+  // second to fourh segments; the preamble and the first segment
+  // are covered by their own crcs.  In secure mode, the epilogue
+  // auth tag covers second to fourth segments; the preamble and the
+  // first segment (if not fully inlined into the preamble inline
+  // buffer) are covered by their own auth tags.
+  //
+  // Note that the auth tag format is an implementation detail of a
+  // particular cipher.  FrameAssembler is concerned only with where
+  // the auth tag is placed (at the end of the ciphertext) and how
+  // long it is (RxHandler::get_extra_size_at_final()).  This is to
+  // provide room for other encryption algorithms: currently we use
+  // AES-128-GCM with 16-byte tags, but it is possible to switch to
+  // e.g. AES-128-CBC + HMAC-SHA512 without affecting the protocol
+  // (except for the cipher negotiation, of course).
+  //
+  // Additionally, each variant of the epilogue contains either
+  // late_flags or late_status field that directs handling of frames
+  // with more than one segment.
+  uint32_t get_epilogue_onwire_len() const {
+    ceph_assert(!m_descs.empty());
+    if (m_is_rev1 && m_descs.size() == 1) {
+      return 0;
+    }
+    if (m_crypto->rx) {
+      return (m_is_rev1 ? sizeof(epilogue_secure_rev1_block_t) :
+                  sizeof(epilogue_secure_rev0_block_t)) + get_auth_tag_len();
+    }
+    return m_is_rev1 ? sizeof(epilogue_crc_rev1_block_t) :
+                       sizeof(epilogue_crc_rev0_block_t);
+  }
+
+  uint64_t get_frame_logical_len() const;
+  uint64_t get_frame_onwire_len() const;
+
+  bufferlist assemble_frame(Tag tag, bufferlist segment_bls[],
+                            const uint16_t segment_aligns[],
+                            size_t segment_count);
+
+  Tag disassemble_preamble(bufferlist& preamble_bl);
+
+  // Like msgr1, and unlike msgr2.0, msgr2.1 allows interpreting the
+  // first segment before reading in the rest of the frame.
+  //
+  // For msgr2.1 (set_is_rev1(true)), you may:
+  //
+  // - read in the first segment
+  // - call disassemble_first_segment()
+  // - use the contents of the first segment, for example to
+  //   look up user-provided buffers based on ceph_msg_header2::tid
+  // - read in the remaining segments, possibly directly into
+  //   user-provided buffers
+  // - read in epilogue
+  // - call disassemble_remaining_segments()
+  //
+  // For msgr2.0 (set_is_rev1(false)), disassemble_first_segment() is
+  // a noop.  To accomodate, disassemble_remaining_segments() always
+  // takes all segments and skips over the first segment in msgr2.1
+  // case.  You must:
+  //
+  // - read in all segments
+  // - read in epilogue
+  // - call disassemble_remaining_segments()
+  //
+  // disassemble_remaining_segments() returns true if the frame is
+  // ready for dispatching, or false if it was aborted by the sender
+  // and must be dropped.
+  void disassemble_first_segment(bufferlist& preamble_bl,
+                                 bufferlist& segment_bl) const;
+  bool disassemble_remaining_segments(bufferlist segment_bls[],
+                                      bufferlist& epilogue_bl) const;
+
+private:
+  struct segment_desc_t {
+    uint32_t logical_len;
+    uint16_t align;
+  };
+
+  uint32_t get_segment_padded_len(size_t seg_idx) const {
+    return p2roundup<uint32_t>(m_descs[seg_idx].logical_len,
+                               CRYPTO_BLOCK_SIZE);
+  }
+
+  uint32_t get_auth_tag_len() const {
+    return m_crypto->rx->get_extra_size_at_final();
+  }
+
+  bufferlist asm_crc_rev0(const preamble_block_t& preamble,
+                          bufferlist segment_bls[]) const;
+  bufferlist asm_secure_rev0(const preamble_block_t& preamble,
+                             bufferlist segment_bls[]) const;
+  bufferlist asm_crc_rev1(const preamble_block_t& preamble,
+                          bufferlist segment_bls[]) const;
+  bufferlist asm_secure_rev1(const preamble_block_t& preamble,
+                             bufferlist segment_bls[]) const;
+
+  bool disasm_all_crc_rev0(bufferlist segment_bls[],
+                           bufferlist& epilogue_bl) const;
+  bool disasm_all_secure_rev0(bufferlist segment_bls[],
+                              bufferlist& epilogue_bl) const;
+  void disasm_first_crc_rev1(bufferlist& preamble_bl,
+                             bufferlist& segment_bl) const;
+  bool disasm_remaining_crc_rev1(bufferlist segment_bls[],
+                                 bufferlist& epilogue_bl) const;
+  void disasm_first_secure_rev1(bufferlist& preamble_bl,
+                                bufferlist& segment_bl) const;
+  bool disasm_remaining_secure_rev1(bufferlist segment_bls[],
+                                    bufferlist& epilogue_bl) const;
+
+  void fill_preamble(Tag tag, preamble_block_t& preamble) const;
+  friend std::ostream& operator<<(std::ostream& os,
+                                  const FrameAssembler& frame_asm);
+
+  boost::container::static_vector<segment_desc_t, MAX_NUM_SEGMENTS> m_descs;
+  const ceph::crypto::onwire::rxtx_t* m_crypto;
+  bool m_is_rev1;  // msgr2.1?
+};
+
+template <class T, uint16_t... SegmentAlignmentVs>
+struct Frame {
+  static constexpr size_t SegmentsNumV = sizeof...(SegmentAlignmentVs);
+  static_assert(SegmentsNumV > 0 && SegmentsNumV <= MAX_NUM_SEGMENTS);
+protected:
+  std::array<ceph::bufferlist, SegmentsNumV> segments;
+
+private:
+  static constexpr std::array<uint16_t, SegmentsNumV> alignments {
+    SegmentAlignmentVs...
+  };
+
+public:
+  ceph::bufferlist get_buffer(FrameAssembler& tx_frame_asm) {
+    auto bl = tx_frame_asm.assemble_frame(T::tag, segments.data(),
+                                          alignments.data(), SegmentsNumV);
+    ceph_assert(bl.length() == tx_frame_asm.get_frame_onwire_len());
+    return bl;
+  }
+};
+
+// ControlFrames are used to manage transceiver state (like connections) and
+// orchestrate transfers of MessageFrames. They use only single segment with
+// marshalling facilities -- derived classes specify frame structure through
+// Args pack while ControlFrame provides common encode/decode machinery.
+template <class C, typename... Args>
+class ControlFrame : public Frame<C, segment_t::DEFAULT_ALIGNMENT /* single segment */> {
+protected:
+  ceph::bufferlist &get_payload_segment() {
+    return this->segments[SegmentIndex::Control::PAYLOAD];
+  }
+
+  // this tuple is only used when decoding values from a payload segment
+  std::tuple<Args...> _values;
+
+  // FIXME: for now, we assume specific features for the purpoess of encoding
+  // the frames themselves (*not* messages in message frames!).
+  uint64_t features = msgr2_frame_assumed;
+
+  template <typename T>
+  inline void _encode_payload_each(T &t) {
+    if constexpr (std::is_same<T, std::vector<uint32_t> const>()) {
+      encode((uint32_t)t.size(), this->get_payload_segment(), features);
+      for (const auto &elem : t) {
+        encode(elem, this->get_payload_segment(), features);
+      }
+    } else {
+      encode(t, this->get_payload_segment(), features);
+    }
+  }
+
+  template <typename T>
+  inline void _decode_payload_each(T &t, bufferlist::const_iterator &ti) const {
+    if constexpr (std::is_same<T, std::vector<uint32_t>>()) {
+      uint32_t size;
+      decode(size, ti);
+      t.resize(size);
+      for (uint32_t i = 0; i < size; ++i) {
+        decode(t[i], ti);
+      }
+    } else {
+      decode(t, ti);
+    }
+  }
+
+  template <std::size_t... Is>
+  inline void _decode_payload(bufferlist::const_iterator &ti,
+                              std::index_sequence<Is...>) const {
+    (_decode_payload_each((Args &)std::get<Is>(_values), ti), ...);
+  }
+
+  template <std::size_t N>
+  inline decltype(auto) get_val() {
+    return std::get<N>(_values);
+  }
+
+  ControlFrame()
+    : Frame<C, segment_t::DEFAULT_ALIGNMENT /* single segment */>() {
+  }
+
+  void _encode(const Args &... args) {
+    (_encode_payload_each(args), ...);
+  }
+
+  void _decode(const ceph::bufferlist &bl) {
+    auto ti = bl.cbegin();
+    _decode_payload(ti, std::index_sequence_for<Args...>());
+  }
+
+public:
+  static C Encode(const Args &... args) {
+    C c;
+    c._encode(args...);
+    return c;
+  }
+
+  static C Decode(const ceph::bufferlist &payload) {
+    C c;
+    c._decode(payload);
+    return c;
+  }
+};
+
+struct HelloFrame : public ControlFrame<HelloFrame,
+                                        uint8_t,          // entity type
+                                        entity_addr_t> {  // peer address
+  static const Tag tag = Tag::HELLO;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint8_t &entity_type() { return get_val<0>(); }
+  inline entity_addr_t &peer_addr() { return get_val<1>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct AuthRequestFrame : public ControlFrame<AuthRequestFrame,
+                                              uint32_t, // auth method
+                                              vector<uint32_t>, // preferred modes
+                                              bufferlist> { // auth payload
+  static const Tag tag = Tag::AUTH_REQUEST;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint32_t &method() { return get_val<0>(); }
+  inline vector<uint32_t> &preferred_modes() { return get_val<1>(); }
+  inline bufferlist &auth_payload() { return get_val<2>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct AuthBadMethodFrame : public ControlFrame<AuthBadMethodFrame,
+                                                uint32_t, // method
+                                                int32_t,  // result
+                                                std::vector<uint32_t>,   // allowed methods
+                                                std::vector<uint32_t>> { // allowed modes
+  static const Tag tag = Tag::AUTH_BAD_METHOD;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint32_t &method() { return get_val<0>(); }
+  inline int32_t &result() { return get_val<1>(); }
+  inline std::vector<uint32_t> &allowed_methods() { return get_val<2>(); }
+  inline std::vector<uint32_t> &allowed_modes() { return get_val<3>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct AuthReplyMoreFrame : public ControlFrame<AuthReplyMoreFrame,
+                                                bufferlist> { // auth payload
+  static const Tag tag = Tag::AUTH_REPLY_MORE;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline bufferlist &auth_payload() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct AuthRequestMoreFrame : public ControlFrame<AuthRequestMoreFrame,
+                                                  bufferlist> { // auth payload
+  static const Tag tag = Tag::AUTH_REQUEST_MORE;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline bufferlist &auth_payload() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct AuthDoneFrame : public ControlFrame<AuthDoneFrame,
+                                           uint64_t, // global id
+                                           uint32_t, // connection mode
+                                           bufferlist> { // auth method payload
+  static const Tag tag = Tag::AUTH_DONE;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint64_t &global_id() { return get_val<0>(); }
+  inline uint32_t &con_mode() { return get_val<1>(); }
+  inline bufferlist &auth_payload() { return get_val<2>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct AuthSignatureFrame
+    : public ControlFrame<AuthSignatureFrame,
+                          sha256_digest_t> {
+  static const Tag tag = Tag::AUTH_SIGNATURE;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline sha256_digest_t &signature() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct ClientIdentFrame
+    : public ControlFrame<ClientIdentFrame,
+                          entity_addrvec_t,  // my addresses
+                          entity_addr_t,  // target address
+                          int64_t,  // global_id
+                          uint64_t,  // global seq
+                          uint64_t,  // supported features
+                          uint64_t,  // required features
+                          uint64_t,  // flags
+                          uint64_t> {  // client cookie
+  static const Tag tag = Tag::CLIENT_IDENT;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline entity_addrvec_t &addrs() { return get_val<0>(); }
+  inline entity_addr_t &target_addr() { return get_val<1>(); }
+  inline int64_t &gid() { return get_val<2>(); }
+  inline uint64_t &global_seq() { return get_val<3>(); }
+  inline uint64_t &supported_features() { return get_val<4>(); }
+  inline uint64_t &required_features() { return get_val<5>(); }
+  inline uint64_t &flags() { return get_val<6>(); }
+  inline uint64_t &cookie() { return get_val<7>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct ServerIdentFrame
+    : public ControlFrame<ServerIdentFrame,
+                          entity_addrvec_t,  // my addresses
+                          int64_t,  // global_id
+                          uint64_t,  // global seq
+                          uint64_t,  // supported features
+                          uint64_t,  // required features
+                          uint64_t,  // flags
+                          uint64_t> {  // server cookie
+  static const Tag tag = Tag::SERVER_IDENT;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline entity_addrvec_t &addrs() { return get_val<0>(); }
+  inline int64_t &gid() { return get_val<1>(); }
+  inline uint64_t &global_seq() { return get_val<2>(); }
+  inline uint64_t &supported_features() { return get_val<3>(); }
+  inline uint64_t &required_features() { return get_val<4>(); }
+  inline uint64_t &flags() { return get_val<5>(); }
+  inline uint64_t &cookie() { return get_val<6>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct ReconnectFrame
+    : public ControlFrame<ReconnectFrame,
+                          entity_addrvec_t,  // my addresses
+                          uint64_t,  // client cookie
+                          uint64_t,  // server cookie
+                          uint64_t,  // global sequence
+                          uint64_t,  // connect sequence
+                          uint64_t> { // message sequence
+  static const Tag tag = Tag::SESSION_RECONNECT;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline entity_addrvec_t &addrs() { return get_val<0>(); }
+  inline uint64_t &client_cookie() { return get_val<1>(); }
+  inline uint64_t &server_cookie() { return get_val<2>(); }
+  inline uint64_t &global_seq() { return get_val<3>(); }
+  inline uint64_t &connect_seq() { return get_val<4>(); }
+  inline uint64_t &msg_seq() { return get_val<5>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct ResetFrame : public ControlFrame<ResetFrame,
+                                        bool> {  // full reset
+  static const Tag tag = Tag::SESSION_RESET;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline bool &full() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct RetryFrame : public ControlFrame<RetryFrame,
+                                        uint64_t> {  // connection seq
+  static const Tag tag = Tag::SESSION_RETRY;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint64_t &connect_seq() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct RetryGlobalFrame : public ControlFrame<RetryGlobalFrame,
+                                              uint64_t> { // global seq
+  static const Tag tag = Tag::SESSION_RETRY_GLOBAL;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint64_t &global_seq() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct WaitFrame : public ControlFrame<WaitFrame> {
+  static const Tag tag = Tag::WAIT;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct ReconnectOkFrame : public ControlFrame<ReconnectOkFrame,
+                                              uint64_t> { // message seq
+  static const Tag tag = Tag::SESSION_RECONNECT_OK;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint64_t &msg_seq() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct IdentMissingFeaturesFrame 
+    : public ControlFrame<IdentMissingFeaturesFrame,
+                          uint64_t> { // missing features mask
+  static const Tag tag = Tag::IDENT_MISSING_FEATURES;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint64_t &features() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct KeepAliveFrame : public ControlFrame<KeepAliveFrame,
+                                            utime_t> {  // timestamp
+  static const Tag tag = Tag::KEEPALIVE2;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  static KeepAliveFrame Encode() {
+    return KeepAliveFrame::Encode(ceph_clock_now());
+  }
+
+  inline utime_t &timestamp() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct KeepAliveFrameAck : public ControlFrame<KeepAliveFrameAck,
+                                               utime_t> { // ack timestamp
+  static const Tag tag = Tag::KEEPALIVE2_ACK;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline utime_t &timestamp() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct AckFrame : public ControlFrame<AckFrame,
+                                      uint64_t> { // message sequence
+  static const Tag tag = Tag::ACK;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint64_t &seq() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+using segment_bls_t =
+    boost::container::static_vector<bufferlist, MAX_NUM_SEGMENTS>;
+
+// This class is used for encoding/decoding header of the message frame.
+// Body is processed almost independently with the sole junction point
+// being the `extra_payload_len` passed to get_buffer().
+struct MessageFrame : public Frame<MessageFrame,
+                                   /* four segments */
+                                   segment_t::DEFAULT_ALIGNMENT,
+                                   segment_t::DEFAULT_ALIGNMENT,
+                                   segment_t::DEFAULT_ALIGNMENT,
+                                   segment_t::PAGE_SIZE_ALIGNMENT> {
+  static const Tag tag = Tag::MESSAGE;
+
+  static MessageFrame Encode(const ceph_msg_header2 &msg_header,
+                             const ceph::bufferlist &front,
+                             const ceph::bufferlist &middle,
+                             const ceph::bufferlist &data) {
+    MessageFrame f;
+    f.segments[SegmentIndex::Msg::HEADER].append(
+        reinterpret_cast<const char*>(&msg_header), sizeof(msg_header));
+
+    f.segments[SegmentIndex::Msg::FRONT] = front;
+    f.segments[SegmentIndex::Msg::MIDDLE] = middle;
+    f.segments[SegmentIndex::Msg::DATA] = data;
+
+    return f;
+  }
+
+  static MessageFrame Decode(segment_bls_t& recv_segments) {
+    MessageFrame f;
+    // transfer segments' bufferlists. If a MessageFrame contains less
+    // SegmentsNumV segments, the missing ones will be seen as zeroed.
+    for (__u8 idx = 0; idx < std::size(recv_segments); idx++) {
+      f.segments[idx] = std::move(recv_segments[idx]);
+    }
+    return f;
+  }
+
+  inline const ceph_msg_header2 &header() {
+    auto& hdrbl = segments[SegmentIndex::Msg::HEADER];
+    return reinterpret_cast<const ceph_msg_header2&>(*hdrbl.c_str());
+  }
+
+  ceph::bufferlist &front() {
+    return segments[SegmentIndex::Msg::FRONT];
+  }
+
+  ceph::bufferlist &middle() {
+    return segments[SegmentIndex::Msg::MIDDLE];
+  }
+
+  ceph::bufferlist &data() {
+    return segments[SegmentIndex::Msg::DATA];
+  }
+
+  uint32_t front_len() const {
+    return segments[SegmentIndex::Msg::FRONT].length();
+  }
+
+  uint32_t middle_len() const {
+    return segments[SegmentIndex::Msg::MIDDLE].length();
+  }
+
+  uint32_t data_len() const {
+    return segments[SegmentIndex::Msg::DATA].length();
+  }
+
+protected:
+  using Frame::Frame;
+};
+
+} // namespace ceph::msgr::v2
+
+#endif // _MSG_ASYNC_FRAMES_V2_
diff --git a/src/msg/async/net_handler.cc b/src/msg/async/net_handler.cc
new file mode 100644
index 00000000..2b4e646d
--- /dev/null
+++ b/src/msg/async/net_handler.cc
@@ -0,0 +1,233 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+
+#include "net_handler.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "include/compat.h"
+#include "include/sock_compat.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << "NetHandler "
+
+namespace ceph{
+
+int NetHandler::create_socket(int domain, bool reuse_addr)
+{
+  int s;
+  int r = 0;
+
+  if ((s = socket_cloexec(domain, SOCK_STREAM, 0)) == -1) {
+    r = errno;
+    lderr(cct) << __func__ << " couldn't create socket " << cpp_strerror(r) << dendl;
+    return -r;
+  }
+
+#if !defined(__FreeBSD__)
+  /* Make sure connection-intensive things like the benchmark
+   * will be able to close/open sockets a zillion of times */
+  if (reuse_addr) {
+    int on = 1;
+    if (::setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) {
+      r = errno;
+      lderr(cct) << __func__ << " setsockopt SO_REUSEADDR failed: "
+                 << strerror(r) << dendl;
+      close(s);
+      return -r;
+    }
+  }
+#endif
+
+  return s;
+}
+
+int NetHandler::set_nonblock(int sd)
+{
+  int flags;
+  int r = 0;
+
+  /* Set the socket nonblocking.
+   * Note that fcntl(2) for F_GETFL and F_SETFL can't be
+   * interrupted by a signal. */
+  if ((flags = fcntl(sd, F_GETFL)) < 0 ) {
+    r = errno;
+    lderr(cct) << __func__ << " fcntl(F_GETFL) failed: " << cpp_strerror(r) << dendl;
+    return -r;
+  }
+  if (fcntl(sd, F_SETFL, flags | O_NONBLOCK) < 0) {
+    r = errno;
+    lderr(cct) << __func__ << " fcntl(F_SETFL,O_NONBLOCK): " << cpp_strerror(r) << dendl;
+    return -r;
+  }
+
+  return 0;
+}
+
+int NetHandler::set_socket_options(int sd, bool nodelay, int size)
+{
+  int r = 0;
+  // disable Nagle algorithm?
+  if (nodelay) {
+    int flag = 1;
+    r = ::setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, (char*)&flag, sizeof(flag));
+    if (r < 0) {
+      r = errno;
+      ldout(cct, 0) << "couldn't set TCP_NODELAY: " << cpp_strerror(r) << dendl;
+    }
+  }
+  if (size) {
+    r = ::setsockopt(sd, SOL_SOCKET, SO_RCVBUF, (void*)&size, sizeof(size));
+    if (r < 0)  {
+      r = errno;
+      ldout(cct, 0) << "couldn't set SO_RCVBUF to " << size << ": " << cpp_strerror(r) << dendl;
+    }
+  }
+
+  // block ESIGPIPE
+#ifdef CEPH_USE_SO_NOSIGPIPE
+  int val = 1;
+  r = ::setsockopt(sd, SOL_SOCKET, SO_NOSIGPIPE, (void*)&val, sizeof(val));
+  if (r) {
+    r = errno;
+    ldout(cct,0) << "couldn't set SO_NOSIGPIPE: " << cpp_strerror(r) << dendl;
+  }
+#endif
+  return -r;
+}
+
+void NetHandler::set_priority(int sd, int prio, int domain)
+{
+#ifdef SO_PRIORITY
+  if (prio < 0) {
+    return;
+  }
+  int r = -1;
+#ifdef IPTOS_CLASS_CS6
+  int iptos = IPTOS_CLASS_CS6;
+  switch (domain) {
+  case AF_INET:
+    r = ::setsockopt(sd, IPPROTO_IP, IP_TOS, &iptos, sizeof(iptos));
+    break;
+  case AF_INET6:
+    r = ::setsockopt(sd, IPPROTO_IPV6, IPV6_TCLASS, &iptos, sizeof(iptos));
+    break;
+  default:
+    lderr(cct) << "couldn't set ToS of unknown family (" << domain << ")"
+	       << " to " << iptos << dendl;
+    return;
+  }
+  if (r < 0) {
+    r = errno;
+    ldout(cct,0) << "couldn't set TOS to " << iptos
+		 << ": " << cpp_strerror(r) << dendl;
+  }
+
+#endif	// IPTOS_CLASS_CS6
+  // setsockopt(IPTOS_CLASS_CS6) sets the priority of the socket as 0.
+  // See http://goo.gl/QWhvsD and http://goo.gl/laTbjT
+  // We need to call setsockopt(SO_PRIORITY) after it.
+  r = ::setsockopt(sd, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio));
+  if (r < 0) {
+    r = errno;
+    ldout(cct, 0) << __func__ << " couldn't set SO_PRIORITY to " << prio
+		  << ": " << cpp_strerror(r) << dendl;
+  }
+#else
+  return;
+#endif	// SO_PRIORITY
+}
+
+int NetHandler::generic_connect(const entity_addr_t& addr, const entity_addr_t &bind_addr, bool nonblock)
+{
+  int ret;
+  int s = create_socket(addr.get_family());
+  if (s < 0)
+    return s;
+
+  if (nonblock) {
+    ret = set_nonblock(s);
+    if (ret < 0) {
+      close(s);
+      return ret;
+    }
+  }
+
+  set_socket_options(s, cct->_conf->ms_tcp_nodelay, cct->_conf->ms_tcp_rcvbuf);
+
+  {
+    entity_addr_t addr = bind_addr;
+    if (cct->_conf->ms_bind_before_connect && (!addr.is_blank_ip())) {
+      addr.set_port(0);
+      ret = ::bind(s, addr.get_sockaddr(), addr.get_sockaddr_len());
+      if (ret < 0) {
+        ret = errno;
+        ldout(cct, 2) << __func__ << " client bind error " << ", " << cpp_strerror(ret) << dendl;
+        close(s);
+        return -ret;
+      }
+    }
+  }
+
+  ret = ::connect(s, addr.get_sockaddr(), addr.get_sockaddr_len());
+  if (ret < 0) {
+    ret = errno;
+    if (errno == EINPROGRESS && nonblock)
+      return s;
+
+    ldout(cct, 10) << __func__ << " connect: " << cpp_strerror(ret) << dendl;
+    close(s);
+    return -ret;
+  }
+
+  return s;
+}
+
+int NetHandler::reconnect(const entity_addr_t &addr, int sd)
+{
+  int r = 0;
+  int ret = ::connect(sd, addr.get_sockaddr(), addr.get_sockaddr_len());
+
+  if (ret < 0 && errno != EISCONN) {
+    r = errno;
+    ldout(cct, 10) << __func__ << " reconnect: " << strerror(r) << dendl;
+    if (r == EINPROGRESS || r == EALREADY)
+      return 1;
+    return -r;
+  }
+
+  return 0;
+}
+
+int NetHandler::connect(const entity_addr_t &addr, const entity_addr_t& bind_addr)
+{
+  return generic_connect(addr, bind_addr, false);
+}
+
+int NetHandler::nonblock_connect(const entity_addr_t &addr, const entity_addr_t& bind_addr)
+{
+  return generic_connect(addr, bind_addr, true);
+}
+
+
+}
diff --git a/src/msg/async/net_handler.h b/src/msg/async/net_handler.h
new file mode 100644
index 00000000..19042377
--- /dev/null
+++ b/src/msg/async/net_handler.h
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_NET_UTILS_H
+#define CEPH_COMMON_NET_UTILS_H
+#include "common/config.h"
+
+namespace ceph {
+  class NetHandler {
+    int generic_connect(const entity_addr_t& addr, const entity_addr_t& bind_addr, bool nonblock);
+
+    CephContext *cct;
+   public:
+    int create_socket(int domain, bool reuse_addr=false);
+    explicit NetHandler(CephContext *c): cct(c) {}
+    int set_nonblock(int sd);
+    int set_socket_options(int sd, bool nodelay, int size);
+    int connect(const entity_addr_t &addr, const entity_addr_t& bind_addr);
+    
+    /**
+     * Try to reconnect the socket.
+     *
+     * @return    0         success
+     *            > 0       just break, and wait for event
+     *            < 0       need to goto fail
+     */
+    int reconnect(const entity_addr_t &addr, int sd);
+    int nonblock_connect(const entity_addr_t &addr, const entity_addr_t& bind_addr);
+    void set_priority(int sd, int priority, int domain);
+  };
+}
+
+#endif
diff --git a/src/msg/async/rdma/Infiniband.cc b/src/msg/async/rdma/Infiniband.cc
new file mode 100644
index 00000000..34299975
--- /dev/null
+++ b/src/msg/async/rdma/Infiniband.cc
@@ -0,0 +1,1234 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "Infiniband.h"
+#include "common/errno.h"
+#include "common/debug.h"
+#include "RDMAStack.h"
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << "Infiniband "
+
+static const uint32_t MAX_SHARED_RX_SGE_COUNT = 1;
+static const uint32_t MAX_INLINE_DATA = 0;
+static const uint32_t TCP_MSG_LEN = sizeof("0000:00000000:00000000:00000000:00000000000000000000000000000000");
+static const uint32_t CQ_DEPTH = 30000;
+
+Port::Port(CephContext *cct, struct ibv_context* ictxt, uint8_t ipn): ctxt(ictxt), port_num(ipn), port_attr(new ibv_port_attr), gid_idx(0)
+{
+#ifdef HAVE_IBV_EXP
+  union ibv_gid cgid;
+  struct ibv_exp_gid_attr gid_attr;
+  bool malformed = false;
+
+  ldout(cct,1) << __func__ << " using experimental verbs for gid" << dendl;
+  int r = ibv_query_port(ctxt, port_num, port_attr);
+  if (r == -1) {
+    lderr(cct) << __func__  << " query port failed  " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+
+  lid = port_attr->lid;
+
+  // search for requested GID in GIDs table
+  ldout(cct, 1) << __func__ << " looking for local GID " << (cct->_conf->ms_async_rdma_local_gid)
+    << " of type " << (cct->_conf->ms_async_rdma_roce_ver) << dendl;
+  r = sscanf(cct->_conf->ms_async_rdma_local_gid.c_str(),
+	     "%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx"
+	     ":%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx",
+	     &cgid.raw[ 0], &cgid.raw[ 1],
+	     &cgid.raw[ 2], &cgid.raw[ 3],
+	     &cgid.raw[ 4], &cgid.raw[ 5],
+	     &cgid.raw[ 6], &cgid.raw[ 7],
+	     &cgid.raw[ 8], &cgid.raw[ 9],
+	     &cgid.raw[10], &cgid.raw[11],
+	     &cgid.raw[12], &cgid.raw[13],
+	     &cgid.raw[14], &cgid.raw[15]);
+
+  if (r != 16) {
+    ldout(cct, 1) << __func__ << " malformed or no GID supplied, using GID index 0" << dendl;
+    malformed = true;
+  }
+
+  gid_attr.comp_mask = IBV_EXP_QUERY_GID_ATTR_TYPE;
+
+  for (gid_idx = 0; gid_idx < port_attr->gid_tbl_len; gid_idx++) {
+    r = ibv_query_gid(ctxt, port_num, gid_idx, &gid);
+    if (r) {
+      lderr(cct) << __func__  << " query gid of port " << port_num << " index " << gid_idx << " failed  " << cpp_strerror(errno) << dendl;
+      ceph_abort();
+    }
+    r = ibv_exp_query_gid_attr(ctxt, port_num, gid_idx, &gid_attr);
+    if (r) {
+      lderr(cct) << __func__  << " query gid attributes of port " << port_num << " index " << gid_idx << " failed  " << cpp_strerror(errno) << dendl;
+      ceph_abort();
+    }
+
+    if (malformed) break; // stay with gid_idx=0
+    if ( (gid_attr.type == cct->_conf->ms_async_rdma_roce_ver) &&
+	 (memcmp(&gid, &cgid, 16) == 0) ) {
+      ldout(cct, 1) << __func__ << " found at index " << gid_idx << dendl;
+      break;
+    }
+  }
+
+  if (gid_idx == port_attr->gid_tbl_len) {
+    lderr(cct) << __func__ << " Requested local GID was not found in GID table" << dendl;
+    ceph_abort();
+  }
+#else
+  int r = ibv_query_port(ctxt, port_num, port_attr);
+  if (r == -1) {
+    lderr(cct) << __func__  << " query port failed  " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+
+  lid = port_attr->lid;
+  r = ibv_query_gid(ctxt, port_num, 0, &gid);
+  if (r) {
+    lderr(cct) << __func__  << " query gid failed  " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+#endif
+}
+
+
+Device::Device(CephContext *cct, ibv_device* d, struct ibv_context *dc)
+  : device(d), device_attr(new ibv_device_attr), active_port(nullptr)
+{
+  if (device == NULL) {
+    lderr(cct) << __func__ << " device == NULL" << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+  name = ibv_get_device_name(device);
+  if (cct->_conf->ms_async_rdma_cm) {
+    ctxt = dc;
+  } else {
+    ctxt = ibv_open_device(device);
+  }
+  if (ctxt == NULL) {
+    lderr(cct) << __func__ << " open rdma device failed. " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+  int r = ibv_query_device(ctxt, device_attr);
+  if (r == -1) {
+    lderr(cct) << __func__ << " failed to query rdma device. " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+}
+
+void Device::binding_port(CephContext *cct, int port_num) {
+  port_cnt = device_attr->phys_port_cnt;
+  for (uint8_t i = 0; i < port_cnt; ++i) {
+    Port *port = new Port(cct, ctxt, i+1);
+    if (i + 1 == port_num && port->get_port_attr()->state == IBV_PORT_ACTIVE) {
+      active_port = port;
+      ldout(cct, 1) << __func__ << " found active port " << i+1 << dendl;
+      break;
+    } else {
+      ldout(cct, 10) << __func__ << " port " << i+1 << " is not what we want. state: " << port->get_port_attr()->state << ")"<< dendl;
+    }
+    delete port;
+  }
+  if (nullptr == active_port) {
+    lderr(cct) << __func__ << "  port not found" << dendl;
+    ceph_assert(active_port);
+  }
+}
+
+
+Infiniband::QueuePair::QueuePair(
+    CephContext *c, Infiniband& infiniband, ibv_qp_type type,
+    int port, ibv_srq *srq,
+    Infiniband::CompletionQueue* txcq, Infiniband::CompletionQueue* rxcq,
+    uint32_t tx_queue_len, uint32_t rx_queue_len, struct rdma_cm_id *cid, uint32_t q_key)
+: cct(c), infiniband(infiniband),
+  type(type),
+  ctxt(infiniband.device->ctxt),
+  ib_physical_port(port),
+  pd(infiniband.pd->pd),
+  srq(srq),
+  qp(NULL),
+  cm_id(cid),
+  txcq(txcq),
+  rxcq(rxcq),
+  initial_psn(0),
+  max_send_wr(tx_queue_len),
+  max_recv_wr(rx_queue_len),
+  q_key(q_key),
+  dead(false)
+{
+  initial_psn = lrand48() & 0xffffff;
+  if (type != IBV_QPT_RC && type != IBV_QPT_UD && type != IBV_QPT_RAW_PACKET) {
+    lderr(cct) << __func__ << " invalid queue pair type" << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+  pd = infiniband.pd->pd;
+}
+
+int Infiniband::QueuePair::init()
+{
+  ldout(cct, 20) << __func__ << " started." << dendl;
+  ibv_qp_init_attr qpia;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&qpia, 0, sizeof(qpia));
+  qpia.send_cq = txcq->get_cq();
+  qpia.recv_cq = rxcq->get_cq();
+  if (srq) {
+    qpia.srq = srq;                      // use the same shared receive queue
+  } else {
+    qpia.cap.max_recv_wr = max_recv_wr;
+    qpia.cap.max_recv_sge = 1;
+  }
+  qpia.cap.max_send_wr  = max_send_wr; // max outstanding send requests
+  qpia.cap.max_send_sge = 1;           // max send scatter-gather elements
+  qpia.cap.max_inline_data = MAX_INLINE_DATA;          // max bytes of immediate data on send q
+  qpia.qp_type = type;                 // RC, UC, UD, or XRC
+  qpia.sq_sig_all = 0;                 // only generate CQEs on requested WQEs
+
+  if (!cct->_conf->ms_async_rdma_cm) {
+    qp = ibv_create_qp(pd, &qpia);
+    if (qp == NULL) {
+      lderr(cct) << __func__ << " failed to create queue pair" << cpp_strerror(errno) << dendl;
+      if (errno == ENOMEM) {
+        lderr(cct) << __func__ << " try reducing ms_async_rdma_receive_queue_length, "
+                                  " ms_async_rdma_send_buffers or"
+                                  " ms_async_rdma_buffer_size" << dendl;
+      }
+      return -1;
+    }
+  } else {
+    ceph_assert(cm_id->verbs == pd->context);
+    if (rdma_create_qp(cm_id, pd, &qpia)) {
+      lderr(cct) << __func__ << " failed to create queue pair with rdmacm library"
+                 << cpp_strerror(errno) << dendl;
+      return -1;
+    }
+    qp = cm_id->qp;
+  }
+  ldout(cct, 20) << __func__ << " successfully create queue pair: "
+                 << "qp=" << qp << dendl;
+
+  if (cct->_conf->ms_async_rdma_cm)
+    return 0;
+
+  // move from RESET to INIT state
+  ibv_qp_attr qpa;
+  memset(&qpa, 0, sizeof(qpa));
+  qpa.qp_state   = IBV_QPS_INIT;
+  qpa.pkey_index = 0;
+  qpa.port_num   = (uint8_t)(ib_physical_port);
+  qpa.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE;
+  qpa.qkey       = q_key;
+
+  int mask = IBV_QP_STATE | IBV_QP_PORT;
+  switch (type) {
+    case IBV_QPT_RC:
+      mask |= IBV_QP_ACCESS_FLAGS;
+      mask |= IBV_QP_PKEY_INDEX;
+      break;
+    case IBV_QPT_UD:
+      mask |= IBV_QP_QKEY;
+      mask |= IBV_QP_PKEY_INDEX;
+      break;
+    case IBV_QPT_RAW_PACKET:
+      break;
+    default:
+      ceph_abort();
+  }
+
+  int ret = ibv_modify_qp(qp, &qpa, mask);
+  if (ret) {
+    ibv_destroy_qp(qp);
+    lderr(cct) << __func__ << " failed to transition to INIT state: "
+               << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+  ldout(cct, 20) << __func__ << " successfully change queue pair to INIT:"
+                 << " qp=" << qp << dendl;
+  return 0;
+}
+
+/**
+ * Change RC QueuePair into the ERROR state. This is necessary modify
+ * the Queue Pair into the Error state and poll all of the relevant
+ * Work Completions prior to destroying a Queue Pair.
+ * Since destroying a Queue Pair does not guarantee that its Work
+ * Completions are removed from the CQ upon destruction. Even if the
+ * Work Completions are already in the CQ, it might not be possible to
+ * retrieve them. If the Queue Pair is associated with an SRQ, it is
+ * recommended wait for the affiliated event IBV_EVENT_QP_LAST_WQE_REACHED
+ *
+ * \return
+ *      -errno if the QueuePair can't switch to ERROR
+ *      0 for success.
+ */
+int Infiniband::QueuePair::to_dead()
+{
+  if (dead)
+    return 0;
+  ibv_qp_attr qpa;
+  memset(&qpa, 0, sizeof(qpa));
+  qpa.qp_state = IBV_QPS_ERR;
+
+  int mask = IBV_QP_STATE;
+  int ret = ibv_modify_qp(qp, &qpa, mask);
+  if (ret) {
+    lderr(cct) << __func__ << " failed to transition to ERROR state: "
+               << cpp_strerror(errno) << dendl;
+    return -errno;
+  }
+  dead = true;
+  return ret;
+}
+
+int Infiniband::QueuePair::get_remote_qp_number(uint32_t *rqp) const
+{
+  ibv_qp_attr qpa;
+  ibv_qp_init_attr qpia;
+
+  int r = ibv_query_qp(qp, &qpa, IBV_QP_DEST_QPN, &qpia);
+  if (r) {
+    lderr(cct) << __func__ << " failed to query qp: "
+      << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+
+  if (rqp)
+    *rqp = qpa.dest_qp_num;
+  return 0;
+}
+
+/**
+ * Get the remote infiniband address for this QueuePair, as set in #plumb().
+ * LIDs are "local IDs" in infiniband terminology. They are short, locally
+ * routable addresses.
+ */
+int Infiniband::QueuePair::get_remote_lid(uint16_t *lid) const
+{
+  ibv_qp_attr qpa;
+  ibv_qp_init_attr qpia;
+
+  int r = ibv_query_qp(qp, &qpa, IBV_QP_AV, &qpia);
+  if (r) {
+    lderr(cct) << __func__ << " failed to query qp: "
+      << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+
+  if (lid)
+    *lid = qpa.ah_attr.dlid;
+  return 0;
+}
+
+/**
+ * Get the state of a QueuePair.
+ */
+int Infiniband::QueuePair::get_state() const
+{
+  ibv_qp_attr qpa;
+  ibv_qp_init_attr qpia;
+
+  int r = ibv_query_qp(qp, &qpa, IBV_QP_STATE, &qpia);
+  if (r) {
+    lderr(cct) << __func__ << " failed to get state: "
+      << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+  return qpa.qp_state;
+}
+
+/**
+ * Return true if the queue pair is in an error state, false otherwise.
+ */
+bool Infiniband::QueuePair::is_error() const
+{
+  ibv_qp_attr qpa;
+  ibv_qp_init_attr qpia;
+
+  int r = ibv_query_qp(qp, &qpa, -1, &qpia);
+  if (r) {
+    lderr(cct) << __func__ << " failed to get state: "
+      << cpp_strerror(errno) << dendl;
+    return true;
+  }
+  return qpa.cur_qp_state == IBV_QPS_ERR;
+}
+
+
+Infiniband::CompletionChannel::CompletionChannel(CephContext *c, Infiniband &ib)
+  : cct(c), infiniband(ib), channel(NULL), cq(NULL), cq_events_that_need_ack(0)
+{
+}
+
+Infiniband::CompletionChannel::~CompletionChannel()
+{
+  if (channel) {
+    int r = ibv_destroy_comp_channel(channel);
+    if (r < 0)
+      lderr(cct) << __func__ << " failed to destroy cc: " << cpp_strerror(errno) << dendl;
+    ceph_assert(r == 0);
+  }
+}
+
+int Infiniband::CompletionChannel::init()
+{
+  ldout(cct, 20) << __func__ << " started." << dendl;
+  channel = ibv_create_comp_channel(infiniband.device->ctxt);
+  if (!channel) {
+    lderr(cct) << __func__ << " failed to create receive completion channel: "
+                          << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+  int rc = NetHandler(cct).set_nonblock(channel->fd);
+  if (rc < 0) {
+    ibv_destroy_comp_channel(channel);
+    return -1;
+  }
+  return 0;
+}
+
+void Infiniband::CompletionChannel::ack_events()
+{
+  ibv_ack_cq_events(cq, cq_events_that_need_ack);
+  cq_events_that_need_ack = 0;
+}
+
+bool Infiniband::CompletionChannel::get_cq_event()
+{
+  ibv_cq *cq = NULL;
+  void *ev_ctx;
+  if (ibv_get_cq_event(channel, &cq, &ev_ctx)) {
+    if (errno != EAGAIN && errno != EINTR)
+      lderr(cct) << __func__ << " failed to retrieve CQ event: "
+                 << cpp_strerror(errno) << dendl;
+    return false;
+  }
+
+  /* accumulate number of cq events that need to
+   *    * be acked, and periodically ack them
+   *       */
+  if (++cq_events_that_need_ack == MAX_ACK_EVENT) {
+    ldout(cct, 20) << __func__ << " ack aq events." << dendl;
+    ibv_ack_cq_events(cq, MAX_ACK_EVENT);
+    cq_events_that_need_ack = 0;
+  }
+
+  return true;
+}
+
+
+Infiniband::CompletionQueue::~CompletionQueue()
+{
+  if (cq) {
+    int r = ibv_destroy_cq(cq);
+    if (r < 0)
+      lderr(cct) << __func__ << " failed to destroy cq: " << cpp_strerror(errno) << dendl;
+    ceph_assert(r == 0);
+  }
+}
+
+int Infiniband::CompletionQueue::init()
+{
+  cq = ibv_create_cq(infiniband.device->ctxt, queue_depth, this, channel->get_channel(), 0);
+  if (!cq) {
+    lderr(cct) << __func__ << " failed to create receive completion queue: "
+      << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+
+  if (ibv_req_notify_cq(cq, 0)) {
+    lderr(cct) << __func__ << " ibv_req_notify_cq failed: " << cpp_strerror(errno) << dendl;
+    ibv_destroy_cq(cq);
+    cq = nullptr;
+    return -1;
+  }
+
+  channel->bind_cq(cq);
+  ldout(cct, 20) << __func__ << " successfully create cq=" << cq << dendl;
+  return 0;
+}
+
+int Infiniband::CompletionQueue::rearm_notify(bool solicite_only)
+{
+  ldout(cct, 20) << __func__ << " started." << dendl;
+  int r = ibv_req_notify_cq(cq, 0);
+  if (r < 0)
+    lderr(cct) << __func__ << " failed to notify cq: " << cpp_strerror(errno) << dendl;
+  return r;
+}
+
+int Infiniband::CompletionQueue::poll_cq(int num_entries, ibv_wc *ret_wc_array) {
+  int r = ibv_poll_cq(cq, num_entries, ret_wc_array);
+  if (r < 0) {
+    lderr(cct) << __func__ << " poll_completion_queue occur met error: "
+      << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+  return r;
+}
+
+
+Infiniband::ProtectionDomain::ProtectionDomain(CephContext *cct, Device *device)
+  : pd(ibv_alloc_pd(device->ctxt))
+{
+  if (pd == NULL) {
+    lderr(cct) << __func__ << " failed to allocate infiniband protection domain: " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+}
+
+Infiniband::ProtectionDomain::~ProtectionDomain()
+{
+  ibv_dealloc_pd(pd);
+}
+
+
+Infiniband::MemoryManager::Chunk::Chunk(ibv_mr* m, uint32_t len, char* b)
+  : mr(m), bytes(len), offset(0), buffer(b)
+{
+}
+
+Infiniband::MemoryManager::Chunk::~Chunk()
+{
+}
+
+void Infiniband::MemoryManager::Chunk::set_offset(uint32_t o)
+{
+  offset = o;
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::get_offset()
+{
+  return offset;
+}
+
+void Infiniband::MemoryManager::Chunk::set_bound(uint32_t b)
+{
+  bound = b;
+}
+
+void Infiniband::MemoryManager::Chunk::prepare_read(uint32_t b)
+{
+  offset = 0;
+  bound = b;
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::get_bound()
+{
+  return bound;
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::read(char* buf, uint32_t len)
+{
+  uint32_t left = bound - offset;
+  if (left >= len) {
+    memcpy(buf, buffer+offset, len);
+    offset += len;
+    return len;
+  } else {
+    memcpy(buf, buffer+offset, left);
+    offset = 0;
+    bound = 0;
+    return left;
+  }
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::write(char* buf, uint32_t len)
+{
+  uint32_t left = bytes - offset;
+  if (left >= len) {
+    memcpy(buffer+offset, buf, len);
+    offset += len;
+    return len;
+  } else {
+    memcpy(buffer+offset, buf, left);
+    offset = bytes;
+    return left;
+  }
+}
+
+bool Infiniband::MemoryManager::Chunk::full()
+{
+  return offset == bytes;
+}
+
+bool Infiniband::MemoryManager::Chunk::over()
+{
+  return Infiniband::MemoryManager::Chunk::offset == bound;
+}
+
+void Infiniband::MemoryManager::Chunk::clear()
+{
+  offset = 0;
+  bound = 0;
+}
+
+Infiniband::MemoryManager::Cluster::Cluster(MemoryManager& m, uint32_t s)
+  : manager(m), buffer_size(s), lock("cluster_lock")
+{
+}
+
+Infiniband::MemoryManager::Cluster::~Cluster()
+{
+  int r = ibv_dereg_mr(chunk_base->mr);
+  ceph_assert(r == 0);
+  const auto chunk_end = chunk_base + num_chunk;
+  for (auto chunk = chunk_base; chunk != chunk_end; chunk++) {
+    chunk->~Chunk();
+  }
+
+  ::free(chunk_base);
+  manager.free(base);
+}
+
+int Infiniband::MemoryManager::Cluster::fill(uint32_t num)
+{
+  ceph_assert(!base);
+  num_chunk = num;
+  uint32_t bytes = buffer_size * num;
+
+  base = (char*)manager.malloc(bytes);
+  end = base + bytes;
+  ceph_assert(base);
+  chunk_base = static_cast<Chunk*>(::malloc(sizeof(Chunk) * num));
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(static_cast<void*>(chunk_base), 0, sizeof(Chunk) * num);
+  free_chunks.reserve(num);
+  ibv_mr* m = ibv_reg_mr(manager.pd->pd, base, bytes, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
+  ceph_assert(m);
+  Chunk* chunk = chunk_base;
+  for (uint32_t offset = 0; offset < bytes; offset += buffer_size){
+    new(chunk) Chunk(m, buffer_size, base+offset);
+    free_chunks.push_back(chunk);
+    chunk++;
+  }
+  return 0;
+}
+
+void Infiniband::MemoryManager::Cluster::take_back(std::vector<Chunk*> &ck)
+{
+  Mutex::Locker l(lock);
+  for (auto c : ck) {
+    c->clear();
+    free_chunks.push_back(c);
+  }
+}
+
+int Infiniband::MemoryManager::Cluster::get_buffers(std::vector<Chunk*> &chunks, size_t bytes)
+{
+  uint32_t num = bytes / buffer_size + 1;
+  if (bytes % buffer_size == 0)
+    --num;
+  int r = num;
+  Mutex::Locker l(lock);
+  if (free_chunks.empty())
+    return 0;
+  if (!bytes) {
+    r = free_chunks.size();
+    for (auto c : free_chunks)
+      chunks.push_back(c);
+    free_chunks.clear();
+    return r;
+  }
+  if (free_chunks.size() < num) {
+    num = free_chunks.size();
+    r = num;
+  }
+  for (uint32_t i = 0; i < num; ++i) {
+    chunks.push_back(free_chunks.back());
+    free_chunks.pop_back();
+  }
+  return r;
+}
+
+bool Infiniband::MemoryManager::MemPoolContext::can_alloc(unsigned nbufs)
+{
+  /* unlimited */
+  if (manager->cct->_conf->ms_async_rdma_receive_buffers <= 0)
+    return true;
+
+  if (n_bufs_allocated + nbufs > (unsigned)manager->cct->_conf->ms_async_rdma_receive_buffers) {
+    lderr(manager->cct) << __func__ << " WARNING: OUT OF RX BUFFERS: allocated: " <<
+        n_bufs_allocated << " requested: " << nbufs <<
+        " limit: " << manager->cct->_conf->ms_async_rdma_receive_buffers << dendl;
+    return false;
+  }
+
+  return true;
+}
+
+void Infiniband::MemoryManager::MemPoolContext::set_stat_logger(PerfCounters *logger) {
+  perf_logger = logger;
+  if (perf_logger != nullptr)
+    perf_logger->set(l_msgr_rdma_rx_bufs_total, n_bufs_allocated);
+}
+
+void Infiniband::MemoryManager::MemPoolContext::update_stats(int nbufs)
+{
+  n_bufs_allocated += nbufs;
+
+  if (!perf_logger)
+    return;
+
+  if (nbufs > 0) {
+    perf_logger->inc(l_msgr_rdma_rx_bufs_total, nbufs);
+  } else {
+    perf_logger->dec(l_msgr_rdma_rx_bufs_total, -nbufs);
+  }
+}
+
+void *Infiniband::MemoryManager::mem_pool::slow_malloc()
+{
+  void *p;
+
+  Mutex::Locker l(PoolAllocator::lock);
+  PoolAllocator::g_ctx = ctx;
+  // this will trigger pool expansion via PoolAllocator::malloc()
+  p = boost::pool<PoolAllocator>::malloc();
+  PoolAllocator::g_ctx = nullptr;
+  return p;
+}
+
+Infiniband::MemoryManager::MemPoolContext *Infiniband::MemoryManager::PoolAllocator::g_ctx = nullptr;
+Mutex Infiniband::MemoryManager::PoolAllocator::lock("pool-alloc-lock");
+
+// lock is taken by mem_pool::slow_malloc()
+char *Infiniband::MemoryManager::PoolAllocator::malloc(const size_type bytes)
+{
+  mem_info *m;
+  Chunk *ch;
+  size_t rx_buf_size;
+  unsigned nbufs;
+  MemoryManager *manager;
+  CephContext *cct;
+
+  ceph_assert(g_ctx);
+  manager     = g_ctx->manager;
+  cct         = manager->cct;
+  rx_buf_size = sizeof(Chunk) + cct->_conf->ms_async_rdma_buffer_size;
+  nbufs       = bytes/rx_buf_size;
+
+  if (!g_ctx->can_alloc(nbufs))
+    return NULL;
+
+  m = static_cast<mem_info *>(manager->malloc(bytes + sizeof(*m)));
+  if (!m) {
+    lderr(cct) << __func__ << " failed to allocate " <<
+        bytes << " + " << sizeof(*m) << " bytes of memory for " << nbufs << dendl;
+    return NULL;
+  }
+
+  m->mr = ibv_reg_mr(manager->pd->pd, m->chunks, bytes, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
+  if (m->mr == NULL) {
+    lderr(cct) << __func__ << " failed to register " <<
+        bytes << " + " << sizeof(*m) << " bytes of memory for " << nbufs << dendl;
+    manager->free(m);
+    return NULL;
+  }
+
+  m->nbufs = nbufs;
+  // save this chunk context
+  m->ctx   = g_ctx;
+
+  // note that the memory can be allocated before perf logger is set
+  g_ctx->update_stats(nbufs);
+
+  /* initialize chunks */
+  ch = m->chunks;
+  for (unsigned i = 0; i < nbufs; i++) {
+    ch->lkey   = m->mr->lkey;
+    ch->bytes  = cct->_conf->ms_async_rdma_buffer_size;
+    ch->offset = 0;
+    ch->buffer = ch->data; // TODO: refactor tx and remove buffer
+    ch = reinterpret_cast<Chunk *>(reinterpret_cast<char *>(ch) + rx_buf_size);
+  }
+
+  return reinterpret_cast<char *>(m->chunks);
+}
+
+
+void Infiniband::MemoryManager::PoolAllocator::free(char * const block)
+{
+  mem_info *m;
+  Mutex::Locker l(lock);
+    
+  m = reinterpret_cast<mem_info *>(block) - 1;
+  m->ctx->update_stats(-m->nbufs);
+  ibv_dereg_mr(m->mr);
+  m->ctx->manager->free(m);
+}
+
+Infiniband::MemoryManager::MemoryManager(CephContext *c, Device *d, ProtectionDomain *p)
+  : cct(c), device(d), pd(p),
+    rxbuf_pool_ctx(this),
+    rxbuf_pool(&rxbuf_pool_ctx, sizeof(Chunk) + c->_conf->ms_async_rdma_buffer_size,
+               c->_conf->ms_async_rdma_receive_buffers > 0 ?
+                  // if possible make initial pool size 2 * receive_queue_len
+                  // that way there will be no pool expansion upon receive of the
+                  // first packet.
+                  (c->_conf->ms_async_rdma_receive_buffers < 2 * c->_conf->ms_async_rdma_receive_queue_len ?
+                   c->_conf->ms_async_rdma_receive_buffers :  2 * c->_conf->ms_async_rdma_receive_queue_len) :
+                  // rx pool is infinite, we can set any initial size that we want
+                   2 * c->_conf->ms_async_rdma_receive_queue_len)
+{
+}
+
+Infiniband::MemoryManager::~MemoryManager()
+{
+  if (send)
+    delete send;
+}
+
+void* Infiniband::MemoryManager::huge_pages_malloc(size_t size)
+{
+  size_t real_size = ALIGN_TO_PAGE_SIZE(size + HUGE_PAGE_SIZE);
+  char *ptr = (char *)mmap(NULL, real_size, PROT_READ | PROT_WRITE,MAP_PRIVATE | MAP_ANONYMOUS |MAP_POPULATE | MAP_HUGETLB,-1, 0);
+  if (ptr == MAP_FAILED) {
+    ptr = (char *)std::malloc(real_size);
+    if (ptr == NULL) return NULL;
+    real_size = 0;
+  }
+  *((size_t *)ptr) = real_size;
+  return ptr + HUGE_PAGE_SIZE;
+}
+
+void Infiniband::MemoryManager::huge_pages_free(void *ptr)
+{
+  if (ptr == NULL) return;
+  void *real_ptr = (char *)ptr -HUGE_PAGE_SIZE;
+  size_t real_size = *((size_t *)real_ptr);
+  ceph_assert(real_size % HUGE_PAGE_SIZE == 0);
+  if (real_size != 0)
+    munmap(real_ptr, real_size);
+  else
+    std::free(real_ptr);
+}
+
+
+void* Infiniband::MemoryManager::malloc(size_t size)
+{
+  if (cct->_conf->ms_async_rdma_enable_hugepage)
+    return huge_pages_malloc(size);
+  else
+    return std::malloc(size);
+}
+
+void Infiniband::MemoryManager::free(void *ptr)
+{
+  if (cct->_conf->ms_async_rdma_enable_hugepage)
+    huge_pages_free(ptr);
+  else
+    std::free(ptr);
+}
+
+void Infiniband::MemoryManager::create_tx_pool(uint32_t size, uint32_t tx_num)
+{
+  ceph_assert(device);
+  ceph_assert(pd);
+
+  send = new Cluster(*this, size);
+  send->fill(tx_num);
+}
+
+void Infiniband::MemoryManager::return_tx(std::vector<Chunk*> &chunks)
+{
+  send->take_back(chunks);
+}
+
+int Infiniband::MemoryManager::get_send_buffers(std::vector<Chunk*> &c, size_t bytes)
+{
+  return send->get_buffers(c, bytes);
+}
+
+static std::atomic<bool> init_prereq = {false};
+
+void Infiniband::verify_prereq(CephContext *cct) {
+
+  //On RDMA MUST be called before fork
+   int rc = ibv_fork_init();
+   if (rc) {
+      lderr(cct) << __func__ << " failed to call ibv_for_init(). On RDMA must be called before fork. Application aborts." << dendl;
+      ceph_abort();
+   }
+
+   ldout(cct, 20) << __func__ << " ms_async_rdma_enable_hugepage value is: " << cct->_conf->ms_async_rdma_enable_hugepage <<  dendl;
+   if (cct->_conf->ms_async_rdma_enable_hugepage){
+     rc =  setenv("RDMAV_HUGEPAGES_SAFE","1",1);
+     ldout(cct, 0) << __func__ << " RDMAV_HUGEPAGES_SAFE is set as: " << getenv("RDMAV_HUGEPAGES_SAFE") <<  dendl;
+     if (rc) {
+       lderr(cct) << __func__ << " failed to export RDMA_HUGEPAGES_SAFE. On RDMA must be exported before using huge pages. Application aborts." << dendl;
+       ceph_abort();
+     }
+   }
+
+   //Check ulimit
+   struct rlimit limit;
+   getrlimit(RLIMIT_MEMLOCK, &limit);
+   if (limit.rlim_cur != RLIM_INFINITY || limit.rlim_max != RLIM_INFINITY) {
+      lderr(cct) << __func__ << "!!! WARNING !!! For RDMA to work properly user memlock (ulimit -l) must be big enough to allow large amount of registered memory."
+				  " We recommend setting this parameter to infinity" << dendl;
+   }
+   init_prereq = true;
+}
+
+Infiniband::Infiniband(CephContext *cct)
+  : cct(cct), lock("IB lock"),
+    device_name(cct->_conf->ms_async_rdma_device_name),
+    port_num( cct->_conf->ms_async_rdma_port_num)
+{
+  if (!init_prereq)
+    verify_prereq(cct);
+  ldout(cct, 20) << __func__ << " constructing Infiniband..." << dendl;
+}
+
+void Infiniband::init()
+{
+  Mutex::Locker l(lock);
+
+  if (initialized)
+    return;
+
+  device_list = new DeviceList(cct);
+  initialized = true;
+
+  device = device_list->get_device(device_name.c_str());
+  ceph_assert(device);
+  device->binding_port(cct, port_num);
+  ib_physical_port = device->active_port->get_port_num();
+  pd = new ProtectionDomain(cct, device);
+  ceph_assert(NetHandler(cct).set_nonblock(device->ctxt->async_fd) == 0);
+
+  support_srq = cct->_conf->ms_async_rdma_support_srq;
+  if (support_srq)
+    rx_queue_len = device->device_attr->max_srq_wr;
+  else
+    rx_queue_len = device->device_attr->max_qp_wr;
+  if (rx_queue_len > cct->_conf->ms_async_rdma_receive_queue_len) {
+    rx_queue_len = cct->_conf->ms_async_rdma_receive_queue_len;
+    ldout(cct, 1) << __func__ << " receive queue length is " << rx_queue_len << " receive buffers" << dendl;
+  } else {
+    ldout(cct, 0) << __func__ << " requested receive queue length " <<
+                  cct->_conf->ms_async_rdma_receive_queue_len <<
+                  " is too big. Setting " << rx_queue_len << dendl;
+  }
+
+  // check for the misconfiguration
+  if (cct->_conf->ms_async_rdma_receive_buffers > 0 &&
+      rx_queue_len > (unsigned)cct->_conf->ms_async_rdma_receive_buffers) {
+    lderr(cct) << __func__ << " rdma_receive_queue_len (" <<
+                  rx_queue_len << ") > ms_async_rdma_receive_buffers(" <<
+                  cct->_conf->ms_async_rdma_receive_buffers << ")." << dendl;
+    ceph_abort();
+  }
+
+  tx_queue_len = device->device_attr->max_qp_wr;
+  if (tx_queue_len > cct->_conf->ms_async_rdma_send_buffers) {
+    tx_queue_len = cct->_conf->ms_async_rdma_send_buffers;
+    ldout(cct, 1) << __func__ << " assigning: " << tx_queue_len << " send buffers"  << dendl;
+  } else {
+    ldout(cct, 0) << __func__ << " using the max allowed send buffers: " << tx_queue_len << dendl;
+  }
+
+  ldout(cct, 1) << __func__ << " device allow " << device->device_attr->max_cqe
+                << " completion entries" << dendl;
+
+  memory_manager = new MemoryManager(cct, device, pd);
+  memory_manager->create_tx_pool(cct->_conf->ms_async_rdma_buffer_size, tx_queue_len);
+
+  if (support_srq) {
+    srq = create_shared_receive_queue(rx_queue_len, MAX_SHARED_RX_SGE_COUNT);
+    post_chunks_to_rq(rx_queue_len, NULL); //add to srq
+  }
+}
+
+Infiniband::~Infiniband()
+{
+  if (!initialized)
+    return;
+  if (support_srq)
+    ibv_destroy_srq(srq);
+  delete memory_manager;
+  delete pd;
+}
+
+/**
+ * Create a shared receive queue. This basically wraps the verbs call. 
+ *
+ * \param[in] max_wr
+ *      The max number of outstanding work requests in the SRQ.
+ * \param[in] max_sge
+ *      The max number of scatter elements per WR.
+ * \return
+ *      A valid ibv_srq pointer, or NULL on error.
+ */
+ibv_srq* Infiniband::create_shared_receive_queue(uint32_t max_wr, uint32_t max_sge)
+{
+  ibv_srq_init_attr sia;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&sia, 0, sizeof(sia));
+  sia.srq_context = device->ctxt;
+  sia.attr.max_wr = max_wr;
+  sia.attr.max_sge = max_sge;
+  return ibv_create_srq(pd->pd, &sia);
+}
+
+int Infiniband::get_tx_buffers(std::vector<Chunk*> &c, size_t bytes)
+{
+  return memory_manager->get_send_buffers(c, bytes);
+}
+
+/**
+ * Create a new QueuePair. This factory should be used in preference to
+ * the QueuePair constructor directly, since this lets derivatives of
+ * Infiniband, e.g. MockInfiniband (if it existed),
+ * return mocked out QueuePair derivatives.
+ *
+ * \return
+ *      QueuePair on success or NULL if init fails
+ * See QueuePair::QueuePair for parameter documentation.
+ */
+Infiniband::QueuePair* Infiniband::create_queue_pair(CephContext *cct, CompletionQueue *tx,
+    CompletionQueue* rx, ibv_qp_type type, struct rdma_cm_id *cm_id)
+{
+  Infiniband::QueuePair *qp = new QueuePair(
+      cct, *this, type, ib_physical_port, srq, tx, rx, tx_queue_len, rx_queue_len, cm_id);
+  if (qp->init()) {
+    delete qp;
+    return NULL;
+  }
+  return qp;
+}
+
+int Infiniband::post_chunks_to_rq(int num, ibv_qp *qp)
+{
+  int ret, i = 0;
+  ibv_sge isge[num];
+  Chunk *chunk;
+  ibv_recv_wr rx_work_request[num];
+
+  while (i < num) {
+    chunk = get_memory_manager()->get_rx_buffer();
+    if (chunk == NULL) {
+      lderr(cct) << __func__ << " WARNING: out of memory. Requested " << num <<
+        " rx buffers. Got " << i << dendl;
+      if (i == 0)
+        return 0;
+      // if we got some buffers post them and hope for the best
+      rx_work_request[i-1].next = 0;
+      break;
+    }
+
+    isge[i].addr = reinterpret_cast<uint64_t>(chunk->data);
+    isge[i].length = chunk->bytes;
+    isge[i].lkey = chunk->lkey;
+
+    memset(&rx_work_request[i], 0, sizeof(rx_work_request[i]));
+    rx_work_request[i].wr_id = reinterpret_cast<uint64_t>(chunk);// stash descriptor ptr
+    if (i == num - 1) {
+      rx_work_request[i].next = 0;
+    } else {
+      rx_work_request[i].next = &rx_work_request[i+1];
+    }
+    rx_work_request[i].sg_list = &isge[i];
+    rx_work_request[i].num_sge = 1;
+    i++;
+  }
+  ibv_recv_wr *badworkrequest;
+  if (support_srq) {
+    ret = ibv_post_srq_recv(srq, &rx_work_request[0], &badworkrequest);
+    ceph_assert(ret == 0);
+  } else {
+    ceph_assert(qp);
+    ret = ibv_post_recv(qp, &rx_work_request[0], &badworkrequest);
+    ceph_assert(ret == 0);
+  }
+  return i;
+}
+
+Infiniband::CompletionChannel* Infiniband::create_comp_channel(CephContext *c)
+{
+  Infiniband::CompletionChannel *cc = new Infiniband::CompletionChannel(c, *this);
+  if (cc->init()) {
+    delete cc;
+    return NULL;
+  }
+  return cc;
+}
+
+Infiniband::CompletionQueue* Infiniband::create_comp_queue(
+    CephContext *cct, CompletionChannel *cc)
+{
+  Infiniband::CompletionQueue *cq = new Infiniband::CompletionQueue(
+      cct, *this, CQ_DEPTH, cc);
+  if (cq->init()) {
+    delete cq;
+    return NULL;
+  }
+  return cq;
+}
+
+// 1 means no valid buffer read, 0 means got enough buffer
+// else return < 0 means error
+int Infiniband::recv_msg(CephContext *cct, int sd, IBSYNMsg& im)
+{
+  char msg[TCP_MSG_LEN];
+  char gid[33];
+  ssize_t r = ::read(sd, &msg, sizeof(msg));
+  // Drop incoming qpt
+  if (cct->_conf->ms_inject_socket_failures && sd >= 0) {
+    if (rand() % cct->_conf->ms_inject_socket_failures == 0) {
+      ldout(cct, 0) << __func__ << " injecting socket failure" << dendl;
+      return -EINVAL;
+    }
+  }
+  if (r < 0) {
+    r = -errno;
+    lderr(cct) << __func__ << " got error " << r << ": "
+               << cpp_strerror(r) << dendl;
+  } else if (r == 0) { // valid disconnect message of length 0
+    ldout(cct, 10) << __func__ << " got disconnect message " << dendl;
+  } else if ((size_t)r != sizeof(msg)) { // invalid message
+    ldout(cct, 1) << __func__ << " got bad length (" << r << ") " << dendl;
+    r = -EINVAL;
+  } else { // valid message
+    sscanf(msg, "%hx:%x:%x:%x:%s", &(im.lid), &(im.qpn), &(im.psn), &(im.peer_qpn),gid);
+    wire_gid_to_gid(gid, &(im.gid));
+    ldout(cct, 5) << __func__ << " recevd: " << im.lid << ", " << im.qpn << ", " << im.psn << ", " << im.peer_qpn << ", " << gid  << dendl;
+  }
+  return r;
+}
+
+int Infiniband::send_msg(CephContext *cct, int sd, IBSYNMsg& im)
+{
+  int retry = 0;
+  ssize_t r;
+
+  char msg[TCP_MSG_LEN];
+  char gid[33];
+retry:
+  gid_to_wire_gid(&(im.gid), gid);
+  sprintf(msg, "%04x:%08x:%08x:%08x:%s", im.lid, im.qpn, im.psn, im.peer_qpn, gid);
+  ldout(cct, 10) << __func__ << " sending: " << im.lid << ", " << im.qpn << ", " << im.psn
+                 << ", " << im.peer_qpn << ", "  << gid  << dendl;
+  r = ::write(sd, msg, sizeof(msg));
+  // Drop incoming qpt
+  if (cct->_conf->ms_inject_socket_failures && sd >= 0) {
+    if (rand() % cct->_conf->ms_inject_socket_failures == 0) {
+      ldout(cct, 0) << __func__ << " injecting socket failure" << dendl;
+      return -EINVAL;
+    }
+  }
+
+  if ((size_t)r != sizeof(msg)) {
+    // FIXME need to handle EAGAIN instead of retry
+    if (r < 0 && (errno == EINTR || errno == EAGAIN) && retry < 3) {
+      retry++;
+      goto retry;
+    }
+    if (r < 0)
+      lderr(cct) << __func__ << " send returned error " << errno << ": "
+                 << cpp_strerror(errno) << dendl;
+    else
+      lderr(cct) << __func__ << " send got bad length (" << r << ") " << cpp_strerror(errno) << dendl;
+    return -errno;
+  }
+  return 0;
+}
+
+void Infiniband::wire_gid_to_gid(const char *wgid, union ibv_gid *gid)
+{
+  char tmp[9];
+  uint32_t v32;
+  int i;
+
+  for (tmp[8] = 0, i = 0; i < 4; ++i) {
+    memcpy(tmp, wgid + i * 8, 8);
+    sscanf(tmp, "%x", &v32);
+    *(uint32_t *)(&gid->raw[i * 4]) = ntohl(v32);
+  }
+}
+
+void Infiniband::gid_to_wire_gid(const union ibv_gid *gid, char wgid[])
+{
+  for (int i = 0; i < 4; ++i)
+    sprintf(&wgid[i * 8], "%08x", htonl(*(uint32_t *)(gid->raw + i * 4)));
+}
+
+Infiniband::QueuePair::~QueuePair()
+{
+  if (qp) {
+    ldout(cct, 20) << __func__ << " destroy qp=" << qp << dendl;
+    ceph_assert(!ibv_destroy_qp(qp));
+  }
+}
+
+/**
+ * Given a string representation of the `status' field from Verbs
+ * struct `ibv_wc'.
+ *
+ * \param[in] status
+ *      The integer status obtained in ibv_wc.status.
+ * \return
+ *      A string corresponding to the given status.
+ */
+const char* Infiniband::wc_status_to_string(int status)
+{
+  static const char *lookup[] = {
+      "SUCCESS",
+      "LOC_LEN_ERR",
+      "LOC_QP_OP_ERR",
+      "LOC_EEC_OP_ERR",
+      "LOC_PROT_ERR",
+      "WR_FLUSH_ERR",
+      "MW_BIND_ERR",
+      "BAD_RESP_ERR",
+      "LOC_ACCESS_ERR",
+      "REM_INV_REQ_ERR",
+      "REM_ACCESS_ERR",
+      "REM_OP_ERR",
+      "RETRY_EXC_ERR",
+      "RNR_RETRY_EXC_ERR",
+      "LOC_RDD_VIOL_ERR",
+      "REM_INV_RD_REQ_ERR",
+      "REM_ABORT_ERR",
+      "INV_EECN_ERR",
+      "INV_EEC_STATE_ERR",
+      "FATAL_ERR",
+      "RESP_TIMEOUT_ERR",
+      "GENERAL_ERR"
+  };
+
+  if (status < IBV_WC_SUCCESS || status > IBV_WC_GENERAL_ERR)
+    return "<status out of range!>";
+  return lookup[status];
+}
+
+const char* Infiniband::qp_state_string(int status) {
+  switch(status) {
+    case IBV_QPS_RESET : return "IBV_QPS_RESET";
+    case IBV_QPS_INIT  : return "IBV_QPS_INIT";
+    case IBV_QPS_RTR   : return "IBV_QPS_RTR";
+    case IBV_QPS_RTS   : return "IBV_QPS_RTS";
+    case IBV_QPS_SQD   : return "IBV_QPS_SQD";
+    case IBV_QPS_SQE   : return "IBV_QPS_SQE";
+    case IBV_QPS_ERR   : return "IBV_QPS_ERR";
+    default: return " out of range.";
+  }
+}
diff --git a/src/msg/async/rdma/Infiniband.h b/src/msg/async/rdma/Infiniband.h
new file mode 100644
index 00000000..2889cdfc
--- /dev/null
+++ b/src/msg/async/rdma/Infiniband.h
@@ -0,0 +1,529 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_INFINIBAND_H
+#define CEPH_INFINIBAND_H
+
+#include <boost/pool/pool.hpp>
+// need this because boost messes with ceph log/assert definitions
+#include "include/ceph_assert.h"
+
+#include <infiniband/verbs.h>
+#include <rdma/rdma_cma.h>
+
+#include <atomic>
+#include <string>
+#include <vector>
+
+#include "include/int_types.h"
+#include "include/page.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Mutex.h"
+#include "common/perf_counters.h"
+#include "msg/msg_types.h"
+#include "msg/async/net_handler.h"
+
+#define HUGE_PAGE_SIZE (2 * 1024 * 1024)
+#define ALIGN_TO_PAGE_SIZE(x) \
+  (((x) + HUGE_PAGE_SIZE -1) / HUGE_PAGE_SIZE * HUGE_PAGE_SIZE)
+
+struct IBSYNMsg {
+  uint16_t lid;
+  uint32_t qpn;
+  uint32_t psn;
+  uint32_t peer_qpn;
+  union ibv_gid gid;
+} __attribute__((packed));
+
+class RDMAStack;
+class CephContext;
+
+class Port {
+  struct ibv_context* ctxt;
+  int port_num;
+  struct ibv_port_attr* port_attr;
+  uint16_t lid;
+  int gid_idx = 0;
+  union ibv_gid gid;
+
+ public:
+  explicit Port(CephContext *cct, struct ibv_context* ictxt, uint8_t ipn);
+  uint16_t get_lid() { return lid; }
+  ibv_gid  get_gid() { return gid; }
+  int get_port_num() { return port_num; }
+  ibv_port_attr* get_port_attr() { return port_attr; }
+  int get_gid_idx() { return gid_idx; }
+};
+
+
+class Device {
+  ibv_device *device;
+  const char* name;
+  uint8_t  port_cnt = 0;
+ public:
+  explicit Device(CephContext *c, ibv_device* d, struct ibv_context *dc);
+  ~Device() {
+    if (active_port) {
+      delete active_port;
+      ceph_assert(ibv_close_device(ctxt) == 0);
+    }
+  }
+  const char* get_name() { return name;}
+  uint16_t get_lid() { return active_port->get_lid(); }
+  ibv_gid get_gid() { return active_port->get_gid(); }
+  int get_gid_idx() { return active_port->get_gid_idx(); }
+  void binding_port(CephContext *c, int port_num);
+  struct ibv_context *ctxt;
+  ibv_device_attr *device_attr;
+  Port* active_port;
+};
+
+
+class DeviceList {
+  struct ibv_device ** device_list;
+  struct ibv_context ** device_context_list;
+  int num;
+  Device** devices;
+ public:
+  explicit DeviceList(CephContext *cct): device_list(ibv_get_device_list(&num)),
+                                device_context_list(rdma_get_devices(&num)) {
+    if (device_list == NULL || num == 0) {
+      lderr(cct) << __func__ << " failed to get rdma device list.  " << cpp_strerror(errno) << dendl;
+      ceph_abort();
+    }
+    devices = new Device*[num];
+
+    for (int i = 0;i < num; ++i) {
+      devices[i] = new Device(cct, device_list[i], device_context_list[i]);
+    }
+  }
+  ~DeviceList() {
+    for (int i=0; i < num; ++i) {
+      delete devices[i];
+    }
+    delete []devices;
+    ibv_free_device_list(device_list);
+  }
+
+  Device* get_device(const char* device_name) {
+    ceph_assert(devices);
+    for (int i = 0; i < num; ++i) {
+      if (!strlen(device_name) || !strcmp(device_name, devices[i]->get_name())) {
+        return devices[i];
+      }
+    }
+    return NULL;
+  }
+};
+
+// stat counters
+enum {
+  l_msgr_rdma_dispatcher_first = 94000,
+
+  l_msgr_rdma_polling,
+  l_msgr_rdma_inflight_tx_chunks,
+  l_msgr_rdma_rx_bufs_in_use,
+  l_msgr_rdma_rx_bufs_total,
+
+  l_msgr_rdma_tx_total_wc,
+  l_msgr_rdma_tx_total_wc_errors,
+  l_msgr_rdma_tx_wc_retry_errors,
+  l_msgr_rdma_tx_wc_wr_flush_errors,
+
+  l_msgr_rdma_rx_total_wc,
+  l_msgr_rdma_rx_total_wc_errors,
+  l_msgr_rdma_rx_fin,
+
+  l_msgr_rdma_handshake_errors,
+
+  l_msgr_rdma_total_async_events,
+  l_msgr_rdma_async_last_wqe_events,
+
+  l_msgr_rdma_created_queue_pair,
+  l_msgr_rdma_active_queue_pair,
+
+  l_msgr_rdma_dispatcher_last,
+};
+
+enum {
+  l_msgr_rdma_first = 95000,
+
+  l_msgr_rdma_tx_no_mem,
+  l_msgr_rdma_tx_parital_mem,
+  l_msgr_rdma_tx_failed,
+
+  l_msgr_rdma_tx_chunks,
+  l_msgr_rdma_tx_bytes,
+  l_msgr_rdma_rx_chunks,
+  l_msgr_rdma_rx_bytes,
+  l_msgr_rdma_pending_sent_conns,
+
+  l_msgr_rdma_last,
+};
+
+class RDMADispatcher;
+
+class Infiniband {
+ public:
+  class ProtectionDomain {
+   public:
+    explicit ProtectionDomain(CephContext *cct, Device *device);
+    ~ProtectionDomain();
+
+    ibv_pd* const pd;
+  };
+
+
+  class MemoryManager {
+   public:
+    class Chunk {
+     public:
+      Chunk(ibv_mr* m, uint32_t len, char* b);
+      ~Chunk();
+
+      void set_offset(uint32_t o);
+      uint32_t get_offset();
+      void set_bound(uint32_t b);
+      void prepare_read(uint32_t b);
+      uint32_t get_bound();
+      uint32_t read(char* buf, uint32_t len);
+      uint32_t write(char* buf, uint32_t len);
+      bool full();
+      bool over();
+      void clear();
+
+     public:
+      ibv_mr* mr;
+      uint32_t lkey = 0;
+      uint32_t bytes;
+      uint32_t bound = 0;
+      uint32_t offset;
+      char* buffer; // TODO: remove buffer/refactor TX
+      char  data[0];
+    };
+
+    class Cluster {
+     public:
+      Cluster(MemoryManager& m, uint32_t s);
+      ~Cluster();
+
+      int fill(uint32_t num);
+      void take_back(std::vector<Chunk*> &ck);
+      int get_buffers(std::vector<Chunk*> &chunks, size_t bytes);
+      Chunk *get_chunk_by_buffer(const char *c) {
+        uint32_t idx = (c - base) / buffer_size;
+        Chunk *chunk = chunk_base + idx;
+        return chunk;
+      }
+      bool is_my_buffer(const char *c) const {
+        return c >= base && c < end;
+      }
+
+      MemoryManager& manager;
+      uint32_t buffer_size;
+      uint32_t num_chunk = 0;
+      Mutex lock;
+      std::vector<Chunk*> free_chunks;
+      char *base = nullptr;
+      char *end = nullptr;
+      Chunk* chunk_base = nullptr;
+    };
+
+    class MemPoolContext {
+      PerfCounters *perf_logger;
+
+     public:
+      MemoryManager *manager;
+      unsigned n_bufs_allocated;
+      // true if it is possible to alloc
+      // more memory for the pool
+      explicit MemPoolContext(MemoryManager *m) :
+        perf_logger(nullptr),
+        manager(m),
+        n_bufs_allocated(0) {}
+      bool can_alloc(unsigned nbufs);
+      void update_stats(int val);
+      void set_stat_logger(PerfCounters *logger);
+    };
+
+    class PoolAllocator {
+      struct mem_info {
+        ibv_mr   *mr;
+        MemPoolContext *ctx;
+        unsigned nbufs;
+        Chunk    chunks[0];
+      };
+     public:
+      typedef std::size_t size_type;
+      typedef std::ptrdiff_t difference_type;
+
+      static char * malloc(const size_type bytes);
+      static void free(char * const block);
+
+      static MemPoolContext  *g_ctx;
+      static Mutex lock;
+    };
+
+    /**
+     * modify boost pool so that it is possible to
+     * have a thread safe 'context' when allocating/freeing
+     * the memory. It is needed to allow a different pool
+     * configurations and bookkeeping per CephContext and
+     * also to be able to use same allocator to deal with
+     * RX and TX pool.
+     * TODO: use boost pool to allocate TX chunks too
+     */
+    class mem_pool : public boost::pool<PoolAllocator> {
+     private:
+      MemPoolContext *ctx;
+      void *slow_malloc();
+
+     public:
+      explicit mem_pool(MemPoolContext *ctx, const size_type nrequested_size,
+          const size_type nnext_size = 32,
+          const size_type nmax_size = 0) :
+        pool(nrequested_size, nnext_size, nmax_size),
+        ctx(ctx) { }
+
+      void *malloc() {
+        if (!store().empty())
+          return (store().malloc)();
+        // need to alloc more memory...
+        // slow path code
+        return slow_malloc();
+      }
+    };
+
+    MemoryManager(CephContext *c, Device *d, ProtectionDomain *p);
+    ~MemoryManager();
+
+    void* malloc(size_t size);
+    void  free(void *ptr);
+
+    void create_tx_pool(uint32_t size, uint32_t tx_num);
+    void return_tx(std::vector<Chunk*> &chunks);
+    int get_send_buffers(std::vector<Chunk*> &c, size_t bytes);
+    bool is_tx_buffer(const char* c) { return send->is_my_buffer(c); }
+    Chunk *get_tx_chunk_by_buffer(const char *c) {
+      return send->get_chunk_by_buffer(c);
+    }
+    uint32_t get_tx_buffer_size() const {
+      return send->buffer_size;
+    }
+
+    Chunk *get_rx_buffer() {
+       return reinterpret_cast<Chunk *>(rxbuf_pool.malloc());
+    }
+
+    void release_rx_buffer(Chunk *chunk) {
+      rxbuf_pool.free(chunk);
+    }
+
+    void set_rx_stat_logger(PerfCounters *logger) {
+      rxbuf_pool_ctx.set_stat_logger(logger);
+    }
+
+    CephContext  *cct;
+   private:
+    // TODO: Cluster -> TxPool txbuf_pool
+    // chunk layout fix
+    //  
+    Cluster* send = nullptr;// SEND
+    Device *device;
+    ProtectionDomain *pd;
+    MemPoolContext rxbuf_pool_ctx;
+    mem_pool     rxbuf_pool;
+
+
+    void* huge_pages_malloc(size_t size);
+    void  huge_pages_free(void *ptr);
+  };
+
+ private:
+  uint32_t tx_queue_len = 0;
+  uint32_t rx_queue_len = 0;
+  uint32_t max_sge = 0;
+  uint8_t  ib_physical_port = 0;
+  MemoryManager* memory_manager = nullptr;
+  ibv_srq* srq = nullptr;             // shared receive work queue
+  Device *device = NULL;
+  ProtectionDomain *pd = NULL;
+  DeviceList *device_list = nullptr;
+  void wire_gid_to_gid(const char *wgid, union ibv_gid *gid);
+  void gid_to_wire_gid(const union ibv_gid *gid, char wgid[]);
+  CephContext *cct;
+  Mutex lock;
+  bool initialized = false;
+  const std::string &device_name;
+  uint8_t port_num;
+  bool support_srq = false;
+
+ public:
+  explicit Infiniband(CephContext *c);
+  ~Infiniband();
+  void init();
+  static void verify_prereq(CephContext *cct);
+
+  class CompletionChannel {
+    static const uint32_t MAX_ACK_EVENT = 5000;
+    CephContext *cct;
+    Infiniband& infiniband;
+    ibv_comp_channel *channel;
+    ibv_cq *cq;
+    uint32_t cq_events_that_need_ack;
+
+   public:
+    CompletionChannel(CephContext *c, Infiniband &ib);
+    ~CompletionChannel();
+    int init();
+    bool get_cq_event();
+    int get_fd() { return channel->fd; }
+    ibv_comp_channel* get_channel() { return channel; }
+    void bind_cq(ibv_cq *c) { cq = c; }
+    void ack_events();
+  };
+
+  // this class encapsulates the creation, use, and destruction of an RC
+  // completion queue.
+  //
+  // You need to call init and it will create a cq and associate to comp channel
+  class CompletionQueue {
+   public:
+    CompletionQueue(CephContext *c, Infiniband &ib,
+                    const uint32_t qd, CompletionChannel *cc)
+      : cct(c), infiniband(ib), channel(cc), cq(NULL), queue_depth(qd) {}
+    ~CompletionQueue();
+    int init();
+    int poll_cq(int num_entries, ibv_wc *ret_wc_array);
+
+    ibv_cq* get_cq() const { return cq; }
+    int rearm_notify(bool solicited_only=true);
+    CompletionChannel* get_cc() const { return channel; }
+   private:
+    CephContext *cct;
+    Infiniband&  infiniband;     // Infiniband to which this QP belongs
+    CompletionChannel *channel;
+    ibv_cq *cq;
+    uint32_t queue_depth;
+  };
+
+  // this class encapsulates the creation, use, and destruction of an RC
+  // queue pair.
+  //
+  // you need call init and it will create a qp and bring it to the INIT state.
+  // after obtaining the lid, qpn, and psn of a remote queue pair, one
+  // must call plumb() to bring the queue pair to the RTS state.
+  class QueuePair {
+   public:
+    QueuePair(CephContext *c, Infiniband& infiniband, ibv_qp_type type,
+              int ib_physical_port,  ibv_srq *srq,
+              Infiniband::CompletionQueue* txcq,
+              Infiniband::CompletionQueue* rxcq,
+              uint32_t tx_queue_len, uint32_t max_recv_wr, struct rdma_cm_id *cid, uint32_t q_key = 0);
+    ~QueuePair();
+
+    int init();
+
+    /**
+     * Get the initial packet sequence number for this QueuePair.
+     * This is randomly generated on creation. It should not be confused
+     * with the remote side's PSN, which is set in #plumb(). 
+     */
+    uint32_t get_initial_psn() const { return initial_psn; };
+    /**
+     * Get the local queue pair number for this QueuePair.
+     * QPNs are analogous to UDP/TCP port numbers.
+     */
+    uint32_t get_local_qp_number() const { return qp->qp_num; };
+    /**
+     * Get the remote queue pair number for this QueuePair, as set in #plumb().
+     * QPNs are analogous to UDP/TCP port numbers.
+     */
+    int get_remote_qp_number(uint32_t *rqp) const;
+    /**
+     * Get the remote infiniband address for this QueuePair, as set in #plumb().
+     * LIDs are "local IDs" in infiniband terminology. They are short, locally
+     * routable addresses.
+     */
+    int get_remote_lid(uint16_t *lid) const;
+    /**
+     * Get the state of a QueuePair.
+     */
+    int get_state() const;
+    /**
+     * Return true if the queue pair is in an error state, false otherwise.
+     */
+    bool is_error() const;
+    void add_tx_wr(uint32_t amt) { tx_wr_inflight += amt; }
+    void dec_tx_wr(uint32_t amt) { tx_wr_inflight -= amt; }
+    uint32_t get_tx_wr() const { return tx_wr_inflight; }
+    ibv_qp* get_qp() const { return qp; }
+    Infiniband::CompletionQueue* get_tx_cq() const { return txcq; }
+    Infiniband::CompletionQueue* get_rx_cq() const { return rxcq; }
+    int to_dead();
+    bool is_dead() const { return dead; }
+
+   private:
+    CephContext  *cct;
+    Infiniband&  infiniband;     // Infiniband to which this QP belongs
+    ibv_qp_type  type;           // QP type (IBV_QPT_RC, etc.)
+    ibv_context* ctxt;           // device context of the HCA to use
+    int ib_physical_port;
+    ibv_pd*      pd;             // protection domain
+    ibv_srq*     srq;            // shared receive queue
+    ibv_qp*      qp;             // infiniband verbs QP handle
+    struct rdma_cm_id *cm_id;
+    Infiniband::CompletionQueue* txcq;
+    Infiniband::CompletionQueue* rxcq;
+    uint32_t     initial_psn;    // initial packet sequence number
+    uint32_t     max_send_wr;
+    uint32_t     max_recv_wr;
+    uint32_t     q_key;
+    bool dead;
+    std::atomic<uint32_t> tx_wr_inflight = {0}; // counter for inflight Tx WQEs
+  };
+
+ public:
+  typedef MemoryManager::Cluster Cluster;
+  typedef MemoryManager::Chunk Chunk;
+  QueuePair* create_queue_pair(CephContext *c, CompletionQueue*, CompletionQueue*,
+      ibv_qp_type type, struct rdma_cm_id *cm_id);
+  ibv_srq* create_shared_receive_queue(uint32_t max_wr, uint32_t max_sge);
+  // post rx buffers to srq, return number of buffers actually posted
+  int post_chunks_to_rq(int num, ibv_qp *qp=NULL);
+  void post_chunk_to_pool(Chunk* chunk) {
+    get_memory_manager()->release_rx_buffer(chunk);
+  }
+  int get_tx_buffers(std::vector<Chunk*> &c, size_t bytes);
+  CompletionChannel *create_comp_channel(CephContext *c);
+  CompletionQueue *create_comp_queue(CephContext *c, CompletionChannel *cc=NULL);
+  uint8_t get_ib_physical_port() { return ib_physical_port; }
+  int send_msg(CephContext *cct, int sd, IBSYNMsg& msg);
+  int recv_msg(CephContext *cct, int sd, IBSYNMsg& msg);
+  uint16_t get_lid() { return device->get_lid(); }
+  ibv_gid get_gid() { return device->get_gid(); }
+  MemoryManager* get_memory_manager() { return memory_manager; }
+  Device* get_device() { return device; }
+  int get_async_fd() { return device->ctxt->async_fd; }
+  bool is_tx_buffer(const char* c) { return memory_manager->is_tx_buffer(c);}
+  Chunk *get_tx_chunk_by_buffer(const char *c) { return memory_manager->get_tx_chunk_by_buffer(c); }
+  static const char* wc_status_to_string(int status);
+  static const char* qp_state_string(int status);
+  uint32_t get_rx_queue_len() const { return rx_queue_len; }
+};
+
+#endif
diff --git a/src/msg/async/rdma/RDMAConnectedSocketImpl.cc b/src/msg/async/rdma/RDMAConnectedSocketImpl.cc
new file mode 100644
index 00000000..89be7428
--- /dev/null
+++ b/src/msg/async/rdma/RDMAConnectedSocketImpl.cc
@@ -0,0 +1,743 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "RDMAStack.h"
+
+class C_handle_connection_established : public EventCallback {
+  RDMAConnectedSocketImpl *csi;
+  bool active = true;
+ public:
+  C_handle_connection_established(RDMAConnectedSocketImpl *w) : csi(w) {}
+  void do_request(uint64_t fd) final {
+    if (active)
+      csi->handle_connection_established();
+  }
+  void close() {
+    active = false;
+  }
+};
+
+class C_handle_connection_read : public EventCallback {
+  RDMAConnectedSocketImpl *csi;
+  bool active = true;
+ public:
+  explicit C_handle_connection_read(RDMAConnectedSocketImpl *w): csi(w) {}
+  void do_request(uint64_t fd) final {
+    if (active)
+      csi->handle_connection();
+  }
+  void close() {
+    active = false;
+  }
+};
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << " RDMAConnectedSocketImpl "
+
+RDMAConnectedSocketImpl::RDMAConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s,
+						 RDMAWorker *w)
+  : cct(cct), connected(0), error(0), infiniband(ib),
+    dispatcher(s), worker(w), lock("RDMAConnectedSocketImpl::lock"),
+    is_server(false), read_handler(new C_handle_connection_read(this)),
+    established_handler(new C_handle_connection_established(this)),
+    active(false), pending(false)
+{
+  if (!cct->_conf->ms_async_rdma_cm) {
+    qp = infiniband->create_queue_pair(cct, s->get_tx_cq(), s->get_rx_cq(), IBV_QPT_RC, NULL);
+    my_msg.qpn = qp->get_local_qp_number();
+    my_msg.psn = qp->get_initial_psn();
+    my_msg.lid = infiniband->get_lid();
+    my_msg.peer_qpn = 0;
+    my_msg.gid = infiniband->get_gid();
+    notify_fd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
+    dispatcher->register_qp(qp, this);
+    dispatcher->perf_logger->inc(l_msgr_rdma_created_queue_pair);
+    dispatcher->perf_logger->inc(l_msgr_rdma_active_queue_pair);
+  }
+}
+
+RDMAConnectedSocketImpl::~RDMAConnectedSocketImpl()
+{
+  ldout(cct, 20) << __func__ << " destruct." << dendl;
+  cleanup();
+  worker->remove_pending_conn(this);
+  dispatcher->erase_qpn(my_msg.qpn);
+
+  for (unsigned i=0; i < wc.size(); ++i) {
+    dispatcher->post_chunk_to_pool(reinterpret_cast<Chunk*>(wc[i].wr_id));
+  }
+  for (unsigned i=0; i < buffers.size(); ++i) {
+    dispatcher->post_chunk_to_pool(buffers[i]);
+  }
+
+  Mutex::Locker l(lock);
+  if (notify_fd >= 0)
+    ::close(notify_fd);
+  if (tcp_fd >= 0)
+    ::close(tcp_fd);
+  error = ECONNRESET;
+}
+
+void RDMAConnectedSocketImpl::pass_wc(std::vector<ibv_wc> &&v)
+{
+  Mutex::Locker l(lock);
+  if (wc.empty())
+    wc = std::move(v);
+  else
+    wc.insert(wc.end(), v.begin(), v.end());
+  notify();
+}
+
+void RDMAConnectedSocketImpl::get_wc(std::vector<ibv_wc> &w)
+{
+  Mutex::Locker l(lock);
+  if (wc.empty())
+    return ;
+  w.swap(wc);
+}
+
+int RDMAConnectedSocketImpl::activate()
+{
+  ibv_qp_attr qpa;
+  int r;
+
+  // now connect up the qps and switch to RTR
+  memset(&qpa, 0, sizeof(qpa));
+  qpa.qp_state = IBV_QPS_RTR;
+  qpa.path_mtu = IBV_MTU_1024;
+  qpa.dest_qp_num = peer_msg.qpn;
+  qpa.rq_psn = peer_msg.psn;
+  qpa.max_dest_rd_atomic = 1;
+  qpa.min_rnr_timer = 12;
+  //qpa.ah_attr.is_global = 0;
+  qpa.ah_attr.is_global = 1;
+  qpa.ah_attr.grh.hop_limit = 6;
+  qpa.ah_attr.grh.dgid = peer_msg.gid;
+
+  qpa.ah_attr.grh.sgid_index = infiniband->get_device()->get_gid_idx();
+
+  qpa.ah_attr.dlid = peer_msg.lid;
+  qpa.ah_attr.sl = cct->_conf->ms_async_rdma_sl;
+  qpa.ah_attr.grh.traffic_class = cct->_conf->ms_async_rdma_dscp;
+  qpa.ah_attr.src_path_bits = 0;
+  qpa.ah_attr.port_num = (uint8_t)(infiniband->get_ib_physical_port());
+
+  ldout(cct, 20) << __func__ << " Choosing gid_index " << (int)qpa.ah_attr.grh.sgid_index << ", sl " << (int)qpa.ah_attr.sl << dendl;
+
+  r = ibv_modify_qp(qp->get_qp(), &qpa, IBV_QP_STATE |
+      IBV_QP_AV |
+      IBV_QP_PATH_MTU |
+      IBV_QP_DEST_QPN |
+      IBV_QP_RQ_PSN |
+      IBV_QP_MIN_RNR_TIMER |
+      IBV_QP_MAX_DEST_RD_ATOMIC);
+  if (r) {
+    lderr(cct) << __func__ << " failed to transition to RTR state: "
+               << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+
+  ldout(cct, 20) << __func__ << " transition to RTR state successfully." << dendl;
+
+  // now move to RTS
+  qpa.qp_state = IBV_QPS_RTS;
+
+  // How long to wait before retrying if packet lost or server dead.
+  // Supposedly the timeout is 4.096us*2^timeout.  However, the actual
+  // timeout appears to be 4.096us*2^(timeout+1), so the setting
+  // below creates a 135ms timeout.
+  qpa.timeout = 14;
+
+  // How many times to retry after timeouts before giving up.
+  qpa.retry_cnt = 7;
+
+  // How many times to retry after RNR (receiver not ready) condition
+  // before giving up. Occurs when the remote side has not yet posted
+  // a receive request.
+  qpa.rnr_retry = 7; // 7 is infinite retry.
+  qpa.sq_psn = my_msg.psn;
+  qpa.max_rd_atomic = 1;
+
+  r = ibv_modify_qp(qp->get_qp(), &qpa, IBV_QP_STATE |
+      IBV_QP_TIMEOUT |
+      IBV_QP_RETRY_CNT |
+      IBV_QP_RNR_RETRY |
+      IBV_QP_SQ_PSN |
+      IBV_QP_MAX_QP_RD_ATOMIC);
+  if (r) {
+    lderr(cct) << __func__ << " failed to transition to RTS state: "
+               << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+
+  // the queue pair should be ready to use once the client has finished
+  // setting up their end.
+  ldout(cct, 20) << __func__ << " transition to RTS state successfully." << dendl;
+  ldout(cct, 20) << __func__ << " QueuePair: " << qp << " with qp:" << qp->get_qp() << dendl;
+
+  if (!is_server) {
+    connected = 1; //indicate successfully
+    ldout(cct, 20) << __func__ << " handle fake send, wake it up. QP: " << my_msg.qpn << dendl;
+    submit(false);
+  }
+  active = true;
+
+  return 0;
+}
+
+int RDMAConnectedSocketImpl::try_connect(const entity_addr_t& peer_addr, const SocketOptions &opts) {
+  ldout(cct, 20) << __func__ << " nonblock:" << opts.nonblock << ", nodelay:"
+                 << opts.nodelay << ", rbuf_size: " << opts.rcbuf_size << dendl;
+  NetHandler net(cct);
+
+  // we construct a socket to transport ib sync message
+  // but we shouldn't block in tcp connecting
+  if (opts.nonblock) {
+    tcp_fd = net.nonblock_connect(peer_addr, opts.connect_bind_addr);
+  } else {
+    tcp_fd = net.connect(peer_addr, opts.connect_bind_addr);
+  }
+
+  if (tcp_fd < 0) {
+    return -errno;
+  }
+
+  int r = net.set_socket_options(tcp_fd, opts.nodelay, opts.rcbuf_size);
+  if (r < 0) {
+    ::close(tcp_fd);
+    tcp_fd = -1;
+    return -errno;
+  }
+
+  ldout(cct, 20) << __func__ << " tcp_fd: " << tcp_fd << dendl;
+  net.set_priority(tcp_fd, opts.priority, peer_addr.get_family());
+  r = 0;
+  if (opts.nonblock) {
+    worker->center.create_file_event(tcp_fd, EVENT_READABLE | EVENT_WRITABLE , established_handler);
+  } else {
+    r = handle_connection_established(false);
+  }
+  return r;
+}
+
+int RDMAConnectedSocketImpl::handle_connection_established(bool need_set_fault) {
+  ldout(cct, 20) << __func__ << " start " << dendl;
+  // delete read event
+  worker->center.delete_file_event(tcp_fd, EVENT_READABLE | EVENT_WRITABLE);
+  if (1 == connected) {
+    ldout(cct, 1) << __func__ << " warnning: logic failed " << dendl;
+    if (need_set_fault) {
+      fault();
+    }
+    return -1;
+  }
+  // send handshake msg to server
+  my_msg.peer_qpn = 0;
+  int r = infiniband->send_msg(cct, tcp_fd, my_msg);
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " send handshake msg failed." << r << dendl;
+    if (need_set_fault) {
+      fault();
+    }
+    return r;
+  }
+  worker->center.create_file_event(tcp_fd, EVENT_READABLE, read_handler);
+  ldout(cct, 20) << __func__ << " finish " << dendl;
+  return 0;
+}
+
+void RDMAConnectedSocketImpl::handle_connection() {
+  ldout(cct, 20) << __func__ << " QP: " << my_msg.qpn << " tcp_fd: " << tcp_fd << " notify_fd: " << notify_fd << dendl;
+  int r = infiniband->recv_msg(cct, tcp_fd, peer_msg);
+  if (r <= 0) {
+    if (r != -EAGAIN) {
+      dispatcher->perf_logger->inc(l_msgr_rdma_handshake_errors);
+      ldout(cct, 1) << __func__ << " recv handshake msg failed." << dendl;
+      fault();
+    }
+    return;
+  }
+
+  if (1 == connected) {
+    ldout(cct, 1) << __func__ << " warnning: logic failed: read len: " << r << dendl;
+    fault();
+    return;
+  }
+
+  if (!is_server) {// syn + ack from server
+    my_msg.peer_qpn = peer_msg.qpn;
+    ldout(cct, 20) << __func__ << " peer msg :  < " << peer_msg.qpn << ", " << peer_msg.psn
+                   <<  ", " << peer_msg.lid << ", " << peer_msg.peer_qpn << "> " << dendl;
+    if (!connected) {
+      r = activate();
+      ceph_assert(!r);
+    }
+    notify();
+    r = infiniband->send_msg(cct, tcp_fd, my_msg);
+    if (r < 0) {
+      ldout(cct, 1) << __func__ << " send client ack failed." << dendl;
+      dispatcher->perf_logger->inc(l_msgr_rdma_handshake_errors);
+      fault();
+    }
+  } else {
+    if (peer_msg.peer_qpn == 0) {// syn from client
+      if (active) {
+        ldout(cct, 10) << __func__ << " server is already active." << dendl;
+        return ;
+      }
+      r = activate();
+      ceph_assert(!r);
+      r = infiniband->send_msg(cct, tcp_fd, my_msg);
+      if (r < 0) {
+        ldout(cct, 1) << __func__ << " server ack failed." << dendl;
+        dispatcher->perf_logger->inc(l_msgr_rdma_handshake_errors);
+        fault();
+        return ;
+      }
+    } else { // ack from client
+      connected = 1;
+      ldout(cct, 10) << __func__ << " handshake of rdma is done. server connected: " << connected << dendl;
+      //cleanup();
+      submit(false);
+      notify();
+    }
+  }
+}
+
+ssize_t RDMAConnectedSocketImpl::read(char* buf, size_t len)
+{
+  uint64_t i = 0;
+  int r = ::read(notify_fd, &i, sizeof(i));
+  ldout(cct, 20) << __func__ << " notify_fd : " << i << " in " << my_msg.qpn << " r = " << r << dendl;
+  
+  if (!active) {
+    ldout(cct, 1) << __func__ << " when ib not active. len: " << len << dendl;
+    return -EAGAIN;
+  }
+  
+  if (0 == connected) {
+    ldout(cct, 1) << __func__ << " when ib not connected. len: " << len <<dendl;
+    return -EAGAIN;
+  }
+  ssize_t read = 0;
+  if (!buffers.empty())
+    read = read_buffers(buf,len);
+
+  std::vector<ibv_wc> cqe;
+  get_wc(cqe);
+  if (cqe.empty()) {
+    if (!buffers.empty()) {
+      notify();
+    }
+    if (read > 0) {
+      return read;
+    }
+    if (error) {
+      return -error;
+    } else {
+      return -EAGAIN;
+    }
+  }
+
+  ldout(cct, 20) << __func__ << " poll queue got " << cqe.size() << " responses. QP: " << my_msg.qpn << dendl;
+  for (size_t i = 0; i < cqe.size(); ++i) {
+    ibv_wc* response = &cqe[i];
+    ceph_assert(response->status == IBV_WC_SUCCESS);
+    Chunk* chunk = reinterpret_cast<Chunk *>(response->wr_id);
+    ldout(cct, 25) << __func__ << " chunk length: " << response->byte_len << " bytes." << chunk << dendl;
+    chunk->prepare_read(response->byte_len);
+    worker->perf_logger->inc(l_msgr_rdma_rx_bytes, response->byte_len);
+    if (response->byte_len == 0) {
+      dispatcher->perf_logger->inc(l_msgr_rdma_rx_fin);
+      if (connected) {
+        error = ECONNRESET;
+        ldout(cct, 20) << __func__ << " got remote close msg..." << dendl;
+      }
+      dispatcher->post_chunk_to_pool(chunk);
+    } else {
+      if (read == (ssize_t)len) {
+        buffers.push_back(chunk);
+        ldout(cct, 25) << __func__ << " buffers add a chunk: " << response->byte_len << dendl;
+      } else if (read + response->byte_len > (ssize_t)len) {
+        read += chunk->read(buf+read, (ssize_t)len-read);
+        buffers.push_back(chunk);
+        ldout(cct, 25) << __func__ << " buffers add a chunk: " << chunk->get_offset() << ":" << chunk->get_bound() << dendl;
+      } else {
+        read += chunk->read(buf+read, response->byte_len);
+        dispatcher->post_chunk_to_pool(chunk);
+        update_post_backlog();
+      }
+    }
+  }
+
+  worker->perf_logger->inc(l_msgr_rdma_rx_chunks, cqe.size());
+  if (is_server && connected == 0) {
+    ldout(cct, 20) << __func__ << " we do not need last handshake, QP: " << my_msg.qpn << " peer QP: " << peer_msg.qpn << dendl;
+    connected = 1; //if so, we don't need the last handshake
+    cleanup();
+    submit(false);
+  }
+
+  if (!buffers.empty()) {
+    notify();
+  }
+
+  if (read == 0 && error)
+    return -error;
+  return read == 0 ? -EAGAIN : read;
+}
+
+ssize_t RDMAConnectedSocketImpl::read_buffers(char* buf, size_t len)
+{
+  size_t read = 0, tmp = 0;
+  auto c = buffers.begin();
+  for (; c != buffers.end() ; ++c) {
+    tmp = (*c)->read(buf+read, len-read);
+    read += tmp;
+    ldout(cct, 25) << __func__ << " this iter read: " << tmp << " bytes." << " offset: " << (*c)->get_offset() << " ,bound: " << (*c)->get_bound()  << ". Chunk:" << *c  << dendl;
+    if ((*c)->over()) {
+      dispatcher->post_chunk_to_pool(*c);
+      update_post_backlog();
+      ldout(cct, 25) << __func__ << " one chunk over." << dendl;
+    }
+    if (read == len) {
+      break;
+    }
+  }
+
+  if (c != buffers.end() && (*c)->over())
+    ++c;
+  buffers.erase(buffers.begin(), c);
+  ldout(cct, 25) << __func__ << " got " << read  << " bytes, buffers size: " << buffers.size() << dendl;
+  return read;
+}
+
+ssize_t RDMAConnectedSocketImpl::zero_copy_read(bufferptr &data)
+{
+  if (error)
+    return -error;
+  static const int MAX_COMPLETIONS = 16;
+  ibv_wc wc[MAX_COMPLETIONS];
+  ssize_t size = 0;
+
+  ibv_wc*  response;
+  Chunk* chunk;
+  bool loaded = false;
+  auto iter = buffers.begin();
+  if (iter != buffers.end()) {
+    chunk = *iter;
+    // FIXME need to handle release
+    // auto del = std::bind(&Chunk::post_srq, std::move(chunk), infiniband);
+    buffers.erase(iter);
+    loaded = true;
+    size = chunk->bound;
+  }
+
+  std::vector<ibv_wc> cqe;
+  get_wc(cqe);
+  if (cqe.empty())
+    return size == 0 ? -EAGAIN : size;
+
+  ldout(cct, 20) << __func__ << " pool completion queue got " << cqe.size() << " responses."<< dendl;
+
+  for (size_t i = 0; i < cqe.size(); ++i) {
+    response = &wc[i];
+    chunk = reinterpret_cast<Chunk*>(response->wr_id);
+    chunk->prepare_read(response->byte_len);
+    if (!loaded && i == 0) {
+      // FIXME need to handle release
+      // auto del = std::bind(&Chunk::post_srq, std::move(chunk), infiniband);
+      size = chunk->bound;
+      continue;
+    }
+    buffers.push_back(chunk);
+    iter++;
+  }
+
+  if (size == 0)
+    return -EAGAIN;
+  return size;
+}
+
+ssize_t RDMAConnectedSocketImpl::send(bufferlist &bl, bool more)
+{
+  if (error) {
+    if (!active)
+      return -EPIPE;
+    return -error;
+  }
+  size_t bytes = bl.length();
+  if (!bytes)
+    return 0;
+  {
+    Mutex::Locker l(lock);
+    pending_bl.claim_append(bl);
+    if (!connected) {
+      ldout(cct, 20) << __func__ << " fake send to upper, QP: " << my_msg.qpn << dendl;
+      return bytes;
+    }
+  }
+  ldout(cct, 20) << __func__ << " QP: " << my_msg.qpn << dendl;
+  ssize_t r = submit(more);
+  if (r < 0 && r != -EAGAIN)
+    return r;
+  return bytes;
+}
+
+ssize_t RDMAConnectedSocketImpl::submit(bool more)
+{
+  if (error)
+    return -error;
+  Mutex::Locker l(lock);
+  size_t bytes = pending_bl.length();
+  ldout(cct, 20) << __func__ << " we need " << bytes << " bytes. iov size: "
+                 << pending_bl.buffers().size() << dendl;
+  if (!bytes)
+    return 0;
+
+  auto fill_tx_via_copy = [this](std::vector<Chunk*> &tx_buffers,
+                                 unsigned bytes,
+                                 auto& start,
+                                 const auto& end) -> unsigned {
+    ceph_assert(start != end);
+    auto chunk_idx = tx_buffers.size();
+    int ret = worker->get_reged_mem(this, tx_buffers, bytes);
+    if (ret == 0) {
+      ldout(cct, 1) << __func__ << " no enough buffers in worker " << worker << dendl;
+      worker->perf_logger->inc(l_msgr_rdma_tx_no_mem);
+      return 0;
+    }
+
+    unsigned total_copied = 0;
+    Chunk *current_chunk = tx_buffers[chunk_idx];
+    while (start != end) {
+      const uintptr_t addr = reinterpret_cast<uintptr_t>(start->c_str());
+      unsigned copied = 0;
+      while (copied < start->length()) {
+        uint32_t r = current_chunk->write((char*)addr+copied, start->length() - copied);
+        copied += r;
+        total_copied += r;
+        bytes -= r;
+        if (current_chunk->full()){
+          if (++chunk_idx == tx_buffers.size())
+            return total_copied;
+          current_chunk = tx_buffers[chunk_idx];
+        }
+      }
+      ++start;
+    }
+    ceph_assert(bytes == 0);
+    return total_copied;
+  };
+
+  std::vector<Chunk*> tx_buffers;
+  auto it = std::cbegin(pending_bl.buffers());
+  auto copy_it = it;
+  unsigned total = 0;
+  unsigned need_reserve_bytes = 0;
+  while (it != pending_bl.buffers().end()) {
+    if (infiniband->is_tx_buffer(it->raw_c_str())) {
+      if (need_reserve_bytes) {
+        unsigned copied = fill_tx_via_copy(tx_buffers, need_reserve_bytes, copy_it, it);
+        total += copied;
+        if (copied < need_reserve_bytes)
+          goto sending;
+        need_reserve_bytes = 0;
+      }
+      ceph_assert(copy_it == it);
+      tx_buffers.push_back(infiniband->get_tx_chunk_by_buffer(it->raw_c_str()));
+      total += it->length();
+      ++copy_it;
+    } else {
+      need_reserve_bytes += it->length();
+    }
+    ++it;
+  }
+  if (need_reserve_bytes)
+    total += fill_tx_via_copy(tx_buffers, need_reserve_bytes, copy_it, it);
+
+ sending:
+  if (total == 0)
+    return -EAGAIN;
+  ceph_assert(total <= pending_bl.length());
+  bufferlist swapped;
+  if (total < pending_bl.length()) {
+    worker->perf_logger->inc(l_msgr_rdma_tx_parital_mem);
+    pending_bl.splice(total, pending_bl.length()-total, &swapped);
+    pending_bl.swap(swapped);
+  } else {
+    pending_bl.clear();
+  }
+
+  ldout(cct, 20) << __func__ << " left bytes: " << pending_bl.length() << " in buffers "
+                 << pending_bl.buffers().size() << " tx chunks " << tx_buffers.size() << dendl;
+
+  int r = post_work_request(tx_buffers);
+  if (r < 0)
+    return r;
+
+  ldout(cct, 20) << __func__ << " finished sending " << bytes << " bytes." << dendl;
+  return pending_bl.length() ? -EAGAIN : 0;
+}
+
+int RDMAConnectedSocketImpl::post_work_request(std::vector<Chunk*> &tx_buffers)
+{
+  ldout(cct, 20) << __func__ << " QP: " << my_msg.qpn << " " << tx_buffers[0] << dendl;
+  vector<Chunk*>::iterator current_buffer = tx_buffers.begin();
+  ibv_sge isge[tx_buffers.size()];
+  uint32_t current_sge = 0;
+  ibv_send_wr iswr[tx_buffers.size()];
+  uint32_t current_swr = 0;
+  ibv_send_wr* pre_wr = NULL;
+  uint32_t num = 0; 
+
+  // FIPS zeroization audit 20191115: these memsets are not security related.
+  memset(iswr, 0, sizeof(iswr));
+  memset(isge, 0, sizeof(isge));
+ 
+  while (current_buffer != tx_buffers.end()) {
+    isge[current_sge].addr = reinterpret_cast<uint64_t>((*current_buffer)->buffer);
+    isge[current_sge].length = (*current_buffer)->get_offset();
+    isge[current_sge].lkey = (*current_buffer)->mr->lkey;
+    ldout(cct, 25) << __func__ << " sending buffer: " << *current_buffer << " length: " << isge[current_sge].length  << dendl;
+
+    iswr[current_swr].wr_id = reinterpret_cast<uint64_t>(*current_buffer);
+    iswr[current_swr].next = NULL;
+    iswr[current_swr].sg_list = &isge[current_sge];
+    iswr[current_swr].num_sge = 1;
+    iswr[current_swr].opcode = IBV_WR_SEND;
+    iswr[current_swr].send_flags = IBV_SEND_SIGNALED;
+    /*if (isge[current_sge].length < infiniband->max_inline_data) {
+      iswr[current_swr].send_flags = IBV_SEND_INLINE;
+      ldout(cct, 20) << __func__ << " send_inline." << dendl;
+      }*/
+
+    num++;
+    worker->perf_logger->inc(l_msgr_rdma_tx_bytes, isge[current_sge].length);
+    if (pre_wr)
+      pre_wr->next = &iswr[current_swr];
+    pre_wr = &iswr[current_swr];
+    ++current_sge;
+    ++current_swr;
+    ++current_buffer;
+  }
+
+  ibv_send_wr *bad_tx_work_request;
+  if (ibv_post_send(qp->get_qp(), iswr, &bad_tx_work_request)) {
+    ldout(cct, 1) << __func__ << " failed to send data"
+                  << " (most probably should be peer not ready): "
+                  << cpp_strerror(errno) << dendl;
+    worker->perf_logger->inc(l_msgr_rdma_tx_failed);
+    return -errno;
+  }
+  qp->add_tx_wr(num);
+  worker->perf_logger->inc(l_msgr_rdma_tx_chunks, tx_buffers.size());
+  ldout(cct, 20) << __func__ << " qp state is " << Infiniband::qp_state_string(qp->get_state()) << dendl;
+  return 0;
+}
+
+void RDMAConnectedSocketImpl::fin() {
+  ibv_send_wr wr;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&wr, 0, sizeof(wr));
+
+  wr.wr_id = reinterpret_cast<uint64_t>(qp);
+  wr.num_sge = 0;
+  wr.opcode = IBV_WR_SEND;
+  wr.send_flags = IBV_SEND_SIGNALED;
+  ibv_send_wr* bad_tx_work_request;
+  if (ibv_post_send(qp->get_qp(), &wr, &bad_tx_work_request)) {
+    ldout(cct, 1) << __func__ << " failed to send message="
+                  << " ibv_post_send failed(most probably should be peer not ready): "
+                  << cpp_strerror(errno) << dendl;
+    worker->perf_logger->inc(l_msgr_rdma_tx_failed);
+    return ;
+  }
+  qp->add_tx_wr(1);
+}
+
+void RDMAConnectedSocketImpl::cleanup() {
+  if (read_handler && tcp_fd >= 0) {
+    (static_cast<C_handle_connection_read*>(read_handler))->close();
+    worker->center.submit_to(worker->center.get_id(), [this]() {
+      worker->center.delete_file_event(tcp_fd, EVENT_READABLE | EVENT_WRITABLE);
+    }, false);
+    delete read_handler;
+    read_handler = nullptr;
+  }
+  if (established_handler) {
+    (static_cast<C_handle_connection_established*>(established_handler))->close();
+    delete established_handler;
+    established_handler = nullptr;
+  }
+}
+
+void RDMAConnectedSocketImpl::notify()
+{
+  // note: notify_fd is an event fd (man eventfd)
+  // write argument must be a 64bit integer
+  uint64_t i = 1;
+
+  ceph_assert(sizeof(i) == write(notify_fd, &i, sizeof(i)));
+}
+
+void RDMAConnectedSocketImpl::shutdown()
+{
+  if (!error)
+    fin();
+  error = ECONNRESET;
+  active = false;
+}
+
+void RDMAConnectedSocketImpl::close()
+{
+  if (!error)
+    fin();
+  error = ECONNRESET;
+  active = false;
+}
+
+void RDMAConnectedSocketImpl::fault()
+{
+  ldout(cct, 1) << __func__ << " tcp fd " << tcp_fd << dendl;
+  /*if (qp) {
+    qp->to_dead();
+    qp = NULL;
+    }*/
+  error = ECONNRESET;
+  connected = 1;
+  notify();
+}
+
+void RDMAConnectedSocketImpl::set_accept_fd(int sd)
+{
+  tcp_fd = sd;
+  is_server = true;
+  worker->center.submit_to(worker->center.get_id(), [this]() {
+			   worker->center.create_file_event(tcp_fd, EVENT_READABLE, read_handler);
+			   }, true);
+}
+
+void RDMAConnectedSocketImpl::post_chunks_to_rq(int num)
+{
+  post_backlog += num - infiniband->post_chunks_to_rq(num, qp->get_qp());
+}
+
+void RDMAConnectedSocketImpl::update_post_backlog()
+{
+  if (post_backlog)
+    post_backlog -= post_backlog - dispatcher->post_chunks_to_rq(post_backlog, qp->get_qp());
+}
diff --git a/src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc b/src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc
new file mode 100644
index 00000000..432c2d2b
--- /dev/null
+++ b/src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc
@@ -0,0 +1,183 @@
+#include "RDMAStack.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << " RDMAIWARPConnectedSocketImpl "
+
+#define TIMEOUT_MS 3000
+#define RETRY_COUNT 7
+
+RDMAIWARPConnectedSocketImpl::RDMAIWARPConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s,
+						 RDMAWorker *w, RDMACMInfo *info)
+  : RDMAConnectedSocketImpl(cct, ib, s, w), cm_con_handler(new C_handle_cm_connection(this))
+{
+  status = IDLE;
+  notify_fd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
+  if (info) {
+    is_server = true;
+    cm_id = info->cm_id;
+    cm_channel = info->cm_channel;
+    status = RDMA_ID_CREATED;
+    remote_qpn = info->qp_num;
+    if (alloc_resource()) {
+      close_notify();
+      return;
+    }
+    worker->center.submit_to(worker->center.get_id(), [this]() {
+      worker->center.create_file_event(cm_channel->fd, EVENT_READABLE, cm_con_handler);
+      status = CHANNEL_FD_CREATED;
+    }, false);
+    status = RESOURCE_ALLOCATED;
+    local_qpn = qp->get_local_qp_number();
+    my_msg.qpn = local_qpn;
+  } else {
+    is_server = false;
+    cm_channel = rdma_create_event_channel();
+    rdma_create_id(cm_channel, &cm_id, NULL, RDMA_PS_TCP);
+    status = RDMA_ID_CREATED;
+    ldout(cct, 20) << __func__ << " successfully created cm id: " << cm_id << dendl;
+  }
+}
+
+RDMAIWARPConnectedSocketImpl::~RDMAIWARPConnectedSocketImpl() {
+  ldout(cct, 20) << __func__ << " destruct." << dendl;
+  std::unique_lock l(close_mtx);
+  close_condition.wait(l, [&] { return closed; });
+  if (status >= RDMA_ID_CREATED) {
+    rdma_destroy_id(cm_id);
+    rdma_destroy_event_channel(cm_channel);
+  }
+}
+
+int RDMAIWARPConnectedSocketImpl::try_connect(const entity_addr_t& peer_addr, const SocketOptions &opts) {
+  worker->center.create_file_event(cm_channel->fd, EVENT_READABLE, cm_con_handler);
+  status = CHANNEL_FD_CREATED;
+  if (rdma_resolve_addr(cm_id, NULL, const_cast<struct sockaddr*>(peer_addr.get_sockaddr()), TIMEOUT_MS)) {
+    lderr(cct) << __func__ << " failed to resolve addr" << dendl;
+    return -1;
+  }
+  return 0;
+}
+
+void RDMAIWARPConnectedSocketImpl::close() {
+  error = ECONNRESET;
+  active = false;
+  if (status >= CONNECTED) {
+    rdma_disconnect(cm_id);
+  }
+  close_notify();
+}
+
+void RDMAIWARPConnectedSocketImpl::shutdown() {
+  error = ECONNRESET;
+  active = false;
+}
+
+void RDMAIWARPConnectedSocketImpl::handle_cm_connection() {
+  struct rdma_cm_event *event;
+  rdma_get_cm_event(cm_channel, &event);
+  ldout(cct, 20) << __func__ << " event name: " << rdma_event_str(event->event)
+                             << " (cm id: " << cm_id << ")" << dendl;
+  struct rdma_conn_param cm_params;
+  switch (event->event) {
+    case RDMA_CM_EVENT_ADDR_RESOLVED:
+      status = ADDR_RESOLVED;
+      if (rdma_resolve_route(cm_id, TIMEOUT_MS)) {
+        lderr(cct) << __func__ << " failed to resolve rdma addr" << dendl;
+        notify();
+      }
+      break;
+
+    case RDMA_CM_EVENT_ROUTE_RESOLVED:
+      status = ROUTE_RESOLVED;
+      if (alloc_resource()) {
+        lderr(cct) << __func__ << " failed to alloc resource while resolving the route" << dendl;
+        connected = -ECONNREFUSED;
+        notify();
+        break;
+      }
+      local_qpn = qp->get_local_qp_number();
+      my_msg.qpn = local_qpn;
+
+      // FIPS zeroization audit 20191115: this memset is not security related.
+      memset(&cm_params, 0, sizeof(cm_params));
+      cm_params.retry_count = RETRY_COUNT;
+      cm_params.qp_num = local_qpn;
+      if (rdma_connect(cm_id, &cm_params)) {
+        lderr(cct) << __func__ << " failed to connect remote rdma port" << dendl;
+        connected = -ECONNREFUSED;
+        notify();
+      }
+      break;
+
+    case RDMA_CM_EVENT_ESTABLISHED:
+      ldout(cct, 20) << __func__ << " qp_num=" << cm_id->qp->qp_num << dendl;
+      status = CONNECTED;
+      if (!is_server) {
+        remote_qpn = event->param.conn.qp_num;
+        activate();
+        notify();
+      }
+      break;
+
+    case RDMA_CM_EVENT_ADDR_ERROR:
+    case RDMA_CM_EVENT_ROUTE_ERROR:
+    case RDMA_CM_EVENT_CONNECT_ERROR:
+    case RDMA_CM_EVENT_UNREACHABLE:
+    case RDMA_CM_EVENT_REJECTED:
+      lderr(cct) << __func__ << " rdma connection rejected" << dendl;
+      connected = -ECONNREFUSED;
+      notify();
+      break;
+
+    case RDMA_CM_EVENT_DISCONNECTED:
+      status = DISCONNECTED;
+      close_notify();
+      if (!error) {
+        error = ECONNRESET;
+        notify();
+      }
+      break;
+
+    case RDMA_CM_EVENT_DEVICE_REMOVAL:
+      break;
+
+    default:
+      ceph_abort_msg("unhandled event");
+      break;
+  }
+  rdma_ack_cm_event(event);
+}
+
+void RDMAIWARPConnectedSocketImpl::activate() {
+  ldout(cct, 30) << __func__ << dendl;
+  active = true;
+  connected = 1;
+}
+
+int RDMAIWARPConnectedSocketImpl::alloc_resource() {
+  ldout(cct, 30) << __func__ << dendl;
+  qp = infiniband->create_queue_pair(cct, dispatcher->get_tx_cq(),
+      dispatcher->get_rx_cq(), IBV_QPT_RC, cm_id);
+  if (!qp) {
+    return -1;
+  }
+  if (!cct->_conf->ms_async_rdma_support_srq)
+    dispatcher->post_chunks_to_rq(infiniband->get_rx_queue_len(), qp->get_qp());
+  dispatcher->register_qp(qp, this);
+  dispatcher->perf_logger->inc(l_msgr_rdma_created_queue_pair);
+  dispatcher->perf_logger->inc(l_msgr_rdma_active_queue_pair);
+  return 0;
+}
+
+void RDMAIWARPConnectedSocketImpl::close_notify() {
+  ldout(cct, 30) << __func__ << dendl;
+  if (status >= CHANNEL_FD_CREATED) {
+    worker->center.delete_file_event(cm_channel->fd, EVENT_READABLE);
+  }
+  std::unique_lock l(close_mtx);
+  if (!closed) {
+    closed = true;
+    close_condition.notify_all();
+  }
+}
diff --git a/src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc b/src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc
new file mode 100644
index 00000000..210eaf00
--- /dev/null
+++ b/src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc
@@ -0,0 +1,107 @@
+#include <poll.h>
+
+#include "msg/async/net_handler.h"
+#include "RDMAStack.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << " RDMAIWARPServerSocketImpl "
+
+RDMAIWARPServerSocketImpl::RDMAIWARPServerSocketImpl(
+  CephContext *cct, Infiniband* i,
+  RDMADispatcher *s, RDMAWorker *w, entity_addr_t& a, unsigned addr_slot)
+  : RDMAServerSocketImpl(cct, i, s, w, a, addr_slot)
+{
+}
+
+int RDMAIWARPServerSocketImpl::listen(entity_addr_t &sa,
+				      const SocketOptions &opt)
+{
+  ldout(cct, 20) << __func__ << " bind to rdma point" << dendl;
+  cm_channel = rdma_create_event_channel();
+  rdma_create_id(cm_channel, &cm_id, NULL, RDMA_PS_TCP);
+  ldout(cct, 20) << __func__ << " successfully created cm id: " << cm_id << dendl;
+  int rc = rdma_bind_addr(cm_id, const_cast<struct sockaddr*>(sa.get_sockaddr()));
+  if (rc < 0) {
+    rc = -errno;
+    ldout(cct, 10) << __func__ << " unable to bind to " << sa.get_sockaddr()
+                   << " on port " << sa.get_port() << ": " << cpp_strerror(errno) << dendl;
+    goto err;
+  }
+  rc = rdma_listen(cm_id, 128);
+  if (rc < 0) {
+    rc = -errno;
+    ldout(cct, 10) << __func__ << " unable to listen to " << sa.get_sockaddr()
+                   << " on port " << sa.get_port() << ": " << cpp_strerror(errno) << dendl;
+    goto err;
+  }
+  server_setup_socket = cm_channel->fd;
+  ldout(cct, 20) << __func__ << " fd of cm_channel is " << server_setup_socket << dendl;
+  return 0;
+
+err:
+  server_setup_socket = -1;
+  rdma_destroy_id(cm_id);
+  rdma_destroy_event_channel(cm_channel);
+  return rc;
+}
+
+int RDMAIWARPServerSocketImpl::accept(ConnectedSocket *sock, const SocketOptions &opt,
+    entity_addr_t *out, Worker *w)
+{
+  ldout(cct, 15) << __func__ << dendl;
+
+  ceph_assert(sock);
+  struct pollfd pfd = {
+    .fd = cm_channel->fd,
+    .events = POLLIN,
+  };
+  int ret = poll(&pfd, 1, 0);
+  ceph_assert(ret >= 0);
+  if (!ret)
+    return -EAGAIN;
+
+  struct rdma_cm_event *cm_event;
+  rdma_get_cm_event(cm_channel, &cm_event);
+  ldout(cct, 20) << __func__ << " event name: " << rdma_event_str(cm_event->event) << dendl;
+
+  struct rdma_cm_id *event_cm_id = cm_event->id;
+  struct rdma_event_channel *event_channel = rdma_create_event_channel();
+
+  rdma_migrate_id(event_cm_id, event_channel);
+
+  struct rdma_cm_id *new_cm_id = event_cm_id;
+  struct rdma_conn_param *remote_conn_param = &cm_event->param.conn;
+  struct rdma_conn_param local_conn_param;
+
+  RDMACMInfo info(new_cm_id, event_channel, remote_conn_param->qp_num);
+  RDMAIWARPConnectedSocketImpl* server =
+    new RDMAIWARPConnectedSocketImpl(cct, infiniband, dispatcher, dynamic_cast<RDMAWorker*>(w), &info);
+
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&local_conn_param, 0, sizeof(local_conn_param));
+  local_conn_param.qp_num = server->get_local_qpn();
+
+  if (rdma_accept(new_cm_id, &local_conn_param)) {
+    return -EAGAIN;
+  }
+  server->activate();
+  ldout(cct, 20) << __func__ << " accepted a new QP" << dendl;
+
+  rdma_ack_cm_event(cm_event);
+
+  std::unique_ptr<RDMAConnectedSocketImpl> csi(server);
+  *sock = ConnectedSocket(std::move(csi));
+  struct sockaddr *addr = &new_cm_id->route.addr.dst_addr;
+  out->set_sockaddr(addr);
+
+  return 0;
+}
+
+void RDMAIWARPServerSocketImpl::abort_accept()
+{
+  if (server_setup_socket >= 0) {
+    rdma_destroy_id(cm_id);
+    rdma_destroy_event_channel(cm_channel);
+  }
+}
diff --git a/src/msg/async/rdma/RDMAServerSocketImpl.cc b/src/msg/async/rdma/RDMAServerSocketImpl.cc
new file mode 100644
index 00000000..98402cfd
--- /dev/null
+++ b/src/msg/async/rdma/RDMAServerSocketImpl.cc
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "msg/async/net_handler.h"
+#include "RDMAStack.h"
+
+#include "include/compat.h"
+#include "include/sock_compat.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << " RDMAServerSocketImpl "
+
+RDMAServerSocketImpl::RDMAServerSocketImpl(
+  CephContext *cct, Infiniband* i, RDMADispatcher *s, RDMAWorker *w,
+  entity_addr_t& a, unsigned slot)
+  : ServerSocketImpl(a.get_type(), slot),
+    cct(cct), net(cct), server_setup_socket(-1), infiniband(i),
+    dispatcher(s), worker(w), sa(a)
+{
+}
+
+int RDMAServerSocketImpl::listen(entity_addr_t &sa, const SocketOptions &opt)
+{
+  int rc = 0;
+  server_setup_socket = net.create_socket(sa.get_family(), true);
+  if (server_setup_socket < 0) {
+    rc = -errno;
+    lderr(cct) << __func__ << " failed to create server socket: "
+               << cpp_strerror(errno) << dendl;
+    return rc;
+  }
+
+  rc = net.set_nonblock(server_setup_socket);
+  if (rc < 0) {
+    goto err;
+  }
+
+  rc = net.set_socket_options(server_setup_socket, opt.nodelay, opt.rcbuf_size);
+  if (rc < 0) {
+    goto err;
+  }
+
+  rc = ::bind(server_setup_socket, sa.get_sockaddr(), sa.get_sockaddr_len());
+  if (rc < 0) {
+    rc = -errno;
+    ldout(cct, 10) << __func__ << " unable to bind to " << sa.get_sockaddr()
+                   << " on port " << sa.get_port() << ": " << cpp_strerror(errno) << dendl;
+    goto err;
+  }
+
+  rc = ::listen(server_setup_socket, cct->_conf->ms_tcp_listen_backlog);
+  if (rc < 0) {
+    rc = -errno;
+    lderr(cct) << __func__ << " unable to listen on " << sa << ": " << cpp_strerror(errno) << dendl;
+    goto err;
+  }
+
+  ldout(cct, 20) << __func__ << " bind to " << sa.get_sockaddr() << " on port " << sa.get_port()  << dendl;
+  return 0;
+
+err:
+  ::close(server_setup_socket);
+  server_setup_socket = -1;
+  return rc;
+}
+
+int RDMAServerSocketImpl::accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w)
+{
+  ldout(cct, 15) << __func__ << dendl;
+
+  ceph_assert(sock);
+
+  sockaddr_storage ss;
+  socklen_t slen = sizeof(ss);
+  int sd = accept_cloexec(server_setup_socket, (sockaddr*)&ss, &slen);
+  if (sd < 0) {
+    return -errno;
+  }
+
+  int r = net.set_nonblock(sd);
+  if (r < 0) {
+    ::close(sd);
+    return -errno;
+  }
+
+  r = net.set_socket_options(sd, opt.nodelay, opt.rcbuf_size);
+  if (r < 0) {
+    ::close(sd);
+    return -errno;
+  }
+
+  ceph_assert(NULL != out); //out should not be NULL in accept connection
+
+  out->set_type(addr_type);
+  out->set_sockaddr((sockaddr*)&ss);
+  net.set_priority(sd, opt.priority, out->get_family());
+
+  RDMAConnectedSocketImpl* server;
+  //Worker* w = dispatcher->get_stack()->get_worker();
+  server = new RDMAConnectedSocketImpl(cct, infiniband, dispatcher, dynamic_cast<RDMAWorker*>(w));
+  server->set_accept_fd(sd);
+  ldout(cct, 20) << __func__ << " accepted a new QP, tcp_fd: " << sd << dendl;
+  std::unique_ptr<RDMAConnectedSocketImpl> csi(server);
+  *sock = ConnectedSocket(std::move(csi));
+
+  return 0;
+}
+
+void RDMAServerSocketImpl::abort_accept()
+{
+  if (server_setup_socket >= 0)
+    ::close(server_setup_socket);
+}
diff --git a/src/msg/async/rdma/RDMAStack.cc b/src/msg/async/rdma/RDMAStack.cc
new file mode 100644
index 00000000..f63a8e7d
--- /dev/null
+++ b/src/msg/async/rdma/RDMAStack.cc
@@ -0,0 +1,610 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <poll.h>
+#include <errno.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "include/str_list.h"
+#include "include/compat.h"
+#include "common/Cycles.h"
+#include "common/deleter.h"
+#include "common/Tub.h"
+#include "RDMAStack.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << "RDMAStack "
+
+RDMADispatcher::~RDMADispatcher()
+{
+  ldout(cct, 20) << __func__ << " destructing rdma dispatcher" << dendl;
+  polling_stop();
+
+  ceph_assert(qp_conns.empty());
+  ceph_assert(num_qp_conn == 0);
+  ceph_assert(dead_queue_pairs.empty());
+  ceph_assert(num_dead_queue_pair == 0);
+
+  delete async_handler;
+}
+
+RDMADispatcher::RDMADispatcher(CephContext* c, RDMAStack* s)
+  : cct(c), async_handler(new C_handle_cq_async(this)), lock("RDMADispatcher::lock"),
+  w_lock("RDMADispatcher::for worker pending list"), stack(s)
+{
+  PerfCountersBuilder plb(cct, "AsyncMessenger::RDMADispatcher", l_msgr_rdma_dispatcher_first, l_msgr_rdma_dispatcher_last);
+
+  plb.add_u64_counter(l_msgr_rdma_polling, "polling", "Whether dispatcher thread is polling");
+  plb.add_u64_counter(l_msgr_rdma_inflight_tx_chunks, "inflight_tx_chunks", "The number of inflight tx chunks");
+  plb.add_u64_counter(l_msgr_rdma_rx_bufs_in_use, "rx_bufs_in_use", "The number of rx buffers that are holding data and being processed");
+  plb.add_u64_counter(l_msgr_rdma_rx_bufs_total, "rx_bufs_total", "The total number of rx buffers");
+
+  plb.add_u64_counter(l_msgr_rdma_tx_total_wc, "tx_total_wc", "The number of tx work comletions");
+  plb.add_u64_counter(l_msgr_rdma_tx_total_wc_errors, "tx_total_wc_errors", "The number of tx errors");
+  plb.add_u64_counter(l_msgr_rdma_tx_wc_retry_errors, "tx_retry_errors", "The number of tx retry errors");
+  plb.add_u64_counter(l_msgr_rdma_tx_wc_wr_flush_errors, "tx_wr_flush_errors", "The number of tx work request flush errors");
+
+  plb.add_u64_counter(l_msgr_rdma_rx_total_wc, "rx_total_wc", "The number of total rx work completion");
+  plb.add_u64_counter(l_msgr_rdma_rx_total_wc_errors, "rx_total_wc_errors", "The number of total rx error work completion");
+  plb.add_u64_counter(l_msgr_rdma_rx_fin, "rx_fin", "The number of rx finish work request");
+
+  plb.add_u64_counter(l_msgr_rdma_total_async_events, "total_async_events", "The number of async events");
+  plb.add_u64_counter(l_msgr_rdma_async_last_wqe_events, "async_last_wqe_events", "The number of last wqe events");
+
+  plb.add_u64_counter(l_msgr_rdma_handshake_errors, "handshake_errors", "The number of handshake errors");
+
+
+  plb.add_u64_counter(l_msgr_rdma_created_queue_pair, "created_queue_pair", "Active queue pair number");
+  plb.add_u64_counter(l_msgr_rdma_active_queue_pair, "active_queue_pair", "Created queue pair number");
+
+  perf_logger = plb.create_perf_counters();
+  cct->get_perfcounters_collection()->add(perf_logger);
+  Cycles::init();
+}
+
+void RDMADispatcher::polling_start()
+{
+  // take lock because listen/connect can happen from different worker threads
+  Mutex::Locker l(lock);
+
+  if (t.joinable()) 
+    return; // dispatcher thread already running 
+
+  get_stack()->get_infiniband().get_memory_manager()->set_rx_stat_logger(perf_logger);
+
+  tx_cc = get_stack()->get_infiniband().create_comp_channel(cct);
+  ceph_assert(tx_cc);
+  rx_cc = get_stack()->get_infiniband().create_comp_channel(cct);
+  ceph_assert(rx_cc);
+  tx_cq = get_stack()->get_infiniband().create_comp_queue(cct, tx_cc);
+  ceph_assert(tx_cq);
+  rx_cq = get_stack()->get_infiniband().create_comp_queue(cct, rx_cc);
+  ceph_assert(rx_cq);
+
+  t = std::thread(&RDMADispatcher::polling, this);
+  ceph_pthread_setname(t.native_handle(), "rdma-polling");
+}
+
+void RDMADispatcher::polling_stop()
+{
+  {
+    Mutex::Locker l(lock);
+    done = true;
+  }
+
+  if (!t.joinable())
+    return;
+
+  t.join();
+
+  tx_cc->ack_events();
+  rx_cc->ack_events();
+  delete tx_cq;
+  delete rx_cq;
+  delete tx_cc;
+  delete rx_cc;
+}
+
+void RDMADispatcher::handle_async_event()
+{
+  ldout(cct, 30) << __func__ << dendl;
+  while (1) {
+    ibv_async_event async_event;
+    if (ibv_get_async_event(get_stack()->get_infiniband().get_device()->ctxt, &async_event)) {
+      if (errno != EAGAIN)
+       lderr(cct) << __func__ << " ibv_get_async_event failed. (errno=" << errno
+                  << " " << cpp_strerror(errno) << ")" << dendl;
+      return;
+    }
+    perf_logger->inc(l_msgr_rdma_total_async_events);
+    // FIXME: Currently we must ensure no other factor make QP in ERROR state,
+    // otherwise this qp can't be deleted in current cleanup flow.
+    if (async_event.event_type == IBV_EVENT_QP_LAST_WQE_REACHED) {
+      perf_logger->inc(l_msgr_rdma_async_last_wqe_events);
+      uint64_t qpn = async_event.element.qp->qp_num;
+      ldout(cct, 10) << __func__ << " event associated qp=" << async_event.element.qp
+                     << " evt: " << ibv_event_type_str(async_event.event_type) << dendl;
+      Mutex::Locker l(lock);
+      RDMAConnectedSocketImpl *conn = get_conn_lockless(qpn);
+      if (!conn) {
+        ldout(cct, 1) << __func__ << " missing qp_num=" << qpn << " discard event" << dendl;
+      } else {
+        ldout(cct, 1) << __func__ << " it's not forwardly stopped by us, reenable=" << conn << dendl;
+        conn->fault();
+        if (!cct->_conf->ms_async_rdma_cm)
+          erase_qpn_lockless(qpn);
+      }
+    } else {
+      ldout(cct, 1) << __func__ << " ibv_get_async_event: dev=" << get_stack()->get_infiniband().get_device()->ctxt
+                    << " evt: " << ibv_event_type_str(async_event.event_type)
+                    << dendl;
+    }
+    ibv_ack_async_event(&async_event);
+  }
+}
+
+void RDMADispatcher::post_chunk_to_pool(Chunk* chunk)
+{
+  Mutex::Locker l(lock);
+  get_stack()->get_infiniband().post_chunk_to_pool(chunk);
+  perf_logger->dec(l_msgr_rdma_rx_bufs_in_use);
+}
+
+int RDMADispatcher::post_chunks_to_rq(int num, ibv_qp *qp)
+{
+  Mutex::Locker l(lock);
+  return get_stack()->get_infiniband().post_chunks_to_rq(num, qp);
+}
+
+void RDMADispatcher::polling()
+{
+  static int MAX_COMPLETIONS = 32;
+  ibv_wc wc[MAX_COMPLETIONS];
+
+  std::map<RDMAConnectedSocketImpl*, std::vector<ibv_wc> > polled;
+  std::vector<ibv_wc> tx_cqe;
+  ldout(cct, 20) << __func__ << " going to poll tx cq: " << tx_cq << " rx cq: " << rx_cq << dendl;
+  RDMAConnectedSocketImpl *conn = nullptr;
+  uint64_t last_inactive = Cycles::rdtsc();
+  bool rearmed = false;
+  int r = 0;
+
+  while (true) {
+    int tx_ret = tx_cq->poll_cq(MAX_COMPLETIONS, wc);
+    if (tx_ret > 0) {
+      ldout(cct, 20) << __func__ << " tx completion queue got " << tx_ret
+                     << " responses."<< dendl;
+      handle_tx_event(wc, tx_ret);
+    }
+
+    int rx_ret = rx_cq->poll_cq(MAX_COMPLETIONS, wc);
+    if (rx_ret > 0) {
+      ldout(cct, 20) << __func__ << " rx completion queue got " << rx_ret
+                     << " responses."<< dendl;
+      perf_logger->inc(l_msgr_rdma_rx_total_wc, rx_ret);
+      perf_logger->inc(l_msgr_rdma_rx_bufs_in_use, rx_ret);
+
+      Mutex::Locker l(lock);//make sure connected socket alive when pass wc
+
+      for (int i = 0; i < rx_ret; ++i) {
+        ibv_wc* response = &wc[i];
+        Chunk* chunk = reinterpret_cast<Chunk *>(response->wr_id);
+
+        if (response->status == IBV_WC_SUCCESS) {
+          ceph_assert(wc[i].opcode == IBV_WC_RECV);
+          conn = get_conn_lockless(response->qp_num);
+          if (!conn) {
+            ldout(cct, 1) << __func__ << " csi with qpn " << response->qp_num << " may be dead. chunk " << chunk << " will be back ? " << r << dendl;
+            get_stack()->get_infiniband().post_chunk_to_pool(chunk);
+            perf_logger->dec(l_msgr_rdma_rx_bufs_in_use);
+          } else {
+            conn->post_chunks_to_rq(1);
+            polled[conn].push_back(*response);
+          }
+        } else {
+          perf_logger->inc(l_msgr_rdma_rx_total_wc_errors);
+          ldout(cct, 1) << __func__ << " work request returned error for buffer(" << chunk
+              << ") status(" << response->status << ":"
+              << get_stack()->get_infiniband().wc_status_to_string(response->status) << ")" << dendl;
+          if (response->status != IBV_WC_WR_FLUSH_ERR) {
+            conn = get_conn_lockless(response->qp_num);
+            if (conn && conn->is_connected())
+              conn->fault();
+          }
+          get_stack()->get_infiniband().post_chunk_to_pool(chunk);
+          perf_logger->dec(l_msgr_rdma_rx_bufs_in_use);
+        }
+      }
+      for (auto &&i : polled)
+        i.first->pass_wc(std::move(i.second));
+      polled.clear();
+    }
+
+    if (!tx_ret && !rx_ret) {
+      // NOTE: Has TX just transitioned to idle? We should do it when idle!
+      // It's now safe to delete queue pairs (see comment by declaration
+      // for dead_queue_pairs).
+      // Additionally, don't delete qp while outstanding_buffers isn't empty,
+      // because we need to check qp's state before sending
+      perf_logger->set(l_msgr_rdma_inflight_tx_chunks, inflight);
+      if (num_dead_queue_pair) {
+        Mutex::Locker l(lock); // FIXME reuse dead qp because creating one qp costs 1 ms
+        auto it = dead_queue_pairs.begin();
+        while (it != dead_queue_pairs.end()) {
+          auto i = *it;
+          // Bypass QPs that do not collect all Tx completions yet.
+          if (i->get_tx_wr()) {
+            ldout(cct, 20) << __func__ << " bypass qp=" << i << " tx_wr=" << i->get_tx_wr() << dendl;
+            ++it;
+          } else {
+            ldout(cct, 10) << __func__ << " finally delete qp=" << i << dendl;
+            delete i;
+            it = dead_queue_pairs.erase(it);
+            perf_logger->dec(l_msgr_rdma_active_queue_pair);
+            --num_dead_queue_pair;
+          }
+        }
+      }
+      if (!num_qp_conn && done && dead_queue_pairs.empty())
+        break;
+
+      uint64_t now = Cycles::rdtsc();
+      if (Cycles::to_microseconds(now - last_inactive) > cct->_conf->ms_async_rdma_polling_us) {
+        handle_async_event();
+        if (!rearmed) {
+          // Clean up cq events after rearm notify ensure no new incoming event
+          // arrived between polling and rearm
+          tx_cq->rearm_notify();
+          rx_cq->rearm_notify();
+          rearmed = true;
+          continue;
+        }
+
+        struct pollfd channel_poll[2];
+        channel_poll[0].fd = tx_cc->get_fd();
+        channel_poll[0].events = POLLIN | POLLERR | POLLNVAL | POLLHUP;
+        channel_poll[0].revents = 0;
+        channel_poll[1].fd = rx_cc->get_fd();
+        channel_poll[1].events = POLLIN | POLLERR | POLLNVAL | POLLHUP;
+        channel_poll[1].revents = 0;
+        r = 0;
+        perf_logger->set(l_msgr_rdma_polling, 0);
+        while (!done && r == 0) {
+          r = TEMP_FAILURE_RETRY(poll(channel_poll, 2, 100));
+          if (r < 0) {
+            r = -errno;
+            lderr(cct) << __func__ << " poll failed " << r << dendl;
+            ceph_abort();
+          }
+        }
+        if (r > 0 && tx_cc->get_cq_event())
+          ldout(cct, 20) << __func__ << " got tx cq event." << dendl;
+        if (r > 0 && rx_cc->get_cq_event())
+          ldout(cct, 20) << __func__ << " got rx cq event." << dendl;
+        last_inactive = Cycles::rdtsc();
+        perf_logger->set(l_msgr_rdma_polling, 1);
+        rearmed = false;
+      }
+    }
+  }
+}
+
+void RDMADispatcher::notify_pending_workers() {
+  if (num_pending_workers) {
+    RDMAWorker *w = nullptr;
+    {
+      Mutex::Locker l(w_lock);
+      if (!pending_workers.empty()) {
+        w = pending_workers.front();
+        pending_workers.pop_front();
+        --num_pending_workers;
+      }
+    }
+    if (w)
+      w->notify_worker();
+  }
+}
+
+void RDMADispatcher::register_qp(QueuePair *qp, RDMAConnectedSocketImpl* csi)
+{
+  Mutex::Locker l(lock);
+  ceph_assert(!qp_conns.count(qp->get_local_qp_number()));
+  qp_conns[qp->get_local_qp_number()] = std::make_pair(qp, csi);
+  ++num_qp_conn;
+}
+
+RDMAConnectedSocketImpl* RDMADispatcher::get_conn_lockless(uint32_t qp)
+{
+  auto it = qp_conns.find(qp);
+  if (it == qp_conns.end())
+    return nullptr;
+  if (it->second.first->is_dead())
+    return nullptr;
+  return it->second.second;
+}
+
+Infiniband::QueuePair* RDMADispatcher::get_qp(uint32_t qp)
+{
+  Mutex::Locker l(lock);
+  // Try to find the QP in qp_conns firstly.
+  auto it = qp_conns.find(qp);
+  if (it != qp_conns.end())
+    return it->second.first;
+
+  // Try again in dead_queue_pairs.
+  for (auto &i: dead_queue_pairs)
+    if (i->get_local_qp_number() == qp)
+      return i;
+
+  return nullptr;
+}
+
+void RDMADispatcher::erase_qpn_lockless(uint32_t qpn)
+{
+  auto it = qp_conns.find(qpn);
+  if (it == qp_conns.end())
+    return ;
+  ++num_dead_queue_pair;
+  dead_queue_pairs.push_back(it->second.first);
+  qp_conns.erase(it);
+  --num_qp_conn;
+}
+
+void RDMADispatcher::erase_qpn(uint32_t qpn)
+{
+  Mutex::Locker l(lock);
+  erase_qpn_lockless(qpn);
+}
+
+void RDMADispatcher::handle_tx_event(ibv_wc *cqe, int n)
+{
+  std::vector<Chunk*> tx_chunks;
+
+  for (int i = 0; i < n; ++i) {
+    ibv_wc* response = &cqe[i];
+    Chunk* chunk = reinterpret_cast<Chunk *>(response->wr_id);
+    ldout(cct, 25) << __func__ << " QP: " << response->qp_num
+                   << " len: " << response->byte_len << " , addr:" << chunk
+                   << " " << get_stack()->get_infiniband().wc_status_to_string(response->status) << dendl;
+
+    QueuePair *qp = get_qp(response->qp_num);
+    if (qp)
+      qp->dec_tx_wr(1);
+
+    if (response->status != IBV_WC_SUCCESS) {
+      perf_logger->inc(l_msgr_rdma_tx_total_wc_errors);
+      if (response->status == IBV_WC_RETRY_EXC_ERR) {
+        ldout(cct, 1) << __func__ << " connection between server and client not working. Disconnect this now" << dendl;
+        perf_logger->inc(l_msgr_rdma_tx_wc_retry_errors);
+      } else if (response->status == IBV_WC_WR_FLUSH_ERR) {
+        ldout(cct, 1) << __func__ << " Work Request Flushed Error: this connection's qp="
+                      << response->qp_num << " should be down while this WR=" << response->wr_id
+                      << " still in flight." << dendl;
+        perf_logger->inc(l_msgr_rdma_tx_wc_wr_flush_errors);
+      } else {
+        ldout(cct, 1) << __func__ << " send work request returned error for buffer("
+                      << response->wr_id << ") status(" << response->status << "): "
+                      << get_stack()->get_infiniband().wc_status_to_string(response->status) << dendl;
+        Mutex::Locker l(lock);//make sure connected socket alive when pass wc
+        RDMAConnectedSocketImpl *conn = get_conn_lockless(response->qp_num);
+
+        if (conn && conn->is_connected()) {
+          ldout(cct, 25) << __func__ << " qp state is : " << conn->get_qp_state() << dendl;
+          conn->fault();
+        } else {
+          ldout(cct, 1) << __func__ << " missing qp_num=" << response->qp_num << " discard event" << dendl;
+        }
+      }
+    }
+
+    //TX completion may come either from regular send message or from 'fin' message.
+    //In the case of 'fin' wr_id points to the QueuePair.
+    if (get_stack()->get_infiniband().get_memory_manager()->is_tx_buffer(chunk->buffer)) {
+      tx_chunks.push_back(chunk);
+    } else if (reinterpret_cast<QueuePair*>(response->wr_id)->get_local_qp_number() == response->qp_num ) {
+      ldout(cct, 1) << __func__ << " sending of the disconnect msg completed" << dendl;
+    } else {
+      ldout(cct, 1) << __func__ << " not tx buffer, chunk " << chunk << dendl;
+      ceph_abort();
+    }
+  }
+
+  perf_logger->inc(l_msgr_rdma_tx_total_wc, n);
+  post_tx_buffer(tx_chunks);
+}
+
+/**
+ * Add the given Chunks to the given free queue.
+ *
+ * \param[in] chunks
+ *      The Chunks to enqueue.
+ * \return
+ *      0 if success or -1 for failure
+ */
+void RDMADispatcher::post_tx_buffer(std::vector<Chunk*> &chunks)
+{
+  if (chunks.empty())
+    return ;
+
+  inflight -= chunks.size();
+  get_stack()->get_infiniband().get_memory_manager()->return_tx(chunks);
+  ldout(cct, 30) << __func__ << " release " << chunks.size()
+                 << " chunks, inflight " << inflight << dendl;
+  notify_pending_workers();
+}
+
+
+RDMAWorker::RDMAWorker(CephContext *c, unsigned i)
+  : Worker(c, i), stack(nullptr),
+    tx_handler(new C_handle_cq_tx(this)), lock("RDMAWorker::lock")
+{
+  // initialize perf_logger
+  char name[128];
+  sprintf(name, "AsyncMessenger::RDMAWorker-%u", id);
+  PerfCountersBuilder plb(cct, name, l_msgr_rdma_first, l_msgr_rdma_last);
+
+  plb.add_u64_counter(l_msgr_rdma_tx_no_mem, "tx_no_mem", "The count of no tx buffer");
+  plb.add_u64_counter(l_msgr_rdma_tx_parital_mem, "tx_parital_mem", "The count of parital tx buffer");
+  plb.add_u64_counter(l_msgr_rdma_tx_failed, "tx_failed_post", "The number of tx failed posted");
+
+  plb.add_u64_counter(l_msgr_rdma_tx_chunks, "tx_chunks", "The number of tx chunks transmitted");
+  plb.add_u64_counter(l_msgr_rdma_tx_bytes, "tx_bytes", "The bytes of tx chunks transmitted", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_u64_counter(l_msgr_rdma_rx_chunks, "rx_chunks", "The number of rx chunks transmitted");
+  plb.add_u64_counter(l_msgr_rdma_rx_bytes, "rx_bytes", "The bytes of rx chunks transmitted", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_u64_counter(l_msgr_rdma_pending_sent_conns, "pending_sent_conns", "The count of pending sent conns");
+
+  perf_logger = plb.create_perf_counters();
+  cct->get_perfcounters_collection()->add(perf_logger);
+}
+
+RDMAWorker::~RDMAWorker()
+{
+  delete tx_handler;
+}
+
+void RDMAWorker::initialize()
+{
+  if (!dispatcher) {
+    dispatcher = &stack->get_dispatcher();
+  }
+}
+
+int RDMAWorker::listen(entity_addr_t &sa, unsigned addr_slot,
+		       const SocketOptions &opt,ServerSocket *sock)
+{
+  get_stack()->get_infiniband().init();
+  dispatcher->polling_start();
+  RDMAServerSocketImpl *p;
+  if (cct->_conf->ms_async_rdma_type == "iwarp") {
+    p = new RDMAIWARPServerSocketImpl(
+      cct, &get_stack()->get_infiniband(), &get_stack()->get_dispatcher(), this,
+      sa, addr_slot);
+  } else {
+    p = new RDMAServerSocketImpl(cct, &get_stack()->get_infiniband(),
+				 &get_stack()->get_dispatcher(), this, sa,
+				 addr_slot);
+  }
+  int r = p->listen(sa, opt);
+  if (r < 0) {
+    delete p;
+    return r;
+  }
+
+  *sock = ServerSocket(std::unique_ptr<ServerSocketImpl>(p));
+  return 0;
+}
+
+int RDMAWorker::connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket)
+{
+  get_stack()->get_infiniband().init();
+  dispatcher->polling_start();
+
+  RDMAConnectedSocketImpl* p;
+  if (cct->_conf->ms_async_rdma_type == "iwarp") {
+    p = new RDMAIWARPConnectedSocketImpl(cct, &get_stack()->get_infiniband(), &get_stack()->get_dispatcher(), this);
+  } else {
+    p = new RDMAConnectedSocketImpl(cct, &get_stack()->get_infiniband(), &get_stack()->get_dispatcher(), this);
+  }
+  int r = p->try_connect(addr, opts);
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " try connecting failed." << dendl;
+    delete p;
+    return r;
+  }
+  std::unique_ptr<RDMAConnectedSocketImpl> csi(p);
+  *socket = ConnectedSocket(std::move(csi));
+  return 0;
+}
+
+int RDMAWorker::get_reged_mem(RDMAConnectedSocketImpl *o, std::vector<Chunk*> &c, size_t bytes)
+{
+  ceph_assert(center.in_thread());
+  int r = get_stack()->get_infiniband().get_tx_buffers(c, bytes);
+  ceph_assert(r >= 0);
+  size_t got = get_stack()->get_infiniband().get_memory_manager()->get_tx_buffer_size() * r;
+  ldout(cct, 30) << __func__ << " need " << bytes << " bytes, reserve " << got << " registered  bytes, inflight " << dispatcher->inflight << dendl;
+  stack->get_dispatcher().inflight += r;
+  if (got >= bytes)
+    return r;
+
+  if (o) {
+    if (!o->is_pending()) {
+      pending_sent_conns.push_back(o);
+      perf_logger->inc(l_msgr_rdma_pending_sent_conns, 1);
+      o->set_pending(1);
+    }
+    dispatcher->make_pending_worker(this);
+  }
+  return r;
+}
+
+
+void RDMAWorker::handle_pending_message()
+{
+  ldout(cct, 20) << __func__ << " pending conns " << pending_sent_conns.size() << dendl;
+  while (!pending_sent_conns.empty()) {
+    RDMAConnectedSocketImpl *o = pending_sent_conns.front();
+    pending_sent_conns.pop_front();
+    ssize_t r = o->submit(false);
+    ldout(cct, 20) << __func__ << " sent pending bl socket=" << o << " r=" << r << dendl;
+    if (r < 0) {
+      if (r == -EAGAIN) {
+        pending_sent_conns.push_back(o);
+        dispatcher->make_pending_worker(this);
+        return ;
+      }
+      o->fault();
+    }
+    o->set_pending(0);
+    perf_logger->dec(l_msgr_rdma_pending_sent_conns, 1);
+  }
+  dispatcher->notify_pending_workers();
+}
+
+RDMAStack::RDMAStack(CephContext *cct, const string &t)
+  : NetworkStack(cct, t), ib(cct), dispatcher(cct, this)
+{
+  ldout(cct, 20) << __func__ << " constructing RDMAStack..." << dendl;
+
+  unsigned num = get_num_worker();
+  for (unsigned i = 0; i < num; ++i) {
+    RDMAWorker* w = dynamic_cast<RDMAWorker*>(get_worker(i));
+    w->set_stack(this);
+  }
+  ldout(cct, 20) << " creating RDMAStack:" << this << " with dispatcher:" << &dispatcher << dendl;
+}
+
+RDMAStack::~RDMAStack()
+{
+  if (cct->_conf->ms_async_rdma_enable_hugepage) {
+    unsetenv("RDMAV_HUGEPAGES_SAFE");	//remove env variable on destruction
+  }
+}
+
+void RDMAStack::spawn_worker(unsigned i, std::function<void ()> &&func)
+{
+  threads.resize(i+1);
+  threads[i] = std::thread(func);
+}
+
+void RDMAStack::join_worker(unsigned i)
+{
+  ceph_assert(threads.size() > i && threads[i].joinable());
+  threads[i].join();
+}
diff --git a/src/msg/async/rdma/RDMAStack.h b/src/msg/async/rdma/RDMAStack.h
new file mode 100644
index 00000000..e4d34ee0
--- /dev/null
+++ b/src/msg/async/rdma/RDMAStack.h
@@ -0,0 +1,348 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_RDMASTACK_H
+#define CEPH_MSG_RDMASTACK_H
+
+#include <sys/eventfd.h>
+
+#include <list>
+#include <vector>
+#include <thread>
+
+#include "common/ceph_context.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "msg/async/Stack.h"
+#include "Infiniband.h"
+
+class RDMAConnectedSocketImpl;
+class RDMAServerSocketImpl;
+class RDMAStack;
+class RDMAWorker;
+
+class RDMADispatcher {
+  typedef Infiniband::MemoryManager::Chunk Chunk;
+  typedef Infiniband::QueuePair QueuePair;
+
+  std::thread t;
+  CephContext *cct;
+  Infiniband::CompletionQueue* tx_cq = nullptr;
+  Infiniband::CompletionQueue* rx_cq = nullptr;
+  Infiniband::CompletionChannel *tx_cc = nullptr, *rx_cc = nullptr;
+  EventCallbackRef async_handler;
+  bool done = false;
+  std::atomic<uint64_t> num_dead_queue_pair = {0};
+  std::atomic<uint64_t> num_qp_conn = {0};
+  Mutex lock; // protect `qp_conns`, `dead_queue_pairs`
+  // qp_num -> InfRcConnection
+  // The main usage of `qp_conns` is looking up connection by qp_num,
+  // so the lifecycle of element in `qp_conns` is the lifecycle of qp.
+  //// make qp queue into dead state
+  /**
+   * 1. Connection call mark_down
+   * 2. Move the Queue Pair into the Error state(QueuePair::to_dead)
+   * 3. Wait for the affiliated event IBV_EVENT_QP_LAST_WQE_REACHED(handle_async_event)
+   * 4. Wait for CQ to be empty(handle_tx_event)
+   * 5. Destroy the QP by calling ibv_destroy_qp()(handle_tx_event)
+   *
+   * @param qp The qp needed to dead
+   */
+  ceph::unordered_map<uint32_t, std::pair<QueuePair*, RDMAConnectedSocketImpl*> > qp_conns;
+
+  /// if a queue pair is closed when transmit buffers are active
+  /// on it, the transmit buffers never get returned via tx_cq.  To
+  /// work around this problem, don't delete queue pairs immediately. Instead,
+  /// save them in this vector and delete them at a safe time, when there are
+  /// no outstanding transmit buffers to be lost.
+  std::vector<QueuePair*> dead_queue_pairs;
+
+  std::atomic<uint64_t> num_pending_workers = {0};
+  Mutex w_lock; // protect pending workers
+  // fixme: lockfree
+  std::list<RDMAWorker*> pending_workers;
+  RDMAStack* stack;
+
+  class C_handle_cq_async : public EventCallback {
+    RDMADispatcher *dispatcher;
+   public:
+    explicit C_handle_cq_async(RDMADispatcher *w): dispatcher(w) {}
+    void do_request(uint64_t fd) {
+      // worker->handle_tx_event();
+      dispatcher->handle_async_event();
+    }
+  };
+
+ public:
+  PerfCounters *perf_logger;
+
+  explicit RDMADispatcher(CephContext* c, RDMAStack* s);
+  virtual ~RDMADispatcher();
+  void handle_async_event();
+
+  void polling_start();
+  void polling_stop();
+  void polling();
+  void register_qp(QueuePair *qp, RDMAConnectedSocketImpl* csi);
+  void make_pending_worker(RDMAWorker* w) {
+    Mutex::Locker l(w_lock);
+    auto it = std::find(pending_workers.begin(), pending_workers.end(), w);
+    if (it != pending_workers.end())
+      return;
+    pending_workers.push_back(w);
+    ++num_pending_workers;
+  }
+  RDMAStack* get_stack() { return stack; }
+  RDMAConnectedSocketImpl* get_conn_lockless(uint32_t qp);
+  QueuePair* get_qp(uint32_t qp);
+  void erase_qpn_lockless(uint32_t qpn);
+  void erase_qpn(uint32_t qpn);
+  Infiniband::CompletionQueue* get_tx_cq() const { return tx_cq; }
+  Infiniband::CompletionQueue* get_rx_cq() const { return rx_cq; }
+  void notify_pending_workers();
+  void handle_tx_event(ibv_wc *cqe, int n);
+  void post_tx_buffer(std::vector<Chunk*> &chunks);
+
+  std::atomic<uint64_t> inflight = {0};
+
+  void post_chunk_to_pool(Chunk* chunk);
+  int post_chunks_to_rq(int num, ibv_qp *qp=NULL);
+};
+
+class RDMAWorker : public Worker {
+  typedef Infiniband::CompletionQueue CompletionQueue;
+  typedef Infiniband::CompletionChannel CompletionChannel;
+  typedef Infiniband::MemoryManager::Chunk Chunk;
+  typedef Infiniband::MemoryManager MemoryManager;
+  typedef std::vector<Chunk*>::iterator ChunkIter;
+  RDMAStack *stack;
+  EventCallbackRef tx_handler;
+  std::list<RDMAConnectedSocketImpl*> pending_sent_conns;
+  RDMADispatcher* dispatcher = nullptr;
+  Mutex lock;
+
+  class C_handle_cq_tx : public EventCallback {
+    RDMAWorker *worker;
+    public:
+    explicit C_handle_cq_tx(RDMAWorker *w): worker(w) {}
+    void do_request(uint64_t fd) {
+      worker->handle_pending_message();
+    }
+  };
+
+ public:
+  PerfCounters *perf_logger;
+  explicit RDMAWorker(CephContext *c, unsigned i);
+  virtual ~RDMAWorker();
+  virtual int listen(entity_addr_t &addr,
+		     unsigned addr_slot,
+		     const SocketOptions &opts, ServerSocket *) override;
+  virtual int connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) override;
+  virtual void initialize() override;
+  RDMAStack *get_stack() { return stack; }
+  int get_reged_mem(RDMAConnectedSocketImpl *o, std::vector<Chunk*> &c, size_t bytes);
+  void remove_pending_conn(RDMAConnectedSocketImpl *o) {
+    ceph_assert(center.in_thread());
+    pending_sent_conns.remove(o);
+  }
+  void handle_pending_message();
+  void set_stack(RDMAStack *s) { stack = s; }
+  void notify_worker() {
+    center.dispatch_event_external(tx_handler);
+  }
+};
+
+struct RDMACMInfo {
+  RDMACMInfo(rdma_cm_id *cid, rdma_event_channel *cm_channel_, uint32_t qp_num_)
+    : cm_id(cid), cm_channel(cm_channel_), qp_num(qp_num_) {}
+  rdma_cm_id *cm_id;
+  rdma_event_channel *cm_channel;
+  uint32_t qp_num;
+};
+
+class RDMAConnectedSocketImpl : public ConnectedSocketImpl {
+ public:
+  typedef Infiniband::MemoryManager::Chunk Chunk;
+  typedef Infiniband::CompletionChannel CompletionChannel;
+  typedef Infiniband::CompletionQueue CompletionQueue;
+
+ protected:
+  CephContext *cct;
+  Infiniband::QueuePair *qp;
+  IBSYNMsg peer_msg;
+  IBSYNMsg my_msg;
+  int connected;
+  int error;
+  Infiniband* infiniband;
+  RDMADispatcher* dispatcher;
+  RDMAWorker* worker;
+  std::vector<Chunk*> buffers;
+  int notify_fd = -1;
+  bufferlist pending_bl;
+
+  Mutex lock;
+  std::vector<ibv_wc> wc;
+  bool is_server;
+  EventCallbackRef read_handler;
+  EventCallbackRef established_handler;
+  int tcp_fd = -1;
+  bool active;// qp is active ?
+  bool pending;
+  int post_backlog = 0;
+
+  void notify();
+  ssize_t read_buffers(char* buf, size_t len);
+  int post_work_request(std::vector<Chunk*>&);
+
+ public:
+  RDMAConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s,
+                          RDMAWorker *w);
+  virtual ~RDMAConnectedSocketImpl();
+
+  void pass_wc(std::vector<ibv_wc> &&v);
+  void get_wc(std::vector<ibv_wc> &w);
+  virtual int is_connected() override { return connected; }
+
+  virtual ssize_t read(char* buf, size_t len) override;
+  virtual ssize_t zero_copy_read(bufferptr &data) override;
+  virtual ssize_t send(bufferlist &bl, bool more) override;
+  virtual void shutdown() override;
+  virtual void close() override;
+  virtual int fd() const override { return notify_fd; }
+  virtual int socket_fd() const override { return tcp_fd; }
+  void fault();
+  const char* get_qp_state() { return Infiniband::qp_state_string(qp->get_state()); }
+  ssize_t submit(bool more);
+  int activate();
+  void fin();
+  void handle_connection();
+  int handle_connection_established(bool need_set_fault = true);
+  void cleanup();
+  void set_accept_fd(int sd);
+  virtual int try_connect(const entity_addr_t&, const SocketOptions &opt);
+  bool is_pending() {return pending;}
+  void set_pending(bool val) {pending = val;}
+  void post_chunks_to_rq(int num);
+  void update_post_backlog();
+};
+
+enum RDMA_CM_STATUS {
+  IDLE = 1,
+  RDMA_ID_CREATED,
+  CHANNEL_FD_CREATED,
+  RESOURCE_ALLOCATED,
+  ADDR_RESOLVED,
+  ROUTE_RESOLVED,
+  CONNECTED,
+  DISCONNECTED,
+  ERROR
+};
+
+class RDMAIWARPConnectedSocketImpl : public RDMAConnectedSocketImpl {
+  public:
+    RDMAIWARPConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s,
+                          RDMAWorker *w, RDMACMInfo *info = nullptr);
+    ~RDMAIWARPConnectedSocketImpl();
+    virtual int try_connect(const entity_addr_t&, const SocketOptions &opt) override;
+    virtual void close() override;
+    virtual void shutdown() override;
+    virtual void handle_cm_connection();
+    uint32_t get_local_qpn() const { return local_qpn; }
+    void activate();
+    int alloc_resource();
+    void close_notify();
+
+  private:
+    rdma_cm_id *cm_id;
+    rdma_event_channel *cm_channel;
+    uint32_t local_qpn;
+    uint32_t remote_qpn;
+    EventCallbackRef cm_con_handler;
+    bool is_server;
+    std::mutex close_mtx;
+    std::condition_variable close_condition;
+    bool closed;
+    RDMA_CM_STATUS status;
+
+
+  class C_handle_cm_connection : public EventCallback {
+    RDMAIWARPConnectedSocketImpl *csi;
+    public:
+      C_handle_cm_connection(RDMAIWARPConnectedSocketImpl *w): csi(w) {}
+      void do_request(uint64_t fd) {
+        csi->handle_cm_connection();
+      }
+  };
+};
+
+class RDMAServerSocketImpl : public ServerSocketImpl {
+  protected:
+    CephContext *cct;
+    NetHandler net;
+    int server_setup_socket;
+    Infiniband* infiniband;
+    RDMADispatcher *dispatcher;
+    RDMAWorker *worker;
+    entity_addr_t sa;
+
+ public:
+  RDMAServerSocketImpl(CephContext *cct, Infiniband* i, RDMADispatcher *s,
+		       RDMAWorker *w, entity_addr_t& a, unsigned slot);
+
+  virtual int listen(entity_addr_t &sa, const SocketOptions &opt);
+  virtual int accept(ConnectedSocket *s, const SocketOptions &opts, entity_addr_t *out, Worker *w) override;
+  virtual void abort_accept() override;
+  virtual int fd() const override { return server_setup_socket; }
+  int get_fd() { return server_setup_socket; }
+};
+
+class RDMAIWARPServerSocketImpl : public RDMAServerSocketImpl {
+  public:
+    RDMAIWARPServerSocketImpl(
+      CephContext *cct, Infiniband *i, RDMADispatcher *s, RDMAWorker *w,
+      entity_addr_t& addr, unsigned addr_slot);
+    virtual int listen(entity_addr_t &sa, const SocketOptions &opt) override;
+    virtual int accept(ConnectedSocket *s, const SocketOptions &opts, entity_addr_t *out, Worker *w) override;
+    virtual void abort_accept() override;
+  private:
+    rdma_cm_id *cm_id;
+    rdma_event_channel *cm_channel;
+};
+
+class RDMAStack : public NetworkStack {
+  vector<std::thread> threads;
+  PerfCounters *perf_counter;
+  Infiniband ib;
+  RDMADispatcher dispatcher;
+
+  std::atomic<bool> fork_finished = {false};
+
+ public:
+  explicit RDMAStack(CephContext *cct, const string &t);
+  virtual ~RDMAStack();
+  virtual bool support_zero_copy_read() const override { return false; }
+  virtual bool nonblock_connect_need_writable_event() const override { return false; }
+
+  virtual void spawn_worker(unsigned i, std::function<void ()> &&func) override;
+  virtual void join_worker(unsigned i) override;
+  RDMADispatcher &get_dispatcher() { return dispatcher; }
+  Infiniband &get_infiniband() { return ib; }
+  virtual bool is_ready() override { return fork_finished.load(); };
+  virtual void ready() override { fork_finished = true; };
+};
+
+
+#endif
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-27 18:24:20 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-27 18:24:20 +0000
commit	483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
tree	e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/msg/async
parent	Initial commit. (diff)
download	ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip