Adding upstream version 16.2.11+ds.upstream/16.2.11+ds upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:45:59 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-07 18:45:59 +0000
commit: 19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree: 42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/msg
parent: Initial commit. (diff)
download: ceph-upstream/16.2.11+ds.tar.xz
ceph-upstream/16.2.11+ds.zip
89 files changed, 32413 insertions, 0 deletions
diff --git a/src/msg/CMakeLists.txt b/src/msg/CMakeLists.txt
new file mode 100644
index 000000000..9cca15c81
--- /dev/null
+++ b/src/msg/CMakeLists.txt
@@ -0,0 +1,65 @@
+set(msg_srcs
+  DispatchQueue.cc
+  Message.cc
+  Messenger.cc
+  Connection.cc
+  msg_types.cc)
+
+list(APPEND msg_srcs
+  async/AsyncConnection.cc
+  async/AsyncMessenger.cc
+  async/Protocol.cc
+  async/ProtocolV1.cc
+  async/ProtocolV2.cc
+  async/Event.cc
+  async/EventSelect.cc
+  async/PosixStack.cc
+  async/Stack.cc
+  async/crypto_onwire.cc
+  async/frames_v2.cc
+  async/net_handler.cc)
+
+if(LINUX)
+  list(APPEND msg_srcs
+    async/EventEpoll.cc)
+elseif(FREEBSD OR APPLE)
+  list(APPEND msg_srcs
+    async/EventKqueue.cc)
+endif(LINUX)
+
+if(HAVE_RDMA)
+  list(APPEND msg_srcs
+    async/rdma/Infiniband.cc
+    async/rdma/RDMAConnectedSocketImpl.cc
+    async/rdma/RDMAIWARPConnectedSocketImpl.cc
+    async/rdma/RDMAServerSocketImpl.cc
+    async/rdma/RDMAIWARPServerSocketImpl.cc
+    async/rdma/RDMAStack.cc)
+endif()
+
+add_library(common-msg-objs OBJECT ${msg_srcs})
+compile_with_fmt(common-msg-objs)
+target_include_directories(common-msg-objs PRIVATE ${OPENSSL_INCLUDE_DIR})
+
+if(WITH_DPDK)
+  set(async_dpdk_srcs
+    async/dpdk/ARP.cc
+    async/dpdk/DPDK.cc
+    async/dpdk/dpdk_rte.cc
+    async/dpdk/DPDKStack.cc
+    async/dpdk/EventDPDK.cc
+    async/dpdk/IP.cc
+    async/dpdk/net.cc
+    async/dpdk/IPChecksum.cc
+    async/dpdk/Packet.cc
+    async/dpdk/TCP.cc
+    async/dpdk/UserspaceEvent.cc
+    async/dpdk/ethernet.cc)
+  add_library(common_async_dpdk STATIC
+    ${async_dpdk_srcs})
+  target_link_libraries(common_async_dpdk PRIVATE
+    dpdk::dpdk)
+  # Stack.cc includes DPDKStack.h, which includes rte_config.h indirectly
+  target_include_directories(common-msg-objs PRIVATE
+    $<TARGET_PROPERTY:dpdk::dpdk,INTERFACE_INCLUDE_DIRECTORIES>)
+endif(WITH_DPDK)
diff --git a/src/msg/Connection.cc b/src/msg/Connection.cc
new file mode 100644
index 000000000..9183871b5
--- /dev/null
+++ b/src/msg/Connection.cc
@@ -0,0 +1,23 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "msg/Connection.h"
+#include "msg/Messenger.h"
+
+
+bool Connection::is_blackhole() const {
+  auto& conf = msgr->cct->_conf;
+
+  switch (peer_type) {
+  case CEPH_ENTITY_TYPE_MON:
+    return conf->ms_blackhole_mon;
+  case CEPH_ENTITY_TYPE_OSD:
+    return conf->ms_blackhole_osd;
+  case CEPH_ENTITY_TYPE_MDS:
+    return conf->ms_blackhole_mds;
+  case CEPH_ENTITY_TYPE_CLIENT:
+    return conf->ms_blackhole_client;
+  default:
+    return false;
+  }
+}
diff --git a/src/msg/Connection.h b/src/msg/Connection.h
new file mode 100644
index 000000000..801d3fa20
--- /dev/null
+++ b/src/msg/Connection.h
@@ -0,0 +1,256 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_CONNECTION_H
+#define CEPH_CONNECTION_H
+
+#include <stdlib.h>
+#include <ostream>
+
+#include "auth/Auth.h"
+#include "common/RefCountedObj.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "common/ref.h"
+#include "common/ceph_mutex.h"
+#include "include/ceph_assert.h" // Because intusive_ptr clobbers our assert...
+#include "include/buffer.h"
+#include "include/types.h"
+#include "common/item_history.h"
+#include "msg/MessageRef.h"
+
+// ======================================================
+
+// abstract Connection, for keeping per-connection state
+
+class Messenger;
+
+#ifdef UNIT_TESTS_BUILT
+class Interceptor;
+#endif
+
+struct Connection : public RefCountedObjectSafe {
+  mutable ceph::mutex lock = ceph::make_mutex("Connection::lock");
+  Messenger *msgr;
+  RefCountedPtr priv;
+  int peer_type = -1;
+  int64_t peer_id = -1;  // [msgr2 only] the 0 of osd.0, 4567 or client.4567
+  safe_item_history<entity_addrvec_t> peer_addrs;
+  utime_t last_keepalive, last_keepalive_ack;
+  bool anon = false;  ///< anonymous outgoing connection
+private:
+  uint64_t features = 0;
+public:
+  bool is_loopback = false;
+  bool failed = false; // true if we are a lossy connection that has failed.
+
+  int rx_buffers_version = 0;
+  std::map<ceph_tid_t,std::pair<ceph::buffer::list, int>> rx_buffers;
+
+  // authentication state
+  // FIXME make these private after ms_handle_authorizer is removed
+public:
+  AuthCapsInfo peer_caps_info;
+  EntityName peer_name;
+  uint64_t peer_global_id = 0;
+
+#ifdef UNIT_TESTS_BUILT
+  Interceptor *interceptor;
+#endif
+
+public:
+  void set_priv(const RefCountedPtr& o) {
+    std::lock_guard l{lock};
+    priv = o;
+  }
+
+  RefCountedPtr get_priv() {
+    std::lock_guard l{lock};
+    return priv;
+  }
+
+  void clear_priv() {
+    std::lock_guard l{lock};
+    priv.reset(nullptr);
+  }
+
+  /**
+   * Used to judge whether this connection is ready to send. Usually, the
+   * implementation need to build a own shakehand or sesson then it can be
+   * ready to send.
+   *
+   * @return true if ready to send, or false otherwise
+   */
+  virtual bool is_connected() = 0;
+
+  virtual bool is_msgr2() const {
+    return false;
+  }
+
+  bool is_anon() const {
+    return anon;
+  }
+
+  Messenger *get_messenger() {
+    return msgr;
+  }
+
+  /**
+   * Queue the given Message to send out on the given Connection.
+   * Success in this function does not guarantee Message delivery, only
+   * success in queueing the Message. Other guarantees may be provided based
+   * on the Connection policy.
+   *
+   * @param m The Message to send. The Messenger consumes a single reference
+   * when you pass it in.
+   *
+   * @return 0 on success, or -errno on failure.
+   */
+  virtual int send_message(Message *m) = 0;
+
+  virtual int send_message2(MessageRef m)
+  {
+    return send_message(m.detach()); /* send_message(Message *m) consumes a reference */
+  }
+
+  /**
+   * Send a "keepalive" ping along the given Connection, if it's working.
+   * If the underlying connection has broken, this function does nothing.
+   *
+   * @return 0, or implementation-defined error numbers.
+   */
+  virtual void send_keepalive() = 0;
+  /**
+   * Mark down the given Connection.
+   *
+   * This will cause us to discard its outgoing queue, and if reset
+   * detection is enabled in the policy and the endpoint tries to
+   * reconnect they will discard their queue when we inform them of
+   * the session reset.
+   *
+   * It does not generate any notifications to the Dispatcher.
+   */
+  virtual void mark_down() = 0;
+
+  /**
+   * Mark a Connection as "disposable", setting it to lossy
+   * (regardless of initial Policy).  This does not immediately close
+   * the Connection once Messages have been delivered, so as long as
+   * there are no errors you can continue to receive responses; but it
+   * will not attempt to reconnect for message delivery or preserve
+   * your old delivery semantics, either.
+   *
+   * TODO: There's some odd stuff going on in our SimpleMessenger
+   * implementation during connect that looks unused; is there
+   * more of a contract that that's enforcing?
+   */
+  virtual void mark_disposable() = 0;
+
+  // WARNING / FIXME: this is not populated for loopback connections
+  AuthCapsInfo& get_peer_caps_info() {
+    return peer_caps_info;
+  }
+  const EntityName& get_peer_entity_name() {
+    return peer_name;
+  }
+  uint64_t get_peer_global_id() {
+    return peer_global_id;
+  }
+
+  int get_peer_type() const { return peer_type; }
+  void set_peer_type(int t) { peer_type = t; }
+
+  // peer_id is only defined for msgr2
+  int64_t get_peer_id() const { return peer_id; }
+  void set_peer_id(int64_t t) { peer_id = t; }
+
+  bool peer_is_mon() const { return peer_type == CEPH_ENTITY_TYPE_MON; }
+  bool peer_is_mgr() const { return peer_type == CEPH_ENTITY_TYPE_MGR; }
+  bool peer_is_mds() const { return peer_type == CEPH_ENTITY_TYPE_MDS; }
+  bool peer_is_osd() const { return peer_type == CEPH_ENTITY_TYPE_OSD; }
+  bool peer_is_client() const { return peer_type == CEPH_ENTITY_TYPE_CLIENT; }
+
+  /// which of the peer's addrs is actually in use for this connection
+  virtual entity_addr_t get_peer_socket_addr() const = 0;
+
+  entity_addr_t get_peer_addr() const {
+    return peer_addrs->front();
+  }
+  const entity_addrvec_t& get_peer_addrs() const {
+    return *peer_addrs;
+  }
+  void set_peer_addr(const entity_addr_t& a) {
+    peer_addrs = entity_addrvec_t(a);
+  }
+  void set_peer_addrs(const entity_addrvec_t& av) { peer_addrs = av; }
+
+  uint64_t get_features() const { return features; }
+  bool has_feature(uint64_t f) const { return features & f; }
+  bool has_features(uint64_t f) const {
+    return (features & f) == f;
+  }
+  void set_features(uint64_t f) { features = f; }
+  void set_feature(uint64_t f) { features |= f; }
+
+  virtual int get_con_mode() const {
+    return CEPH_CON_MODE_CRC;
+  }
+
+  void post_rx_buffer(ceph_tid_t tid, ceph::buffer::list& bl) {
+#if 0
+    std::lock_guard l{lock};
+    ++rx_buffers_version;
+    rx_buffers[tid] = pair<bufferlist,int>(bl, rx_buffers_version);
+#endif
+  }
+
+  void revoke_rx_buffer(ceph_tid_t tid) {
+#if 0
+    std::lock_guard l{lock};
+    rx_buffers.erase(tid);
+#endif
+  }
+
+  utime_t get_last_keepalive() const {
+    std::lock_guard l{lock};
+    return last_keepalive;
+  }
+  void set_last_keepalive(utime_t t) {
+    std::lock_guard l{lock};
+    last_keepalive = t;
+  }
+  utime_t get_last_keepalive_ack() const {
+    std::lock_guard l{lock};
+    return last_keepalive_ack;
+  }
+  void set_last_keepalive_ack(utime_t t) {
+    std::lock_guard l{lock};
+    last_keepalive_ack = t;
+  }
+  bool is_blackhole() const;
+
+protected:
+  Connection(CephContext *cct, Messenger *m)
+    : RefCountedObjectSafe(cct),
+      msgr(m)
+  {}
+
+  ~Connection() override {
+    //generic_dout(0) << "~Connection " << this << dendl;
+  }
+};
+
+using ConnectionRef = ceph::ref_t<Connection>;
+
+#endif /* CEPH_CONNECTION_H */
diff --git a/src/msg/DispatchQueue.cc b/src/msg/DispatchQueue.cc
new file mode 100644
index 000000000..b8ed6f7ef
--- /dev/null
+++ b/src/msg/DispatchQueue.cc
@@ -0,0 +1,261 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "msg/Message.h"
+#include "DispatchQueue.h"
+#include "Messenger.h"
+#include "common/ceph_context.h"
+
+#define dout_subsys ceph_subsys_ms
+#include "common/debug.h"
+
+using ceph::cref_t;
+using ceph::ref_t;
+
+/*******************
+ * DispatchQueue
+ */
+
+#undef dout_prefix
+#define dout_prefix *_dout << "-- " << msgr->get_myaddrs() << " "
+
+double DispatchQueue::get_max_age(utime_t now) const {
+  std::lock_guard l{lock};
+  if (marrival.empty())
+    return 0;
+  else
+    return (now - marrival.begin()->first);
+}
+
+uint64_t DispatchQueue::pre_dispatch(const ref_t<Message>& m)
+{
+  ldout(cct,1) << "<== " << m->get_source_inst()
+	       << " " << m->get_seq()
+	       << " ==== " << *m
+	       << " ==== " << m->get_payload().length()
+	       << "+" << m->get_middle().length()
+	       << "+" << m->get_data().length()
+	       << " (" << ceph_con_mode_name(m->get_connection()->get_con_mode())
+	       << " " << m->get_footer().front_crc << " "
+	       << m->get_footer().middle_crc
+	       << " " << m->get_footer().data_crc << ")"
+	       << " " << m << " con " << m->get_connection()
+	       << dendl;
+  uint64_t msize = m->get_dispatch_throttle_size();
+  m->set_dispatch_throttle_size(0); // clear it out, in case we requeue this message.
+  return msize;
+}
+
+void DispatchQueue::post_dispatch(const ref_t<Message>& m, uint64_t msize)
+{
+  dispatch_throttle_release(msize);
+  ldout(cct,20) << "done calling dispatch on " << m << dendl;
+}
+
+bool DispatchQueue::can_fast_dispatch(const cref_t<Message> &m) const
+{
+  return msgr->ms_can_fast_dispatch(m);
+}
+
+void DispatchQueue::fast_dispatch(const ref_t<Message>& m)
+{
+  uint64_t msize = pre_dispatch(m);
+  msgr->ms_fast_dispatch(m);
+  post_dispatch(m, msize);
+}
+
+void DispatchQueue::fast_preprocess(const ref_t<Message>& m)
+{
+  msgr->ms_fast_preprocess(m);
+}
+
+void DispatchQueue::enqueue(const ref_t<Message>& m, int priority, uint64_t id)
+{
+  std::lock_guard l{lock};
+  if (stop) {
+    return;
+  }
+  ldout(cct,20) << "queue " << m << " prio " << priority << dendl;
+  add_arrival(m);
+  if (priority >= CEPH_MSG_PRIO_LOW) {
+    mqueue.enqueue_strict(id, priority, QueueItem(m));
+  } else {
+    mqueue.enqueue(id, priority, m->get_cost(), QueueItem(m));
+  }
+  cond.notify_all();
+}
+
+void DispatchQueue::local_delivery(const ref_t<Message>& m, int priority)
+{
+  auto local_delivery_stamp = ceph_clock_now();
+  m->set_recv_stamp(local_delivery_stamp);
+  m->set_throttle_stamp(local_delivery_stamp);
+  m->set_recv_complete_stamp(local_delivery_stamp);
+  std::lock_guard l{local_delivery_lock};
+  if (local_messages.empty())
+    local_delivery_cond.notify_all();
+  local_messages.emplace(m, priority);
+  return;
+}
+
+void DispatchQueue::run_local_delivery()
+{
+  std::unique_lock l{local_delivery_lock};
+  while (true) {
+    if (stop_local_delivery)
+      break;
+    if (local_messages.empty()) {
+      local_delivery_cond.wait(l);
+      continue;
+    }
+    auto p = std::move(local_messages.front());
+    local_messages.pop();
+    l.unlock();
+    const ref_t<Message>& m = p.first;
+    int priority = p.second;
+    fast_preprocess(m);
+    if (can_fast_dispatch(m)) {
+      fast_dispatch(m);
+    } else {
+      enqueue(m, priority, 0);
+    }
+    l.lock();
+  }
+}
+
+void DispatchQueue::dispatch_throttle_release(uint64_t msize)
+{
+  if (msize) {
+    ldout(cct,10) << __func__ << " " << msize << " to dispatch throttler "
+	    << dispatch_throttler.get_current() << "/"
+	    << dispatch_throttler.get_max() << dendl;
+    dispatch_throttler.put(msize);
+  }
+}
+
+/*
+ * This function delivers incoming messages to the Messenger.
+ * Connections with messages are kept in queues; when beginning a message
+ * delivery the highest-priority queue is selected, the connection from the
+ * front of the queue is removed, and its message read. If the connection
+ * has remaining messages at that priority level, it is re-placed on to the
+ * end of the queue. If the queue is empty; it's removed.
+ * The message is then delivered and the process starts again.
+ */
+void DispatchQueue::entry()
+{
+  std::unique_lock l{lock};
+  while (true) {
+    while (!mqueue.empty()) {
+      QueueItem qitem = mqueue.dequeue();
+      if (!qitem.is_code())
+	remove_arrival(qitem.get_message());
+      l.unlock();
+
+      if (qitem.is_code()) {
+	if (cct->_conf->ms_inject_internal_delays &&
+	    cct->_conf->ms_inject_delay_probability &&
+	    (rand() % 10000)/10000.0 < cct->_conf->ms_inject_delay_probability) {
+	  utime_t t;
+	  t.set_from_double(cct->_conf->ms_inject_internal_delays);
+	  ldout(cct, 1) << "DispatchQueue::entry  inject delay of " << t
+			<< dendl;
+	  t.sleep();
+	}
+	switch (qitem.get_code()) {
+	case D_BAD_REMOTE_RESET:
+	  msgr->ms_deliver_handle_remote_reset(qitem.get_connection());
+	  break;
+	case D_CONNECT:
+	  msgr->ms_deliver_handle_connect(qitem.get_connection());
+	  break;
+	case D_ACCEPT:
+	  msgr->ms_deliver_handle_accept(qitem.get_connection());
+	  break;
+	case D_BAD_RESET:
+	  msgr->ms_deliver_handle_reset(qitem.get_connection());
+	  break;
+	case D_CONN_REFUSED:
+	  msgr->ms_deliver_handle_refused(qitem.get_connection());
+	  break;
+	default:
+	  ceph_abort();
+	}
+      } else {
+	const ref_t<Message>& m = qitem.get_message();
+	if (stop) {
+	  ldout(cct,10) << " stop flag set, discarding " << m << " " << *m << dendl;
+	} else {
+	  uint64_t msize = pre_dispatch(m);
+	  msgr->ms_deliver_dispatch(m);
+	  post_dispatch(m, msize);
+	}
+      }
+
+      l.lock();
+    }
+    if (stop)
+      break;
+
+    // wait for something to be put on queue
+    cond.wait(l);
+  }
+}
+
+void DispatchQueue::discard_queue(uint64_t id) {
+  std::lock_guard l{lock};
+  std::list<QueueItem> removed;
+  mqueue.remove_by_class(id, &removed);
+  for (auto i = removed.begin(); i != removed.end(); ++i) {
+    ceph_assert(!(i->is_code())); // We don't discard id 0, ever!
+    const ref_t<Message>& m = i->get_message();
+    remove_arrival(m);
+    dispatch_throttle_release(m->get_dispatch_throttle_size());
+  }
+}
+
+void DispatchQueue::start()
+{
+  ceph_assert(!stop);
+  ceph_assert(!dispatch_thread.is_started());
+  dispatch_thread.create("ms_dispatch");
+  local_delivery_thread.create("ms_local");
+}
+
+void DispatchQueue::wait()
+{
+  local_delivery_thread.join();
+  dispatch_thread.join();
+}
+
+void DispatchQueue::discard_local()
+{
+  decltype(local_messages)().swap(local_messages);
+}
+
+void DispatchQueue::shutdown()
+{
+  // stop my local delivery thread
+  {
+    std::scoped_lock l{local_delivery_lock};
+    stop_local_delivery = true;
+    local_delivery_cond.notify_all();
+  }
+  // stop my dispatch thread
+  {
+    std::scoped_lock l{lock};
+    stop = true;
+    cond.notify_all();
+  }
+}
diff --git a/src/msg/DispatchQueue.h b/src/msg/DispatchQueue.h
new file mode 100644
index 000000000..de0cb7d1a
--- /dev/null
+++ b/src/msg/DispatchQueue.h
@@ -0,0 +1,242 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_DISPATCHQUEUE_H
+#define CEPH_DISPATCHQUEUE_H
+
+#include <atomic>
+#include <map>
+#include <queue>
+#include <boost/intrusive_ptr.hpp>
+#include "include/ceph_assert.h"
+#include "include/common_fwd.h"
+#include "common/Throttle.h"
+#include "common/ceph_mutex.h"
+#include "common/Thread.h"
+#include "common/PrioritizedQueue.h"
+
+#include "Message.h"
+
+class Messenger;
+struct Connection;
+
+/**
+ * The DispatchQueue contains all the connections which have Messages
+ * they want to be dispatched, carefully organized by Message priority
+ * and permitted to deliver in a round-robin fashion.
+ * See Messenger::dispatch_entry for details.
+ */
+class DispatchQueue {
+  class QueueItem {
+    int type;
+    ConnectionRef con;
+    ceph::ref_t<Message> m;
+  public:
+    explicit QueueItem(const ceph::ref_t<Message>& m) : type(-1), con(0), m(m) {}
+    QueueItem(int type, Connection *con) : type(type), con(con), m(0) {}
+    bool is_code() const {
+      return type != -1;
+    }
+    int get_code () const {
+      ceph_assert(is_code());
+      return type;
+    }
+    const ceph::ref_t<Message>& get_message() {
+      ceph_assert(!is_code());
+      return m;
+    }
+    Connection *get_connection() {
+      ceph_assert(is_code());
+      return con.get();
+    }
+  };
+
+  CephContext *cct;
+  Messenger *msgr;
+  mutable ceph::mutex lock;
+  ceph::condition_variable cond;
+
+  PrioritizedQueue<QueueItem, uint64_t> mqueue;
+
+  std::set<std::pair<double, ceph::ref_t<Message>>> marrival;
+  std::map<ceph::ref_t<Message>, decltype(marrival)::iterator> marrival_map;
+  void add_arrival(const ceph::ref_t<Message>& m) {
+    marrival_map.insert(
+      make_pair(
+	m,
+	marrival.insert(std::make_pair(m->get_recv_stamp(), m)).first
+	)
+      );
+  }
+  void remove_arrival(const ceph::ref_t<Message>& m) {
+    auto it = marrival_map.find(m);
+    ceph_assert(it != marrival_map.end());
+    marrival.erase(it->second);
+    marrival_map.erase(it);
+  }
+
+  std::atomic<uint64_t> next_id;
+
+  enum { D_CONNECT = 1, D_ACCEPT, D_BAD_REMOTE_RESET, D_BAD_RESET, D_CONN_REFUSED, D_NUM_CODES };
+
+  /**
+   * The DispatchThread runs dispatch_entry to empty out the dispatch_queue.
+   */
+  class DispatchThread : public Thread {
+    DispatchQueue *dq;
+  public:
+    explicit DispatchThread(DispatchQueue *dq) : dq(dq) {}
+    void *entry() override {
+      dq->entry();
+      return 0;
+    }
+  } dispatch_thread;
+
+  ceph::mutex local_delivery_lock;
+  ceph::condition_variable local_delivery_cond;
+  bool stop_local_delivery;
+  std::queue<std::pair<ceph::ref_t<Message>, int>> local_messages;
+  class LocalDeliveryThread : public Thread {
+    DispatchQueue *dq;
+  public:
+    explicit LocalDeliveryThread(DispatchQueue *dq) : dq(dq) {}
+    void *entry() override {
+      dq->run_local_delivery();
+      return 0;
+    }
+  } local_delivery_thread;
+
+  uint64_t pre_dispatch(const ceph::ref_t<Message>& m);
+  void post_dispatch(const ceph::ref_t<Message>& m, uint64_t msize);
+
+ public:
+
+  /// Throttle preventing us from building up a big backlog waiting for dispatch
+  Throttle dispatch_throttler;
+
+  bool stop;
+  void local_delivery(const ceph::ref_t<Message>& m, int priority);
+  void local_delivery(Message* m, int priority) {
+    return local_delivery(ceph::ref_t<Message>(m, false), priority); /* consume ref */
+  }
+  void run_local_delivery();
+
+  double get_max_age(utime_t now) const;
+
+  int get_queue_len() const {
+    std::lock_guard l{lock};
+    return mqueue.length();
+  }
+
+  /**
+   * Release memory accounting back to the dispatch throttler.
+   *
+   * @param msize The amount of memory to release.
+   */
+  void dispatch_throttle_release(uint64_t msize);
+
+  void queue_connect(Connection *con) {
+    std::lock_guard l{lock};
+    if (stop)
+      return;
+    mqueue.enqueue_strict(
+      0,
+      CEPH_MSG_PRIO_HIGHEST,
+      QueueItem(D_CONNECT, con));
+    cond.notify_all();
+  }
+  void queue_accept(Connection *con) {
+    std::lock_guard l{lock};
+    if (stop)
+      return;
+    mqueue.enqueue_strict(
+      0,
+      CEPH_MSG_PRIO_HIGHEST,
+      QueueItem(D_ACCEPT, con));
+    cond.notify_all();
+  }
+  void queue_remote_reset(Connection *con) {
+    std::lock_guard l{lock};
+    if (stop)
+      return;
+    mqueue.enqueue_strict(
+      0,
+      CEPH_MSG_PRIO_HIGHEST,
+      QueueItem(D_BAD_REMOTE_RESET, con));
+    cond.notify_all();
+  }
+  void queue_reset(Connection *con) {
+    std::lock_guard l{lock};
+    if (stop)
+      return;
+    mqueue.enqueue_strict(
+      0,
+      CEPH_MSG_PRIO_HIGHEST,
+      QueueItem(D_BAD_RESET, con));
+    cond.notify_all();
+  }
+  void queue_refused(Connection *con) {
+    std::lock_guard l{lock};
+    if (stop)
+      return;
+    mqueue.enqueue_strict(
+      0,
+      CEPH_MSG_PRIO_HIGHEST,
+      QueueItem(D_CONN_REFUSED, con));
+    cond.notify_all();
+  }
+
+  bool can_fast_dispatch(const ceph::cref_t<Message> &m) const;
+  void fast_dispatch(const ceph::ref_t<Message>& m);
+  void fast_dispatch(Message* m) {
+    return fast_dispatch(ceph::ref_t<Message>(m, false)); /* consume ref */
+  }
+  void fast_preprocess(const ceph::ref_t<Message>& m);
+  void enqueue(const ceph::ref_t<Message>& m, int priority, uint64_t id);
+  void enqueue(Message* m, int priority, uint64_t id) {
+    return enqueue(ceph::ref_t<Message>(m, false), priority, id); /* consume ref */
+  }
+  void discard_queue(uint64_t id);
+  void discard_local();
+  uint64_t get_id() {
+    return next_id++;
+  }
+  void start();
+  void entry();
+  void wait();
+  void shutdown();
+  bool is_started() const {return dispatch_thread.is_started();}
+
+  DispatchQueue(CephContext *cct, Messenger *msgr, std::string &name)
+    : cct(cct), msgr(msgr),
+      lock(ceph::make_mutex("Messenger::DispatchQueue::lock" + name)),
+      mqueue(cct->_conf->ms_pq_max_tokens_per_priority,
+	     cct->_conf->ms_pq_min_cost),
+      next_id(1),
+      dispatch_thread(this),
+      local_delivery_lock(ceph::make_mutex("Messenger::DispatchQueue::local_delivery_lock" + name)),
+      stop_local_delivery(false),
+      local_delivery_thread(this),
+      dispatch_throttler(cct, std::string("msgr_dispatch_throttler-") + name,
+                         cct->_conf->ms_dispatch_throttle_bytes),
+      stop(false)
+    {}
+  ~DispatchQueue() {
+    ceph_assert(mqueue.empty());
+    ceph_assert(marrival.empty());
+    ceph_assert(local_messages.empty());
+  }
+};
+
+#endif
diff --git a/src/msg/Dispatcher.h b/src/msg/Dispatcher.h
new file mode 100644
index 000000000..5e025437b
--- /dev/null
+++ b/src/msg/Dispatcher.h
@@ -0,0 +1,228 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_DISPATCHER_H
+#define CEPH_DISPATCHER_H
+
+#include <memory>
+#include "include/buffer_fwd.h"
+#include "include/ceph_assert.h"
+#include "include/common_fwd.h"
+#include "msg/MessageRef.h"
+
+class Messenger;
+class Connection;
+class CryptoKey;
+class KeyStore;
+
+class Dispatcher {
+public:
+  explicit Dispatcher(CephContext *cct_)
+    : cct(cct_)
+  {
+  }
+  virtual ~Dispatcher() { }
+
+  /**
+   * The Messenger calls this function to query if you are capable
+   * of "fast dispatch"ing a message. Indicating that you can fast
+   * dispatch it requires that you:
+   * 1) Handle the Message quickly and without taking long-term contended
+   * locks. (This function is likely to be called in-line with message
+   * receipt.)
+   * 2) Be able to accept the Message even if you have not yet received
+   * an ms_handle_accept() notification for the Connection it is associated
+   * with, and even if you *have* called mark_down() or received an
+   * ms_handle_reset() (or similar) call on the Connection. You will
+   * not receive more than one dead "message" (and should generally be
+   * prepared for that circumstance anyway, since the normal dispatch can begin,
+   * then trigger Connection failure before it's percolated through your system).
+   * We provide ms_handle_fast_[connect|accept] calls if you need them, under
+   * similar speed and state constraints as fast_dispatch itself.
+   * 3) Be able to make a determination on fast_dispatch without relying
+   * on particular system state -- the ms_can_fast_dispatch() call might
+   * be called multiple times on a single message; the state might change between
+   * calling ms_can_fast_dispatch and ms_fast_dispatch; etc.
+   *
+   * @param m The message we want to fast dispatch.
+   * @returns True if the message can be fast dispatched; false otherwise.
+   */
+  virtual bool ms_can_fast_dispatch(const Message *m) const { return false; }
+  virtual bool ms_can_fast_dispatch2(const MessageConstRef& m) const {
+    return ms_can_fast_dispatch(m.get());
+  }
+  /**
+   * This function determines if a dispatcher is included in the
+   * list of fast-dispatch capable Dispatchers.
+   * @returns True if the Dispatcher can handle any messages via
+   * fast dispatch; false otherwise.
+   */
+  virtual bool ms_can_fast_dispatch_any() const { return false; }
+  /**
+   * Perform a "fast dispatch" on a given message. See
+   * ms_can_fast_dispatch() for the requirements.
+   *
+   * @param m The Message to fast dispatch.
+   */
+  virtual void ms_fast_dispatch(Message *m) { ceph_abort(); }
+
+  /* ms_fast_dispatch2 because otherwise the child must define both */
+  virtual void ms_fast_dispatch2(const MessageRef &m) {
+    /* allow old style dispatch handling that expects a Message * with a floating ref */
+    return ms_fast_dispatch(MessageRef(m).detach()); /* XXX N.B. always consumes ref */
+  }
+
+  /**
+   * Let the Dispatcher preview a Message before it is dispatched. This
+   * function is called on *every* Message, prior to the fast/regular dispatch
+   * decision point, but it is only used on fast-dispatch capable systems. An
+   * implementation of ms_fast_preprocess must be essentially lock-free in the
+   * same way as the ms_fast_dispatch function is (in particular, ms_fast_preprocess
+   * may be called while the Messenger holds internal locks that prevent progress from
+   * other threads, so any locks it takes must be at the very bottom of the hierarchy).
+   * Messages are delivered in receipt order within a single Connection, but there are
+   * no guarantees across Connections. This makes it useful for some limited
+   * coordination between Messages which can be fast_dispatch'ed and those which must
+   * go through normal dispatch.
+   *
+   * @param m A message which has been received
+   */
+  virtual void ms_fast_preprocess(Message *m) {}
+
+  /* ms_fast_preprocess2 because otherwise the child must define both */
+  virtual void ms_fast_preprocess2(const MessageRef &m) {
+    /* allow old style dispatch handling that expects a Message* */
+    return ms_fast_preprocess(m.get());
+  }
+
+  /**
+   * The Messenger calls this function to deliver a single message.
+   *
+   * @param m The message being delivered. You (the Dispatcher)
+   * are given a single reference count on it.
+   */
+  virtual bool ms_dispatch(Message *m) {
+    ceph_abort();
+  }
+
+  /* ms_dispatch2 because otherwise the child must define both */
+  virtual bool ms_dispatch2(const MessageRef &m) {
+    /* allow old style dispatch handling that expects a Message * with a floating ref */
+    MessageRef mr(m);
+    if (ms_dispatch(mr.get())) {
+      mr.detach(); /* dispatcher consumed ref */
+      return true;
+    }
+    return false;
+  }
+
+  /**
+   * This function will be called whenever a Connection is newly-created
+   * or reconnects in the Messenger.
+   *
+   * @param con The new Connection which has been established. You are not
+   * granted a reference to it -- take one if you need one!
+   */
+  virtual void ms_handle_connect(Connection *con) {}
+
+  /**
+   * This function will be called synchronously whenever a Connection is
+   * newly-created or reconnects in the Messenger, if you support fast
+   * dispatch. It is guaranteed to be called before any messages are
+   * dispatched.
+   *
+   * @param con The new Connection which has been established. You are not
+   * granted a reference to it -- take one if you need one!
+   */
+  virtual void ms_handle_fast_connect(Connection *con) {}
+
+  /**
+   * Callback indicating we have accepted an incoming connection.
+   *
+   * @param con The (new or existing) Connection associated with the session
+   */
+  virtual void ms_handle_accept(Connection *con) {}
+
+  /**
+   * Callback indicating we have accepted an incoming connection, if you
+   * support fast dispatch. It is guaranteed to be called before any messages
+   * are dispatched.
+   *
+   * @param con The (new or existing) Connection associated with the session
+   */
+  virtual void ms_handle_fast_accept(Connection *con) {}
+
+  /*
+   * this indicates that the ordered+reliable delivery semantics have
+   * been violated.  Messages may have been lost due to a fault
+   * in the network connection.
+   * Only called on lossy Connections.
+   *
+   * @param con The Connection which broke. You are not granted
+   * a reference to it.
+   */
+  virtual bool ms_handle_reset(Connection *con) = 0;
+
+  /**
+   * This indicates that the ordered+reliable delivery semantics
+   * have been violated because the remote somehow reset.
+   * It implies that incoming messages were dropped, and
+   * probably some of our previous outgoing messages were too.
+   *
+   * @param con The Connection which broke. You are not granted
+   * a reference to it.
+   */
+  virtual void ms_handle_remote_reset(Connection *con) = 0;
+
+  /**
+   * This indicates that the connection is both broken and further
+   * connection attempts are failing because other side refuses
+   * it.
+   *
+   * @param con The Connection which broke. You are not granted
+   * a reference to it.
+   */
+  virtual bool ms_handle_refused(Connection *con) = 0;
+
+  /**
+   * @defgroup Authentication
+   * @{
+   */
+
+  /**
+   * handle successful authentication (msgr2)
+   *
+   * Authenticated result/state will be attached to the Connection.
+   *
+   * return 1 for success
+   * return 0 for no action (let another Dispatcher handle it)
+   * return <0 for failure (failure to parse caps, for instance)
+   */
+  virtual int ms_handle_authentication(Connection *con) {
+    return 0;
+  }
+
+  /**
+   * @} //Authentication
+   */
+
+protected:
+  CephContext *cct;
+private:
+  explicit Dispatcher(const Dispatcher &rhs);
+  Dispatcher& operator=(const Dispatcher &rhs);
+};
+
+#endif
diff --git a/src/msg/Message.cc b/src/msg/Message.cc
new file mode 100644
index 000000000..3f4405a50
--- /dev/null
+++ b/src/msg/Message.cc
@@ -0,0 +1,1080 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#ifdef ENCODE_DUMP
+# include <typeinfo>
+# include <cxxabi.h>
+#endif
+
+#include <iostream>
+
+#include "include/types.h"
+
+#include "global/global_context.h"
+
+#include "Message.h"
+
+#include "messages/MPGStats.h"
+
+#include "messages/MGenericMessage.h"
+
+#include "messages/MPGStatsAck.h"
+
+#include "messages/MStatfs.h"
+#include "messages/MStatfsReply.h"
+
+#include "messages/MGetPoolStats.h"
+#include "messages/MGetPoolStatsReply.h"
+
+
+#include "messages/MPoolOp.h"
+#include "messages/MPoolOpReply.h"
+
+#include "messages/PaxosServiceMessage.h"
+#include "messages/MMonCommand.h"
+#include "messages/MMonCommandAck.h"
+#include "messages/MMonPaxos.h"
+#include "messages/MConfig.h"
+#include "messages/MGetConfig.h"
+#include "messages/MKVData.h"
+
+#include "messages/MMonProbe.h"
+#include "messages/MMonJoin.h"
+#include "messages/MMonElection.h"
+#include "messages/MMonSync.h"
+#include "messages/MMonPing.h"
+#include "messages/MMonScrub.h"
+
+#include "messages/MLog.h"
+#include "messages/MLogAck.h"
+
+#include "messages/MPing.h"
+
+#include "messages/MCommand.h"
+#include "messages/MCommandReply.h"
+#include "messages/MBackfillReserve.h"
+#include "messages/MRecoveryReserve.h"
+
+#include "messages/MRoute.h"
+#include "messages/MForward.h"
+
+#include "messages/MOSDBoot.h"
+#include "messages/MOSDAlive.h"
+#include "messages/MOSDBeacon.h"
+#include "messages/MOSDPGTemp.h"
+#include "messages/MOSDFailure.h"
+#include "messages/MOSDMarkMeDown.h"
+#include "messages/MOSDMarkMeDead.h"
+#include "messages/MOSDFull.h"
+#include "messages/MOSDPing.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "messages/MOSDRepOp.h"
+#include "messages/MOSDRepOpReply.h"
+#include "messages/MOSDMap.h"
+#include "messages/MMonGetOSDMap.h"
+#include "messages/MMonGetPurgedSnaps.h"
+#include "messages/MMonGetPurgedSnapsReply.h"
+
+#include "messages/MOSDPGCreated.h"
+#include "messages/MOSDPGNotify.h"
+#include "messages/MOSDPGNotify2.h"
+#include "messages/MOSDPGQuery.h"
+#include "messages/MOSDPGQuery2.h"
+#include "messages/MOSDPGLog.h"
+#include "messages/MOSDPGRemove.h"
+#include "messages/MOSDPGInfo.h"
+#include "messages/MOSDPGInfo2.h"
+#include "messages/MOSDPGCreate.h"
+#include "messages/MOSDPGCreate2.h"
+#include "messages/MOSDPGTrim.h"
+#include "messages/MOSDPGLease.h"
+#include "messages/MOSDPGLeaseAck.h"
+#include "messages/MOSDScrub.h"
+#include "messages/MOSDScrub2.h"
+#include "messages/MOSDScrubReserve.h"
+#include "messages/MOSDRepScrub.h"
+#include "messages/MOSDRepScrubMap.h"
+#include "messages/MOSDForceRecovery.h"
+#include "messages/MOSDPGScan.h"
+#include "messages/MOSDPGBackfill.h"
+#include "messages/MOSDBackoff.h"
+#include "messages/MOSDPGBackfillRemove.h"
+#include "messages/MOSDPGRecoveryDelete.h"
+#include "messages/MOSDPGRecoveryDeleteReply.h"
+#include "messages/MOSDPGReadyToMerge.h"
+
+#include "messages/MRemoveSnaps.h"
+
+#include "messages/MMonMap.h"
+#include "messages/MMonGetMap.h"
+#include "messages/MMonGetVersion.h"
+#include "messages/MMonGetVersionReply.h"
+#include "messages/MMonHealth.h"
+#include "messages/MMonHealthChecks.h"
+#include "messages/MAuth.h"
+#include "messages/MAuthReply.h"
+#include "messages/MMonSubscribe.h"
+#include "messages/MMonSubscribeAck.h"
+#include "messages/MMonGlobalID.h"
+#include "messages/MClientSession.h"
+#include "messages/MClientReconnect.h"
+#include "messages/MClientRequest.h"
+#include "messages/MClientRequestForward.h"
+#include "messages/MClientReply.h"
+#include "messages/MClientReclaim.h"
+#include "messages/MClientReclaimReply.h"
+#include "messages/MClientCaps.h"
+#include "messages/MClientCapRelease.h"
+#include "messages/MClientLease.h"
+#include "messages/MClientSnap.h"
+#include "messages/MClientQuota.h"
+#include "messages/MClientMetrics.h"
+
+#include "messages/MMDSPeerRequest.h"
+
+#include "messages/MMDSMap.h"
+#include "messages/MFSMap.h"
+#include "messages/MFSMapUser.h"
+#include "messages/MMDSBeacon.h"
+#include "messages/MMDSLoadTargets.h"
+#include "messages/MMDSResolve.h"
+#include "messages/MMDSResolveAck.h"
+#include "messages/MMDSCacheRejoin.h"
+#include "messages/MMDSFindIno.h"
+#include "messages/MMDSFindInoReply.h"
+#include "messages/MMDSOpenIno.h"
+#include "messages/MMDSOpenInoReply.h"
+#include "messages/MMDSSnapUpdate.h"
+#include "messages/MMDSScrub.h"
+#include "messages/MMDSScrubStats.h"
+
+#include "messages/MDirUpdate.h"
+#include "messages/MDiscover.h"
+#include "messages/MDiscoverReply.h"
+
+#include "messages/MMDSFragmentNotify.h"
+#include "messages/MMDSFragmentNotifyAck.h"
+
+#include "messages/MExportDirDiscover.h"
+#include "messages/MExportDirDiscoverAck.h"
+#include "messages/MExportDirCancel.h"
+#include "messages/MExportDirPrep.h"
+#include "messages/MExportDirPrepAck.h"
+#include "messages/MExportDir.h"
+#include "messages/MExportDirAck.h"
+#include "messages/MExportDirNotify.h"
+#include "messages/MExportDirNotifyAck.h"
+#include "messages/MExportDirFinish.h"
+
+#include "messages/MExportCaps.h"
+#include "messages/MExportCapsAck.h"
+#include "messages/MGatherCaps.h"
+
+
+#include "messages/MDentryUnlink.h"
+#include "messages/MDentryLink.h"
+
+#include "messages/MHeartbeat.h"
+
+#include "messages/MMDSTableRequest.h"
+#include "messages/MMDSMetrics.h"
+#include "messages/MMDSPing.h"
+
+//#include "messages/MInodeUpdate.h"
+#include "messages/MCacheExpire.h"
+#include "messages/MInodeFileCaps.h"
+
+#include "messages/MMgrBeacon.h"
+#include "messages/MMgrMap.h"
+#include "messages/MMgrDigest.h"
+#include "messages/MMgrReport.h"
+#include "messages/MMgrOpen.h"
+#include "messages/MMgrUpdate.h"
+#include "messages/MMgrClose.h"
+#include "messages/MMgrConfigure.h"
+#include "messages/MMonMgrReport.h"
+#include "messages/MMgrCommand.h"
+#include "messages/MMgrCommandReply.h"
+#include "messages/MServiceMap.h"
+
+#include "messages/MLock.h"
+
+#include "messages/MWatchNotify.h"
+#include "messages/MTimeCheck.h"
+#include "messages/MTimeCheck2.h"
+
+#include "common/config.h"
+
+#include "messages/MOSDPGPush.h"
+#include "messages/MOSDPGPushReply.h"
+#include "messages/MOSDPGPull.h"
+
+#include "messages/MOSDECSubOpWrite.h"
+#include "messages/MOSDECSubOpWriteReply.h"
+#include "messages/MOSDECSubOpRead.h"
+#include "messages/MOSDECSubOpReadReply.h"
+
+#include "messages/MOSDPGUpdateLogMissing.h"
+#include "messages/MOSDPGUpdateLogMissingReply.h"
+
+#ifdef WITH_BLKIN
+#include "Messenger.h"
+#endif
+
+#define DEBUGLVL  10    // debug level of output
+
+#define dout_subsys ceph_subsys_ms
+
+void Message::encode(uint64_t features, int crcflags, bool skip_header_crc)
+{
+  // encode and copy out of *m
+  if (empty_payload()) {
+    ceph_assert(middle.length() == 0);
+    encode_payload(features);
+
+    if (byte_throttler) {
+      byte_throttler->take(payload.length() + middle.length());
+    }
+
+    // if the encoder didn't specify past compatibility, we assume it
+    // is incompatible.
+    if (header.compat_version == 0)
+      header.compat_version = header.version;
+  }
+  if (crcflags & MSG_CRC_HEADER)
+    calc_front_crc();
+
+  // update envelope
+  header.front_len = get_payload().length();
+  header.middle_len = get_middle().length();
+  header.data_len = get_data().length();
+  if (!skip_header_crc && (crcflags & MSG_CRC_HEADER))
+    calc_header_crc();
+
+  footer.flags = CEPH_MSG_FOOTER_COMPLETE;
+
+  if (crcflags & MSG_CRC_DATA) {
+    calc_data_crc();
+
+#ifdef ENCODE_DUMP
+    bufferlist bl;
+    encode(get_header(), bl);
+
+    // dump the old footer format
+    ceph_msg_footer_old old_footer;
+    old_footer.front_crc = footer.front_crc;
+    old_footer.middle_crc = footer.middle_crc;
+    old_footer.data_crc = footer.data_crc;
+    old_footer.flags = footer.flags;
+    encode(old_footer, bl);
+
+    encode(get_payload(), bl);
+    encode(get_middle(), bl);
+    encode(get_data(), bl);
+
+    // this is almost an exponential backoff, except because we count
+    // bits we tend to sample things we encode later, which should be
+    // more representative.
+    static int i = 0;
+    i++;
+    int bits = 0;
+    for (unsigned t = i; t; bits++)
+      t &= t - 1;
+    if (bits <= 2) {
+      char fn[200];
+      int status;
+      snprintf(fn, sizeof(fn), ENCODE_STRINGIFY(ENCODE_DUMP) "/%s__%d.%x",
+	       abi::__cxa_demangle(typeid(*this).name(), 0, 0, &status),
+	       getpid(), i++);
+      int fd = ::open(fn, O_WRONLY|O_TRUNC|O_CREAT|O_CLOEXEC|O_BINARY, 0644);
+      if (fd >= 0) {
+	bl.write_fd(fd);
+	::close(fd);
+      }
+    }
+#endif
+  } else {
+    footer.flags = (unsigned)footer.flags | CEPH_MSG_FOOTER_NOCRC;
+  }
+}
+
+void Message::dump(ceph::Formatter *f) const
+{
+  std::stringstream ss;
+  print(ss);
+  f->dump_string("summary", ss.str());
+}
+
+Message *decode_message(CephContext *cct,
+                        int crcflags,
+                        ceph_msg_header& header,
+                        ceph_msg_footer& footer,
+                        ceph::bufferlist& front,
+                        ceph::bufferlist& middle,
+                        ceph::bufferlist& data,
+                        Message::ConnectionRef conn)
+{
+  // verify crc
+  if (crcflags & MSG_CRC_HEADER) {
+    __u32 front_crc = front.crc32c(0);
+    __u32 middle_crc = middle.crc32c(0);
+
+    if (front_crc != footer.front_crc) {
+      if (cct) {
+	ldout(cct, 0) << "bad crc in front " << front_crc << " != exp " << footer.front_crc
+		      << " from " << conn->get_peer_addr() << dendl;
+	ldout(cct, 20) << " ";
+	front.hexdump(*_dout);
+	*_dout << dendl;
+      }
+      return 0;
+    }
+    if (middle_crc != footer.middle_crc) {
+      if (cct) {
+	ldout(cct, 0) << "bad crc in middle " << middle_crc << " != exp " << footer.middle_crc
+		      << " from " << conn->get_peer_addr() << dendl;
+	ldout(cct, 20) << " ";
+	middle.hexdump(*_dout);
+	*_dout << dendl;
+      }
+      return 0;
+    }
+  }
+  if (crcflags & MSG_CRC_DATA) {
+    if ((footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0) {
+      __u32 data_crc = data.crc32c(0);
+      if (data_crc != footer.data_crc) {
+	if (cct) {
+	  ldout(cct, 0) << "bad crc in data " << data_crc << " != exp " << footer.data_crc
+			<< " from " << conn->get_peer_addr() << dendl;
+	  ldout(cct, 20) << " ";
+	  data.hexdump(*_dout);
+	  *_dout << dendl;
+	}
+	return 0;
+      }
+    }
+  }
+
+  // make message
+  ceph::ref_t<Message> m;
+  int type = header.type;
+  switch (type) {
+
+    // -- with payload --
+
+    using ceph::make_message;
+
+  case MSG_PGSTATS:
+    m = make_message<MPGStats>();
+    break;
+  case MSG_PGSTATSACK:
+    m = make_message<MPGStatsAck>();
+    break;
+
+  case CEPH_MSG_STATFS:
+    m = make_message<MStatfs>();
+    break;
+  case CEPH_MSG_STATFS_REPLY:
+    m = make_message<MStatfsReply>();
+    break;
+  case MSG_GETPOOLSTATS:
+    m = make_message<MGetPoolStats>();
+    break;
+  case MSG_GETPOOLSTATSREPLY:
+    m = make_message<MGetPoolStatsReply>();
+    break;
+  case CEPH_MSG_POOLOP:
+    m = make_message<MPoolOp>();
+    break;
+  case CEPH_MSG_POOLOP_REPLY:
+    m = make_message<MPoolOpReply>();
+    break;
+  case MSG_MON_COMMAND:
+    m = make_message<MMonCommand>();
+    break;
+  case MSG_MON_COMMAND_ACK:
+    m = make_message<MMonCommandAck>();
+    break;
+  case MSG_MON_PAXOS:
+    m = make_message<MMonPaxos>();
+    break;
+  case MSG_CONFIG:
+    m = make_message<MConfig>();
+    break;
+  case MSG_GET_CONFIG:
+    m = make_message<MGetConfig>();
+    break;
+  case MSG_KV_DATA:
+    m = make_message<MKVData>();
+    break;
+
+  case MSG_MON_PROBE:
+    m = make_message<MMonProbe>();
+    break;
+  case MSG_MON_JOIN:
+    m = make_message<MMonJoin>();
+    break;
+  case MSG_MON_ELECTION:
+    m = make_message<MMonElection>();
+    break;
+  case MSG_MON_SYNC:
+    m = make_message<MMonSync>();
+    break;
+  case MSG_MON_PING:
+    m = make_message<MMonPing>();
+    break;
+  case MSG_MON_SCRUB:
+    m = make_message<MMonScrub>();
+    break;
+
+  case MSG_LOG:
+    m = make_message<MLog>();
+    break;
+  case MSG_LOGACK:
+    m = make_message<MLogAck>();
+    break;
+
+  case CEPH_MSG_PING:
+    m = make_message<MPing>();
+    break;
+  case MSG_COMMAND:
+    m = make_message<MCommand>();
+    break;
+  case MSG_COMMAND_REPLY:
+    m = make_message<MCommandReply>();
+    break;
+  case MSG_OSD_BACKFILL_RESERVE:
+    m = make_message<MBackfillReserve>();
+    break;
+  case MSG_OSD_RECOVERY_RESERVE:
+    m = make_message<MRecoveryReserve>();
+    break;
+  case MSG_OSD_FORCE_RECOVERY:
+    m = make_message<MOSDForceRecovery>();
+    break;
+
+  case MSG_ROUTE:
+    m = make_message<MRoute>();
+    break;
+  case MSG_FORWARD:
+    m = make_message<MForward>();
+    break;
+    
+  case CEPH_MSG_MON_MAP:
+    m = make_message<MMonMap>();
+    break;
+  case CEPH_MSG_MON_GET_MAP:
+    m = make_message<MMonGetMap>();
+    break;
+  case CEPH_MSG_MON_GET_OSDMAP:
+    m = make_message<MMonGetOSDMap>();
+    break;
+  case MSG_MON_GET_PURGED_SNAPS:
+    m = make_message<MMonGetPurgedSnaps>();
+    break;
+  case MSG_MON_GET_PURGED_SNAPS_REPLY:
+    m = make_message<MMonGetPurgedSnapsReply>();
+    break;
+  case CEPH_MSG_MON_GET_VERSION:
+    m = make_message<MMonGetVersion>();
+    break;
+  case CEPH_MSG_MON_GET_VERSION_REPLY:
+    m = make_message<MMonGetVersionReply>();
+    break;
+
+  case MSG_OSD_BOOT:
+    m = make_message<MOSDBoot>();
+    break;
+  case MSG_OSD_ALIVE:
+    m = make_message<MOSDAlive>();
+    break;
+  case MSG_OSD_BEACON:
+    m = make_message<MOSDBeacon>();
+    break;
+  case MSG_OSD_PGTEMP:
+    m = make_message<MOSDPGTemp>();
+    break;
+  case MSG_OSD_FAILURE:
+    m = make_message<MOSDFailure>();
+    break;
+  case MSG_OSD_MARK_ME_DOWN:
+    m = make_message<MOSDMarkMeDown>();
+    break;
+  case MSG_OSD_MARK_ME_DEAD:
+    m = make_message<MOSDMarkMeDead>();
+    break;
+  case MSG_OSD_FULL:
+    m = make_message<MOSDFull>();
+    break;
+  case MSG_OSD_PING:
+    m = make_message<MOSDPing>();
+    break;
+  case CEPH_MSG_OSD_OP:
+    m = make_message<MOSDOp>();
+    break;
+  case CEPH_MSG_OSD_OPREPLY:
+    m = make_message<MOSDOpReply>();
+    break;
+  case MSG_OSD_REPOP:
+    m = make_message<MOSDRepOp>();
+    break;
+  case MSG_OSD_REPOPREPLY:
+    m = make_message<MOSDRepOpReply>();
+    break;
+  case MSG_OSD_PG_CREATED:
+    m = make_message<MOSDPGCreated>();
+    break;
+  case MSG_OSD_PG_UPDATE_LOG_MISSING:
+    m = make_message<MOSDPGUpdateLogMissing>();
+    break;
+  case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
+    m = make_message<MOSDPGUpdateLogMissingReply>();
+    break;
+  case CEPH_MSG_OSD_BACKOFF:
+    m = make_message<MOSDBackoff>();
+    break;
+
+  case CEPH_MSG_OSD_MAP:
+    m = make_message<MOSDMap>();
+    break;
+
+  case CEPH_MSG_WATCH_NOTIFY:
+    m = make_message<MWatchNotify>();
+    break;
+
+  case MSG_OSD_PG_NOTIFY:
+    m = make_message<MOSDPGNotify>();
+    break;
+  case MSG_OSD_PG_NOTIFY2:
+    m = make_message<MOSDPGNotify2>();
+    break;
+  case MSG_OSD_PG_QUERY:
+    m = make_message<MOSDPGQuery>();
+    break;
+  case MSG_OSD_PG_QUERY2:
+    m = make_message<MOSDPGQuery2>();
+    break;
+  case MSG_OSD_PG_LOG:
+    m = make_message<MOSDPGLog>();
+    break;
+  case MSG_OSD_PG_REMOVE:
+    m = make_message<MOSDPGRemove>();
+    break;
+  case MSG_OSD_PG_INFO:
+    m = make_message<MOSDPGInfo>();
+    break;
+  case MSG_OSD_PG_INFO2:
+    m = make_message<MOSDPGInfo2>();
+    break;
+  case MSG_OSD_PG_CREATE:
+    m = make_message<MOSDPGCreate>();
+    break;
+  case MSG_OSD_PG_CREATE2:
+    m = make_message<MOSDPGCreate2>();
+    break;
+  case MSG_OSD_PG_TRIM:
+    m = make_message<MOSDPGTrim>();
+    break;
+  case MSG_OSD_PG_LEASE:
+    m = make_message<MOSDPGLease>();
+    break;
+  case MSG_OSD_PG_LEASE_ACK:
+    m = make_message<MOSDPGLeaseAck>();
+    break;
+
+  case MSG_OSD_SCRUB:
+    m = make_message<MOSDScrub>();
+    break;
+  case MSG_OSD_SCRUB2:
+    m = make_message<MOSDScrub2>();
+    break;
+  case MSG_OSD_SCRUB_RESERVE:
+    m = make_message<MOSDScrubReserve>();
+    break;
+  case MSG_REMOVE_SNAPS:
+    m = make_message<MRemoveSnaps>();
+    break;
+  case MSG_OSD_REP_SCRUB:
+    m = make_message<MOSDRepScrub>();
+    break;
+  case MSG_OSD_REP_SCRUBMAP:
+    m = make_message<MOSDRepScrubMap>();
+    break;
+  case MSG_OSD_PG_SCAN:
+    m = make_message<MOSDPGScan>();
+    break;
+  case MSG_OSD_PG_BACKFILL:
+    m = make_message<MOSDPGBackfill>();
+    break;
+  case MSG_OSD_PG_BACKFILL_REMOVE:
+    m = make_message<MOSDPGBackfillRemove>();
+    break;
+  case MSG_OSD_PG_PUSH:
+    m = make_message<MOSDPGPush>();
+    break;
+  case MSG_OSD_PG_PULL:
+    m = make_message<MOSDPGPull>();
+    break;
+  case MSG_OSD_PG_PUSH_REPLY:
+    m = make_message<MOSDPGPushReply>();
+    break;
+  case MSG_OSD_PG_RECOVERY_DELETE:
+    m = make_message<MOSDPGRecoveryDelete>();
+    break;
+  case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
+    m = make_message<MOSDPGRecoveryDeleteReply>();
+    break;
+  case MSG_OSD_PG_READY_TO_MERGE:
+    m = make_message<MOSDPGReadyToMerge>();
+    break;
+  case MSG_OSD_EC_WRITE:
+    m = make_message<MOSDECSubOpWrite>();
+    break;
+  case MSG_OSD_EC_WRITE_REPLY:
+    m = make_message<MOSDECSubOpWriteReply>();
+    break;
+  case MSG_OSD_EC_READ:
+    m = make_message<MOSDECSubOpRead>();
+    break;
+  case MSG_OSD_EC_READ_REPLY:
+    m = make_message<MOSDECSubOpReadReply>();
+    break;
+   // auth
+  case CEPH_MSG_AUTH:
+    m = make_message<MAuth>();
+    break;
+  case CEPH_MSG_AUTH_REPLY:
+    m = make_message<MAuthReply>();
+    break;
+
+  case MSG_MON_GLOBAL_ID:
+    m = make_message<MMonGlobalID>();
+    break; 
+
+    // clients
+  case CEPH_MSG_MON_SUBSCRIBE:
+    m = make_message<MMonSubscribe>();
+    break;
+  case CEPH_MSG_MON_SUBSCRIBE_ACK:
+    m = make_message<MMonSubscribeAck>();
+    break;
+  case CEPH_MSG_CLIENT_SESSION:
+    m = make_message<MClientSession>();
+    break;
+  case CEPH_MSG_CLIENT_RECONNECT:
+    m = make_message<MClientReconnect>();
+    break;
+  case CEPH_MSG_CLIENT_REQUEST:
+    m = make_message<MClientRequest>();
+    break;
+  case CEPH_MSG_CLIENT_REQUEST_FORWARD:
+    m = make_message<MClientRequestForward>();
+    break;
+  case CEPH_MSG_CLIENT_REPLY:
+    m = make_message<MClientReply>();
+    break;
+  case CEPH_MSG_CLIENT_RECLAIM:
+    m = make_message<MClientReclaim>();
+    break;
+  case CEPH_MSG_CLIENT_RECLAIM_REPLY:
+    m = make_message<MClientReclaimReply>();
+    break;
+  case CEPH_MSG_CLIENT_CAPS:
+    m = make_message<MClientCaps>();
+    break;
+  case CEPH_MSG_CLIENT_CAPRELEASE:
+    m = make_message<MClientCapRelease>();
+    break;
+  case CEPH_MSG_CLIENT_LEASE:
+    m = make_message<MClientLease>();
+    break;
+  case CEPH_MSG_CLIENT_SNAP:
+    m = make_message<MClientSnap>();
+    break;
+  case CEPH_MSG_CLIENT_QUOTA:
+    m = make_message<MClientQuota>();
+    break;
+  case CEPH_MSG_CLIENT_METRICS:
+    m = make_message<MClientMetrics>();
+    break;
+
+    // mds
+  case MSG_MDS_PEER_REQUEST:
+    m = make_message<MMDSPeerRequest>();
+    break;
+
+  case CEPH_MSG_MDS_MAP:
+    m = make_message<MMDSMap>();
+    break;
+  case CEPH_MSG_FS_MAP:
+    m = make_message<MFSMap>();
+    break;
+  case CEPH_MSG_FS_MAP_USER:
+    m = make_message<MFSMapUser>();
+    break;
+  case MSG_MDS_BEACON:
+    m = make_message<MMDSBeacon>();
+    break;
+  case MSG_MDS_OFFLOAD_TARGETS:
+    m = make_message<MMDSLoadTargets>();
+    break;
+  case MSG_MDS_RESOLVE:
+    m = make_message<MMDSResolve>();
+    break;
+  case MSG_MDS_RESOLVEACK:
+    m = make_message<MMDSResolveAck>();
+    break;
+  case MSG_MDS_CACHEREJOIN:
+    m = make_message<MMDSCacheRejoin>();
+	break;
+  
+  case MSG_MDS_DIRUPDATE:
+    m = make_message<MDirUpdate>();
+    break;
+
+  case MSG_MDS_DISCOVER:
+    m = make_message<MDiscover>();
+    break;
+  case MSG_MDS_DISCOVERREPLY:
+    m = make_message<MDiscoverReply>();
+    break;
+
+  case MSG_MDS_FINDINO:
+    m = make_message<MMDSFindIno>();
+    break;
+  case MSG_MDS_FINDINOREPLY:
+    m = make_message<MMDSFindInoReply>();
+    break;
+
+  case MSG_MDS_OPENINO:
+    m = make_message<MMDSOpenIno>();
+    break;
+  case MSG_MDS_OPENINOREPLY:
+    m = make_message<MMDSOpenInoReply>();
+    break;
+
+  case MSG_MDS_SNAPUPDATE:
+    m = make_message<MMDSSnapUpdate>();
+    break;
+
+  case MSG_MDS_FRAGMENTNOTIFY:
+    m = make_message<MMDSFragmentNotify>();
+    break;
+
+  case MSG_MDS_FRAGMENTNOTIFYACK:
+    m = make_message<MMDSFragmentNotifyAck>();
+    break;
+
+  case MSG_MDS_SCRUB:
+    m = make_message<MMDSScrub>();
+    break;
+
+  case MSG_MDS_SCRUB_STATS:
+    m = make_message<MMDSScrubStats>();
+    break;
+
+  case MSG_MDS_EXPORTDIRDISCOVER:
+    m = make_message<MExportDirDiscover>();
+    break;
+  case MSG_MDS_EXPORTDIRDISCOVERACK:
+    m = make_message<MExportDirDiscoverAck>();
+    break;
+  case MSG_MDS_EXPORTDIRCANCEL:
+    m = make_message<MExportDirCancel>();
+    break;
+
+  case MSG_MDS_EXPORTDIR:
+    m = make_message<MExportDir>();
+    break;
+  case MSG_MDS_EXPORTDIRACK:
+    m = make_message<MExportDirAck>();
+    break;
+  case MSG_MDS_EXPORTDIRFINISH:
+    m = make_message<MExportDirFinish>();
+    break;
+
+  case MSG_MDS_EXPORTDIRNOTIFY:
+    m = make_message<MExportDirNotify>();
+    break;
+
+  case MSG_MDS_EXPORTDIRNOTIFYACK:
+    m = make_message<MExportDirNotifyAck>();
+    break;
+
+  case MSG_MDS_EXPORTDIRPREP:
+    m = make_message<MExportDirPrep>();
+    break;
+
+  case MSG_MDS_EXPORTDIRPREPACK:
+    m = make_message<MExportDirPrepAck>();
+    break;
+
+  case MSG_MDS_EXPORTCAPS:
+    m = make_message<MExportCaps>();
+    break;
+  case MSG_MDS_EXPORTCAPSACK:
+    m = make_message<MExportCapsAck>();
+    break;
+  case MSG_MDS_GATHERCAPS:
+    m = make_message<MGatherCaps>();
+    break;
+
+
+  case MSG_MDS_DENTRYUNLINK_ACK:
+    m = make_message<MDentryUnlinkAck>();
+    break;
+  case MSG_MDS_DENTRYUNLINK:
+    m = make_message<MDentryUnlink>();
+    break;
+  case MSG_MDS_DENTRYLINK:
+    m = make_message<MDentryLink>();
+    break;
+
+  case MSG_MDS_HEARTBEAT:
+    m = make_message<MHeartbeat>();
+    break;
+
+  case MSG_MDS_CACHEEXPIRE:
+    m = make_message<MCacheExpire>();
+    break;
+
+  case MSG_MDS_TABLE_REQUEST:
+    m = make_message<MMDSTableRequest>();
+    break;
+
+	/*  case MSG_MDS_INODEUPDATE:
+    m = make_message<MInodeUpdate>();
+    break;
+	*/
+
+  case MSG_MDS_INODEFILECAPS:
+    m = make_message<MInodeFileCaps>();
+    break;
+
+  case MSG_MDS_LOCK:
+    m = make_message<MLock>();
+    break;
+
+  case MSG_MDS_METRICS:
+    m = make_message<MMDSMetrics>();
+    break;
+
+  case MSG_MDS_PING:
+    m = make_message<MMDSPing>();
+    break;
+
+  case MSG_MGR_BEACON:
+    m = make_message<MMgrBeacon>();
+    break;
+
+  case MSG_MON_MGR_REPORT:
+    m = make_message<MMonMgrReport>();
+    break;
+
+  case MSG_SERVICE_MAP:
+    m = make_message<MServiceMap>();
+    break;
+
+  case MSG_MGR_MAP:
+    m = make_message<MMgrMap>();
+    break;
+
+  case MSG_MGR_DIGEST:
+    m = make_message<MMgrDigest>();
+    break;
+
+  case MSG_MGR_COMMAND:
+    m = make_message<MMgrCommand>();
+    break;
+
+  case MSG_MGR_COMMAND_REPLY:
+    m = make_message<MMgrCommandReply>();
+    break;
+
+  case MSG_MGR_OPEN:
+    m = make_message<MMgrOpen>();
+    break;
+
+  case MSG_MGR_UPDATE:
+    m = make_message<MMgrUpdate>();
+    break;
+
+  case MSG_MGR_CLOSE:
+    m = make_message<MMgrClose>();
+    break;
+
+  case MSG_MGR_REPORT:
+    m = make_message<MMgrReport>();
+    break;
+
+  case MSG_MGR_CONFIGURE:
+    m = make_message<MMgrConfigure>();
+    break;
+
+  case MSG_TIMECHECK:
+    m = make_message<MTimeCheck>();
+    break;
+  case MSG_TIMECHECK2:
+    m = make_message<MTimeCheck2>();
+    break;
+
+  case MSG_MON_HEALTH:
+    m = make_message<MMonHealth>();
+    break;
+
+  case MSG_MON_HEALTH_CHECKS:
+    m = make_message<MMonHealthChecks>();
+    break;
+
+    // -- simple messages without payload --
+
+  case CEPH_MSG_SHUTDOWN:
+    m = make_message<MGenericMessage>(type);
+    break;
+
+  default:
+    if (cct) {
+      ldout(cct, 0) << "can't decode unknown message type " << type << " MSG_AUTH=" << CEPH_MSG_AUTH << dendl;
+      if (cct->_conf->ms_die_on_bad_msg)
+	ceph_abort();
+    }
+    return 0;
+  }
+
+  m->set_cct(cct);
+
+  // m->header.version, if non-zero, should be populated with the
+  // newest version of the encoding the code supports.  If set, check
+  // it against compat_version.
+  if (m->get_header().version &&
+      m->get_header().version < header.compat_version) {
+    if (cct) {
+      ldout(cct, 0) << "will not decode message of type " << type
+		    << " version " << header.version
+		    << " because compat_version " << header.compat_version
+		    << " > supported version " << m->get_header().version << dendl;
+      if (cct->_conf->ms_die_on_bad_msg)
+	ceph_abort();
+    }
+    return 0;
+  }
+
+  m->set_connection(std::move(conn));
+  m->set_header(header);
+  m->set_footer(footer);
+  m->set_payload(front);
+  m->set_middle(middle);
+  m->set_data(data);
+
+  try {
+    m->decode_payload();
+  }
+  catch (const ceph::buffer::error &e) {
+    if (cct) {
+      lderr(cct) << "failed to decode message of type " << type
+		 << " v" << header.version
+		 << ": " << e.what() << dendl;
+      ldout(cct, ceph::dout::need_dynamic(
+	cct->_conf->ms_dump_corrupt_message_level)) << "dump: \n";
+      m->get_payload().hexdump(*_dout);
+      *_dout << dendl;
+      if (cct->_conf->ms_die_on_bad_msg)
+	ceph_abort();
+    }
+    return 0;
+  }
+
+  // done!
+  return m.detach();
+}
+
+void Message::encode_trace(ceph::bufferlist &bl, uint64_t features) const
+{
+  using ceph::encode;
+  auto p = trace.get_info();
+  static const blkin_trace_info empty = { 0, 0, 0 };
+  if (!p) {
+    p = &empty;
+  }
+  encode(*p, bl);
+}
+
+void Message::decode_trace(ceph::bufferlist::const_iterator &p, bool create)
+{
+  blkin_trace_info info = {};
+  decode(info, p);
+
+#ifdef WITH_BLKIN
+  if (!connection)
+    return;
+
+  const auto msgr = connection->get_messenger();
+  const auto endpoint = msgr->get_trace_endpoint();
+  if (info.trace_id) {
+    trace.init(get_type_name().data(), endpoint, &info, true);
+    trace.event("decoded trace");
+  } else if (create || (msgr->get_myname().is_osd() &&
+                        msgr->cct->_conf->osd_blkin_trace_all)) {
+    // create a trace even if we didn't get one on the wire
+    trace.init(get_type_name().data(), endpoint);
+    trace.event("created trace");
+  }
+  trace.keyval("tid", get_tid());
+  trace.keyval("entity type", get_source().type_str());
+  trace.keyval("entity num", get_source().num());
+#endif
+}
+
+
+// This routine is not used for ordinary messages, but only when encapsulating a message
+// for forwarding and routing.  It's also used in a backward compatibility test, which only
+// effectively tests backward compability for those functions.  To avoid backward compatibility
+// problems, we currently always encode and decode using the old footer format that doesn't
+// allow for message authentication.  Eventually we should fix that.  PLR
+
+void encode_message(Message *msg, uint64_t features, ceph::bufferlist& payload)
+{
+  ceph_msg_footer_old old_footer;
+  msg->encode(features, MSG_CRC_ALL);
+  encode(msg->get_header(), payload);
+
+  // Here's where we switch to the old footer format.  PLR
+  ceph_msg_footer footer = msg->get_footer();
+  old_footer.front_crc = footer.front_crc;   
+  old_footer.middle_crc = footer.middle_crc;   
+  old_footer.data_crc = footer.data_crc;   
+  old_footer.flags = footer.flags;   
+  encode(old_footer, payload);
+
+  using ceph::encode;
+  encode(msg->get_payload(), payload);
+  encode(msg->get_middle(), payload);
+  encode(msg->get_data(), payload);
+}
+
+// See above for somewhat bogus use of the old message footer.  We switch to the current footer
+// after decoding the old one so the other form of decode_message() doesn't have to change.
+// We've slipped in a 0 signature at this point, so any signature checking after this will
+// fail.  PLR
+
+Message *decode_message(CephContext *cct, int crcflags, ceph::bufferlist::const_iterator& p)
+{
+  ceph_msg_header h;
+  ceph_msg_footer_old fo;
+  ceph_msg_footer f;
+  ceph::bufferlist fr, mi, da;
+  decode(h, p);
+  decode(fo, p);
+  f.front_crc = fo.front_crc;
+  f.middle_crc = fo.middle_crc;
+  f.data_crc = fo.data_crc;
+  f.flags = fo.flags;
+  f.sig = 0;
+  using ceph::decode;
+  decode(fr, p);
+  decode(mi, p);
+  decode(da, p);
+  return decode_message(cct, crcflags, h, f, fr, mi, da, nullptr);
+}
diff --git a/src/msg/Message.h b/src/msg/Message.h
new file mode 100644
index 000000000..b211474c3
--- /dev/null
+++ b/src/msg/Message.h
@@ -0,0 +1,564 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MESSAGE_H
+#define CEPH_MESSAGE_H
+
+#include <cstdlib>
+#include <ostream>
+#include <string_view>
+
+#include <boost/intrusive/list.hpp>
+
+#include "include/Context.h"
+#include "common/RefCountedObj.h"
+#include "common/ThrottleInterface.h"
+#include "common/config.h"
+#include "common/ref.h"
+#include "common/debug.h"
+#include "common/zipkin_trace.h"
+#include "include/ceph_assert.h" // Because intrusive_ptr clobbers our assert...
+#include "include/buffer.h"
+#include "include/types.h"
+#include "msg/Connection.h"
+#include "msg/MessageRef.h"
+#include "msg_types.h"
+
+#ifdef WITH_SEASTAR
+#  include "crimson/net/SocketConnection.h"
+#endif // WITH_SEASTAR
+
+// monitor internal
+#define MSG_MON_SCRUB              64
+#define MSG_MON_ELECTION           65
+#define MSG_MON_PAXOS              66
+#define MSG_MON_PROBE              67
+#define MSG_MON_JOIN               68
+#define MSG_MON_SYNC		   69
+#define MSG_MON_PING               140
+
+/* monitor <-> mon admin tool */
+#define MSG_MON_COMMAND            50
+#define MSG_MON_COMMAND_ACK        51
+#define MSG_LOG                    52
+#define MSG_LOGACK                 53
+
+#define MSG_GETPOOLSTATS           58
+#define MSG_GETPOOLSTATSREPLY      59
+
+#define MSG_MON_GLOBAL_ID          60
+
+#define MSG_ROUTE                  47
+#define MSG_FORWARD                46
+
+#define MSG_PAXOS                  40
+
+#define MSG_CONFIG           62
+#define MSG_GET_CONFIG       63
+
+#define MSG_KV_DATA          54
+
+#define MSG_MON_GET_PURGED_SNAPS 76
+#define MSG_MON_GET_PURGED_SNAPS_REPLY 77
+
+// osd internal
+#define MSG_OSD_PING         70
+#define MSG_OSD_BOOT         71
+#define MSG_OSD_FAILURE      72
+#define MSG_OSD_ALIVE        73
+#define MSG_OSD_MARK_ME_DOWN 74
+#define MSG_OSD_FULL         75
+#define MSG_OSD_MARK_ME_DEAD 123
+
+// removed right after luminous
+//#define MSG_OSD_SUBOP        76
+//#define MSG_OSD_SUBOPREPLY   77
+
+#define MSG_OSD_PGTEMP       78
+
+#define MSG_OSD_BEACON       79
+
+#define MSG_OSD_PG_NOTIFY      80
+#define MSG_OSD_PG_NOTIFY2    130
+#define MSG_OSD_PG_QUERY       81
+#define MSG_OSD_PG_QUERY2     131
+#define MSG_OSD_PG_LOG         83
+#define MSG_OSD_PG_REMOVE      84
+#define MSG_OSD_PG_INFO        85
+#define MSG_OSD_PG_INFO2      132
+#define MSG_OSD_PG_TRIM        86
+
+#define MSG_PGSTATS            87
+#define MSG_PGSTATSACK         88
+
+#define MSG_OSD_PG_CREATE      89
+#define MSG_REMOVE_SNAPS       90
+
+#define MSG_OSD_SCRUB          91
+#define MSG_OSD_SCRUB_RESERVE  92  // previous PG_MISSING
+#define MSG_OSD_REP_SCRUB      93
+
+#define MSG_OSD_PG_SCAN        94
+#define MSG_OSD_PG_BACKFILL    95
+#define MSG_OSD_PG_BACKFILL_REMOVE 96
+
+#define MSG_COMMAND            97
+#define MSG_COMMAND_REPLY      98
+
+#define MSG_OSD_BACKFILL_RESERVE 99
+#define MSG_OSD_RECOVERY_RESERVE 150
+#define MSG_OSD_FORCE_RECOVERY 151
+
+#define MSG_OSD_PG_PUSH        105
+#define MSG_OSD_PG_PULL        106
+#define MSG_OSD_PG_PUSH_REPLY  107
+
+#define MSG_OSD_EC_WRITE       108
+#define MSG_OSD_EC_WRITE_REPLY 109
+#define MSG_OSD_EC_READ        110
+#define MSG_OSD_EC_READ_REPLY  111
+
+#define MSG_OSD_REPOP         112
+#define MSG_OSD_REPOPREPLY    113
+#define MSG_OSD_PG_UPDATE_LOG_MISSING  114
+#define MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY  115
+
+#define MSG_OSD_PG_CREATED      116
+#define MSG_OSD_REP_SCRUBMAP    117
+#define MSG_OSD_PG_RECOVERY_DELETE 118
+#define MSG_OSD_PG_RECOVERY_DELETE_REPLY 119
+#define MSG_OSD_PG_CREATE2      120
+#define MSG_OSD_SCRUB2          121
+
+#define MSG_OSD_PG_READY_TO_MERGE 122
+
+#define MSG_OSD_PG_LEASE        133
+#define MSG_OSD_PG_LEASE_ACK    134
+
+// *** MDS ***
+
+#define MSG_MDS_BEACON             100  // to monitor
+#define MSG_MDS_PEER_REQUEST       101
+#define MSG_MDS_TABLE_REQUEST      102
+#define MSG_MDS_SCRUB              135
+
+                                // 150 already in use (MSG_OSD_RECOVERY_RESERVE)
+
+#define MSG_MDS_RESOLVE            0x200 // 0x2xx are for mdcache of mds
+#define MSG_MDS_RESOLVEACK         0x201
+#define MSG_MDS_CACHEREJOIN        0x202
+#define MSG_MDS_DISCOVER           0x203
+#define MSG_MDS_DISCOVERREPLY      0x204
+#define MSG_MDS_INODEUPDATE        0x205
+#define MSG_MDS_DIRUPDATE          0x206
+#define MSG_MDS_CACHEEXPIRE        0x207
+#define MSG_MDS_DENTRYUNLINK       0x208
+#define MSG_MDS_FRAGMENTNOTIFY     0x209
+#define MSG_MDS_OFFLOAD_TARGETS    0x20a
+#define MSG_MDS_DENTRYLINK         0x20c
+#define MSG_MDS_FINDINO            0x20d
+#define MSG_MDS_FINDINOREPLY       0x20e
+#define MSG_MDS_OPENINO            0x20f
+#define MSG_MDS_OPENINOREPLY       0x210
+#define MSG_MDS_SNAPUPDATE         0x211
+#define MSG_MDS_FRAGMENTNOTIFYACK  0x212
+#define MSG_MDS_DENTRYUNLINK_ACK   0x213
+#define MSG_MDS_LOCK               0x300 // 0x3xx are for locker of mds
+#define MSG_MDS_INODEFILECAPS      0x301
+
+#define MSG_MDS_EXPORTDIRDISCOVER     0x449 // 0x4xx are for migrator of mds
+#define MSG_MDS_EXPORTDIRDISCOVERACK  0x450
+#define MSG_MDS_EXPORTDIRCANCEL       0x451
+#define MSG_MDS_EXPORTDIRPREP         0x452
+#define MSG_MDS_EXPORTDIRPREPACK      0x453
+#define MSG_MDS_EXPORTDIRWARNING      0x454
+#define MSG_MDS_EXPORTDIRWARNINGACK   0x455
+#define MSG_MDS_EXPORTDIR             0x456
+#define MSG_MDS_EXPORTDIRACK          0x457
+#define MSG_MDS_EXPORTDIRNOTIFY       0x458
+#define MSG_MDS_EXPORTDIRNOTIFYACK    0x459
+#define MSG_MDS_EXPORTDIRFINISH       0x460
+
+#define MSG_MDS_EXPORTCAPS            0x470
+#define MSG_MDS_EXPORTCAPSACK         0x471
+#define MSG_MDS_GATHERCAPS            0x472
+
+#define MSG_MDS_HEARTBEAT          0x500  // for mds load balancer
+#define MSG_MDS_METRICS            0x501  // for mds metric aggregator
+#define MSG_MDS_PING               0x502  // for mds pinger
+#define MSG_MDS_SCRUB_STATS        0x503  // for mds scrub stack
+
+// *** generic ***
+#define MSG_TIMECHECK             0x600
+#define MSG_MON_HEALTH            0x601
+
+// *** Message::encode() crcflags bits ***
+#define MSG_CRC_DATA           (1 << 0)
+#define MSG_CRC_HEADER         (1 << 1)
+#define MSG_CRC_ALL            (MSG_CRC_DATA | MSG_CRC_HEADER)
+
+
+// Special
+#define MSG_NOP                   0x607
+
+#define MSG_MON_HEALTH_CHECKS     0x608
+#define MSG_TIMECHECK2            0x609
+
+// *** ceph-mgr <-> OSD/MDS daemons ***
+#define MSG_MGR_OPEN              0x700
+#define MSG_MGR_CONFIGURE         0x701
+#define MSG_MGR_REPORT            0x702
+
+// *** ceph-mgr <-> ceph-mon ***
+#define MSG_MGR_BEACON            0x703
+
+// *** ceph-mon(MgrMonitor) -> OSD/MDS daemons ***
+#define MSG_MGR_MAP               0x704
+
+// *** ceph-mon(MgrMonitor) -> ceph-mgr
+#define MSG_MGR_DIGEST               0x705
+// *** cephmgr -> ceph-mon
+#define MSG_MON_MGR_REPORT        0x706
+#define MSG_SERVICE_MAP           0x707
+
+#define MSG_MGR_CLOSE             0x708
+#define MSG_MGR_COMMAND           0x709
+#define MSG_MGR_COMMAND_REPLY     0x70a
+
+// *** ceph-mgr <-> MON daemons ***
+#define MSG_MGR_UPDATE     0x70b
+
+// ======================================================
+
+// abstract Message class
+
+class Message : public RefCountedObject {
+public:
+#ifdef WITH_SEASTAR
+  using ConnectionRef = crimson::net::ConnectionRef;
+#else
+  using ConnectionRef = ::ConnectionRef;
+#endif // WITH_SEASTAR
+
+protected:
+  ceph_msg_header  header;      // headerelope
+  ceph_msg_footer  footer;
+  ceph::buffer::list       payload;  // "front" unaligned blob
+  ceph::buffer::list       middle;   // "middle" unaligned blob
+  ceph::buffer::list       data;     // data payload (page-alignment will be preserved where possible)
+
+  /* recv_stamp is set when the Messenger starts reading the
+   * Message off the wire */
+  utime_t recv_stamp;
+  /* dispatch_stamp is set when the Messenger starts calling dispatch() on
+   * its endpoints */
+  utime_t dispatch_stamp;
+  /* throttle_stamp is the point at which we got throttle */
+  utime_t throttle_stamp;
+  /* time at which message was fully read */
+  utime_t recv_complete_stamp;
+
+  ConnectionRef connection;
+
+  uint32_t magic = 0;
+
+  boost::intrusive::list_member_hook<> dispatch_q;
+
+public:
+  // zipkin tracing
+  ZTracer::Trace trace;
+  void encode_trace(ceph::buffer::list &bl, uint64_t features) const;
+  void decode_trace(ceph::buffer::list::const_iterator &p, bool create = false);
+
+  class CompletionHook : public Context {
+  protected:
+    Message *m;
+    friend class Message;
+  public:
+    explicit CompletionHook(Message *_m) : m(_m) {}
+    virtual void set_message(Message *_m) { m = _m; }
+  };
+
+  typedef boost::intrusive::list<Message,
+				 boost::intrusive::member_hook<
+				   Message,
+				   boost::intrusive::list_member_hook<>,
+				   &Message::dispatch_q>> Queue;
+
+  ceph::mono_time queue_start;
+protected:
+  CompletionHook* completion_hook = nullptr; // owned by Messenger
+
+  // release our size in bytes back to this throttler when our payload
+  // is adjusted or when we are destroyed.
+  ThrottleInterface *byte_throttler = nullptr;
+
+  // release a count back to this throttler when we are destroyed
+  ThrottleInterface *msg_throttler = nullptr;
+
+  // keep track of how big this message was when we reserved space in
+  // the msgr dispatch_throttler, so that we can properly release it
+  // later.  this is necessary because messages can enter the dispatch
+  // queue locally (not via read_message()), and those are not
+  // currently throttled.
+  uint64_t dispatch_throttle_size = 0;
+
+  friend class Messenger;
+
+public:
+  Message() {
+    memset(&header, 0, sizeof(header));
+    memset(&footer, 0, sizeof(footer));
+  }
+  Message(int t, int version=1, int compat_version=0) {
+    memset(&header, 0, sizeof(header));
+    header.type = t;
+    header.version = version;
+    header.compat_version = compat_version;
+    memset(&footer, 0, sizeof(footer));
+  }
+
+  Message *get() {
+    return static_cast<Message *>(RefCountedObject::get());
+  }
+
+protected:
+  ~Message() override {
+    if (byte_throttler)
+      byte_throttler->put(payload.length() + middle.length() + data.length());
+    release_message_throttle();
+    trace.event("message destructed");
+    /* call completion hooks (if any) */
+    if (completion_hook)
+      completion_hook->complete(0);
+  }
+public:
+  const ConnectionRef& get_connection() const { return connection; }
+  void set_connection(ConnectionRef c) {
+    connection = std::move(c);
+  }
+  CompletionHook* get_completion_hook() { return completion_hook; }
+  void set_completion_hook(CompletionHook *hook) { completion_hook = hook; }
+  void set_byte_throttler(ThrottleInterface *t) {
+    byte_throttler = t;
+  }
+  void set_message_throttler(ThrottleInterface *t) {
+    msg_throttler = t;
+  }
+
+  void set_dispatch_throttle_size(uint64_t s) { dispatch_throttle_size = s; }
+  uint64_t get_dispatch_throttle_size() const { return dispatch_throttle_size; }
+
+  const ceph_msg_header &get_header() const { return header; }
+  ceph_msg_header &get_header() { return header; }
+  void set_header(const ceph_msg_header &e) { header = e; }
+  void set_footer(const ceph_msg_footer &e) { footer = e; }
+  const ceph_msg_footer &get_footer() const { return footer; }
+  ceph_msg_footer &get_footer() { return footer; }
+  void set_src(const entity_name_t& src) { header.src = src; }
+
+  uint32_t get_magic() const { return magic; }
+  void set_magic(int _magic) { magic = _magic; }
+
+  /*
+   * If you use get_[data, middle, payload] you shouldn't
+   * use it to change those ceph::buffer::lists unless you KNOW
+   * there is no throttle being used. The other
+   * functions are throttling-aware as appropriate.
+   */
+
+  void clear_payload() {
+    if (byte_throttler) {
+      byte_throttler->put(payload.length() + middle.length());
+    }
+    payload.clear();
+    middle.clear();
+  }
+
+  virtual void clear_buffers() {}
+  void clear_data() {
+    if (byte_throttler)
+      byte_throttler->put(data.length());
+    data.clear();
+    clear_buffers(); // let subclass drop buffers as well
+  }
+  void release_message_throttle() {
+    if (msg_throttler)
+      msg_throttler->put();
+    msg_throttler = nullptr;
+  }
+
+  bool empty_payload() const { return payload.length() == 0; }
+  ceph::buffer::list& get_payload() { return payload; }
+  const ceph::buffer::list& get_payload() const { return payload; }
+  void set_payload(ceph::buffer::list& bl) {
+    if (byte_throttler)
+      byte_throttler->put(payload.length());
+    payload = std::move(bl);
+    if (byte_throttler)
+      byte_throttler->take(payload.length());
+  }
+
+  void set_middle(ceph::buffer::list& bl) {
+    if (byte_throttler)
+      byte_throttler->put(middle.length());
+    middle = std::move(bl);
+    if (byte_throttler)
+      byte_throttler->take(middle.length());
+  }
+  ceph::buffer::list& get_middle() { return middle; }
+
+  void set_data(const ceph::buffer::list &bl) {
+    if (byte_throttler)
+      byte_throttler->put(data.length());
+    data.share(bl);
+    if (byte_throttler)
+      byte_throttler->take(data.length());
+  }
+
+  const ceph::buffer::list& get_data() const { return data; }
+  ceph::buffer::list& get_data() { return data; }
+  void claim_data(ceph::buffer::list& bl) {
+    if (byte_throttler)
+      byte_throttler->put(data.length());
+    bl = std::move(data);
+  }
+  off_t get_data_len() const { return data.length(); }
+
+  void set_recv_stamp(utime_t t) { recv_stamp = t; }
+  const utime_t& get_recv_stamp() const { return recv_stamp; }
+  void set_dispatch_stamp(utime_t t) { dispatch_stamp = t; }
+  const utime_t& get_dispatch_stamp() const { return dispatch_stamp; }
+  void set_throttle_stamp(utime_t t) { throttle_stamp = t; }
+  const utime_t& get_throttle_stamp() const { return throttle_stamp; }
+  void set_recv_complete_stamp(utime_t t) { recv_complete_stamp = t; }
+  const utime_t& get_recv_complete_stamp() const { return recv_complete_stamp; }
+
+  void calc_header_crc() {
+    header.crc = ceph_crc32c(0, (unsigned char*)&header,
+			     sizeof(header) - sizeof(header.crc));
+  }
+  void calc_front_crc() {
+    footer.front_crc = payload.crc32c(0);
+    footer.middle_crc = middle.crc32c(0);
+  }
+  void calc_data_crc() {
+    footer.data_crc = data.crc32c(0);
+  }
+
+  virtual int get_cost() const {
+    return data.length();
+  }
+
+  // type
+  int get_type() const { return header.type; }
+  void set_type(int t) { header.type = t; }
+
+  uint64_t get_tid() const { return header.tid; }
+  void set_tid(uint64_t t) { header.tid = t; }
+
+  uint64_t get_seq() const { return header.seq; }
+  void set_seq(uint64_t s) { header.seq = s; }
+
+  unsigned get_priority() const { return header.priority; }
+  void set_priority(__s16 p) { header.priority = p; }
+
+  // source/dest
+  entity_inst_t get_source_inst() const {
+    return entity_inst_t(get_source(), get_source_addr());
+  }
+  entity_name_t get_source() const {
+    return entity_name_t(header.src);
+  }
+  entity_addr_t get_source_addr() const {
+    if (connection)
+      return connection->get_peer_addr();
+    return entity_addr_t();
+  }
+  entity_addrvec_t get_source_addrs() const {
+    if (connection)
+      return connection->get_peer_addrs();
+    return entity_addrvec_t();
+  }
+
+  // forwarded?
+  entity_inst_t get_orig_source_inst() const {
+    return get_source_inst();
+  }
+  entity_name_t get_orig_source() const {
+    return get_source();
+  }
+  entity_addr_t get_orig_source_addr() const {
+    return get_source_addr();
+  }
+  entity_addrvec_t get_orig_source_addrs() const {
+    return get_source_addrs();
+  }
+
+  // virtual bits
+  virtual void decode_payload() = 0;
+  virtual void encode_payload(uint64_t features) = 0;
+  virtual std::string_view get_type_name() const = 0;
+  virtual void print(std::ostream& out) const {
+    out << get_type_name() << " magic: " << magic;
+  }
+
+  virtual void dump(ceph::Formatter *f) const;
+
+  void encode(uint64_t features, int crcflags, bool skip_header_crc = false);
+};
+
+extern Message *decode_message(CephContext *cct,
+                               int crcflags,
+                               ceph_msg_header& header,
+                               ceph_msg_footer& footer,
+                               ceph::buffer::list& front,
+                               ceph::buffer::list& middle,
+                               ceph::buffer::list& data,
+                               Message::ConnectionRef conn);
+inline std::ostream& operator<<(std::ostream& out, const Message& m) {
+  m.print(out);
+  if (m.get_header().version)
+    out << " v" << m.get_header().version;
+  return out;
+}
+
+extern void encode_message(Message *m, uint64_t features, ceph::buffer::list& bl);
+extern Message *decode_message(CephContext *cct, int crcflags,
+                               ceph::buffer::list::const_iterator& bl);
+
+/// this is a "safe" version of Message. it does not allow calling get/put
+/// methods on its derived classes. This is intended to prevent some accidental
+/// reference leaks by forcing . Instead, you must either cast the derived class to a
+/// RefCountedObject to do the get/put or detach a temporary reference.
+class SafeMessage : public Message {
+public:
+  using Message::Message;
+  bool is_a_client() const {
+    return get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_CLIENT;
+  }
+
+private:
+  using RefCountedObject::get;
+  using RefCountedObject::put;
+};
+
+namespace ceph {
+template<class T, typename... Args>
+ceph::ref_t<T> make_message(Args&&... args) {
+  return {new T(std::forward<Args>(args)...), false};
+}
+}
+
+#endif
diff --git a/src/msg/MessageRef.h b/src/msg/MessageRef.h
new file mode 100644
index 000000000..6bed38311
--- /dev/null
+++ b/src/msg/MessageRef.h
@@ -0,0 +1,183 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc. <contact@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MESSAGEREF_H
+#define CEPH_MESSAGEREF_H
+ 
+#include <boost/intrusive_ptr.hpp>
+
+template<typename T>
+using MRef = boost::intrusive_ptr<T>;
+template<typename T>
+using MConstRef = boost::intrusive_ptr<T const>;
+
+using MessageRef = MRef<class Message>;
+using MessageConstRef = MConstRef<class Message>;
+
+/* cd src/messages/ && for f in *; do printf 'class '; basename "$f" .h | tr -d '\n'; printf ';\n'; done >> ../msg/MessageRef.h */
+
+class MAuth;
+class MAuthReply;
+class MBackfillReserve;
+class MCacheExpire;
+class MClientCapRelease;
+class MClientCaps;
+class MClientLease;
+class MClientQuota;
+class MClientReclaim;
+class MClientReclaimReply;
+class MClientReconnect;
+class MClientReply;
+class MClientRequestForward;
+class MClientRequest;
+class MClientSession;
+class MClientSnap;
+class MCommand;
+class MCommandReply;
+class MConfig;
+class MDentryLink;
+class MDentryUnlink;
+class MDirUpdate;
+class MDiscover;
+class MDiscoverReply;
+class MExportCapsAck;
+class MExportCaps;
+class MExportDirAck;
+class MExportDirCancel;
+class MExportDirDiscoverAck;
+class MExportDirDiscover;
+class MExportDirFinish;
+class MExportDir;
+class MExportDirNotifyAck;
+class MExportDirNotify;
+class MExportDirPrepAck;
+class MExportDirPrep;
+class MForward;
+class MFSMap;
+class MFSMapUser;
+class MGatherCaps;
+class MGenericMessage;
+class MGetConfig;
+class MGetPoolStats;
+class MGetPoolStatsReply;
+class MHeartbeat;
+class MInodeFileCaps;
+class MLock;
+class MLogAck;
+class MLog;
+class MMDSBeacon;
+class MMDSCacheRejoin;
+class MMDSFindIno;
+class MMDSFindInoReply;
+class MMDSFragmentNotifyAck;
+class MMDSFragmentNotify;
+class MMDSLoadTargets;
+class MMDSMap;
+class MMDSOpenIno;
+class MMDSOpenInoReply;
+class MMDSResolveAck;
+class MMDSResolve;
+class MMDSPeerRequest;
+class MMDSSnapUpdate;
+class MMDSTableRequest;
+class MMgrBeacon;
+class MMgrClose;
+class MMgrConfigure;
+class MMgrDigest;
+class MMgrMap;
+class MMgrOpen;
+class MMgrUpdate;
+class MMgrReport;
+class MMonCommandAck;
+class MMonCommand;
+class MMonElection;
+class MMonGetMap;
+class MMonGetOSDMap;
+class MMonGetVersion;
+class MMonGetVersionReply;
+class MMonGlobalID;
+class MMonHealthChecks;
+class MMonHealth;
+class MMonJoin;
+class MMonMap;
+class MMonMetadata;
+class MMonMgrReport;
+class MMonPaxos;
+class MMonProbe;
+class MMonQuorumService;
+class MMonScrub;
+class MMonSubscribeAck;
+class MMonSubscribe;
+class MMonSync;
+class MOSDAlive;
+class MOSDBackoff;
+class MOSDBeacon;
+class MOSDBoot;
+class MOSDECSubOpRead;
+class MOSDECSubOpReadReply;
+class MOSDECSubOpWrite;
+class MOSDECSubOpWriteReply;
+class MOSDFailure;
+class MOSDFastDispatchOp;
+class MOSDForceRecovery;
+class MOSDFull;
+class MOSDMap;
+class MOSDMarkMeDown;
+class MOSDPeeringOp;
+class MOSDPGBackfill;
+class MOSDPGBackfillRemove;
+class MOSDPGCreate2;
+class MOSDPGCreated;
+class MOSDPGCreate;
+class MOSDPGInfo;
+class MOSDPGLog;
+class MOSDPGNotify;
+class MOSDPGPull;
+class MOSDPGPush;
+class MOSDPGPushReply;
+class MOSDPGQuery;
+class MOSDPGReadyToMerge;
+class MOSDPGRecoveryDelete;
+class MOSDPGRecoveryDeleteReply;
+class MOSDPGRemove;
+class MOSDPGScan;
+class MOSDPGTemp;
+class MOSDPGTrim;
+class MOSDPGUpdateLogMissing;
+class MOSDPGUpdateLogMissingReply;
+class MOSDPing;
+class MOSDRepOp;
+class MOSDRepOpReply;
+class MOSDRepScrub;
+class MOSDRepScrubMap;
+class MOSDScrub2;
+class MOSDScrub;
+class MOSDScrubReserve;
+class MPGStatsAck;
+class MPGStats;
+class MPing;
+class MPoolOp;
+class MPoolOpReply;
+class MRecoveryReserve;
+class MRemoveSnaps;
+class MRoute;
+class MServiceMap;
+class MStatfs;
+class MStatfsReply;
+class MTimeCheck2;
+class MTimeCheck;
+class MWatchNotify;
+class PaxosServiceMessage;
+
+#endif
diff --git a/src/msg/Messenger.cc b/src/msg/Messenger.cc
new file mode 100644
index 000000000..eab2f2909
--- /dev/null
+++ b/src/msg/Messenger.cc
@@ -0,0 +1,114 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <netdb.h>
+
+#include "include/types.h"
+#include "include/random.h"
+
+#include "Messenger.h"
+
+#include "msg/async/AsyncMessenger.h"
+
+Messenger *Messenger::create_client_messenger(CephContext *cct, std::string lname)
+{
+  std::string public_msgr_type = cct->_conf->ms_public_type.empty() ? cct->_conf.get_val<std::string>("ms_type") : cct->_conf->ms_public_type;
+  auto nonce = get_random_nonce();
+  return Messenger::create(cct, public_msgr_type, entity_name_t::CLIENT(),
+			   std::move(lname), nonce);
+}
+
+uint64_t Messenger::get_pid_nonce()
+{
+  uint64_t nonce = getpid();
+  if (nonce == 1 || getenv("CEPH_USE_RANDOM_NONCE")) {
+    // we're running in a container; use a random number instead!
+    nonce = ceph::util::generate_random_number<uint64_t>();
+  }
+  return nonce;
+}
+
+uint64_t Messenger::get_random_nonce()
+{
+  return ceph::util::generate_random_number<uint64_t>();
+}
+
+Messenger *Messenger::create(CephContext *cct, const std::string &type,
+			     entity_name_t name, std::string lname,
+			     uint64_t nonce)
+{
+  int r = -1;
+  if (type == "random") {
+    r = 0;
+    //r = ceph::util::generate_random_number(0, 1);
+  }
+  if (r == 0 || type.find("async") != std::string::npos)
+    return new AsyncMessenger(cct, name, type, std::move(lname), nonce);
+  lderr(cct) << "unrecognized ms_type '" << type << "'" << dendl;
+  return nullptr;
+}
+
+/**
+ * Get the default crc flags for this messenger.
+ * but not yet dispatched.
+ */
+static int get_default_crc_flags(const ConfigProxy&);
+
+Messenger::Messenger(CephContext *cct_, entity_name_t w)
+  : trace_endpoint("0.0.0.0", 0, "Messenger"),
+    my_name(w),
+    default_send_priority(CEPH_MSG_PRIO_DEFAULT),
+    started(false),
+    magic(0),
+    socket_priority(-1),
+    cct(cct_),
+    crcflags(get_default_crc_flags(cct->_conf)),
+    auth_registry(cct)
+{
+  auth_registry.refresh_config();
+}
+
+void Messenger::set_endpoint_addr(const entity_addr_t& a,
+                                  const entity_name_t &name)
+{
+  size_t hostlen;
+  if (a.get_family() == AF_INET)
+    hostlen = sizeof(struct sockaddr_in);
+  else if (a.get_family() == AF_INET6)
+    hostlen = sizeof(struct sockaddr_in6);
+  else
+    hostlen = 0;
+
+  if (hostlen) {
+    char buf[NI_MAXHOST] = { 0 };
+    getnameinfo(a.get_sockaddr(), hostlen, buf, sizeof(buf),
+                NULL, 0, NI_NUMERICHOST);
+
+    trace_endpoint.copy_ip(buf);
+  }
+  trace_endpoint.set_port(a.get_port());
+}
+
+/**
+ * Get the default crc flags for this messenger.
+ * but not yet dispatched.
+ *
+ * Pre-calculate desired software CRC settings.  CRC computation may
+ * be disabled by default for some transports (e.g., those with strong
+ * hardware checksum support).
+ */
+int get_default_crc_flags(const ConfigProxy& conf)
+{
+  int r = 0;
+  if (conf->ms_crc_data)
+    r |= MSG_CRC_DATA;
+  if (conf->ms_crc_header)
+    r |= MSG_CRC_HEADER;
+  return r;
+}
+
+int Messenger::bindv(const entity_addrvec_t& addrs)
+{
+  return bind(addrs.legacy_addr());
+}
+
diff --git a/src/msg/Messenger.h b/src/msg/Messenger.h
new file mode 100644
index 000000000..e87f3196b
--- /dev/null
+++ b/src/msg/Messenger.h
@@ -0,0 +1,824 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+
+#ifndef CEPH_MESSENGER_H
+#define CEPH_MESSENGER_H
+
+#include <map>
+#include <deque>
+
+#include <errno.h>
+#include <sstream>
+#include <memory>
+
+#include "Message.h"
+#include "Dispatcher.h"
+#include "Policy.h"
+#include "common/Throttle.h"
+#include "include/Context.h"
+#include "include/types.h"
+#include "include/ceph_features.h"
+#include "auth/Crypto.h"
+#include "common/item_history.h"
+#include "auth/AuthRegistry.h"
+#include "include/ceph_assert.h"
+
+#include <errno.h>
+#include <sstream>
+#include <signal.h>
+
+#define SOCKET_PRIORITY_MIN_DELAY 6
+
+class Timer;
+
+class AuthClient;
+class AuthServer;
+
+#ifdef UNIT_TESTS_BUILT
+
+struct Interceptor {
+  std::mutex lock;
+  std::condition_variable cond_var;
+
+  enum ACTION : uint32_t {
+    CONTINUE = 0,
+    FAIL,
+    STOP
+  };
+
+  enum STEP {
+    START_CLIENT_BANNER_EXCHANGE = 1,
+    START_SERVER_BANNER_EXCHANGE,
+    BANNER_EXCHANGE_BANNER_CONNECTING,
+    BANNER_EXCHANGE,
+    HANDLE_PEER_BANNER_BANNER_CONNECTING,
+    HANDLE_PEER_BANNER,
+    HANDLE_PEER_BANNER_PAYLOAD_HELLO_CONNECTING,
+    HANDLE_PEER_BANNER_PAYLOAD,
+    SEND_AUTH_REQUEST,
+    HANDLE_AUTH_REQUEST_ACCEPTING_SIGN,
+    SEND_CLIENT_IDENTITY,
+    SEND_SERVER_IDENTITY,
+    SEND_RECONNECT,
+    SEND_RECONNECT_OK,
+    READY,
+    HANDLE_MESSAGE,
+    READ_MESSAGE_COMPLETE,
+    SESSION_RETRY
+  };
+
+  virtual ~Interceptor() {}
+  virtual ACTION intercept(Connection *conn, uint32_t step) = 0;
+};
+
+#endif
+
+class Messenger {
+private:
+  std::deque<Dispatcher*> dispatchers;
+  std::deque<Dispatcher*> fast_dispatchers;
+  ZTracer::Endpoint trace_endpoint;
+
+protected:
+  void set_endpoint_addr(const entity_addr_t& a,
+                         const entity_name_t &name);
+
+protected:
+  /// the "name" of the local daemon. eg client.99
+  entity_name_t my_name;
+
+  /// my addr
+  safe_item_history<entity_addrvec_t> my_addrs;
+
+  int default_send_priority;
+  /// std::set to true once the Messenger has started, and std::set to false on shutdown
+  bool started;
+  uint32_t magic;
+  int socket_priority;
+
+public:
+  AuthClient *auth_client = 0;
+  AuthServer *auth_server = 0;
+
+#ifdef UNIT_TESTS_BUILT
+  Interceptor *interceptor = nullptr;
+#endif
+
+  /**
+   *  The CephContext this Messenger uses. Many other components initialize themselves
+   *  from this value.
+   */
+  CephContext *cct;
+  int crcflags;
+
+  using Policy = ceph::net::Policy<Throttle>;
+
+public:
+  // allow unauthenticated connections.  This is needed for
+  // compatibility with pre-nautilus OSDs, which do not authenticate
+  // the heartbeat sessions.
+  bool require_authorizer = true;
+
+protected:
+  // for authentication
+  AuthRegistry auth_registry;
+
+public:
+  /**
+   * Messenger constructor. Call this from your implementation.
+   * Messenger users should construct full implementations directly,
+   * or use the create() function.
+   */
+  Messenger(CephContext *cct_, entity_name_t w);
+  virtual ~Messenger() {}
+
+  /**
+   * create a new messenger
+   *
+   * Create a new messenger instance, with whatever implementation is
+   * available or specified via the configuration in cct.
+   *
+   * @param cct context
+   * @param type name of messenger type
+   * @param name entity name to register
+   * @param lname logical name of the messenger in this process (e.g., "client")
+   * @param nonce nonce value to uniquely identify this instance on the current host
+   */
+  static Messenger *create(CephContext *cct,
+                           const std::string &type,
+                           entity_name_t name,
+			   std::string lname,
+                           uint64_t nonce);
+
+  static uint64_t get_random_nonce();
+  static uint64_t get_pid_nonce();
+
+  /**
+   * create a new messenger
+   *
+   * Create a new messenger instance.
+   * Same as the above, but a slightly simpler interface for clients:
+   * - Generate a random nonce
+   * - get the messenger type from cct
+   * - use the client entity_type
+   *
+   * @param cct context
+   * @param lname logical name of the messenger in this process (e.g., "client")
+   */
+  static Messenger *create_client_messenger(CephContext *cct, std::string lname);
+
+  /**
+   * @defgroup Accessors
+   * @{
+   */
+  int get_mytype() const { return my_name.type(); }
+
+  /**
+   * Retrieve the Messenger's name
+   *
+   * @return A const reference to the name this Messenger
+   * currently believes to be its own.
+   */
+  const entity_name_t& get_myname() { return my_name; }
+
+  /**
+   * Retrieve the Messenger's address.
+   *
+   * @return A const reference to the address this Messenger
+   * currently believes to be its own.
+   */
+  const entity_addrvec_t& get_myaddrs() {
+    return *my_addrs;
+  }
+
+  /**
+   * get legacy addr for myself, suitable for protocol v1
+   *
+   * Note that myaddrs might be a proper addrvec with v1 in it, or it might be an
+   * ANY addr (if i am a pure client).
+   */
+  entity_addr_t get_myaddr_legacy() {
+    return my_addrs->as_legacy_addr();
+  }
+
+
+  /**
+   * std::set messenger's instance
+   */
+  uint32_t get_magic() { return magic; }
+  void set_magic(int _magic) { magic = _magic; }
+
+  void set_auth_client(AuthClient *ac) {
+    auth_client = ac;
+  }
+  void set_auth_server(AuthServer *as) {
+    auth_server = as;
+  }
+
+protected:
+  /**
+   * std::set messenger's address
+   */
+  virtual void set_myaddrs(const entity_addrvec_t& a) {
+    my_addrs = a;
+    set_endpoint_addr(a.front(), my_name);
+  }
+public:
+  /**
+   * @return the zipkin trace endpoint
+   */
+  const ZTracer::Endpoint* get_trace_endpoint() const {
+    return &trace_endpoint;
+  }
+
+  /**
+   * set the name of the local entity. The name is reported to others and
+   * can be changed while the system is running, but doing so at incorrect
+   * times may have bad results.
+   *
+   * @param m The name to std::set.
+   */
+  void set_myname(const entity_name_t& m) { my_name = m; }
+
+  /**
+   * set the unknown address components for this Messenger.
+   * This is useful if the Messenger doesn't know its full address just by
+   * binding, but another Messenger on the same interface has already learned
+   * its full address. This function does not fill in known address elements,
+   * cause a rebind, or do anything of that sort.
+   *
+   * @param addr The address to use as a template.
+   */
+  virtual bool set_addr_unknowns(const entity_addrvec_t &addrs) = 0;
+  /**
+   * set the address for this Messenger. This is useful if the Messenger
+   * binds to a specific address but advertises a different address on the
+   * the network.
+   *
+   * @param addr The address to use.
+   */
+  virtual void set_addrs(const entity_addrvec_t &addr) = 0;
+  /// Get the default send priority.
+  int get_default_send_priority() { return default_send_priority; }
+  /**
+   * Get the number of Messages which the Messenger has received
+   * but not yet dispatched.
+   */
+  virtual int get_dispatch_queue_len() = 0;
+
+  /**
+   * Get age of oldest undelivered message
+   * (0 if the queue is empty)
+   */
+  virtual double get_dispatch_queue_max_age(utime_t now) = 0;
+
+  /**
+   * @} // Accessors
+   */
+
+  /**
+   * @defgroup Configuration
+   * @{
+   */
+  /**
+   * set the cluster protocol in use by this daemon.
+   * This is an init-time function and cannot be called after calling
+   * start() or bind().
+   *
+   * @param p The cluster protocol to use. Defined externally.
+   */
+  virtual void set_cluster_protocol(int p) = 0;
+  /**
+   * set a policy which is applied to all peers who do not have a type-specific
+   * Policy.
+   * This is an init-time function and cannot be called after calling
+   * start() or bind().
+   *
+   * @param p The Policy to apply.
+   */
+  virtual void set_default_policy(Policy p) = 0;
+  /**
+   * set a policy which is applied to all peers of the given type.
+   * This is an init-time function and cannot be called after calling
+   * start() or bind().
+   *
+   * @param type The peer type this policy applies to.
+   * @param p The policy to apply.
+   */
+  virtual void set_policy(int type, Policy p) = 0;
+  /**
+   * set the Policy associated with a type of peer.
+   *
+   * This can be called either on initial setup, or after connections
+   * are already established.  However, the policies for existing
+   * connections will not be affected; the new policy will only apply
+   * to future connections.
+   *
+   * @param t The peer type to get the default policy for.
+   * @return A const Policy reference.
+   */
+  virtual Policy get_policy(int t) = 0;
+  /**
+   * Get the default Policy
+   *
+   * @return A const Policy reference.
+   */
+  virtual Policy get_default_policy() = 0;
+  /**
+   * set Throttlers applied to all Messages from the given type of peer
+   *
+   * This is an init-time function and cannot be called after calling
+   * start() or bind().
+   *
+   * @param type The peer type the Throttlers will apply to.
+   * @param bytes The Throttle for the number of bytes carried by the message
+   * @param msgs The Throttle for the number of messages for this @p type
+   * @note The Messenger does not take ownership of the Throttle pointers, but
+   * you must not destroy them before you destroy the Messenger.
+   */
+  virtual void set_policy_throttlers(int type, Throttle *bytes, Throttle *msgs=NULL) = 0;
+  /**
+   * set the default send priority
+   *
+   * This is an init-time function and must be called *before* calling
+   * start().
+   *
+   * @param p The cluster protocol to use. Defined externally.
+   */
+  void set_default_send_priority(int p) {
+    ceph_assert(!started);
+    default_send_priority = p;
+  }
+  /**
+   * set the priority(SO_PRIORITY) for all packets to be sent on this socket.
+   *
+   * Linux uses this value to order the networking queues: packets with a higher
+   * priority may be processed first depending on the selected device queueing
+   * discipline.
+   *
+   * @param prio The priority. Setting a priority outside the range 0 to 6
+   * requires the CAP_NET_ADMIN capability.
+   */
+  void set_socket_priority(int prio) {
+    socket_priority = prio;
+  }
+  /**
+   * Get the socket priority
+   *
+   * @return the socket priority
+   */
+  int get_socket_priority() {
+    return socket_priority;
+  }
+  /**
+   * Add a new Dispatcher to the front of the list. If you add
+   * a Dispatcher which is already included, it will get a duplicate
+   * entry. This will reduce efficiency but not break anything.
+   *
+   * @param d The Dispatcher to insert into the list.
+   */
+  void add_dispatcher_head(Dispatcher *d) {
+    bool first = dispatchers.empty();
+    dispatchers.push_front(d);
+    if (d->ms_can_fast_dispatch_any())
+      fast_dispatchers.push_front(d);
+    if (first)
+      ready();
+  }
+  /**
+   * Add a new Dispatcher to the end of the list. If you add
+   * a Dispatcher which is already included, it will get a duplicate
+   * entry. This will reduce efficiency but not break anything.
+   *
+   * @param d The Dispatcher to insert into the list.
+   */
+  void add_dispatcher_tail(Dispatcher *d) {
+    bool first = dispatchers.empty();
+    dispatchers.push_back(d);
+    if (d->ms_can_fast_dispatch_any())
+      fast_dispatchers.push_back(d);
+    if (first)
+      ready();
+  }
+  /**
+   * Bind the Messenger to a specific address. If bind_addr
+   * is not completely filled in the system will use the
+   * valid portions and cycle through the unset ones (eg, the port)
+   * in an unspecified order.
+   *
+   * @param bind_addr The address to bind to.
+   * @return 0 on success, or -1 on error, or -errno if
+   * we can be more specific about the failure.
+   */
+  virtual int bind(const entity_addr_t& bind_addr) = 0;
+
+  virtual int bindv(const entity_addrvec_t& addrs);
+
+  /**
+   * This function performs a full restart of the Messenger component,
+   * whatever that means.  Other entities who connect to this
+   * Messenger post-rebind() should perceive it as a new entity which
+   * they have not previously contacted, and it MUST bind to a
+   * different address than it did previously.
+   *
+   * @param avoid_ports Additional port to avoid binding to.
+   */
+  virtual int rebind(const std::set<int>& avoid_ports) { return -EOPNOTSUPP; }
+  /**
+   * Bind the 'client' Messenger to a specific address.Messenger will bind
+   * the address before connect to others when option ms_bind_before_connect
+   * is true.
+   * @param bind_addr The address to bind to.
+   * @return 0 on success, or -1 on error, or -errno if
+   * we can be more specific about the failure.
+   */
+  virtual int client_bind(const entity_addr_t& bind_addr) = 0;
+
+  /**
+   * reset the 'client' Messenger. Mark all the existing Connections down
+   * and update 'nonce'.
+   */
+  virtual int client_reset() = 0;
+
+
+  virtual bool should_use_msgr2() {
+    return false;
+  }
+
+  /**
+   * @} // Configuration
+   */
+
+  /**
+   * @defgroup Startup/Shutdown
+   * @{
+   */
+  /**
+   * Perform any resource allocation, thread startup, etc
+   * that is required before attempting to connect to other
+   * Messengers or transmit messages.
+   * Once this function completes, started shall be set to true.
+   *
+   * @return 0 on success; -errno on failure.
+   */
+  virtual int start() { started = true; return 0; }
+
+  // shutdown
+  /**
+   * Block until the Messenger has finished shutting down (according
+   * to the shutdown() function).
+   * It is valid to call this after calling shutdown(), but it must
+   * be called before deleting the Messenger.
+   */
+  virtual void wait() = 0;
+  /**
+   * Initiate a shutdown of the Messenger.
+   *
+   * @return 0 on success, -errno otherwise.
+   */
+  virtual int shutdown() { started = false; return 0; }
+  /**
+   * @} // Startup/Shutdown
+   */
+
+  /**
+   * @defgroup Messaging
+   * @{
+   */
+  /**
+   * Queue the given Message for the given entity.
+   * Success in this function does not guarantee Message delivery, only
+   * success in queueing the Message. Other guarantees may be provided based
+   * on the Connection policy associated with the dest.
+   *
+   * @param m The Message to send. The Messenger consumes a single reference
+   * when you pass it in.
+   * @param dest The entity to send the Message to.
+   *
+   * DEPRECATED: please do not use this interface for any new code;
+   * use the Connection* variant.
+   *
+   * @return 0 on success, or -errno on failure.
+   */
+  virtual int send_to(
+    Message *m,
+    int type,
+    const entity_addrvec_t& addr) = 0;
+  int send_to_mon(
+    Message *m, const entity_addrvec_t& addrs) {
+    return send_to(m, CEPH_ENTITY_TYPE_MON, addrs);
+  }
+  int send_to_mds(
+    Message *m, const entity_addrvec_t& addrs) {
+    return send_to(m, CEPH_ENTITY_TYPE_MDS, addrs);
+  }
+
+  /**
+   * @} // Messaging
+   */
+  /**
+   * @defgroup Connection Management
+   * @{
+   */
+  /**
+   * Get the Connection object associated with a given entity. If a
+   * Connection does not exist, create one and establish a logical connection.
+   * The caller owns a reference when this returns. Call ->put() when you're
+   * done!
+   *
+   * @param dest The entity to get a connection for.
+   */
+  virtual ConnectionRef connect_to(
+    int type, const entity_addrvec_t& dest,
+    bool anon=false, bool not_local_dest=false) = 0;
+  ConnectionRef connect_to_mon(const entity_addrvec_t& dest,
+      bool anon=false, bool not_local_dest=false) {
+	return connect_to(CEPH_ENTITY_TYPE_MON, dest, anon, not_local_dest);
+  }
+  ConnectionRef connect_to_mds(const entity_addrvec_t& dest,
+      bool anon=false, bool not_local_dest=false) {
+	return connect_to(CEPH_ENTITY_TYPE_MDS, dest, anon, not_local_dest);
+  }
+  ConnectionRef connect_to_osd(const entity_addrvec_t& dest,
+      bool anon=false, bool not_local_dest=false) {
+	return connect_to(CEPH_ENTITY_TYPE_OSD, dest, anon, not_local_dest);
+  }
+  ConnectionRef connect_to_mgr(const entity_addrvec_t& dest,
+      bool anon=false, bool not_local_dest=false) {
+	return connect_to(CEPH_ENTITY_TYPE_MGR, dest, anon, not_local_dest);
+  }
+
+  /**
+   * Get the Connection object associated with ourselves.
+   */
+  virtual ConnectionRef get_loopback_connection() = 0;
+  /**
+   * Mark down a Connection to a remote.
+   *
+   * This will cause us to discard our outgoing queue for them, and if
+   * reset detection is enabled in the policy and the endpoint tries
+   * to reconnect they will discard their queue when we inform them of
+   * the session reset.
+   *
+   * If there is no Connection to the given dest, it is a no-op.
+   *
+   * This generates a RESET notification to the Dispatcher.
+   *
+   * DEPRECATED: please do not use this interface for any new code;
+   * use the Connection* variant.
+   *
+   * @param a The address to mark down.
+   */
+  virtual void mark_down(const entity_addr_t& a) = 0;
+  virtual void mark_down_addrs(const entity_addrvec_t& a) {
+    mark_down(a.legacy_addr());
+  }
+  /**
+   * Mark all the existing Connections down. This is equivalent
+   * to iterating over all Connections and calling mark_down()
+   * on each.
+   *
+   * This will generate a RESET event for each closed connections.
+   */
+  virtual void mark_down_all() = 0;
+  /**
+   * @} // Connection Management
+   */
+protected:
+  /**
+   * @defgroup Subclass Interfacing
+   * @{
+   */
+  /**
+   * A courtesy function for Messenger implementations which
+   * will be called when we receive our first Dispatcher.
+   */
+  virtual void ready() { }
+  /**
+   * @} // Subclass Interfacing
+   */
+public:
+#ifdef CEPH_USE_SIGPIPE_BLOCKER
+  /**
+   * We need to disable SIGPIPE on all platforms, and if they
+   * don't give us a better mechanism (read: are on Solaris) that
+   * means blocking the signal whenever we do a send or sendmsg...
+   * That means any implementations must invoke MSGR_SIGPIPE_STOPPER in-scope
+   * whenever doing so. On most systems that's blank, but on systems where
+   * it's needed we construct an RAII object to plug and un-plug the SIGPIPE.
+   * See http://www.microhowto.info/howto/ignore_sigpipe_without_affecting_other_threads_in_a_process.html
+   */
+  struct sigpipe_stopper {
+    bool blocked;
+    sigset_t existing_mask;
+    sigset_t pipe_mask;
+    sigpipe_stopper() {
+      sigemptyset(&pipe_mask);
+      sigaddset(&pipe_mask, SIGPIPE);
+      sigset_t signals;
+      sigemptyset(&signals);
+      sigpending(&signals);
+      if (sigismember(&signals, SIGPIPE)) {
+	blocked = false;
+      } else {
+	blocked = true;
+	int r = pthread_sigmask(SIG_BLOCK, &pipe_mask, &existing_mask);
+	ceph_assert(r == 0);
+      }
+    }
+    ~sigpipe_stopper() {
+      if (blocked) {
+	struct timespec nowait{0};
+	int r = sigtimedwait(&pipe_mask, 0, &nowait);
+	ceph_assert(r == EAGAIN || r == 0);
+	r = pthread_sigmask(SIG_SETMASK, &existing_mask, 0);
+	ceph_assert(r == 0);
+      }
+    }
+  };
+#  define MSGR_SIGPIPE_STOPPER Messenger::sigpipe_stopper stopper();
+#else
+#  define MSGR_SIGPIPE_STOPPER
+#endif
+  /**
+   * @defgroup Dispatcher Interfacing
+   * @{
+   */
+  /**
+   * Determine whether a message can be fast-dispatched. We will
+   * query each Dispatcher in sequence to determine if they are
+   * capable of handling a particular message via "fast dispatch".
+   *
+   * @param m The Message we are testing.
+   */
+  bool ms_can_fast_dispatch(const ceph::cref_t<Message>& m) {
+    for (const auto &dispatcher : fast_dispatchers) {
+      if (dispatcher->ms_can_fast_dispatch2(m))
+	return true;
+    }
+    return false;
+  }
+
+  /**
+   * Deliver a single Message via "fast dispatch".
+   *
+   * @param m The Message we are fast dispatching.
+   * If none of our Dispatchers can handle it, ceph_abort().
+   */
+  void ms_fast_dispatch(const ceph::ref_t<Message> &m) {
+    m->set_dispatch_stamp(ceph_clock_now());
+    for (const auto &dispatcher : fast_dispatchers) {
+      if (dispatcher->ms_can_fast_dispatch2(m)) {
+	dispatcher->ms_fast_dispatch2(m);
+	return;
+      }
+    }
+    ceph_abort();
+  }
+  void ms_fast_dispatch(Message *m) {
+    return ms_fast_dispatch(ceph::ref_t<Message>(m, false)); /* consume ref */
+  }
+  /**
+   *
+   */
+  void ms_fast_preprocess(const ceph::ref_t<Message> &m) {
+    for (const auto &dispatcher : fast_dispatchers) {
+      dispatcher->ms_fast_preprocess2(m);
+    }
+  }
+  /**
+   *  Deliver a single Message. Send it to each Dispatcher
+   *  in sequence until one of them handles it.
+   *  If none of our Dispatchers can handle it, ceph_abort().
+   *
+   *  @param m The Message to deliver.
+   */
+  void ms_deliver_dispatch(const ceph::ref_t<Message> &m) {
+    m->set_dispatch_stamp(ceph_clock_now());
+    for (const auto &dispatcher : dispatchers) {
+      if (dispatcher->ms_dispatch2(m))
+	return;
+    }
+    lsubdout(cct, ms, 0) << "ms_deliver_dispatch: unhandled message " << m << " " << *m << " from "
+			 << m->get_source_inst() << dendl;
+    ceph_assert(!cct->_conf->ms_die_on_unhandled_msg);
+  }
+  void ms_deliver_dispatch(Message *m) {
+    return ms_deliver_dispatch(ceph::ref_t<Message>(m, false)); /* consume ref */
+  }
+  /**
+   * Notify each Dispatcher of a new Connection. Call
+   * this function whenever a new Connection is initiated or
+   * reconnects.
+   *
+   * @param con Pointer to the new Connection.
+   */
+  void ms_deliver_handle_connect(Connection *con) {
+    for (const auto& dispatcher : dispatchers) {
+      dispatcher->ms_handle_connect(con);
+    }
+  }
+
+  /**
+   * Notify each fast Dispatcher of a new Connection. Call
+   * this function whenever a new Connection is initiated or
+   * reconnects.
+   *
+   * @param con Pointer to the new Connection.
+   */
+  void ms_deliver_handle_fast_connect(Connection *con) {
+    for (const auto& dispatcher : fast_dispatchers) {
+      dispatcher->ms_handle_fast_connect(con);
+    }
+  }
+
+  /**
+   * Notify each Dispatcher of a new incoming Connection. Call
+   * this function whenever a new Connection is accepted.
+   *
+   * @param con Pointer to the new Connection.
+   */
+  void ms_deliver_handle_accept(Connection *con) {
+    for (const auto& dispatcher : dispatchers) {
+      dispatcher->ms_handle_accept(con);
+    }
+  }
+
+  /**
+   * Notify each fast Dispatcher of a new incoming Connection. Call
+   * this function whenever a new Connection is accepted.
+   *
+   * @param con Pointer to the new Connection.
+   */
+  void ms_deliver_handle_fast_accept(Connection *con) {
+    for (const auto& dispatcher : fast_dispatchers) {
+      dispatcher->ms_handle_fast_accept(con);
+    }
+  }
+
+  /**
+   * Notify each Dispatcher of a Connection which may have lost
+   * Messages. Call this function whenever you detect that a lossy Connection
+   * has been disconnected.
+   *
+   * @param con Pointer to the broken Connection.
+   */
+  void ms_deliver_handle_reset(Connection *con) {
+    for (const auto& dispatcher : dispatchers) {
+      if (dispatcher->ms_handle_reset(con))
+	return;
+    }
+  }
+  /**
+   * Notify each Dispatcher of a Connection which has been "forgotten" about
+   * by the remote end, implying that messages have probably been lost.
+   * Call this function whenever you detect a reset.
+   *
+   * @param con Pointer to the broken Connection.
+   */
+  void ms_deliver_handle_remote_reset(Connection *con) {
+    for (const auto& dispatcher : dispatchers) {
+      dispatcher->ms_handle_remote_reset(con);
+    }
+  }
+
+  /**
+   * Notify each Dispatcher of a Connection for which reconnection
+   * attempts are being refused. Call this function whenever you
+   * detect that a lossy Connection has been disconnected and it's
+   * impossible to reconnect.
+   *
+   * @param con Pointer to the broken Connection.
+   */
+  void ms_deliver_handle_refused(Connection *con) {
+    for (const auto& dispatcher : dispatchers) {
+      if (dispatcher->ms_handle_refused(con))
+        return;
+    }
+  }
+
+  void set_require_authorizer(bool b) {
+    require_authorizer = b;
+  }
+
+  /**
+   * @} // Dispatcher Interfacing
+   */
+};
+
+
+
+#endif
diff --git a/src/msg/Policy.h b/src/msg/Policy.h
new file mode 100644
index 000000000..10a426f2f
--- /dev/null
+++ b/src/msg/Policy.h
@@ -0,0 +1,129 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "include/ceph_features.h"
+
+namespace ceph::net {
+
+using peer_type_t = int;
+
+/**
+ * A Policy describes the rules of a Connection. Is there a limit on how
+ * much data this Connection can have locally? When the underlying connection
+ * experiences an error, does the Connection disappear? Can this Messenger
+ * re-establish the underlying connection?
+ */
+template<class ThrottleType>
+struct Policy {
+  /// If true, the Connection is tossed out on errors.
+  bool lossy;
+  /// If true, the underlying connection can't be re-established from this end.
+  bool server;
+  /// If true, we will standby when idle
+  bool standby;
+  /// If true, we will try to detect session resets
+  bool resetcheck;
+
+  /// Server: register lossy client connections.
+  bool register_lossy_clients = true;
+  // The net result of this is that a given client can only have one
+  // open connection with the server.  If a new connection is made,
+  // the old (registered) one is closed by the messenger during the accept
+  // process.
+  
+  /**
+   *  The throttler is used to limit how much data is held by Messages from
+   *  the associated Connection(s). When reading in a new Message, the Messenger
+   *  will call throttler->throttle() for the size of the new Message.
+   */
+  ThrottleType* throttler_bytes;
+  ThrottleType* throttler_messages;
+  
+  /// Specify features supported locally by the endpoint.
+  uint64_t features_supported;
+  /// Specify features any remotes must have to talk to this endpoint.
+  uint64_t features_required;
+  
+  Policy()
+    : lossy(false), server(false), standby(false), resetcheck(true),
+      throttler_bytes(NULL),
+      throttler_messages(NULL),
+      features_supported(CEPH_FEATURES_SUPPORTED_DEFAULT),
+      features_required(0) {}
+private:
+  Policy(bool l, bool s, bool st, bool r, bool rlc, uint64_t req)
+    : lossy(l), server(s), standby(st), resetcheck(r),
+      register_lossy_clients(rlc),
+      throttler_bytes(NULL),
+      throttler_messages(NULL),
+      features_supported(CEPH_FEATURES_SUPPORTED_DEFAULT),
+      features_required(req) {}
+  
+public:
+  static Policy stateful_server(uint64_t req) {
+    return Policy(false, true, true, true, true, req);
+  }
+  static Policy stateless_registered_server(uint64_t req) {
+    return Policy(true, true, false, false, true, req);
+  }
+  static Policy stateless_server(uint64_t req) {
+    return Policy(true, true, false, false, false, req);
+  }
+  static Policy lossless_peer(uint64_t req) {
+    return Policy(false, false, true, false, true, req);
+  }
+  static Policy lossless_peer_reuse(uint64_t req) {
+    return Policy(false, false, true, true, true, req);
+  }
+  static Policy lossy_client(uint64_t req) {
+    return Policy(true, false, false, false, true, req);
+  }
+  static Policy lossless_client(uint64_t req) {
+    return Policy(false, false, false, true, true, req);
+  }
+};
+
+template<class ThrottleType>
+class PolicySet {
+  using policy_t = Policy<ThrottleType> ;
+  /// the default Policy we use for Pipes
+  policy_t default_policy;
+  /// map specifying different Policies for specific peer types
+  std::map<int, policy_t> policy_map; // entity_name_t::type -> Policy
+
+public:
+  const policy_t& get(peer_type_t peer_type) const {
+    if (auto found = policy_map.find(peer_type); found != policy_map.end()) {
+      return found->second;
+    } else {
+      return default_policy;
+    }
+  }
+  policy_t& get(peer_type_t peer_type) {
+    if (auto found = policy_map.find(peer_type); found != policy_map.end()) {
+      return found->second;
+    } else {
+      return default_policy;
+    }
+  }
+  void set(peer_type_t peer_type, const policy_t& p) {
+    policy_map[peer_type] = p;
+  }
+  const policy_t& get_default() const {
+    return default_policy;
+  }
+  void set_default(const policy_t& p) {
+    default_policy = p;
+  }
+  void set_throttlers(peer_type_t peer_type,
+                      ThrottleType* byte_throttle,
+                      ThrottleType* msg_throttle) {
+    auto& policy = get(peer_type);
+    policy.throttler_bytes = byte_throttle;
+    policy.throttler_messages = msg_throttle;
+  }
+};
+
+}
diff --git a/src/msg/SimplePolicyMessenger.h b/src/msg/SimplePolicyMessenger.h
new file mode 100644
index 000000000..3b25983a6
--- /dev/null
+++ b/src/msg/SimplePolicyMessenger.h
@@ -0,0 +1,99 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Portions Copyright (C) 2013 CohortFS, LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef SIMPLE_POLICY_MESSENGER_H
+#define SIMPLE_POLICY_MESSENGER_H
+
+#include "Messenger.h"
+#include "Policy.h"
+
+class SimplePolicyMessenger : public Messenger
+{
+private:
+  /// lock protecting policy
+  ceph::mutex policy_lock =
+    ceph::make_mutex("SimplePolicyMessenger::policy_lock");
+  // entity_name_t::type -> Policy
+  ceph::net::PolicySet<Throttle> policy_set;
+
+public:
+
+  SimplePolicyMessenger(CephContext *cct, entity_name_t name)
+    : Messenger(cct, name)
+    {
+    }
+
+    /**
+   * Get the Policy associated with a type of peer.
+   * @param t The peer type to get the default policy for.
+   *
+   * @return A const Policy reference.
+   */
+  Policy get_policy(int t) override {
+    std::lock_guard l{policy_lock};
+    return policy_set.get(t);
+  }
+
+  Policy get_default_policy() override {
+    std::lock_guard l{policy_lock};
+    return policy_set.get_default();
+  }
+
+  /**
+   * Set a policy which is applied to all peers who do not have a type-specific
+   * Policy.
+   * This is an init-time function and cannot be called after calling
+   * start() or bind().
+   *
+   * @param p The Policy to apply.
+   */
+  void set_default_policy(Policy p) override {
+    std::lock_guard l{policy_lock};
+    policy_set.set_default(p);
+  }
+  /**
+   * Set a policy which is applied to all peers of the given type.
+   * This is an init-time function and cannot be called after calling
+   * start() or bind().
+   *
+   * @param type The peer type this policy applies to.
+   * @param p The policy to apply.
+   */
+  void set_policy(int type, Policy p) override {
+    std::lock_guard l{policy_lock};
+    policy_set.set(type, p);
+  }
+
+  /**
+   * Set a Throttler which is applied to all Messages from the given
+   * type of peer.
+   * This is an init-time function and cannot be called after calling
+   * start() or bind().
+   *
+   * @param type The peer type this Throttler will apply to.
+   * @param t The Throttler to apply. The messenger does not take
+   * ownership of this pointer, but you must not destroy it before
+   * you destroy messenger.
+   */
+  void set_policy_throttlers(int type,
+			     Throttle* byte_throttle,
+			     Throttle* msg_throttle) override {
+    std::lock_guard l{policy_lock};
+    policy_set.set_throttlers(type, byte_throttle, msg_throttle);
+  }
+
+}; /* SimplePolicyMessenger */
+
+#endif /* SIMPLE_POLICY_MESSENGER_H */
diff --git a/src/msg/async/AsyncConnection.cc b/src/msg/async/AsyncConnection.cc
new file mode 100644
index 000000000..5769c580e
--- /dev/null
+++ b/src/msg/async/AsyncConnection.cc
@@ -0,0 +1,786 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <unistd.h>
+
+#include "include/Context.h"
+#include "include/random.h"
+#include "common/errno.h"
+#include "AsyncMessenger.h"
+#include "AsyncConnection.h"
+
+#include "ProtocolV1.h"
+#include "ProtocolV2.h"
+
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "common/EventTrace.h"
+
+// Constant to limit starting sequence number to 2^31.  Nothing special about it, just a big number.  PLR
+#define SEQ_MASK  0x7fffffff
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix _conn_prefix(_dout)
+std::ostream& AsyncConnection::_conn_prefix(std::ostream *_dout) {
+  return *_dout << "-- " << async_msgr->get_myaddrs() << " >> "
+		<< *peer_addrs << " conn(" << this
+		<< (msgr2 ? " msgr2=" : " legacy=")
+		<< protocol.get()
+		<< " " << ceph_con_mode_name(protocol->auth_meta->con_mode)
+                << " :" << port
+                << " s=" << get_state_name(state)
+                << " l=" << policy.lossy
+                << ").";
+}
+
+// Notes:
+// 1. Don't dispatch any event when closed! It may cause AsyncConnection alive even if AsyncMessenger dead
+
+const uint32_t AsyncConnection::TCP_PREFETCH_MIN_SIZE = 512;
+
+class C_time_wakeup : public EventCallback {
+  AsyncConnectionRef conn;
+
+ public:
+  explicit C_time_wakeup(AsyncConnectionRef c): conn(c) {}
+  void do_request(uint64_t fd_or_id) override {
+    conn->wakeup_from(fd_or_id);
+  }
+};
+
+class C_handle_read : public EventCallback {
+  AsyncConnectionRef conn;
+
+ public:
+  explicit C_handle_read(AsyncConnectionRef c): conn(c) {}
+  void do_request(uint64_t fd_or_id) override {
+    conn->process();
+  }
+};
+
+class C_handle_write : public EventCallback {
+  AsyncConnectionRef conn;
+
+ public:
+  explicit C_handle_write(AsyncConnectionRef c): conn(c) {}
+  void do_request(uint64_t fd) override {
+    conn->handle_write();
+  }
+};
+
+class C_handle_write_callback : public EventCallback {
+  AsyncConnectionRef conn;
+
+public:
+  explicit C_handle_write_callback(AsyncConnectionRef c) : conn(c) {}
+  void do_request(uint64_t fd) override { conn->handle_write_callback(); }
+};
+
+class C_clean_handler : public EventCallback {
+  AsyncConnectionRef conn;
+ public:
+  explicit C_clean_handler(AsyncConnectionRef c): conn(c) {}
+  void do_request(uint64_t id) override {
+    conn->cleanup();
+    delete this;
+  }
+};
+
+class C_tick_wakeup : public EventCallback {
+  AsyncConnectionRef conn;
+
+ public:
+  explicit C_tick_wakeup(AsyncConnectionRef c): conn(c) {}
+  void do_request(uint64_t fd_or_id) override {
+    conn->tick(fd_or_id);
+  }
+};
+
+
+AsyncConnection::AsyncConnection(CephContext *cct, AsyncMessenger *m, DispatchQueue *q,
+                                 Worker *w, bool m2, bool local)
+  : Connection(cct, m),
+    delay_state(NULL), async_msgr(m), conn_id(q->get_id()),
+    logger(w->get_perf_counter()),
+    state(STATE_NONE), port(-1),
+    dispatch_queue(q), recv_buf(NULL),
+    recv_max_prefetch(std::max<int64_t>(msgr->cct->_conf->ms_tcp_prefetch_max_size, TCP_PREFETCH_MIN_SIZE)),
+    recv_start(0), recv_end(0),
+    last_active(ceph::coarse_mono_clock::now()),
+    connect_timeout_us(cct->_conf->ms_connection_ready_timeout*1000*1000),
+    inactive_timeout_us(cct->_conf->ms_connection_idle_timeout*1000*1000),
+    msgr2(m2), state_offset(0),
+    worker(w), center(&w->center),read_buffer(nullptr)
+{
+#ifdef UNIT_TESTS_BUILT
+  this->interceptor = m->interceptor;
+#endif
+  read_handler = new C_handle_read(this);
+  write_handler = new C_handle_write(this);
+  write_callback_handler = new C_handle_write_callback(this);
+  wakeup_handler = new C_time_wakeup(this);
+  tick_handler = new C_tick_wakeup(this);
+  // double recv_max_prefetch see "read_until"
+  recv_buf = new char[2*recv_max_prefetch];
+  if (local) {
+    protocol = std::unique_ptr<Protocol>(new LoopbackProtocolV1(this));
+  } else if (m2) {
+    protocol = std::unique_ptr<Protocol>(new ProtocolV2(this));
+  } else {
+    protocol = std::unique_ptr<Protocol>(new ProtocolV1(this));
+  }
+  logger->inc(l_msgr_created_connections);
+}
+
+AsyncConnection::~AsyncConnection()
+{
+  if (recv_buf)
+    delete[] recv_buf;
+  ceph_assert(!delay_state);
+}
+
+int AsyncConnection::get_con_mode() const
+{
+  return protocol->get_con_mode();
+}
+
+bool AsyncConnection::is_msgr2() const
+{
+  return protocol->proto_type == 2;
+}
+
+void AsyncConnection::maybe_start_delay_thread()
+{
+  if (!delay_state) {
+    async_msgr->cct->_conf.with_val<std::string>(
+      "ms_inject_delay_type",
+      [this](const std::string& s) {
+	if (s.find(ceph_entity_type_name(peer_type)) != std::string::npos) {
+	  ldout(msgr->cct, 1) << __func__ << " setting up a delay queue"
+			      << dendl;
+	  delay_state = new DelayedDelivery(async_msgr, center, dispatch_queue,
+					    conn_id);
+	}
+      });
+  }
+}
+
+
+ssize_t AsyncConnection::read(unsigned len, char *buffer,
+                              std::function<void(char *, ssize_t)> callback) {
+  ldout(async_msgr->cct, 20) << __func__
+                             << (pendingReadLen ? " continue" : " start")
+                             << " len=" << len << dendl;
+  ssize_t r = read_until(len, buffer);
+  if (r > 0) {
+    readCallback = callback;
+    pendingReadLen = len;
+    read_buffer = buffer;
+  }
+  return r;
+}
+
+// Because this func will be called multi times to populate
+// the needed buffer, so the passed in bufferptr must be the same.
+// Normally, only "read_message" will pass existing bufferptr in
+//
+// And it will uses readahead method to reduce small read overhead,
+// "recv_buf" is used to store read buffer
+//
+// return the remaining bytes, 0 means this buffer is finished
+// else return < 0 means error
+ssize_t AsyncConnection::read_until(unsigned len, char *p)
+{
+  ldout(async_msgr->cct, 25) << __func__ << " len is " << len << " state_offset is "
+                             << state_offset << dendl;
+
+  if (async_msgr->cct->_conf->ms_inject_socket_failures && cs) {
+    if (rand() % async_msgr->cct->_conf->ms_inject_socket_failures == 0) {
+      ldout(async_msgr->cct, 0) << __func__ << " injecting socket failure" << dendl;
+      cs.shutdown();
+    }
+  }
+
+  ssize_t r = 0;
+  uint64_t left = len - state_offset;
+  if (recv_end > recv_start) {
+    uint64_t to_read = std::min<uint64_t>(recv_end - recv_start, left);
+    memcpy(p, recv_buf+recv_start, to_read);
+    recv_start += to_read;
+    left -= to_read;
+    ldout(async_msgr->cct, 25) << __func__ << " got " << to_read << " in buffer "
+                               << " left is " << left << " buffer still has "
+                               << recv_end - recv_start << dendl;
+    if (left == 0) {
+      return 0;
+    }
+    state_offset += to_read;
+  }
+
+  recv_end = recv_start = 0;
+  /* nothing left in the prefetch buffer */
+  if (left > (uint64_t)recv_max_prefetch) {
+    /* this was a large read, we don't prefetch for these */
+    do {
+      r = read_bulk(p+state_offset, left);
+      ldout(async_msgr->cct, 25) << __func__ << " read_bulk left is " << left << " got " << r << dendl;
+      if (r < 0) {
+        ldout(async_msgr->cct, 1) << __func__ << " read failed" << dendl;
+        return -1;
+      } else if (r == static_cast<int>(left)) {
+        state_offset = 0;
+        return 0;
+      }
+      state_offset += r;
+      left -= r;
+    } while (r > 0);
+  } else {
+    do {
+      r = read_bulk(recv_buf+recv_end, recv_max_prefetch);
+      ldout(async_msgr->cct, 25) << __func__ << " read_bulk recv_end is " << recv_end
+                                 << " left is " << left << " got " << r << dendl;
+      if (r < 0) {
+        ldout(async_msgr->cct, 1) << __func__ << " read failed" << dendl;
+        return -1;
+      }
+      recv_end += r;
+      if (r >= static_cast<int>(left)) {
+        recv_start = len - state_offset;
+        memcpy(p+state_offset, recv_buf, recv_start);
+        state_offset = 0;
+        return 0;
+      }
+      left -= r;
+    } while (r > 0);
+    memcpy(p+state_offset, recv_buf, recv_end-recv_start);
+    state_offset += (recv_end - recv_start);
+    recv_end = recv_start = 0;
+  }
+  ldout(async_msgr->cct, 25) << __func__ << " need len " << len << " remaining "
+                             << len - state_offset << " bytes" << dendl;
+  return len - state_offset;
+}
+
+/* return -1 means `fd` occurs error or closed, it should be closed
+ * return 0 means EAGAIN or EINTR */
+ssize_t AsyncConnection::read_bulk(char *buf, unsigned len)
+{
+  ssize_t nread;
+ again:
+  nread = cs.read(buf, len);
+  if (nread < 0) {
+    if (nread == -EAGAIN) {
+      nread = 0;
+    } else if (nread == -EINTR) {
+      goto again;
+    } else {
+      ldout(async_msgr->cct, 1) << __func__ << " reading from fd=" << cs.fd()
+                          << " : "<< nread << " " << strerror(nread) << dendl;
+      return -1;
+    }
+  } else if (nread == 0) {
+    ldout(async_msgr->cct, 1) << __func__ << " peer close file descriptor "
+                              << cs.fd() << dendl;
+    return -1;
+  }
+  return nread;
+}
+
+ssize_t AsyncConnection::write(ceph::buffer::list &bl,
+                               std::function<void(ssize_t)> callback,
+                               bool more) {
+
+    std::unique_lock<std::mutex> l(write_lock);
+    outgoing_bl.claim_append(bl);
+    ssize_t r = _try_send(more);
+    if (r > 0) {
+      writeCallback = callback;
+    }
+    return r;
+}
+
+// return the remaining bytes, it may larger than the length of ptr
+// else return < 0 means error
+ssize_t AsyncConnection::_try_send(bool more)
+{
+  if (async_msgr->cct->_conf->ms_inject_socket_failures && cs) {
+    if (rand() % async_msgr->cct->_conf->ms_inject_socket_failures == 0) {
+      ldout(async_msgr->cct, 0) << __func__ << " injecting socket failure" << dendl;
+      cs.shutdown();
+    }
+  }
+
+  ceph_assert(center->in_thread());
+  ldout(async_msgr->cct, 25) << __func__ << " cs.send " << outgoing_bl.length()
+                             << " bytes" << dendl;
+  ssize_t r = cs.send(outgoing_bl, more);
+  if (r < 0) {
+    ldout(async_msgr->cct, 1) << __func__ << " send error: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  ldout(async_msgr->cct, 10) << __func__ << " sent bytes " << r
+                             << " remaining bytes " << outgoing_bl.length() << dendl;
+
+  if (!open_write && is_queued()) {
+    center->create_file_event(cs.fd(), EVENT_WRITABLE, write_handler);
+    open_write = true;
+  }
+
+  if (open_write && !is_queued()) {
+    center->delete_file_event(cs.fd(), EVENT_WRITABLE);
+    open_write = false;
+    if (writeCallback) {
+      center->dispatch_event_external(write_callback_handler);
+    }
+  }
+
+  return outgoing_bl.length();
+}
+
+void AsyncConnection::inject_delay() {
+  if (async_msgr->cct->_conf->ms_inject_internal_delays) {
+    ldout(async_msgr->cct, 10) << __func__ << " sleep for " <<
+      async_msgr->cct->_conf->ms_inject_internal_delays << dendl;
+    utime_t t;
+    t.set_from_double(async_msgr->cct->_conf->ms_inject_internal_delays);
+    t.sleep();
+  }
+}
+
+void AsyncConnection::process() {
+  std::lock_guard<std::mutex> l(lock);
+  last_active = ceph::coarse_mono_clock::now();
+  recv_start_time = ceph::mono_clock::now();
+
+  ldout(async_msgr->cct, 20) << __func__ << dendl;
+
+  switch (state) {
+    case STATE_NONE: {
+      ldout(async_msgr->cct, 20) << __func__ << " enter none state" << dendl;
+      return;
+    }
+    case STATE_CLOSED: {
+      ldout(async_msgr->cct, 20) << __func__ << " socket closed" << dendl;
+      return;
+    }
+    case STATE_CONNECTING: {
+      ceph_assert(!policy.server);
+
+      // clear timer (if any) since we are connecting/re-connecting
+      if (last_tick_id) {
+        center->delete_time_event(last_tick_id);
+      }
+      last_connect_started = ceph::coarse_mono_clock::now();
+      last_tick_id = center->create_time_event(
+          connect_timeout_us, tick_handler);
+
+      if (cs) {
+        center->delete_file_event(cs.fd(), EVENT_READABLE | EVENT_WRITABLE);
+        cs.close();
+      }
+
+      SocketOptions opts;
+      opts.priority = async_msgr->get_socket_priority();
+      opts.connect_bind_addr = msgr->get_myaddrs().front();
+      ssize_t r = worker->connect(target_addr, opts, &cs);
+      if (r < 0) {
+        protocol->fault();
+        return;
+      }
+
+      center->create_file_event(cs.fd(), EVENT_READABLE, read_handler);
+      state = STATE_CONNECTING_RE;
+    }
+    case STATE_CONNECTING_RE: {
+      ssize_t r = cs.is_connected();
+      if (r < 0) {
+        ldout(async_msgr->cct, 1) << __func__ << " reconnect failed to "
+                                  << target_addr << dendl;
+        if (r == -ECONNREFUSED) {
+          ldout(async_msgr->cct, 2)
+              << __func__ << " connection refused!" << dendl;
+          dispatch_queue->queue_refused(this);
+        }
+        protocol->fault();
+        return;
+      } else if (r == 0) {
+        ldout(async_msgr->cct, 10)
+            << __func__ << " nonblock connect inprogress" << dendl;
+        if (async_msgr->get_stack()->nonblock_connect_need_writable_event()) {
+          center->create_file_event(cs.fd(), EVENT_WRITABLE,
+                                    read_handler);
+        }
+        logger->tinc(l_msgr_running_recv_time,
+               ceph::mono_clock::now() - recv_start_time);
+        return;
+      }
+
+      center->delete_file_event(cs.fd(), EVENT_WRITABLE);
+      ldout(async_msgr->cct, 10)
+          << __func__ << " connect successfully, ready to send banner" << dendl;
+      state = STATE_CONNECTION_ESTABLISHED;
+      break;
+    }
+
+    case STATE_ACCEPTING: {
+      center->create_file_event(cs.fd(), EVENT_READABLE, read_handler);
+      state = STATE_CONNECTION_ESTABLISHED;
+
+      break;
+    }
+
+    case STATE_CONNECTION_ESTABLISHED: {
+      if (pendingReadLen) {
+        ssize_t r = read(*pendingReadLen, read_buffer, readCallback);
+        if (r <= 0) { // read all bytes, or an error occured
+          pendingReadLen.reset();
+          char *buf_tmp = read_buffer;
+          read_buffer = nullptr;
+          readCallback(buf_tmp, r);
+        }
+	logger->tinc(l_msgr_running_recv_time,
+	    ceph::mono_clock::now() - recv_start_time);
+        return;
+      }
+      break;
+    }
+  }
+
+  protocol->read_event();
+
+  logger->tinc(l_msgr_running_recv_time,
+               ceph::mono_clock::now() - recv_start_time);
+}
+
+bool AsyncConnection::is_connected() {
+  return protocol->is_connected();
+}
+
+void AsyncConnection::connect(const entity_addrvec_t &addrs, int type,
+                              entity_addr_t &target) {
+
+  std::lock_guard<std::mutex> l(lock);
+  set_peer_type(type);
+  set_peer_addrs(addrs);
+  policy = msgr->get_policy(type);
+  target_addr = target;
+  _connect();
+}
+
+void AsyncConnection::_connect()
+{
+  ldout(async_msgr->cct, 10) << __func__ << dendl;
+
+  state = STATE_CONNECTING;
+  protocol->connect();
+  // rescheduler connection in order to avoid lock dep
+  // may called by external thread(send_message)
+  center->dispatch_event_external(read_handler);
+}
+
+void AsyncConnection::accept(ConnectedSocket socket,
+			     const entity_addr_t &listen_addr,
+			     const entity_addr_t &peer_addr)
+{
+  ldout(async_msgr->cct, 10) << __func__ << " sd=" << socket.fd()
+			     << " listen_addr " << listen_addr
+			     << " peer_addr " << peer_addr << dendl;
+  ceph_assert(socket.fd() >= 0);
+
+  std::lock_guard<std::mutex> l(lock);
+  cs = std::move(socket);
+  socket_addr = listen_addr;
+  target_addr = peer_addr; // until we know better
+  state = STATE_ACCEPTING;
+  protocol->accept();
+  // rescheduler connection in order to avoid lock dep
+  center->dispatch_event_external(read_handler);
+}
+
+int AsyncConnection::send_message(Message *m)
+{
+  FUNCTRACE(async_msgr->cct);
+  lgeneric_subdout(async_msgr->cct, ms,
+		   1) << "-- " << async_msgr->get_myaddrs() << " --> "
+		      << get_peer_addrs() << " -- "
+		      << *m << " -- " << m << " con "
+		      << this
+		      << dendl;
+
+  if (is_blackhole()) {
+    lgeneric_subdout(async_msgr->cct, ms, 0) << __func__ << ceph_entity_type_name(peer_type)
+      << " blackhole " << *m << dendl;
+    m->put();
+    return 0;
+  }
+
+  // optimistic think it's ok to encode(actually may broken now)
+  if (!m->get_priority())
+    m->set_priority(async_msgr->get_default_send_priority());
+
+  m->get_header().src = async_msgr->get_myname();
+  m->set_connection(this);
+
+#if defined(WITH_EVENTTRACE)
+  if (m->get_type() == CEPH_MSG_OSD_OP)
+    OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OP_BEGIN", true);
+  else if (m->get_type() == CEPH_MSG_OSD_OPREPLY)
+    OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OPREPLY_BEGIN", true);
+#endif
+
+  if (is_loopback) { //loopback connection
+    ldout(async_msgr->cct, 20) << __func__ << " " << *m << " local" << dendl;
+    std::lock_guard<std::mutex> l(write_lock);
+    if (protocol->is_connected()) {
+      dispatch_queue->local_delivery(m, m->get_priority());
+    } else {
+      ldout(async_msgr->cct, 10) << __func__ << " loopback connection closed."
+                                 << " Drop message " << m << dendl;
+      m->put();
+    }
+    return 0;
+  }
+
+  // we don't want to consider local message here, it's too lightweight which
+  // may disturb users
+  logger->inc(l_msgr_send_messages);
+
+  protocol->send_message(m);
+  return 0;
+}
+
+entity_addr_t AsyncConnection::_infer_target_addr(const entity_addrvec_t& av)
+{
+  // pick the first addr of the same address family as socket_addr.  it could be
+  // an any: or v2: addr, we don't care.  it should not be a v1 addr.
+  for (auto& i : av.v) {
+    if (i.is_legacy()) {
+      continue;
+    }
+    if (i.get_family() == socket_addr.get_family()) {
+      ldout(async_msgr->cct,10) << __func__ << " " << av << " -> " << i << dendl;
+      return i;
+    }
+  }
+  ldout(async_msgr->cct,10) << __func__ << " " << av << " -> nothing to match "
+			    << socket_addr << dendl;
+  return {};
+}
+
+void AsyncConnection::fault()
+{
+  shutdown_socket();
+  open_write = false;
+
+  // queue delayed items immediately
+  if (delay_state)
+    delay_state->flush();
+
+  recv_start = recv_end = 0;
+  state_offset = 0;
+  outgoing_bl.clear();
+}
+
+void AsyncConnection::_stop() {
+  writeCallback.reset();
+  dispatch_queue->discard_queue(conn_id);
+  async_msgr->unregister_conn(this);
+  worker->release_worker();
+
+  state = STATE_CLOSED;
+  open_write = false;
+
+  state_offset = 0;
+  // Make sure in-queue events will been processed
+  center->dispatch_event_external(EventCallbackRef(new C_clean_handler(this)));
+}
+
+bool AsyncConnection::is_queued() const {
+  return outgoing_bl.length();
+}
+
+void AsyncConnection::shutdown_socket() {
+  for (auto &&t : register_time_events) center->delete_time_event(t);
+  register_time_events.clear();
+  if (last_tick_id) {
+    center->delete_time_event(last_tick_id);
+    last_tick_id = 0;
+  }
+  if (cs) {
+    center->delete_file_event(cs.fd(), EVENT_READABLE | EVENT_WRITABLE);
+    cs.shutdown();
+    cs.close();
+  }
+}
+
+void AsyncConnection::DelayedDelivery::do_request(uint64_t id)
+{
+  Message *m = nullptr;
+  {
+    std::lock_guard<std::mutex> l(delay_lock);
+    register_time_events.erase(id);
+    if (stop_dispatch)
+      return ;
+    if (delay_queue.empty())
+      return ;
+    m = delay_queue.front();
+    delay_queue.pop_front();
+  }
+  if (msgr->ms_can_fast_dispatch(m)) {
+    dispatch_queue->fast_dispatch(m);
+  } else {
+    dispatch_queue->enqueue(m, m->get_priority(), conn_id);
+  }
+}
+
+void AsyncConnection::DelayedDelivery::discard() {
+  stop_dispatch = true;
+  center->submit_to(center->get_id(),
+                    [this]() mutable {
+                      std::lock_guard<std::mutex> l(delay_lock);
+                      while (!delay_queue.empty()) {
+                        Message *m = delay_queue.front();
+                        dispatch_queue->dispatch_throttle_release(
+                            m->get_dispatch_throttle_size());
+                        m->put();
+                        delay_queue.pop_front();
+                      }
+                      for (auto i : register_time_events)
+                        center->delete_time_event(i);
+                      register_time_events.clear();
+                      stop_dispatch = false;
+                    },
+                    true);
+}
+
+void AsyncConnection::DelayedDelivery::flush() {
+  stop_dispatch = true;
+  center->submit_to(
+      center->get_id(), [this] () mutable {
+    std::lock_guard<std::mutex> l(delay_lock);
+    while (!delay_queue.empty()) {
+      Message *m = delay_queue.front();
+      if (msgr->ms_can_fast_dispatch(m)) {
+        dispatch_queue->fast_dispatch(m);
+      } else {
+        dispatch_queue->enqueue(m, m->get_priority(), conn_id);
+      }
+      delay_queue.pop_front();
+    }
+    for (auto i : register_time_events)
+      center->delete_time_event(i);
+    register_time_events.clear();
+    stop_dispatch = false;
+  }, true);
+}
+
+void AsyncConnection::send_keepalive()
+{
+  protocol->send_keepalive();
+}
+
+void AsyncConnection::mark_down()
+{
+  ldout(async_msgr->cct, 1) << __func__ << dendl;
+  std::lock_guard<std::mutex> l(lock);
+  protocol->stop();
+}
+
+void AsyncConnection::handle_write()
+{
+  ldout(async_msgr->cct, 10) << __func__ << dendl;
+  protocol->write_event();
+}
+
+void AsyncConnection::handle_write_callback() {
+  std::lock_guard<std::mutex> l(lock);
+  last_active = ceph::coarse_mono_clock::now();
+  recv_start_time = ceph::mono_clock::now();
+  write_lock.lock();
+  if (writeCallback) {
+    auto callback = *writeCallback;
+    writeCallback.reset();
+    write_lock.unlock();
+    callback(0);
+    return;
+  }
+  write_lock.unlock();
+}
+
+void AsyncConnection::stop(bool queue_reset) {
+  lock.lock();
+  bool need_queue_reset = (state != STATE_CLOSED) && queue_reset;
+  protocol->stop();
+  lock.unlock();
+  if (need_queue_reset) dispatch_queue->queue_reset(this);
+}
+
+void AsyncConnection::cleanup() {
+  shutdown_socket();
+  delete read_handler;
+  delete write_handler;
+  delete write_callback_handler;
+  delete wakeup_handler;
+  delete tick_handler;
+  if (delay_state) {
+    delete delay_state;
+    delay_state = NULL;
+  }
+}
+
+void AsyncConnection::wakeup_from(uint64_t id)
+{
+  lock.lock();
+  register_time_events.erase(id);
+  lock.unlock();
+  process();
+}
+
+void AsyncConnection::tick(uint64_t id)
+{
+  auto now = ceph::coarse_mono_clock::now();
+  ldout(async_msgr->cct, 20) << __func__ << " last_id=" << last_tick_id
+                             << " last_active=" << last_active << dendl;
+  std::lock_guard<std::mutex> l(lock);
+  last_tick_id = 0;
+  if (!is_connected()) {
+    if (connect_timeout_us <=
+        (uint64_t)std::chrono::duration_cast<std::chrono::microseconds>
+          (now - last_connect_started).count()) {
+      ldout(async_msgr->cct, 1) << __func__ << " see no progress in more than "
+                                << connect_timeout_us
+                                << " us during connecting, fault."
+                                << dendl;
+      protocol->fault();
+    } else {
+      last_tick_id = center->create_time_event(connect_timeout_us, tick_handler);
+    }
+  } else {
+    auto idle_period = std::chrono::duration_cast<std::chrono::microseconds>
+      (now - last_active).count();
+    if (inactive_timeout_us < (uint64_t)idle_period) {
+      ldout(async_msgr->cct, 1) << __func__ << " idle (" << idle_period
+                                << ") for more than " << inactive_timeout_us
+                                << " us, fault."
+                                << dendl;
+      protocol->fault();
+    } else {
+      last_tick_id = center->create_time_event(inactive_timeout_us, tick_handler);
+    }
+  }
+}
diff --git a/src/msg/async/AsyncConnection.h b/src/msg/async/AsyncConnection.h
new file mode 100644
index 000000000..c7f0f9fe8
--- /dev/null
+++ b/src/msg/async/AsyncConnection.h
@@ -0,0 +1,251 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_ASYNCCONNECTION_H
+#define CEPH_MSG_ASYNCCONNECTION_H
+
+#include <atomic>
+#include <pthread.h>
+#include <climits>
+#include <list>
+#include <mutex>
+#include <map>
+#include <functional>
+#include <optional>
+
+#include "auth/AuthSessionHandler.h"
+#include "common/ceph_time.h"
+#include "common/perf_counters.h"
+#include "include/buffer.h"
+#include "msg/Connection.h"
+#include "msg/Messenger.h"
+
+#include "Event.h"
+#include "Stack.h"
+
+class AsyncMessenger;
+class DispatchQueue;
+class Worker;
+class Protocol;
+
+static const int ASYNC_IOV_MAX = (IOV_MAX >= 1024 ? IOV_MAX / 4 : IOV_MAX);
+
+/*
+ * AsyncConnection maintains a logic session between two endpoints. In other
+ * word, a pair of addresses can find the only AsyncConnection. AsyncConnection
+ * will handle with network fault or read/write transactions. If one file
+ * descriptor broken, AsyncConnection will maintain the message queue and
+ * sequence, try to reconnect peer endpoint.
+ */
+class AsyncConnection : public Connection {
+  ssize_t read(unsigned len, char *buffer,
+               std::function<void(char *, ssize_t)> callback);
+  ssize_t read_until(unsigned needed, char *p);
+  ssize_t read_bulk(char *buf, unsigned len);
+
+  ssize_t write(ceph::buffer::list &bl, std::function<void(ssize_t)> callback,
+                bool more=false);
+  ssize_t _try_send(bool more=false);
+
+  void _connect();
+  void _stop();
+  void fault();
+  void inject_delay();
+
+  bool is_queued() const;
+  void shutdown_socket();
+
+   /**
+   * The DelayedDelivery is for injecting delays into Message delivery off
+   * the socket. It is only enabled if delays are requested, and if they
+   * are then it pulls Messages off the DelayQueue and puts them into the
+   * AsyncMessenger event queue.
+   */
+  class DelayedDelivery : public EventCallback {
+    std::set<uint64_t> register_time_events; // need to delete it if stop
+    std::deque<Message*> delay_queue;
+    std::mutex delay_lock;
+    AsyncMessenger *msgr;
+    EventCenter *center;
+    DispatchQueue *dispatch_queue;
+    uint64_t conn_id;
+    std::atomic_bool stop_dispatch;
+
+   public:
+    explicit DelayedDelivery(AsyncMessenger *omsgr, EventCenter *c,
+                             DispatchQueue *q, uint64_t cid)
+      : msgr(omsgr), center(c), dispatch_queue(q), conn_id(cid),
+        stop_dispatch(false) { }
+    ~DelayedDelivery() override {
+      ceph_assert(register_time_events.empty());
+      ceph_assert(delay_queue.empty());
+    }
+    void set_center(EventCenter *c) { center = c; }
+    void do_request(uint64_t id) override;
+    void queue(double delay_period, Message *m) {
+      std::lock_guard<std::mutex> l(delay_lock);
+      delay_queue.push_back(m);
+      register_time_events.insert(center->create_time_event(delay_period*1000000, this));
+    }
+    void discard();
+    bool ready() const { return !stop_dispatch && delay_queue.empty() && register_time_events.empty(); }
+    void flush();
+  } *delay_state;
+
+private:
+  FRIEND_MAKE_REF(AsyncConnection);
+  AsyncConnection(CephContext *cct, AsyncMessenger *m, DispatchQueue *q,
+		  Worker *w, bool is_msgr2, bool local);
+  ~AsyncConnection() override;
+  bool unregistered = false;
+public:
+  void maybe_start_delay_thread();
+
+  std::ostream& _conn_prefix(std::ostream *_dout);
+
+  bool is_connected() override;
+
+  // Only call when AsyncConnection first construct
+  void connect(const entity_addrvec_t& addrs, int type, entity_addr_t& target);
+
+  // Only call when AsyncConnection first construct
+  void accept(ConnectedSocket socket,
+	      const entity_addr_t &listen_addr,
+	      const entity_addr_t &peer_addr);
+  int send_message(Message *m) override;
+
+  void send_keepalive() override;
+  void mark_down() override;
+  void mark_disposable() override {
+    std::lock_guard<std::mutex> l(lock);
+    policy.lossy = true;
+  }
+
+  entity_addr_t get_peer_socket_addr() const override {
+    return target_addr;
+  }
+
+  int get_con_mode() const override;
+
+  bool is_unregistered() const {
+    return unregistered;
+  }
+
+  void unregister() {
+    unregistered = true;
+  }
+
+ private:
+  enum {
+    STATE_NONE,
+    STATE_CONNECTING,
+    STATE_CONNECTING_RE,
+    STATE_ACCEPTING,
+    STATE_CONNECTION_ESTABLISHED,
+    STATE_CLOSED
+  };
+
+  static const uint32_t TCP_PREFETCH_MIN_SIZE;
+  static const char *get_state_name(int state) {
+      const char* const statenames[] = {"STATE_NONE",
+                                        "STATE_CONNECTING",
+                                        "STATE_CONNECTING_RE",
+                                        "STATE_ACCEPTING",
+                                        "STATE_CONNECTION_ESTABLISHED",
+                                        "STATE_CLOSED"};
+      return statenames[state];
+  }
+
+  AsyncMessenger *async_msgr;
+  uint64_t conn_id;
+  PerfCounters *logger;
+  int state;
+  ConnectedSocket cs;
+  int port;
+public:
+  Messenger::Policy policy;
+private:
+
+  DispatchQueue *dispatch_queue;
+
+  // lockfree, only used in own thread
+  ceph::buffer::list outgoing_bl;
+  bool open_write = false;
+
+  std::mutex write_lock;
+
+  std::mutex lock;
+  EventCallbackRef read_handler;
+  EventCallbackRef write_handler;
+  EventCallbackRef write_callback_handler;
+  EventCallbackRef wakeup_handler;
+  EventCallbackRef tick_handler;
+  char *recv_buf;
+  uint32_t recv_max_prefetch;
+  uint32_t recv_start;
+  uint32_t recv_end;
+  std::set<uint64_t> register_time_events; // need to delete it if stop
+  ceph::coarse_mono_clock::time_point last_connect_started;
+  ceph::coarse_mono_clock::time_point last_active;
+  ceph::mono_clock::time_point recv_start_time;
+  uint64_t last_tick_id = 0;
+  const uint64_t connect_timeout_us;
+  const uint64_t inactive_timeout_us;
+
+  // Tis section are temp variables used by state transition
+
+  // Accepting state
+  bool msgr2 = false;
+  entity_addr_t socket_addr;  ///< local socket addr
+  entity_addr_t target_addr;  ///< which of the peer_addrs we're connecting to (as clienet) or should reconnect to (as peer)
+
+  entity_addr_t _infer_target_addr(const entity_addrvec_t& av);
+
+  // used only by "read_until"
+  uint64_t state_offset;
+  Worker *worker;
+  EventCenter *center;
+
+  std::unique_ptr<Protocol> protocol;
+
+  std::optional<std::function<void(ssize_t)>> writeCallback;
+  std::function<void(char *, ssize_t)> readCallback;
+  std::optional<unsigned> pendingReadLen;
+  char *read_buffer;
+
+ public:
+  // used by eventcallback
+  void handle_write();
+  void handle_write_callback();
+  void process();
+  void wakeup_from(uint64_t id);
+  void tick(uint64_t id);
+  void stop(bool queue_reset);
+  void cleanup();
+  PerfCounters *get_perf_counter() {
+    return logger;
+  }
+
+  bool is_msgr2() const override;
+
+  friend class Protocol;
+  friend class ProtocolV1;
+  friend class ProtocolV2;
+}; /* AsyncConnection */
+
+using AsyncConnectionRef = ceph::ref_t<AsyncConnection>;
+
+#endif
diff --git a/src/msg/async/AsyncMessenger.cc b/src/msg/async/AsyncMessenger.cc
new file mode 100644
index 000000000..f5dd03295
--- /dev/null
+++ b/src/msg/async/AsyncMessenger.cc
@@ -0,0 +1,944 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "acconfig.h"
+
+#include <iostream>
+#include <fstream>
+
+#include "AsyncMessenger.h"
+
+#include "common/config.h"
+#include "common/Timer.h"
+#include "common/errno.h"
+
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "common/EventTrace.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+static std::ostream& _prefix(std::ostream *_dout, AsyncMessenger *m) {
+  return *_dout << "-- " << m->get_myaddrs() << " ";
+}
+
+static std::ostream& _prefix(std::ostream *_dout, Processor *p) {
+  return *_dout << " Processor -- ";
+}
+
+
+/*******************
+ * Processor
+ */
+
+class Processor::C_processor_accept : public EventCallback {
+  Processor *pro;
+
+ public:
+  explicit C_processor_accept(Processor *p): pro(p) {}
+  void do_request(uint64_t id) override {
+    pro->accept();
+  }
+};
+
+Processor::Processor(AsyncMessenger *r, Worker *w, CephContext *c)
+  : msgr(r), net(c), worker(w),
+    listen_handler(new C_processor_accept(this)) {}
+
+int Processor::bind(const entity_addrvec_t &bind_addrs,
+		    const std::set<int>& avoid_ports,
+		    entity_addrvec_t* bound_addrs)
+{
+  const auto& conf = msgr->cct->_conf;
+  // bind to socket(s)
+  ldout(msgr->cct, 10) << __func__ << " " << bind_addrs << dendl;
+
+  SocketOptions opts;
+  opts.nodelay = msgr->cct->_conf->ms_tcp_nodelay;
+  opts.rcbuf_size = msgr->cct->_conf->ms_tcp_rcvbuf;
+
+  listen_sockets.resize(bind_addrs.v.size());
+  *bound_addrs = bind_addrs;
+
+  for (unsigned k = 0; k < bind_addrs.v.size(); ++k) {
+    auto& listen_addr = bound_addrs->v[k];
+
+    /* bind to port */
+    int r = -1;
+
+    for (int i = 0; i < conf->ms_bind_retry_count; i++) {
+      if (i > 0) {
+	lderr(msgr->cct) << __func__ << " was unable to bind. Trying again in "
+			 << conf->ms_bind_retry_delay << " seconds " << dendl;
+	sleep(conf->ms_bind_retry_delay);
+      }
+
+      if (listen_addr.get_port()) {
+	worker->center.submit_to(
+	  worker->center.get_id(),
+	  [this, k, &listen_addr, &opts, &r]() {
+	    r = worker->listen(listen_addr, k, opts, &listen_sockets[k]);
+	  }, false);
+	if (r < 0) {
+	  lderr(msgr->cct) << __func__ << " unable to bind to " << listen_addr
+			   << ": " << cpp_strerror(r) << dendl;
+	  continue;
+	}
+      } else {
+	// try a range of ports
+	for (int port = msgr->cct->_conf->ms_bind_port_min;
+	     port <= msgr->cct->_conf->ms_bind_port_max;
+	     port++) {
+	  if (avoid_ports.count(port))
+	    continue;
+
+	  listen_addr.set_port(port);
+	  worker->center.submit_to(
+	    worker->center.get_id(),
+	    [this, k, &listen_addr, &opts, &r]() {
+	      r = worker->listen(listen_addr, k, opts, &listen_sockets[k]);
+	    }, false);
+	  if (r == 0)
+	    break;
+	}
+	if (r < 0) {
+	  lderr(msgr->cct) << __func__ << " unable to bind to " << listen_addr
+			   << " on any port in range "
+			   << msgr->cct->_conf->ms_bind_port_min
+			   << "-" << msgr->cct->_conf->ms_bind_port_max << ": "
+			   << cpp_strerror(r) << dendl;
+	  listen_addr.set_port(0); // Clear port before retry, otherwise we shall fail again.
+	  continue;
+	}
+	ldout(msgr->cct, 10) << __func__ << " bound on random port "
+			     << listen_addr << dendl;
+      }
+      if (r == 0) {
+	break;
+      }
+    }
+
+    // It seems that binding completely failed, return with that exit status
+    if (r < 0) {
+      lderr(msgr->cct) << __func__ << " was unable to bind after "
+		       << conf->ms_bind_retry_count
+		       << " attempts: " << cpp_strerror(r) << dendl;
+      for (unsigned j = 0; j < k; ++j) {
+	// clean up previous bind
+	listen_sockets[j].abort_accept();
+      }
+      return r;
+    }
+  }
+
+  ldout(msgr->cct, 10) << __func__ << " bound to " << *bound_addrs << dendl;
+  return 0;
+}
+
+void Processor::start()
+{
+  ldout(msgr->cct, 1) << __func__ << dendl;
+
+  // start thread
+  worker->center.submit_to(worker->center.get_id(), [this]() {
+      for (auto& listen_socket : listen_sockets) {
+	if (listen_socket) {
+          if (listen_socket.fd() == -1) {
+            ldout(msgr->cct, 1) << __func__ 
+                << " Error: processor restart after listen_socket.fd closed. " 
+                << this << dendl;
+            return;
+          }
+	  worker->center.create_file_event(listen_socket.fd(), EVENT_READABLE,
+					   listen_handler); }
+      }
+    }, false);
+}
+
+void Processor::accept()
+{
+  SocketOptions opts;
+  opts.nodelay = msgr->cct->_conf->ms_tcp_nodelay;
+  opts.rcbuf_size = msgr->cct->_conf->ms_tcp_rcvbuf;
+  opts.priority = msgr->get_socket_priority();
+
+  for (auto& listen_socket : listen_sockets) {
+    ldout(msgr->cct, 10) << __func__ << " listen_fd=" << listen_socket.fd()
+			 << dendl;
+    unsigned accept_error_num = 0;
+
+    while (true) {
+      entity_addr_t addr;
+      ConnectedSocket cli_socket;
+      Worker *w = worker;
+      if (!msgr->get_stack()->support_local_listen_table())
+	w = msgr->get_stack()->get_worker();
+      else
+	++w->references;
+      int r = listen_socket.accept(&cli_socket, opts, &addr, w);
+      if (r == 0) {
+	ldout(msgr->cct, 10) << __func__ << " accepted incoming on sd "
+			     << cli_socket.fd() << dendl;
+
+	msgr->add_accept(
+	  w, std::move(cli_socket),
+	  msgr->get_myaddrs().v[listen_socket.get_addr_slot()],
+	  addr);
+	accept_error_num = 0;
+	continue;
+      } else {
+	--w->references;
+	if (r == -EINTR) {
+	  continue;
+	} else if (r == -EAGAIN) {
+	  break;
+	} else if (r == -EMFILE || r == -ENFILE) {
+	  lderr(msgr->cct) << __func__ << " open file descriptions limit reached sd = " << listen_socket.fd()
+			   << " errno " << r << " " << cpp_strerror(r) << dendl;
+	  if (++accept_error_num > msgr->cct->_conf->ms_max_accept_failures) {
+	    lderr(msgr->cct) << "Proccessor accept has encountered enough error numbers, just do ceph_abort()." << dendl;
+	    ceph_abort();
+	  }
+	  continue;
+	} else if (r == -ECONNABORTED) {
+	  ldout(msgr->cct, 0) << __func__ << " it was closed because of rst arrived sd = " << listen_socket.fd()
+			      << " errno " << r << " " << cpp_strerror(r) << dendl;
+	  continue;
+	} else {
+	  lderr(msgr->cct) << __func__ << " no incoming connection?"
+			   << " errno " << r << " " << cpp_strerror(r) << dendl;
+	  if (++accept_error_num > msgr->cct->_conf->ms_max_accept_failures) {
+	    lderr(msgr->cct) << "Proccessor accept has encountered enough error numbers, just do ceph_abort()." << dendl;
+	    ceph_abort();
+	  }
+	  continue;
+	}
+      }
+    }
+  }
+}
+
+void Processor::stop()
+{
+  ldout(msgr->cct,10) << __func__ << dendl;
+
+  worker->center.submit_to(worker->center.get_id(), [this]() {
+      for (auto& listen_socket : listen_sockets) {
+	if (listen_socket) {
+	  worker->center.delete_file_event(listen_socket.fd(), EVENT_READABLE);
+	  listen_socket.abort_accept();
+	}
+      }
+    }, false);
+}
+
+
+struct StackSingleton {
+  CephContext *cct;
+  std::shared_ptr<NetworkStack> stack;
+
+  explicit StackSingleton(CephContext *c): cct(c) {}
+  void ready(std::string &type) {
+    if (!stack)
+      stack = NetworkStack::create(cct, type);
+  }
+  ~StackSingleton() {
+    stack->stop();
+  }
+};
+
+
+class C_handle_reap : public EventCallback {
+  AsyncMessenger *msgr;
+
+  public:
+  explicit C_handle_reap(AsyncMessenger *m): msgr(m) {}
+  void do_request(uint64_t id) override {
+    // judge whether is a time event
+    msgr->reap_dead();
+  }
+};
+
+/*******************
+ * AsyncMessenger
+ */
+
+AsyncMessenger::AsyncMessenger(CephContext *cct, entity_name_t name,
+                               const std::string &type, std::string mname, uint64_t _nonce)
+  : SimplePolicyMessenger(cct, name),
+    dispatch_queue(cct, this, mname),
+    nonce(_nonce)
+{
+  std::string transport_type = "posix";
+  if (type.find("rdma") != std::string::npos)
+    transport_type = "rdma";
+  else if (type.find("dpdk") != std::string::npos)
+    transport_type = "dpdk";
+
+  auto single = &cct->lookup_or_create_singleton_object<StackSingleton>(
+    "AsyncMessenger::NetworkStack::" + transport_type, true, cct);
+  single->ready(transport_type);
+  stack = single->stack.get();
+  stack->start();
+  local_worker = stack->get_worker();
+  local_connection = ceph::make_ref<AsyncConnection>(cct, this, &dispatch_queue,
+					 local_worker, true, true);
+  init_local_connection();
+  reap_handler = new C_handle_reap(this);
+  unsigned processor_num = 1;
+  if (stack->support_local_listen_table())
+    processor_num = stack->get_num_worker();
+  for (unsigned i = 0; i < processor_num; ++i)
+    processors.push_back(new Processor(this, stack->get_worker(i), cct));
+}
+
+/**
+ * Destroy the AsyncMessenger. Pretty simple since all the work is done
+ * elsewhere.
+ */
+AsyncMessenger::~AsyncMessenger()
+{
+  delete reap_handler;
+  ceph_assert(!did_bind); // either we didn't bind or we shut down the Processor
+  for (auto &&p : processors)
+    delete p;
+}
+
+void AsyncMessenger::ready()
+{
+  ldout(cct,10) << __func__ << " " << get_myaddrs() << dendl;
+
+  stack->ready();
+  if (pending_bind) {
+    int err = bindv(pending_bind_addrs);
+    if (err) {
+      lderr(cct) << __func__ << " postponed bind failed" << dendl;
+      ceph_abort();
+    }
+  }
+
+  std::lock_guard l{lock};
+  for (auto &&p : processors)
+    p->start();
+  dispatch_queue.start();
+}
+
+int AsyncMessenger::shutdown()
+{
+  ldout(cct,10) << __func__ << " " << get_myaddrs() << dendl;
+
+  // done!  clean up.
+  for (auto &&p : processors)
+    p->stop();
+  mark_down_all();
+  // break ref cycles on the loopback connection
+  local_connection->clear_priv();
+  local_connection->mark_down();
+  did_bind = false;
+  lock.lock();
+  stop_cond.notify_all();
+  stopped = true;
+  lock.unlock();
+  stack->drain();
+  return 0;
+}
+
+int AsyncMessenger::bind(const entity_addr_t &bind_addr)
+{
+  ldout(cct,10) << __func__ << " " << bind_addr << dendl;
+  // old bind() can take entity_addr_t(). new bindv() can take a
+  // 0.0.0.0-like address but needs type and family to be set.
+  auto a = bind_addr;
+  if (a == entity_addr_t()) {
+    a.set_type(entity_addr_t::TYPE_LEGACY);
+    if (cct->_conf->ms_bind_ipv6) {
+      a.set_family(AF_INET6);
+    } else {
+      a.set_family(AF_INET);
+    }
+  }
+  return bindv(entity_addrvec_t(a));
+}
+
+int AsyncMessenger::bindv(const entity_addrvec_t &bind_addrs)
+{
+  lock.lock();
+
+  if (!pending_bind && started) {
+    ldout(cct,10) << __func__ << " already started" << dendl;
+    lock.unlock();
+    return -1;
+  }
+
+  ldout(cct,10) << __func__ << " " << bind_addrs << dendl;
+
+  if (!stack->is_ready()) {
+    ldout(cct, 10) << __func__ << " Network Stack is not ready for bind yet - postponed" << dendl;
+    pending_bind_addrs = bind_addrs;
+    pending_bind = true;
+    lock.unlock();
+    return 0;
+  }
+
+  lock.unlock();
+
+  // bind to a socket
+  std::set<int> avoid_ports;
+  entity_addrvec_t bound_addrs;
+  unsigned i = 0;
+  for (auto &&p : processors) {
+    int r = p->bind(bind_addrs, avoid_ports, &bound_addrs);
+    if (r) {
+      // Note: this is related to local tcp listen table problem.
+      // Posix(default kernel implementation) backend shares listen table
+      // in the kernel, so all threads can use the same listen table naturally
+      // and only one thread need to bind. But other backends(like dpdk) uses local
+      // listen table, we need to bind/listen tcp port for each worker. So if the
+      // first worker failed to bind, it could be think the normal error then handle
+      // it, like port is used case. But if the first worker successfully to bind
+      // but the second worker failed, it's not expected and we need to assert
+      // here
+      ceph_assert(i == 0);
+      return r;
+    }
+    ++i;
+  }
+  _finish_bind(bind_addrs, bound_addrs);
+  return 0;
+}
+
+int AsyncMessenger::rebind(const std::set<int>& avoid_ports)
+{
+  ldout(cct,1) << __func__ << " rebind avoid " << avoid_ports << dendl;
+  ceph_assert(did_bind);
+
+  for (auto &&p : processors)
+    p->stop();
+  mark_down_all();
+
+  // adjust the nonce; we want our entity_addr_t to be truly unique.
+  nonce += 1000000;
+  ldout(cct, 10) << __func__ << " new nonce " << nonce
+		 << " and addr " << get_myaddrs() << dendl;
+
+  entity_addrvec_t bound_addrs;
+  entity_addrvec_t bind_addrs = get_myaddrs();
+  std::set<int> new_avoid(avoid_ports);
+  for (auto& a : bind_addrs.v) {
+    new_avoid.insert(a.get_port());
+    a.set_port(0);
+  }
+  ldout(cct, 10) << __func__ << " will try " << bind_addrs
+		 << " and avoid ports " << new_avoid << dendl;
+  unsigned i = 0;
+  for (auto &&p : processors) {
+    int r = p->bind(bind_addrs, avoid_ports, &bound_addrs);
+    if (r) {
+      ceph_assert(i == 0);
+      return r;
+    }
+    ++i;
+  }
+  _finish_bind(bind_addrs, bound_addrs);
+  for (auto &&p : processors) {
+    p->start();
+  }
+  return 0;
+}
+
+int AsyncMessenger::client_bind(const entity_addr_t &bind_addr)
+{
+  if (!cct->_conf->ms_bind_before_connect)
+    return 0;
+  std::lock_guard l{lock};
+  if (did_bind) {
+    return 0;
+  }
+  if (started) {
+    ldout(cct, 10) << __func__ << " already started" << dendl;
+    return -1;
+  }
+  ldout(cct, 10) << __func__ << " " << bind_addr << dendl;
+
+  set_myaddrs(entity_addrvec_t(bind_addr));
+  return 0;
+}
+
+void AsyncMessenger::_finish_bind(const entity_addrvec_t& bind_addrs,
+				  const entity_addrvec_t& listen_addrs)
+{
+  set_myaddrs(bind_addrs);
+  for (auto& a : bind_addrs.v) {
+    if (!a.is_blank_ip()) {
+      learned_addr(a);
+    }
+  }
+
+  if (get_myaddrs().front().get_port() == 0) {
+    set_myaddrs(listen_addrs);
+  }
+  entity_addrvec_t newaddrs = *my_addrs;
+  for (auto& a : newaddrs.v) {
+    a.set_nonce(nonce);
+  }
+  set_myaddrs(newaddrs);
+
+  init_local_connection();
+
+  ldout(cct,1) << __func__ << " bind my_addrs is " << get_myaddrs() << dendl;
+  did_bind = true;
+}
+
+int AsyncMessenger::client_reset()
+{
+  mark_down_all();
+
+  std::scoped_lock l{lock};
+  // adjust the nonce; we want our entity_addr_t to be truly unique.
+  nonce += 1000000;
+  ldout(cct, 10) << __func__ << " new nonce " << nonce << dendl;
+
+  entity_addrvec_t newaddrs = *my_addrs;
+  for (auto& a : newaddrs.v) {
+    a.set_nonce(nonce);
+  }
+  set_myaddrs(newaddrs);
+  _init_local_connection();
+  return 0;
+}
+
+int AsyncMessenger::start()
+{
+  std::scoped_lock l{lock};
+  ldout(cct,1) << __func__ << " start" << dendl;
+
+  // register at least one entity, first!
+  ceph_assert(my_name.type() >= 0);
+
+  ceph_assert(!started);
+  started = true;
+  stopped = false;
+
+  if (!did_bind) {
+    entity_addrvec_t newaddrs = *my_addrs;
+    for (auto& a : newaddrs.v) {
+      a.nonce = nonce;
+    }
+    set_myaddrs(newaddrs);
+    _init_local_connection();
+  }
+
+  return 0;
+}
+
+void AsyncMessenger::wait()
+{
+  {
+    std::unique_lock locker{lock};
+    if (!started) {
+      return;
+    }
+    if (!stopped)
+      stop_cond.wait(locker);
+  }
+  dispatch_queue.shutdown();
+  if (dispatch_queue.is_started()) {
+    ldout(cct, 10) << __func__ << ": waiting for dispatch queue" << dendl;
+    dispatch_queue.wait();
+    dispatch_queue.discard_local();
+    ldout(cct, 10) << __func__ << ": dispatch queue is stopped" << dendl;
+  }
+
+  // close all connections
+  shutdown_connections(false);
+  stack->drain();
+
+  ldout(cct, 10) << __func__ << ": done." << dendl;
+  ldout(cct, 1) << __func__ << " complete." << dendl;
+  started = false;
+}
+
+void AsyncMessenger::add_accept(Worker *w, ConnectedSocket cli_socket,
+				const entity_addr_t &listen_addr,
+				const entity_addr_t &peer_addr)
+{
+  std::lock_guard l{lock};
+  auto conn = ceph::make_ref<AsyncConnection>(cct, this, &dispatch_queue, w,
+						listen_addr.is_msgr2(), false);
+  conn->accept(std::move(cli_socket), listen_addr, peer_addr);
+  accepting_conns.insert(conn);
+}
+
+AsyncConnectionRef AsyncMessenger::create_connect(
+  const entity_addrvec_t& addrs, int type, bool anon)
+{
+  ceph_assert(ceph_mutex_is_locked(lock));
+
+  ldout(cct, 10) << __func__ << " " << addrs
+      << ", creating connection and registering" << dendl;
+
+  // here is where we decide which of the addrs to connect to.  always prefer
+  // the first one, if we support it.
+  entity_addr_t target;
+  for (auto& a : addrs.v) {
+    if (!a.is_msgr2() && !a.is_legacy()) {
+      continue;
+    }
+    // FIXME: for ipv4 vs ipv6, check whether local host can handle ipv6 before
+    // trying it?  for now, just pick whichever is listed first.
+    target = a;
+    break;
+  }
+
+  // create connection
+  Worker *w = stack->get_worker();
+  auto conn = ceph::make_ref<AsyncConnection>(cct, this, &dispatch_queue, w,
+						target.is_msgr2(), false);
+  conn->anon = anon;
+  conn->connect(addrs, type, target);
+  if (anon) {
+    anon_conns.insert(conn);
+  } else {
+    ceph_assert(!conns.count(addrs));
+    ldout(cct, 10) << __func__ << " " << conn << " " << addrs << " "
+		   << *conn->peer_addrs << dendl;
+    conns[addrs] = conn;
+  }
+  w->get_perf_counter()->inc(l_msgr_active_connections);
+
+  return conn;
+}
+
+
+ConnectionRef AsyncMessenger::get_loopback_connection()
+{
+  return local_connection;
+}
+
+bool AsyncMessenger::should_use_msgr2()
+{
+  // if we are bound to v1 only, and we are connecting to a v2 peer,
+  // we cannot use the peer's v2 address. otherwise the connection
+  // is assymetrical, because they would have to use v1 to connect
+  // to us, and we would use v2, and connection race detection etc
+  // would totally break down (among other things).  or, the other
+  // end will be confused that we advertise ourselve with a v1
+  // address only (that we bound to) but connected with protocol v2.
+  return !did_bind || get_myaddrs().has_msgr2();
+}
+
+entity_addrvec_t AsyncMessenger::_filter_addrs(const entity_addrvec_t& addrs)
+{
+  if (!should_use_msgr2()) {
+    ldout(cct, 10) << __func__ << " " << addrs << " limiting to v1 ()" << dendl;
+    entity_addrvec_t r;
+    for (auto& i : addrs.v) {
+      if (i.is_msgr2()) {
+	continue;
+      }
+      r.v.push_back(i);
+    }
+    return r;
+  } else {
+    return addrs;
+  }
+}
+
+int AsyncMessenger::send_to(Message *m, int type, const entity_addrvec_t& addrs)
+{
+  FUNCTRACE(cct);
+  ceph_assert(m);
+
+#if defined(WITH_EVENTTRACE)
+  if (m->get_type() == CEPH_MSG_OSD_OP)
+    OID_EVENT_TRACE(((MOSDOp *)m)->get_oid().name.c_str(), "SEND_MSG_OSD_OP");
+  else if (m->get_type() == CEPH_MSG_OSD_OPREPLY)
+    OID_EVENT_TRACE(((MOSDOpReply *)m)->get_oid().name.c_str(), "SEND_MSG_OSD_OP_REPLY");
+#endif
+
+  ldout(cct, 1) << __func__ << "--> " << ceph_entity_type_name(type) << " "
+      << addrs << " -- " << *m << " -- ?+"
+      << m->get_data().length() << " " << m << dendl;
+
+  if (addrs.empty()) {
+    ldout(cct,0) << __func__ <<  " message " << *m
+        << " with empty dest " << addrs << dendl;
+    m->put();
+    return -EINVAL;
+  }
+
+  if (cct->_conf->ms_dump_on_send) {
+    m->encode(-1, MSG_CRC_ALL);
+    ldout(cct, 0) << __func__ << " submit_message " << *m << "\n";
+    m->get_payload().hexdump(*_dout);
+    if (m->get_data().length() > 0) {
+      *_dout << " data:\n";
+      m->get_data().hexdump(*_dout);
+    }
+    *_dout << dendl;
+    m->clear_payload();
+  }
+
+  connect_to(type, addrs, false)->send_message(m);
+  return 0;
+}
+
+ConnectionRef AsyncMessenger::connect_to(int type,
+					 const entity_addrvec_t& addrs,
+					 bool anon, bool not_local_dest)
+{
+  if (!not_local_dest) {
+    if (*my_addrs == addrs ||
+	(addrs.v.size() == 1 &&
+	 my_addrs->contains(addrs.front()))) {
+      // local
+      return local_connection;
+    }
+  }
+
+  auto av = _filter_addrs(addrs);
+  std::lock_guard l{lock};
+  if (anon) {
+    return create_connect(av, type, anon);
+  }
+
+  AsyncConnectionRef conn = _lookup_conn(av);
+  if (conn) {
+    ldout(cct, 10) << __func__ << " " << av << " existing " << conn << dendl;
+  } else {
+    conn = create_connect(av, type, false);
+    ldout(cct, 10) << __func__ << " " << av << " new " << conn << dendl;
+  }
+
+  return conn;
+}
+
+/**
+ * If my_addr doesn't have an IP set, this function
+ * will fill it in from the passed addr. Otherwise it does nothing and returns.
+ */
+bool AsyncMessenger::set_addr_unknowns(const entity_addrvec_t &addrs)
+{
+  ldout(cct,1) << __func__ << " " << addrs << dendl;
+  bool ret = false;
+  std::lock_guard l{lock};
+
+  entity_addrvec_t newaddrs = *my_addrs;
+  for (auto& a : newaddrs.v) {
+    if (a.is_blank_ip()) {
+      int type = a.get_type();
+      int port = a.get_port();
+      uint32_t nonce = a.get_nonce();
+      for (auto& b : addrs.v) {
+	if (a.get_family() == b.get_family()) {
+	  ldout(cct,1) << __func__ << " assuming my addr " << a
+		       << " matches provided addr " << b << dendl;
+	  a = b;
+	  a.set_nonce(nonce);
+	  a.set_type(type);
+	  a.set_port(port);
+	  ret = true;
+	  break;
+	}
+      }
+    }
+  }
+  set_myaddrs(newaddrs);
+  if (ret) {
+    _init_local_connection();
+  }
+  ldout(cct,1) << __func__ << " now " << *my_addrs << dendl;
+  return ret;
+}
+
+void AsyncMessenger::set_addrs(const entity_addrvec_t &addrs)
+{
+  std::lock_guard l{lock};
+  auto t = addrs;
+  for (auto& a : t.v) {
+    a.set_nonce(nonce);
+  }
+  set_myaddrs(t);
+  _init_local_connection();
+}
+
+void AsyncMessenger::shutdown_connections(bool queue_reset)
+{
+  ldout(cct,1) << __func__ << " " << dendl;
+  std::lock_guard l{lock};
+  for (const auto& c : accepting_conns) {
+    ldout(cct, 5) << __func__ << " accepting_conn " << c << dendl;
+    c->stop(queue_reset);
+  }
+  accepting_conns.clear();
+
+  for (const auto& [e, c] : conns) {
+    ldout(cct, 5) << __func__ << " mark down " << e << " " << c << dendl;
+    c->stop(queue_reset);
+  }
+  conns.clear();
+
+  for (const auto& c : anon_conns) {
+    ldout(cct, 5) << __func__ << " mark down " << c << dendl;
+    c->stop(queue_reset);
+  }
+  anon_conns.clear();
+
+  {
+    std::lock_guard l{deleted_lock};
+    for (const auto& c : deleted_conns) {
+      ldout(cct, 5) << __func__ << " delete " << c << dendl;
+      c->get_perf_counter()->dec(l_msgr_active_connections);
+    }
+    deleted_conns.clear();
+  }
+}
+
+void AsyncMessenger::mark_down_addrs(const entity_addrvec_t& addrs)
+{
+  std::lock_guard l{lock};
+  const AsyncConnectionRef& conn = _lookup_conn(addrs);
+  if (conn) {
+    ldout(cct, 1) << __func__ << " " << addrs << " -- " << conn << dendl;
+    conn->stop(true);
+  } else {
+    ldout(cct, 1) << __func__ << " " << addrs << " -- connection dne" << dendl;
+  }
+}
+
+int AsyncMessenger::get_proto_version(int peer_type, bool connect) const
+{
+  int my_type = my_name.type();
+
+  // set reply protocol version
+  if (peer_type == my_type) {
+    // internal
+    return cluster_protocol;
+  } else {
+    // public
+    switch (connect ? peer_type : my_type) {
+      case CEPH_ENTITY_TYPE_OSD: return CEPH_OSDC_PROTOCOL;
+      case CEPH_ENTITY_TYPE_MDS: return CEPH_MDSC_PROTOCOL;
+      case CEPH_ENTITY_TYPE_MON: return CEPH_MONC_PROTOCOL;
+    }
+  }
+  return 0;
+}
+
+int AsyncMessenger::accept_conn(const AsyncConnectionRef& conn)
+{
+  std::lock_guard l{lock};
+  if (conn->policy.server &&
+      conn->policy.lossy &&
+      !conn->policy.register_lossy_clients) {
+    anon_conns.insert(conn);
+    conn->get_perf_counter()->inc(l_msgr_active_connections);
+    return 0;
+  }
+  auto it = conns.find(*conn->peer_addrs);
+  if (it != conns.end()) {
+    auto& existing = it->second;
+
+    // lazy delete, see "deleted_conns"
+    // If conn already in, we will return 0
+    std::lock_guard l{deleted_lock};
+    if (deleted_conns.erase(existing)) {
+      it->second->get_perf_counter()->dec(l_msgr_active_connections);
+      conns.erase(it);
+    } else if (conn != existing) {
+      return -1;
+    }
+  }
+  ldout(cct, 10) << __func__ << " " << conn << " " << *conn->peer_addrs << dendl;
+  conns[*conn->peer_addrs] = conn;
+  conn->get_perf_counter()->inc(l_msgr_active_connections);
+  accepting_conns.erase(conn);
+  return 0;
+}
+
+
+bool AsyncMessenger::learned_addr(const entity_addr_t &peer_addr_for_me)
+{
+  // be careful here: multiple threads may block here, and readers of
+  // my_addr do NOT hold any lock.
+
+  // this always goes from true -> false under the protection of the
+  // mutex.  if it is already false, we need not retake the mutex at
+  // all.
+  if (!need_addr)
+    return false;
+  std::lock_guard l(lock);
+  if (need_addr) {
+    if (my_addrs->empty()) {
+      auto a = peer_addr_for_me;
+      a.set_type(entity_addr_t::TYPE_ANY);
+      a.set_nonce(nonce);
+      if (!did_bind) {
+	a.set_port(0);
+      }
+      set_myaddrs(entity_addrvec_t(a));
+      ldout(cct,10) << __func__ << " had no addrs" << dendl;
+    } else {
+      // fix all addrs of the same family, regardless of type (msgr2 vs legacy)
+      entity_addrvec_t newaddrs = *my_addrs;
+      for (auto& a : newaddrs.v) {
+	if (a.is_blank_ip() &&
+	    a.get_family() == peer_addr_for_me.get_family()) {
+	  entity_addr_t t = peer_addr_for_me;
+	  if (!did_bind) {
+	    t.set_type(entity_addr_t::TYPE_ANY);
+	    t.set_port(0);
+	  } else {	  
+	    t.set_type(a.get_type());
+	    t.set_port(a.get_port());
+	  }
+	  t.set_nonce(a.get_nonce());
+	  ldout(cct,10) << __func__ << " " << a << " -> " << t << dendl;
+	  a = t;
+	}
+      }
+      set_myaddrs(newaddrs);
+    }
+    ldout(cct, 1) << __func__ << " learned my addr " << *my_addrs
+		  << " (peer_addr_for_me " << peer_addr_for_me << ")" << dendl;
+    _init_local_connection();
+    need_addr = false;
+    return true;
+  }
+  return false;
+}
+
+void AsyncMessenger::reap_dead()
+{
+  ldout(cct, 1) << __func__ << " start" << dendl;
+
+  std::lock_guard l1{lock};
+
+  {
+    std::lock_guard l2{deleted_lock};
+    for (auto& c : deleted_conns) {
+      ldout(cct, 5) << __func__ << " delete " << c << dendl;
+      auto conns_it = conns.find(*c->peer_addrs);
+      if (conns_it != conns.end() && conns_it->second == c)
+        conns.erase(conns_it);
+      accepting_conns.erase(c);
+      anon_conns.erase(c);
+      c->get_perf_counter()->dec(l_msgr_active_connections);
+    }
+    deleted_conns.clear();
+  }
+}
diff --git a/src/msg/async/AsyncMessenger.h b/src/msg/async/AsyncMessenger.h
new file mode 100644
index 000000000..f1333307d
--- /dev/null
+++ b/src/msg/async/AsyncMessenger.h
@@ -0,0 +1,423 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_ASYNCMESSENGER_H
+#define CEPH_ASYNCMESSENGER_H
+
+#include <map>
+
+#include "include/types.h"
+#include "include/xlist.h"
+#include "include/spinlock.h"
+#include "include/unordered_map.h"
+#include "include/unordered_set.h"
+
+#include "common/ceph_mutex.h"
+#include "common/Cond.h"
+#include "common/Thread.h"
+
+#include "msg/SimplePolicyMessenger.h"
+#include "msg/DispatchQueue.h"
+#include "AsyncConnection.h"
+#include "Event.h"
+
+#include "include/ceph_assert.h"
+
+class AsyncMessenger;
+
+/**
+ * If the Messenger binds to a specific address, the Processor runs
+ * and listens for incoming connections.
+ */
+class Processor {
+  AsyncMessenger *msgr;
+  ceph::NetHandler net;
+  Worker *worker;
+  std::vector<ServerSocket> listen_sockets;
+  EventCallbackRef listen_handler;
+
+  class C_processor_accept;
+
+ public:
+  Processor(AsyncMessenger *r, Worker *w, CephContext *c);
+  ~Processor() { delete listen_handler; };
+
+  void stop();
+  int bind(const entity_addrvec_t &bind_addrs,
+	   const std::set<int>& avoid_ports,
+	   entity_addrvec_t* bound_addrs);
+  void start();
+  void accept();
+};
+
+/*
+ * AsyncMessenger is represented for maintaining a set of asynchronous connections,
+ * it may own a bind address and the accepted connections will be managed by
+ * AsyncMessenger.
+ *
+ */
+
+class AsyncMessenger : public SimplePolicyMessenger {
+  // First we have the public Messenger interface implementation...
+public:
+  /**
+   * Initialize the AsyncMessenger!
+   *
+   * @param cct The CephContext to use
+   * @param name The name to assign ourselves
+   * _nonce A unique ID to use for this AsyncMessenger. It should not
+   * be a value that will be repeated if the daemon restarts.
+   */
+  AsyncMessenger(CephContext *cct, entity_name_t name, const std::string &type,
+                 std::string mname, uint64_t _nonce);
+
+  /**
+   * Destroy the AsyncMessenger. Pretty simple since all the work is done
+   * elsewhere.
+   */
+  ~AsyncMessenger() override;
+
+  /** @defgroup Accessors
+   * @{
+   */
+  bool set_addr_unknowns(const entity_addrvec_t &addr) override;
+  void set_addrs(const entity_addrvec_t &addrs) override;
+
+  int get_dispatch_queue_len() override {
+    return dispatch_queue.get_queue_len();
+  }
+
+  double get_dispatch_queue_max_age(utime_t now) override {
+    return dispatch_queue.get_max_age(now);
+  }
+  /** @} Accessors */
+
+  /**
+   * @defgroup Configuration functions
+   * @{
+   */
+  void set_cluster_protocol(int p) override {
+    ceph_assert(!started && !did_bind);
+    cluster_protocol = p;
+  }
+
+  int bind(const entity_addr_t& bind_addr) override;
+  int rebind(const std::set<int>& avoid_ports) override;
+  int bindv(const entity_addrvec_t& bind_addrs) override;
+
+  int client_bind(const entity_addr_t& bind_addr) override;
+
+  int client_reset() override;
+
+  bool should_use_msgr2() override;
+
+  /** @} Configuration functions */
+
+  /**
+   * @defgroup Startup/Shutdown
+   * @{
+   */
+  int start() override;
+  void wait() override;
+  int shutdown() override;
+
+  /** @} // Startup/Shutdown */
+
+  /**
+   * @defgroup Messaging
+   * @{
+   */
+  int send_to(Message *m, int type, const entity_addrvec_t& addrs) override;
+
+  /** @} // Messaging */
+
+  /**
+   * @defgroup Connection Management
+   * @{
+   */
+  ConnectionRef connect_to(int type,
+			   const entity_addrvec_t& addrs,
+			   bool anon, bool not_local_dest=false) override;
+  ConnectionRef get_loopback_connection() override;
+  void mark_down(const entity_addr_t& addr) override {
+    mark_down_addrs(entity_addrvec_t(addr));
+  }
+  void mark_down_addrs(const entity_addrvec_t& addrs) override;
+  void mark_down_all() override {
+    shutdown_connections(true);
+  }
+  /** @} // Connection Management */
+
+  /**
+   * @defgroup Inner classes
+   * @{
+   */
+
+  /**
+   * @} // Inner classes
+   */
+
+protected:
+  /**
+   * @defgroup Messenger Interfaces
+   * @{
+   */
+  /**
+   * Start up the DispatchQueue thread once we have somebody to dispatch to.
+   */
+  void ready() override;
+  /** @} // Messenger Interfaces */
+
+private:
+
+  /**
+   * @defgroup Utility functions
+   * @{
+   */
+
+  /**
+   * Create a connection associated with the given entity (of the given type).
+   * Initiate the connection. (This function returning does not guarantee
+   * connection success.)
+   *
+   * @param addrs The address(es) of the entity to connect to.
+   * @param type The peer type of the entity at the address.
+   *
+   * @return a pointer to the newly-created connection. Caller does not own a
+   * reference; take one if you need it.
+   */
+  AsyncConnectionRef create_connect(const entity_addrvec_t& addrs, int type,
+				    bool anon);
+
+
+  void _finish_bind(const entity_addrvec_t& bind_addrs,
+		    const entity_addrvec_t& listen_addrs);
+
+  entity_addrvec_t _filter_addrs(const entity_addrvec_t& addrs);
+
+ private:
+  NetworkStack *stack;
+  std::vector<Processor*> processors;
+  friend class Processor;
+  DispatchQueue dispatch_queue;
+
+  // the worker run messenger's cron jobs
+  Worker *local_worker;
+
+  std::string ms_type;
+
+  /// overall lock used for AsyncMessenger data structures
+  ceph::mutex lock = ceph::make_mutex("AsyncMessenger::lock");
+  // AsyncMessenger stuff
+  /// approximately unique ID set by the Constructor for use in entity_addr_t
+  uint64_t nonce;
+
+  /// true, specifying we haven't learned our addr; set false when we find it.
+  // maybe this should be protected by the lock?
+  bool need_addr = true;
+
+  /**
+   * set to bind addresses if bind was called before NetworkStack was ready to
+   * bind
+   */
+  entity_addrvec_t pending_bind_addrs;
+
+  /**
+   * false; set to true if a pending bind exists
+   */
+  bool pending_bind = false;
+
+  /**
+   *  The following aren't lock-protected since you shouldn't be able to race
+   *  the only writers.
+   */
+
+  /**
+   *  false; set to true if the AsyncMessenger bound to a specific address;
+   *  and set false again by Accepter::stop().
+   */
+  bool did_bind = false;
+  /// counter for the global seq our connection protocol uses
+  __u32 global_seq = 0;
+  /// lock to protect the global_seq
+  ceph::spinlock global_seq_lock;
+
+  /**
+   * hash map of addresses to Asyncconnection
+   *
+   * NOTE: a Asyncconnection* with state CLOSED may still be in the map but is considered
+   * invalid and can be replaced by anyone holding the msgr lock
+   */
+  ceph::unordered_map<entity_addrvec_t, AsyncConnectionRef> conns;
+
+  /**
+   * list of connection are in the process of accepting
+   *
+   * These are not yet in the conns map.
+   */
+  std::set<AsyncConnectionRef> accepting_conns;
+
+  /// anonymous outgoing connections
+  std::set<AsyncConnectionRef> anon_conns;
+
+  /**
+   * list of connection are closed which need to be clean up
+   *
+   * Because AsyncMessenger and AsyncConnection follow a lock rule that
+   * we can lock AsyncMesenger::lock firstly then lock AsyncConnection::lock
+   * but can't reversed. This rule is aimed to avoid dead lock.
+   * So if AsyncConnection want to unregister itself from AsyncMessenger,
+   * we pick up this idea that just queue itself to this set and do lazy
+   * deleted for AsyncConnection. "_lookup_conn" must ensure not return a
+   * AsyncConnection in this set.
+   */
+  ceph::mutex deleted_lock = ceph::make_mutex("AsyncMessenger::deleted_lock");
+  std::set<AsyncConnectionRef> deleted_conns;
+
+  EventCallbackRef reap_handler;
+
+  /// internal cluster protocol version, if any, for talking to entities of the same type.
+  int cluster_protocol = 0;
+
+  ceph::condition_variable  stop_cond;
+  bool stopped = true;
+
+  /* You must hold this->lock for the duration of use! */
+  const auto& _lookup_conn(const entity_addrvec_t& k) {
+    static const AsyncConnectionRef nullref;
+    ceph_assert(ceph_mutex_is_locked(lock));
+    auto p = conns.find(k);
+    if (p == conns.end()) {
+      return nullref;
+    }
+
+    // lazy delete, see "deleted_conns"
+    // don't worry omit, Connection::send_message can handle this case.
+    if (p->second->is_unregistered()) {
+      std::lock_guard l{deleted_lock};
+      if (deleted_conns.erase(p->second)) {
+	p->second->get_perf_counter()->dec(l_msgr_active_connections);
+	conns.erase(p);
+	return nullref;
+      }
+    }
+
+    return p->second;
+  }
+
+  void _init_local_connection() {
+    ceph_assert(ceph_mutex_is_locked(lock));
+    local_connection->peer_addrs = *my_addrs;
+    local_connection->peer_type = my_name.type();
+    local_connection->set_features(CEPH_FEATURES_ALL);
+    ms_deliver_handle_fast_connect(local_connection.get());
+  }
+
+  void shutdown_connections(bool queue_reset);
+
+public:
+
+  /// con used for sending messages to ourselves
+  AsyncConnectionRef local_connection;
+
+  /**
+   * @defgroup AsyncMessenger internals
+   * @{
+   */
+  /**
+   * This wraps _lookup_conn.
+   */
+  AsyncConnectionRef lookup_conn(const entity_addrvec_t& k) {
+    std::lock_guard l{lock};
+    return _lookup_conn(k); /* make new ref! */
+  }
+
+  int accept_conn(const AsyncConnectionRef& conn);
+  bool learned_addr(const entity_addr_t &peer_addr_for_me);
+  void add_accept(Worker *w, ConnectedSocket cli_socket,
+		  const entity_addr_t &listen_addr,
+		  const entity_addr_t &peer_addr);
+  NetworkStack *get_stack() {
+    return stack;
+  }
+
+  uint64_t get_nonce() const {
+    return nonce;
+  }
+
+  /**
+   * Increment the global sequence for this AsyncMessenger and return it.
+   * This is for the connect protocol, although it doesn't hurt if somebody
+   * else calls it.
+   *
+   * @return a global sequence ID that nobody else has seen.
+   */
+  __u32 get_global_seq(__u32 old=0) {
+    std::lock_guard<ceph::spinlock> lg(global_seq_lock);
+
+    if (old > global_seq)
+      global_seq = old;
+    __u32 ret = ++global_seq;
+
+    return ret;
+  }
+  /**
+   * Get the protocol version we support for the given peer type: either
+   * a peer protocol (if it matches our own), the protocol version for the
+   * peer (if we're connecting), or our protocol version (if we're accepting).
+   */
+  int get_proto_version(int peer_type, bool connect) const;
+
+  /**
+   * Fill in the address and peer type for the local connection, which
+   * is used for delivering messages back to ourself.
+   */
+  void init_local_connection() {
+    std::lock_guard l{lock};
+    local_connection->is_loopback = true;
+    _init_local_connection();
+  }
+
+  /**
+   * Unregister connection from `conns`
+   *
+   * See "deleted_conns"
+   */
+  void unregister_conn(const AsyncConnectionRef& conn) {
+    std::lock_guard l{deleted_lock};
+    deleted_conns.emplace(std::move(conn));
+    conn->unregister();
+
+    if (deleted_conns.size() >= cct->_conf.get_val<uint64_t>("ms_async_reap_threshold")) {
+      local_worker->center.dispatch_event_external(reap_handler);
+    }
+  }
+
+  /**
+   * Reap dead connection from `deleted_conns`
+   *
+   * @return the number of dead connections
+   *
+   * See "deleted_conns"
+   */
+  void reap_dead();
+
+  /**
+   * @} // AsyncMessenger Internals
+   */
+} ;
+
+#endif /* CEPH_ASYNCMESSENGER_H */
diff --git a/src/msg/async/Event.cc b/src/msg/async/Event.cc
new file mode 100644
index 000000000..2c545c07b
--- /dev/null
+++ b/src/msg/async/Event.cc
@@ -0,0 +1,479 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+#include "common/errno.h"
+#include "Event.h"
+
+#ifdef HAVE_DPDK
+#include "dpdk/EventDPDK.h"
+#endif
+
+#ifdef HAVE_EPOLL
+#include "EventEpoll.h"
+#else
+#ifdef HAVE_KQUEUE
+#include "EventKqueue.h"
+#else
+#include "EventSelect.h"
+#endif
+#endif
+
+#define dout_subsys ceph_subsys_ms
+
+#undef dout_prefix
+#define dout_prefix *_dout << "EventCallback "
+class C_handle_notify : public EventCallback {
+  EventCenter *center;
+  CephContext *cct;
+
+ public:
+  C_handle_notify(EventCenter *c, CephContext *cc): center(c), cct(cc) {}
+  void do_request(uint64_t fd_or_id) override {
+    char c[256];
+    int r = 0;
+    do {
+      #ifdef _WIN32
+      r = recv(fd_or_id, c, sizeof(c), 0);
+      #else
+      r = read(fd_or_id, c, sizeof(c));
+      #endif
+      if (r < 0) {
+        if (ceph_sock_errno() != EAGAIN)
+          ldout(cct, 1) << __func__ << " read notify pipe failed: " << cpp_strerror(ceph_sock_errno()) << dendl;
+      }
+    } while (r > 0);
+  }
+};
+
+#undef dout_prefix
+#define dout_prefix _event_prefix(_dout)
+
+/**
+ * Construct a Poller.
+ *
+ * \param center
+ *      EventCenter object through which the poller will be invoked (defaults
+ *      to the global #RAMCloud::center object).
+ * \param pollerName
+ *      Human readable name that can be printed out in debugging messages
+ *      about the poller. The name of the superclass is probably sufficient
+ *      for most cases.
+ */
+EventCenter::Poller::Poller(EventCenter* center, const std::string& name)
+    : owner(center), poller_name(name), slot(owner->pollers.size())
+{
+  owner->pollers.push_back(this);
+}
+
+/**
+ * Destroy a Poller.
+ */
+EventCenter::Poller::~Poller()
+{
+  // Erase this Poller from the vector by overwriting it with the
+  // poller that used to be the last one in the vector.
+  //
+  // Note: this approach is reentrant (it is safe to delete a
+  // poller from a poller callback, which means that the poll
+  // method is in the middle of scanning the list of all pollers;
+  // the worst that will happen is that the poller that got moved
+  // may not be invoked in the current scan).
+  owner->pollers[slot] = owner->pollers.back();
+  owner->pollers[slot]->slot = slot;
+  owner->pollers.pop_back();
+  slot = -1;
+}
+
+std::ostream& EventCenter::_event_prefix(std::ostream *_dout)
+{
+  return *_dout << "Event(" << this << " nevent=" << nevent
+                << " time_id=" << time_event_next_id << ").";
+}
+
+int EventCenter::init(int nevent, unsigned center_id, const std::string &type)
+{
+  // can't init multi times
+  ceph_assert(this->nevent == 0);
+
+  this->type = type;
+  this->center_id = center_id;
+
+  if (type == "dpdk") {
+#ifdef HAVE_DPDK
+    driver = new DPDKDriver(cct);
+#endif
+  } else {
+#ifdef HAVE_EPOLL
+  driver = new EpollDriver(cct);
+#else
+#ifdef HAVE_KQUEUE
+  driver = new KqueueDriver(cct);
+#else
+  driver = new SelectDriver(cct);
+#endif
+#endif
+  }
+
+  if (!driver) {
+    lderr(cct) << __func__ << " failed to create event driver " << dendl;
+    return -1;
+  }
+
+  int r = driver->init(this, nevent);
+  if (r < 0) {
+    lderr(cct) << __func__ << " failed to init event driver." << dendl;
+    return r;
+  }
+
+  file_events.resize(nevent);
+  this->nevent = nevent;
+
+  if (!driver->need_wakeup())
+    return 0;
+
+  int fds[2];
+
+  #ifdef _WIN32
+  if (win_socketpair(fds) < 0) {
+  #else
+  if (pipe_cloexec(fds, 0) < 0) {
+  #endif
+    int e = ceph_sock_errno();
+    lderr(cct) << __func__ << " can't create notify pipe: " << cpp_strerror(e) << dendl;
+    return -e;
+  }
+
+  notify_receive_fd = fds[0];
+  notify_send_fd = fds[1];
+
+  r = net.set_nonblock(notify_receive_fd);
+  if (r < 0) {
+    return r;
+  }
+  r = net.set_nonblock(notify_send_fd);
+  if (r < 0) {
+    return r;
+  }
+
+  return r;
+}
+
+EventCenter::~EventCenter()
+{
+  {
+    std::lock_guard<std::mutex> l(external_lock);
+    while (!external_events.empty()) {
+      EventCallbackRef e = external_events.front();
+      if (e)
+        e->do_request(0);
+      external_events.pop_front();
+    }
+  }
+  time_events.clear();
+  //assert(time_events.empty());
+
+  if (notify_receive_fd >= 0)
+    compat_closesocket(notify_receive_fd);
+  if (notify_send_fd >= 0)
+    compat_closesocket(notify_send_fd);
+
+  delete driver;
+  if (notify_handler)
+    delete notify_handler;
+}
+
+
+void EventCenter::set_owner()
+{
+  owner = pthread_self();
+  ldout(cct, 2) << __func__ << " center_id=" << center_id << " owner=" << owner << dendl;
+  if (!global_centers) {
+    global_centers = &cct->lookup_or_create_singleton_object<
+      EventCenter::AssociatedCenters>(
+	"AsyncMessenger::EventCenter::global_center::" + type, true);
+    ceph_assert(global_centers);
+    global_centers->centers[center_id] = this;
+    if (driver->need_wakeup()) {
+      notify_handler = new C_handle_notify(this, cct);
+      int r = create_file_event(notify_receive_fd, EVENT_READABLE, notify_handler);
+      ceph_assert(r == 0);
+    }
+  }
+}
+
+int EventCenter::create_file_event(int fd, int mask, EventCallbackRef ctxt)
+{
+  ceph_assert(in_thread());
+  int r = 0;
+  if (fd >= nevent) {
+    int new_size = nevent << 2;
+    while (fd >= new_size)
+      new_size <<= 2;
+    ldout(cct, 20) << __func__ << " event count exceed " << nevent << ", expand to " << new_size << dendl;
+    r = driver->resize_events(new_size);
+    if (r < 0) {
+      lderr(cct) << __func__ << " event count is exceed." << dendl;
+      return -ERANGE;
+    }
+    file_events.resize(new_size);
+    nevent = new_size;
+  }
+
+  EventCenter::FileEvent *event = _get_file_event(fd);
+  ldout(cct, 20) << __func__ << " create event started fd=" << fd << " mask=" << mask
+                 << " original mask is " << event->mask << dendl;
+  if (event->mask == mask)
+    return 0;
+
+  r = driver->add_event(fd, event->mask, mask);
+  if (r < 0) {
+    // Actually we don't allow any failed error code, caller doesn't prepare to
+    // handle error status. So now we need to assert failure here. In practice,
+    // add_event shouldn't report error, otherwise it must be a innermost bug!
+    lderr(cct) << __func__ << " add event failed, ret=" << r << " fd=" << fd
+               << " mask=" << mask << " original mask is " << event->mask << dendl;
+    ceph_abort_msg("BUG!");
+    return r;
+  }
+
+  event->mask |= mask;
+  if (mask & EVENT_READABLE) {
+    event->read_cb = ctxt;
+  }
+  if (mask & EVENT_WRITABLE) {
+    event->write_cb = ctxt;
+  }
+  ldout(cct, 20) << __func__ << " create event end fd=" << fd << " mask=" << mask
+                 << " current mask is " << event->mask << dendl;
+  return 0;
+}
+
+void EventCenter::delete_file_event(int fd, int mask)
+{
+  ceph_assert(in_thread() && fd >= 0);
+  if (fd >= nevent) {
+    ldout(cct, 1) << __func__ << " delete event fd=" << fd << " is equal or greater than nevent=" << nevent
+                  << "mask=" << mask << dendl;
+    return ;
+  }
+  EventCenter::FileEvent *event = _get_file_event(fd);
+  ldout(cct, 30) << __func__ << " delete event started fd=" << fd << " mask=" << mask
+                 << " original mask is " << event->mask << dendl;
+  if (!event->mask)
+    return ;
+
+  int r = driver->del_event(fd, event->mask, mask);
+  if (r < 0) {
+    // see create_file_event
+    ceph_abort_msg("BUG!");
+  }
+
+  if (mask & EVENT_READABLE && event->read_cb) {
+    event->read_cb = nullptr;
+  }
+  if (mask & EVENT_WRITABLE && event->write_cb) {
+    event->write_cb = nullptr;
+  }
+
+  event->mask = event->mask & (~mask);
+  ldout(cct, 30) << __func__ << " delete event end fd=" << fd << " mask=" << mask
+                 << " current mask is " << event->mask << dendl;
+}
+
+uint64_t EventCenter::create_time_event(uint64_t microseconds, EventCallbackRef ctxt)
+{
+  ceph_assert(in_thread());
+  uint64_t id = time_event_next_id++;
+
+  ldout(cct, 30) << __func__ << " id=" << id << " trigger after " << microseconds << "us"<< dendl;
+  EventCenter::TimeEvent event;
+  clock_type::time_point expire = clock_type::now() + std::chrono::microseconds(microseconds);
+  event.id = id;
+  event.time_cb = ctxt;
+  std::multimap<clock_type::time_point, TimeEvent>::value_type s_val(expire, event);
+  auto it = time_events.insert(std::move(s_val));
+  event_map[id] = it;
+
+  return id;
+}
+
+void EventCenter::delete_time_event(uint64_t id)
+{
+  ceph_assert(in_thread());
+  ldout(cct, 30) << __func__ << " id=" << id << dendl;
+  if (id >= time_event_next_id || id == 0)
+    return ;
+
+  auto it = event_map.find(id);
+  if (it == event_map.end()) {
+    ldout(cct, 10) << __func__ << " id=" << id << " not found" << dendl;
+    return ;
+  }
+
+  time_events.erase(it->second);
+  event_map.erase(it);
+}
+
+void EventCenter::wakeup()
+{
+  // No need to wake up since we never sleep
+  if (!pollers.empty() || !driver->need_wakeup())
+    return ;
+
+  ldout(cct, 20) << __func__ << dendl;
+  char buf = 'c';
+  // wake up "event_wait"
+  #ifdef _WIN32
+  int n = send(notify_send_fd, &buf, sizeof(buf), 0);
+  #else
+  int n = write(notify_send_fd, &buf, sizeof(buf));
+  #endif
+  if (n < 0) {
+    if (ceph_sock_errno() != EAGAIN) {
+      ldout(cct, 1) << __func__ << " write notify pipe failed: "
+                    << cpp_strerror(ceph_sock_errno()) << dendl;
+      ceph_abort();
+    }
+  }
+}
+
+int EventCenter::process_time_events()
+{
+  int processed = 0;
+  clock_type::time_point now = clock_type::now();
+  using ceph::operator <<;
+  ldout(cct, 30) << __func__ << " cur time is " << now << dendl;
+
+  while (!time_events.empty()) {
+    auto it = time_events.begin();
+    if (now >= it->first) {
+      TimeEvent &e = it->second;
+      EventCallbackRef cb = e.time_cb;
+      uint64_t id = e.id;
+      time_events.erase(it);
+      event_map.erase(id);
+      ldout(cct, 30) << __func__ << " process time event: id=" << id << dendl;
+      processed++;
+      cb->do_request(id);
+    } else {
+      break;
+    }
+  }
+
+  return processed;
+}
+
+int EventCenter::process_events(unsigned timeout_microseconds,  ceph::timespan *working_dur)
+{
+  struct timeval tv;
+  int numevents;
+  bool trigger_time = false;
+  auto now = clock_type::now();
+  clock_type::time_point end_time = now + std::chrono::microseconds(timeout_microseconds);
+
+  auto it = time_events.begin();
+  if (it != time_events.end() && end_time >= it->first) {
+    trigger_time = true;
+    end_time = it->first;
+
+    if (end_time > now) {
+      timeout_microseconds = std::chrono::duration_cast<std::chrono::microseconds>(end_time - now).count();
+    } else {
+      timeout_microseconds = 0;
+    }
+  }
+
+  bool blocking = pollers.empty() && !external_num_events.load();
+  if (!blocking)
+    timeout_microseconds = 0;
+  tv.tv_sec = timeout_microseconds / 1000000;
+  tv.tv_usec = timeout_microseconds % 1000000;
+
+  ldout(cct, 30) << __func__ << " wait second " << tv.tv_sec << " usec " << tv.tv_usec << dendl;
+  std::vector<FiredFileEvent> fired_events;
+  numevents = driver->event_wait(fired_events, &tv);
+  auto working_start = ceph::mono_clock::now();
+  for (int event_id = 0; event_id < numevents; event_id++) {
+    int rfired = 0;
+    FileEvent *event;
+    EventCallbackRef cb;
+    event = _get_file_event(fired_events[event_id].fd);
+
+    /* note the event->mask & mask & ... code: maybe an already processed
+    * event removed an element that fired and we still didn't
+    * processed, so we check if the event is still valid. */
+    if (event->mask & fired_events[event_id].mask & EVENT_READABLE) {
+      rfired = 1;
+      cb = event->read_cb;
+      cb->do_request(fired_events[event_id].fd);
+    }
+
+    if (event->mask & fired_events[event_id].mask & EVENT_WRITABLE) {
+      if (!rfired || event->read_cb != event->write_cb) {
+        cb = event->write_cb;
+        cb->do_request(fired_events[event_id].fd);
+      }
+    }
+
+    ldout(cct, 30) << __func__ << " event_wq process is " << fired_events[event_id].fd
+                   << " mask is " << fired_events[event_id].mask << dendl;
+  }
+
+  if (trigger_time)
+    numevents += process_time_events();
+
+  if (external_num_events.load()) {
+    external_lock.lock();
+    std::deque<EventCallbackRef> cur_process;
+    cur_process.swap(external_events);
+    external_num_events.store(0);
+    external_lock.unlock();
+    numevents += cur_process.size();
+    while (!cur_process.empty()) {
+      EventCallbackRef e = cur_process.front();
+      ldout(cct, 30) << __func__ << " do " << e << dendl;
+      e->do_request(0);
+      cur_process.pop_front();
+    }
+  }
+
+  if (!numevents && !blocking) {
+    for (uint32_t i = 0; i < pollers.size(); i++)
+      numevents += pollers[i]->poll();
+  }
+
+  if (working_dur)
+    *working_dur = ceph::mono_clock::now() - working_start;
+  return numevents;
+}
+
+void EventCenter::dispatch_event_external(EventCallbackRef e)
+{
+  uint64_t num = 0;
+  {
+    std::lock_guard lock{external_lock};
+    if (external_num_events > 0 && *external_events.rbegin() == e) {
+      return;
+    }
+    external_events.push_back(e);
+    num = ++external_num_events;
+  }
+  if (num == 1 && !in_thread())
+    wakeup();
+
+  ldout(cct, 30) << __func__ << " " << e << " pending " << num << dendl;
+}
diff --git a/src/msg/async/Event.h b/src/msg/async/Event.h
new file mode 100644
index 000000000..1812db3cd
--- /dev/null
+++ b/src/msg/async/Event.h
@@ -0,0 +1,265 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_EVENT_H
+#define CEPH_MSG_EVENT_H
+
+#ifdef __APPLE__
+#include <AvailabilityMacros.h>
+#endif
+
+// We use epoll, kqueue, evport, select in descending order by performance.
+#if defined(__linux__)
+#define HAVE_EPOLL 1
+#endif
+
+#if (defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined (__NetBSD__)
+#define HAVE_KQUEUE 1
+#endif
+
+#ifdef __sun
+#include <sys/feature_tests.h>
+#ifdef _DTRACE_VERSION
+#define HAVE_EVPORT 1
+#endif
+#endif
+
+#include <atomic>
+#include <mutex>
+#include <condition_variable>
+
+#include "common/ceph_time.h"
+#include "common/dout.h"
+#include "net_handler.h"
+
+#define EVENT_NONE 0
+#define EVENT_READABLE 1
+#define EVENT_WRITABLE 2
+
+class EventCenter;
+
+class EventCallback {
+
+ public:
+  virtual void do_request(uint64_t fd_or_id) = 0;
+  virtual ~EventCallback() {}       // we want a virtual destructor!!!
+};
+
+typedef EventCallback* EventCallbackRef;
+
+struct FiredFileEvent {
+  int fd;
+  int mask;
+};
+
+/*
+ * EventDriver is a wrap of event mechanisms depends on different OS.
+ * For example, Linux will use epoll(2), BSD will use kqueue(2) and select will
+ * be used for worst condition.
+ */
+class EventDriver {
+ public:
+  virtual ~EventDriver() {}       // we want a virtual destructor!!!
+  virtual int init(EventCenter *center, int nevent) = 0;
+  virtual int add_event(int fd, int cur_mask, int mask) = 0;
+  virtual int del_event(int fd, int cur_mask, int del_mask) = 0;
+  virtual int event_wait(std::vector<FiredFileEvent> &fired_events, struct timeval *tp) = 0;
+  virtual int resize_events(int newsize) = 0;
+  virtual bool need_wakeup() { return true; }
+};
+
+/*
+ * EventCenter maintain a set of file descriptor and handle registered events.
+ */
+class EventCenter {
+ public:
+  // should be enough;
+  static const int MAX_EVENTCENTER = 24;
+
+ private:
+  using clock_type = ceph::coarse_mono_clock;
+
+  struct AssociatedCenters {
+    EventCenter *centers[MAX_EVENTCENTER];
+    AssociatedCenters() {
+      // FIPS zeroization audit 20191115: this memset is not security related.
+      memset(centers, 0, MAX_EVENTCENTER * sizeof(EventCenter*));
+    }
+  };
+
+  struct FileEvent {
+    int mask;
+    EventCallbackRef read_cb;
+    EventCallbackRef write_cb;
+    FileEvent(): mask(0), read_cb(NULL), write_cb(NULL) {}
+  };
+
+  struct TimeEvent {
+    uint64_t id;
+    EventCallbackRef time_cb;
+
+    TimeEvent(): id(0), time_cb(NULL) {}
+  };
+
+ public:
+  /**
+     * A Poller object is invoked once each time through the dispatcher's
+     * inner polling loop.
+     */
+  class Poller {
+   public:
+    explicit Poller(EventCenter* center, const std::string& pollerName);
+    virtual ~Poller();
+
+    /**
+     * This method is defined by a subclass and invoked once by the
+     * center during each pass through its inner polling loop.
+     *
+     * \return
+     *      1 means that this poller did useful work during this call.
+     *      0 means that the poller found no work to do.
+     */
+    virtual int poll() = 0;
+
+   private:
+    /// The EventCenter object that owns this Poller.  NULL means the
+    /// EventCenter has been deleted.
+    EventCenter* owner;
+
+    /// Human-readable string name given to the poller to make it
+    /// easy to identify for debugging. For most pollers just passing
+    /// in the subclass name probably makes sense.
+    std::string poller_name;
+
+    /// Index of this Poller in EventCenter::pollers.  Allows deletion
+    /// without having to scan all the entries in pollers. -1 means
+    /// this poller isn't currently in EventCenter::pollers (happens
+    /// after EventCenter::reset).
+    int slot;
+  };
+
+ private:
+  CephContext *cct;
+  std::string type;
+  int nevent;
+  // Used only to external event
+  pthread_t owner = 0;
+  std::mutex external_lock;
+  std::atomic_ulong external_num_events;
+  std::deque<EventCallbackRef> external_events;
+  std::vector<FileEvent> file_events;
+  EventDriver *driver;
+  std::multimap<clock_type::time_point, TimeEvent> time_events;
+  // Keeps track of all of the pollers currently defined.  We don't
+  // use an intrusive list here because it isn't reentrant: we need
+  // to add/remove elements while the center is traversing the list.
+  std::vector<Poller*> pollers;
+  std::map<uint64_t, std::multimap<clock_type::time_point, TimeEvent>::iterator> event_map;
+  uint64_t time_event_next_id;
+  int notify_receive_fd;
+  int notify_send_fd;
+  ceph::NetHandler net;
+  EventCallbackRef notify_handler;
+  unsigned center_id;
+  AssociatedCenters *global_centers = nullptr;
+
+  int process_time_events();
+  FileEvent *_get_file_event(int fd) {
+    ceph_assert(fd < nevent);
+    return &file_events[fd];
+  }
+
+ public:
+  explicit EventCenter(CephContext *c):
+    cct(c), nevent(0),
+    external_num_events(0),
+    driver(NULL), time_event_next_id(1),
+    notify_receive_fd(-1), notify_send_fd(-1), net(c),
+    notify_handler(NULL), center_id(0) { }
+  ~EventCenter();
+  std::ostream& _event_prefix(std::ostream *_dout);
+
+  int init(int nevent, unsigned center_id, const std::string &type);
+  void set_owner();
+  pthread_t get_owner() const { return owner; }
+  unsigned get_id() const { return center_id; }
+
+  EventDriver *get_driver() { return driver; }
+
+  // Used by internal thread
+  int create_file_event(int fd, int mask, EventCallbackRef ctxt);
+  uint64_t create_time_event(uint64_t milliseconds, EventCallbackRef ctxt);
+  void delete_file_event(int fd, int mask);
+  void delete_time_event(uint64_t id);
+  int process_events(unsigned timeout_microseconds, ceph::timespan *working_dur = nullptr);
+  void wakeup();
+
+  // Used by external thread
+  void dispatch_event_external(EventCallbackRef e);
+  inline bool in_thread() const {
+    return pthread_equal(pthread_self(), owner);
+  }
+
+ private:
+  template <typename func>
+  class C_submit_event : public EventCallback {
+    std::mutex lock;
+    std::condition_variable cond;
+    bool done = false;
+    func f;
+    bool nonwait;
+   public:
+    C_submit_event(func &&_f, bool nowait)
+      : f(std::move(_f)), nonwait(nowait) {}
+    void do_request(uint64_t id) override {
+      f();
+      lock.lock();
+      cond.notify_all();
+      done = true;
+      bool del = nonwait;
+      lock.unlock();
+      if (del)
+        delete this;
+    }
+    void wait() {
+      ceph_assert(!nonwait);
+      std::unique_lock<std::mutex> l(lock);
+      while (!done)
+        cond.wait(l);
+    }
+  };
+
+ public:
+  template <typename func>
+  void submit_to(int i, func &&f, bool always_async = false) {
+    ceph_assert(i < MAX_EVENTCENTER && global_centers);
+    EventCenter *c = global_centers->centers[i];
+    ceph_assert(c);
+    if (always_async) {
+      C_submit_event<func> *event = new C_submit_event<func>(std::move(f), true);
+      c->dispatch_event_external(event);
+    } else if (c->in_thread()) {
+      f();
+      return;
+    } else {
+      C_submit_event<func> event(std::move(f), false);
+      c->dispatch_event_external(&event);
+      event.wait();
+    }
+  };
+};
+
+#endif
diff --git a/src/msg/async/EventEpoll.cc b/src/msg/async/EventEpoll.cc
new file mode 100644
index 000000000..7ed5321dc
--- /dev/null
+++ b/src/msg/async/EventEpoll.cc
@@ -0,0 +1,142 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/errno.h"
+#include <fcntl.h>
+#include "EventEpoll.h"
+
+#define dout_subsys ceph_subsys_ms
+
+#undef dout_prefix
+#define dout_prefix *_dout << "EpollDriver."
+
+int EpollDriver::init(EventCenter *c, int nevent)
+{
+  events = (struct epoll_event*)calloc(nevent, sizeof(struct epoll_event));
+  if (!events) {
+    lderr(cct) << __func__ << " unable to malloc memory. " << dendl;
+    return -ENOMEM;
+  }
+
+  epfd = epoll_create(1024); /* 1024 is just an hint for the kernel */
+  if (epfd == -1) {
+    lderr(cct) << __func__ << " unable to do epoll_create: "
+                       << cpp_strerror(errno) << dendl;
+    return -errno;
+  }
+  if (::fcntl(epfd, F_SETFD, FD_CLOEXEC) == -1) {
+    int e = errno;
+    ::close(epfd);
+    lderr(cct) << __func__ << " unable to set cloexec: "
+                       << cpp_strerror(e) << dendl;
+
+    return -e;
+  }
+
+  this->nevent = nevent;
+
+  return 0;
+}
+
+int EpollDriver::add_event(int fd, int cur_mask, int add_mask)
+{
+  ldout(cct, 20) << __func__ << " add event fd=" << fd << " cur_mask=" << cur_mask
+                 << " add_mask=" << add_mask << " to " << epfd << dendl;
+  struct epoll_event ee;
+  /* If the fd was already monitored for some event, we need a MOD
+   * operation. Otherwise we need an ADD operation. */
+  int op;
+  op = cur_mask == EVENT_NONE ? EPOLL_CTL_ADD: EPOLL_CTL_MOD;
+
+  ee.events = EPOLLET;
+  add_mask |= cur_mask; /* Merge old events */
+  if (add_mask & EVENT_READABLE)
+    ee.events |= EPOLLIN;
+  if (add_mask & EVENT_WRITABLE)
+    ee.events |= EPOLLOUT;
+  ee.data.u64 = 0; /* avoid valgrind warning */
+  ee.data.fd = fd;
+  if (epoll_ctl(epfd, op, fd, &ee) == -1) {
+    lderr(cct) << __func__ << " epoll_ctl: add fd=" << fd << " failed. "
+               << cpp_strerror(errno) << dendl;
+    return -errno;
+  }
+
+  return 0;
+}
+
+int EpollDriver::del_event(int fd, int cur_mask, int delmask)
+{
+  ldout(cct, 20) << __func__ << " del event fd=" << fd << " cur_mask=" << cur_mask
+                 << " delmask=" << delmask << " to " << epfd << dendl;
+  struct epoll_event ee = {0};
+  int mask = cur_mask & (~delmask);
+  int r = 0;
+
+  if (mask != EVENT_NONE) {
+    ee.events = EPOLLET;
+    ee.data.fd = fd;
+    if (mask & EVENT_READABLE)
+      ee.events |= EPOLLIN;
+    if (mask & EVENT_WRITABLE)
+      ee.events |= EPOLLOUT;
+
+    if ((r = epoll_ctl(epfd, EPOLL_CTL_MOD, fd, &ee)) < 0) {
+      lderr(cct) << __func__ << " epoll_ctl: modify fd=" << fd << " mask=" << mask
+                 << " failed." << cpp_strerror(errno) << dendl;
+      return -errno;
+    }
+  } else {
+    /* Note, Kernel < 2.6.9 requires a non null event pointer even for
+     * EPOLL_CTL_DEL. */
+    if ((r = epoll_ctl(epfd, EPOLL_CTL_DEL, fd, &ee)) < 0) {
+      lderr(cct) << __func__ << " epoll_ctl: delete fd=" << fd
+                 << " failed." << cpp_strerror(errno) << dendl;
+      return -errno;
+    }
+  }
+  return 0;
+}
+
+int EpollDriver::resize_events(int newsize)
+{
+  return 0;
+}
+
+int EpollDriver::event_wait(std::vector<FiredFileEvent> &fired_events, struct timeval *tvp)
+{
+  int retval, numevents = 0;
+
+  retval = epoll_wait(epfd, events, nevent,
+                      tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1);
+  if (retval > 0) {
+    numevents = retval;
+    fired_events.resize(numevents);
+
+    for (int event_id = 0; event_id < numevents; event_id++) {
+      int mask = 0;
+      struct epoll_event *e = &events[event_id];
+
+      if (e->events & EPOLLIN) mask |= EVENT_READABLE;
+      if (e->events & EPOLLOUT) mask |= EVENT_WRITABLE;
+      if (e->events & EPOLLERR) mask |= EVENT_READABLE|EVENT_WRITABLE;
+      if (e->events & EPOLLHUP) mask |= EVENT_READABLE|EVENT_WRITABLE;
+      fired_events[event_id].fd = e->data.fd;
+      fired_events[event_id].mask = mask;
+    }
+  }
+  return numevents;
+}
diff --git a/src/msg/async/EventEpoll.h b/src/msg/async/EventEpoll.h
new file mode 100644
index 000000000..454ecbc34
--- /dev/null
+++ b/src/msg/async/EventEpoll.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_EVENTEPOLL_H
+#define CEPH_MSG_EVENTEPOLL_H
+
+#include <unistd.h>
+#include <sys/epoll.h>
+
+#include "Event.h"
+
+class EpollDriver : public EventDriver {
+  int epfd;
+  struct epoll_event *events;
+  CephContext *cct;
+  int nevent;
+
+ public:
+  explicit EpollDriver(CephContext *c): epfd(-1), events(NULL), cct(c), nevent(0) {}
+  ~EpollDriver() override {
+    if (epfd != -1)
+      close(epfd);
+
+    if (events)
+      free(events);
+  }
+
+  int init(EventCenter *c, int nevent) override;
+  int add_event(int fd, int cur_mask, int add_mask) override;
+  int del_event(int fd, int cur_mask, int del_mask) override;
+  int resize_events(int newsize) override;
+  int event_wait(std::vector<FiredFileEvent> &fired_events,
+		 struct timeval *tp) override;
+};
+
+#endif
diff --git a/src/msg/async/EventKqueue.cc b/src/msg/async/EventKqueue.cc
new file mode 100644
index 000000000..d6ba4a3db
--- /dev/null
+++ b/src/msg/async/EventKqueue.cc
@@ -0,0 +1,267 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/errno.h"
+#include "EventKqueue.h"
+
+#define dout_subsys ceph_subsys_ms
+
+#undef dout_prefix
+#define dout_prefix *_dout << "KqueueDriver."
+
+#define KEVENT_NOWAIT 0
+
+int KqueueDriver::test_kqfd() {
+  struct kevent ke[1];
+  if (kevent(kqfd, ke, 0, NULL, 0, KEVENT_NOWAIT) == -1) {
+    ldout(cct,0) << __func__ << " invalid kqfd = " << kqfd 
+                 << cpp_strerror(errno) << dendl;
+    return -errno;
+  }
+  return kqfd;
+}
+
+int KqueueDriver::restore_events() {
+  struct kevent ke[2];
+  int i;
+
+  ldout(cct,30) << __func__ << " on kqfd = " << kqfd << dendl;
+  for(i=0;i<size;i++) {
+    int num = 0;
+    if (sav_events[i].mask == 0 )
+      continue;
+    ldout(cct,30) << __func__ << " restore kqfd = " << kqfd 
+                  << " fd = " << i << " mask " << sav_events[i].mask << dendl;
+    if (sav_events[i].mask & EVENT_READABLE)
+      EV_SET(&ke[num++], i, EVFILT_READ, EV_ADD, 0, 0, NULL);
+    if (sav_events[i].mask & EVENT_WRITABLE)
+      EV_SET(&ke[num++], i, EVFILT_WRITE, EV_ADD, 0, 0, NULL);
+    if (num) {
+      if (kevent(kqfd, ke, num, NULL, 0, KEVENT_NOWAIT) == -1) {
+        ldout(cct,0) << __func__ << " unable to add event: "
+                     << cpp_strerror(errno) << dendl;
+        return -errno;
+      }
+    }
+  }
+  return 0;
+}
+
+int KqueueDriver::test_thread_change(const char* funcname) {
+  // check to see if we changed thread, because that invalidates
+  // the kqfd and we need to restore that
+  int oldkqfd = kqfd;
+
+  if (!pthread_equal(mythread, pthread_self())) {
+    ldout(cct,20) << funcname << " We changed thread from " << mythread
+                  << " to " << pthread_self() << dendl;
+    mythread = pthread_self();
+    kqfd = -1;
+  } else if ((kqfd != -1) && (test_kqfd() < 0)) {
+    // should this ever happen?
+    // It would be strange to change kqfd with thread change.
+    // Might nee to change this into an ceph_assert() in the future.
+    ldout(cct,0) << funcname << " Warning: Recreating old kqfd. "
+                 << "This should not happen!!!"  << dendl;
+    kqfd = -1;
+  }
+  if (kqfd == -1) {
+    kqfd = kqueue();
+    ldout(cct,30) << funcname << " kqueue: new kqfd = " << kqfd
+                  << " (was: " << oldkqfd << ")"
+                  << dendl;
+    if (kqfd < 0) {
+      lderr(cct) << funcname << " unable to do kqueue: "
+                             << cpp_strerror(errno) << dendl;
+      return -errno;
+    }
+    if (restore_events()< 0) {
+      lderr(cct) << funcname << " unable restore all events "
+                             << cpp_strerror(errno) << dendl;
+      return -errno;
+    }
+  }
+  return 0;
+}
+
+int KqueueDriver::init(EventCenter *c, int nevent)
+{
+  // keep track of possible changes of our thread
+  // because change of thread kills the kqfd
+  mythread = pthread_self();
+
+  // Reserve the space to accept the kevent return events.
+  res_events = (struct kevent*)malloc(sizeof(struct kevent)*nevent);
+  if (!res_events) {
+    lderr(cct) << __func__ << " unable to malloc memory: "
+                           << cpp_strerror(errno) << dendl;
+    return -ENOMEM;
+  }
+  memset(res_events, 0, sizeof(struct kevent)*nevent);
+  size = nevent;
+
+  // Reserve the space to keep all of the events set, so it can be redone
+  // when we change trhread ID. 
+  sav_events = (struct SaveEvent*)malloc(sizeof(struct SaveEvent)*nevent);
+  if (!sav_events) {
+    lderr(cct) << __func__ << " unable to malloc memory: "
+                           << cpp_strerror(errno) << dendl;
+    return -ENOMEM;
+  }
+  memset(sav_events, 0, sizeof(struct SaveEvent)*nevent);
+  sav_max = nevent;
+
+  // Delay assigning a descriptor until it is really needed.
+  // kqfd = kqueue();
+  kqfd = -1;
+  return 0;
+}
+
+int KqueueDriver::add_event(int fd, int cur_mask, int add_mask)
+{
+  struct kevent ke[2];
+  int num = 0;
+
+  ldout(cct,30) << __func__ << " add event kqfd = " << kqfd << " fd = " << fd 
+	<< " cur_mask = " << cur_mask << " add_mask = " << add_mask 
+	<< dendl;
+
+  int r = test_thread_change(__func__);
+  if ( r < 0 )
+    return r;
+
+  if (add_mask & EVENT_READABLE)
+    EV_SET(&ke[num++], fd, EVFILT_READ, EV_ADD|EV_CLEAR, 0, 0, NULL);
+  if (add_mask & EVENT_WRITABLE)
+    EV_SET(&ke[num++], fd, EVFILT_WRITE, EV_ADD|EV_CLEAR, 0, 0, NULL);
+
+  if (num) {
+    if (kevent(kqfd, ke, num, NULL, 0, KEVENT_NOWAIT) == -1) {
+      lderr(cct) << __func__ << " unable to add event: "
+                             << cpp_strerror(errno) << dendl;
+      return -errno;
+    }
+  }
+  // keep what we set
+  if (fd >= sav_max)
+    resize_events(sav_max+5000);
+  sav_events[fd].mask = cur_mask | add_mask;
+  return 0;
+}
+
+int KqueueDriver::del_event(int fd, int cur_mask, int del_mask)
+{
+  struct kevent ke[2];
+  int num = 0;
+  int mask = cur_mask & del_mask;
+
+  ldout(cct,30) << __func__ << " delete event kqfd = " << kqfd 
+	<< " fd = " << fd << " cur_mask = " << cur_mask 
+	<< " del_mask = " << del_mask << dendl;
+
+  int r = test_thread_change(__func__);
+  if ( r < 0 )
+    return r;
+
+  if (mask & EVENT_READABLE)
+    EV_SET(&ke[num++], fd, EVFILT_READ, EV_DELETE, 0, 0, NULL);
+  if (mask & EVENT_WRITABLE)
+    EV_SET(&ke[num++], fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL);
+
+  if (num) {
+    int r = 0;
+    if ((r = kevent(kqfd, ke, num, NULL, 0, KEVENT_NOWAIT)) < 0) {
+      lderr(cct) << __func__ << " kevent: delete fd=" << fd << " mask=" << mask
+                 << " failed." << cpp_strerror(errno) << dendl;
+      return -errno;
+    }
+  }
+  // keep the administration
+  sav_events[fd].mask = cur_mask & ~del_mask;
+  return 0;
+}
+
+int KqueueDriver::resize_events(int newsize)
+{
+  ldout(cct,30) << __func__ << " kqfd = " << kqfd << "newsize = " << newsize 
+                << dendl;
+  if (newsize > sav_max) {
+    sav_events = (struct SaveEvent*)realloc(sav_events, sizeof(struct SaveEvent)*newsize);
+    if (!sav_events) {
+      lderr(cct) << __func__ << " unable to realloc memory: "
+                             << cpp_strerror(errno) << dendl;
+      ceph_assert(sav_events);
+      return -ENOMEM;
+    }
+    memset(&sav_events[size], 0, sizeof(struct SaveEvent)*(newsize-sav_max));
+    sav_max = newsize;
+  }
+  return 0;
+}
+
+int KqueueDriver::event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tvp)
+{
+  int retval, numevents = 0;
+  struct timespec timeout;
+
+  ldout(cct,10) << __func__ << " kqfd = " << kqfd << dendl;
+
+  int r = test_thread_change(__func__);
+  if ( r < 0 )
+    return r;
+
+  if (tvp != NULL) {
+      timeout.tv_sec = tvp->tv_sec;
+      timeout.tv_nsec = tvp->tv_usec * 1000;
+      ldout(cct,20) << __func__ << " "
+		<< timeout.tv_sec << " sec "
+		<< timeout.tv_nsec << " nsec"
+		<< dendl;
+      retval = kevent(kqfd, NULL, 0, res_events, size, &timeout);
+  } else {
+      ldout(cct,30) << __func__ << " event_wait: " << " NULL" << dendl;
+      retval = kevent(kqfd, NULL, 0, res_events, size, KEVENT_NOWAIT);
+  }
+
+  ldout(cct,25) << __func__ << " kevent retval: " << retval << dendl;
+  if (retval < 0) {
+    lderr(cct) << __func__ << " kqueue error: "
+                           << cpp_strerror(errno) << dendl;
+    return -errno;
+  } else if (retval == 0) {
+    ldout(cct,5) << __func__ << " Hit timeout("
+                 << timeout.tv_sec << " sec "
+                 << timeout.tv_nsec << " nsec"
+		 << ")." << dendl;
+  } else {
+    int j;
+
+    numevents = retval;
+    fired_events.resize(numevents);
+    for (j = 0; j < numevents; j++) {
+      int mask = 0;
+      struct kevent *e = res_events + j;
+
+      if (e->filter == EVFILT_READ) mask |= EVENT_READABLE;
+      if (e->filter == EVFILT_WRITE) mask |= EVENT_WRITABLE;
+      if (e->flags & EV_ERROR) mask |= EVENT_READABLE|EVENT_WRITABLE;
+      fired_events[j].fd = (int)e->ident;
+      fired_events[j].mask = mask;
+
+    }
+  }
+  return numevents;
+}
diff --git a/src/msg/async/EventKqueue.h b/src/msg/async/EventKqueue.h
new file mode 100644
index 000000000..24863a93d
--- /dev/null
+++ b/src/msg/async/EventKqueue.h
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_EVENTKQUEUE_H
+#define CEPH_MSG_EVENTKQUEUE_H
+
+#include <sys/types.h>
+#include <sys/event.h>
+#include <unistd.h>
+
+#include "Event.h"
+
+class KqueueDriver : public EventDriver {
+  int kqfd;
+  pthread_t mythread;
+  struct kevent *res_events;
+  CephContext *cct;
+  int size;
+
+  // Keep what we set on the kqfd
+  struct SaveEvent{
+    int fd;
+    int mask;
+  };
+  struct SaveEvent *sav_events;
+  int sav_max;
+  int restore_events();
+  int test_kqfd();
+  int test_thread_change(const char* funcname);
+
+ public:
+  explicit KqueueDriver(CephContext *c): kqfd(-1), res_events(NULL), cct(c), 
+		size(0), sav_max(0) {}
+  virtual ~KqueueDriver() {
+    if (kqfd != -1)
+      close(kqfd);
+
+    if (res_events)
+      free(res_events);
+    size = 0;
+    if (sav_events)
+      free(sav_events);
+    sav_max = 0;
+  }
+
+  int init(EventCenter *c, int nevent) override;
+  int add_event(int fd, int cur_mask, int add_mask) override;
+  int del_event(int fd, int cur_mask, int del_mask) override;
+  int resize_events(int newsize) override;
+  int event_wait(vector<FiredFileEvent> &fired_events,
+		 struct timeval *tp) override;
+};
+
+#endif
diff --git a/src/msg/async/EventSelect.cc b/src/msg/async/EventSelect.cc
new file mode 100644
index 000000000..35000ccea
--- /dev/null
+++ b/src/msg/async/EventSelect.cc
@@ -0,0 +1,97 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/errno.h"
+#include "EventSelect.h"
+
+#include <unistd.h>
+#include <sys/select.h>
+#define dout_subsys ceph_subsys_ms
+
+#undef dout_prefix
+#define dout_prefix *_dout << "SelectDriver."
+
+int SelectDriver::init(EventCenter *c, int nevent)
+{
+  #ifndef _WIN32
+  ldout(cct, 0) << "Select isn't suitable for production env, just avoid "
+                << "compiling error or special purpose" << dendl;
+  #endif
+  FD_ZERO(&rfds);
+  FD_ZERO(&wfds);
+  max_fd = 0;
+  return 0;
+}
+
+int SelectDriver::add_event(int fd, int cur_mask, int add_mask)
+{
+  ldout(cct, 10) << __func__ << " add event to fd=" << fd << " mask=" << add_mask
+                 << dendl;
+
+  int mask = cur_mask | add_mask;
+  if (mask & EVENT_READABLE)
+    FD_SET(fd, &rfds);
+  if (mask & EVENT_WRITABLE)
+    FD_SET(fd, &wfds);
+  if (fd > max_fd)
+      max_fd = fd;
+
+  return 0;
+}
+
+int SelectDriver::del_event(int fd, int cur_mask, int delmask)
+{
+  ldout(cct, 10) << __func__ << " del event fd=" << fd << " cur mask=" << cur_mask
+                 << dendl;
+
+  if (delmask & EVENT_READABLE)
+    FD_CLR(fd, &rfds);
+  if (delmask & EVENT_WRITABLE)
+    FD_CLR(fd, &wfds);
+  return 0;
+}
+
+int SelectDriver::resize_events(int newsize)
+{
+  return 0;
+}
+
+int SelectDriver::event_wait(std::vector<FiredFileEvent> &fired_events, struct timeval *tvp)
+{
+  int retval, numevents = 0;
+
+  memcpy(&_rfds, &rfds, sizeof(fd_set));
+  memcpy(&_wfds, &wfds, sizeof(fd_set));
+
+  retval = select(max_fd+1, &_rfds, &_wfds, NULL, tvp);
+  if (retval > 0) {
+    for (int j = 0; j <= max_fd; j++) {
+      int mask = 0;
+      struct FiredFileEvent fe;
+      if (FD_ISSET(j, &_rfds))
+          mask |= EVENT_READABLE;
+      if (FD_ISSET(j, &_wfds))
+          mask |= EVENT_WRITABLE;
+      if (mask) {
+        fe.fd = j;
+        fe.mask = mask;
+        fired_events.push_back(fe);
+        numevents++;
+      }
+    }
+  }
+  return numevents;
+}
diff --git a/src/msg/async/EventSelect.h b/src/msg/async/EventSelect.h
new file mode 100644
index 000000000..08af57bcf
--- /dev/null
+++ b/src/msg/async/EventSelect.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_EVENTSELECT_H
+#define CEPH_MSG_EVENTSELECT_H
+
+#include "Event.h"
+
+class SelectDriver : public EventDriver {
+  fd_set rfds, wfds;
+  /* We need to have a copy of the fd sets as it's not safe to reuse
+   * FD sets after select(). */
+  fd_set _rfds, _wfds;
+  int max_fd;
+  CephContext *cct;
+
+ public:
+  explicit SelectDriver(CephContext *c): max_fd(0), cct(c) {}
+  ~SelectDriver() override {}
+
+  int init(EventCenter *c, int nevent) override;
+  int add_event(int fd, int cur_mask, int add_mask) override;
+  int del_event(int fd, int cur_mask, int del_mask) override;
+  int resize_events(int newsize) override;
+  int event_wait(std::vector<FiredFileEvent> &fired_events,
+		 struct timeval *tp) override;
+};
+
+#endif
diff --git a/src/msg/async/PosixStack.cc b/src/msg/async/PosixStack.cc
new file mode 100644
index 000000000..a38e82cf3
--- /dev/null
+++ b/src/msg/async/PosixStack.cc
@@ -0,0 +1,338 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <sys/socket.h>
+#include <netinet/tcp.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <errno.h>
+
+#include <algorithm>
+
+#include "PosixStack.h"
+
+#include "include/buffer.h"
+#include "include/str_list.h"
+#include "common/errno.h"
+#include "common/strtol.h"
+#include "common/dout.h"
+#include "msg/Messenger.h"
+#include "include/compat.h"
+#include "include/sock_compat.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << "PosixStack "
+
+class PosixConnectedSocketImpl final : public ConnectedSocketImpl {
+  ceph::NetHandler &handler;
+  int _fd;
+  entity_addr_t sa;
+  bool connected;
+
+ public:
+  explicit PosixConnectedSocketImpl(ceph::NetHandler &h, const entity_addr_t &sa,
+				    int f, bool connected)
+      : handler(h), _fd(f), sa(sa), connected(connected) {}
+
+  int is_connected() override {
+    if (connected)
+      return 1;
+
+    int r = handler.reconnect(sa, _fd);
+    if (r == 0) {
+      connected = true;
+      return 1;
+    } else if (r < 0) {
+      return r;
+    } else {
+      return 0;
+    }
+  }
+
+  ssize_t read(char *buf, size_t len) override {
+    #ifdef _WIN32
+    ssize_t r = ::recv(_fd, buf, len, 0);
+    #else
+    ssize_t r = ::read(_fd, buf, len);
+    #endif
+    if (r < 0)
+      r = -ceph_sock_errno();
+    return r;
+  }
+
+  // return the sent length
+  // < 0 means error occurred
+  #ifndef _WIN32
+  static ssize_t do_sendmsg(int fd, struct msghdr &msg, unsigned len, bool more)
+  {
+    size_t sent = 0;
+    while (1) {
+      MSGR_SIGPIPE_STOPPER;
+      ssize_t r;
+      r = ::sendmsg(fd, &msg, MSG_NOSIGNAL | (more ? MSG_MORE : 0));
+      if (r < 0) {
+        int err = ceph_sock_errno();
+        if (err == EINTR) {
+          continue;
+        } else if (err == EAGAIN) {
+          break;
+        }
+        return -err;
+      }
+
+      sent += r;
+      if (len == sent) break;
+
+      while (r > 0) {
+        if (msg.msg_iov[0].iov_len <= (size_t)r) {
+          // drain this whole item
+          r -= msg.msg_iov[0].iov_len;
+          msg.msg_iov++;
+          msg.msg_iovlen--;
+        } else {
+          msg.msg_iov[0].iov_base = (char *)msg.msg_iov[0].iov_base + r;
+          msg.msg_iov[0].iov_len -= r;
+          break;
+        }
+      }
+    }
+    return (ssize_t)sent;
+  }
+
+  ssize_t send(ceph::buffer::list &bl, bool more) override {
+    size_t sent_bytes = 0;
+    auto pb = std::cbegin(bl.buffers());
+    uint64_t left_pbrs = bl.get_num_buffers();
+    while (left_pbrs) {
+      struct msghdr msg;
+      struct iovec msgvec[IOV_MAX];
+      uint64_t size = std::min<uint64_t>(left_pbrs, IOV_MAX);
+      left_pbrs -= size;
+      // FIPS zeroization audit 20191115: this memset is not security related.
+      memset(&msg, 0, sizeof(msg));
+      msg.msg_iovlen = size;
+      msg.msg_iov = msgvec;
+      unsigned msglen = 0;
+      for (auto iov = msgvec; iov != msgvec + size; iov++) {
+	iov->iov_base = (void*)(pb->c_str());
+	iov->iov_len = pb->length();
+	msglen += pb->length();
+	++pb;
+      }
+      ssize_t r = do_sendmsg(_fd, msg, msglen, left_pbrs || more);
+      if (r < 0)
+        return r;
+
+      // "r" is the remaining length
+      sent_bytes += r;
+      if (static_cast<unsigned>(r) < msglen)
+        break;
+      // only "r" == 0 continue
+    }
+
+    if (sent_bytes) {
+      ceph::buffer::list swapped;
+      if (sent_bytes < bl.length()) {
+        bl.splice(sent_bytes, bl.length()-sent_bytes, &swapped);
+        bl.swap(swapped);
+      } else {
+        bl.clear();
+      }
+    }
+
+    return static_cast<ssize_t>(sent_bytes);
+  }
+  #else
+  ssize_t send(bufferlist &bl, bool more) override
+  {
+    size_t total_sent_bytes = 0;
+    auto pb = std::cbegin(bl.buffers());
+    uint64_t left_pbrs = bl.get_num_buffers();
+    while (left_pbrs) {
+      WSABUF msgvec[IOV_MAX];
+      uint64_t size = std::min<uint64_t>(left_pbrs, IOV_MAX);
+      left_pbrs -= size;
+      unsigned msglen = 0;
+      for (auto iov = msgvec; iov != msgvec + size; iov++) {
+        iov->buf = const_cast<char*>(pb->c_str());
+        iov->len = pb->length();
+        msglen += pb->length();
+        ++pb;
+      }
+      DWORD sent_bytes = 0;
+      DWORD flags = 0;
+      if (more)
+        flags |= MSG_PARTIAL;
+
+      int ret_val = WSASend(_fd, msgvec, size, &sent_bytes, flags, NULL, NULL);
+      if (ret_val)
+        return -ret_val;
+
+      total_sent_bytes += sent_bytes;
+      if (static_cast<unsigned>(sent_bytes) < msglen)
+        break;
+    }
+
+    if (total_sent_bytes) {
+      bufferlist swapped;
+      if (total_sent_bytes < bl.length()) {
+        bl.splice(total_sent_bytes, bl.length()-total_sent_bytes, &swapped);
+        bl.swap(swapped);
+      } else {
+        bl.clear();
+      }
+    }
+
+    return static_cast<ssize_t>(total_sent_bytes);
+  }
+  #endif
+  void shutdown() override {
+    ::shutdown(_fd, SHUT_RDWR);
+  }
+  void close() override {
+    compat_closesocket(_fd);
+  }
+  int fd() const override {
+    return _fd;
+  }
+  friend class PosixServerSocketImpl;
+  friend class PosixNetworkStack;
+};
+
+class PosixServerSocketImpl : public ServerSocketImpl {
+  ceph::NetHandler &handler;
+  int _fd;
+
+ public:
+  explicit PosixServerSocketImpl(ceph::NetHandler &h, int f,
+				 const entity_addr_t& listen_addr, unsigned slot)
+    : ServerSocketImpl(listen_addr.get_type(), slot),
+      handler(h), _fd(f) {}
+  int accept(ConnectedSocket *sock, const SocketOptions &opts, entity_addr_t *out, Worker *w) override;
+  void abort_accept() override {
+    ::close(_fd);
+    _fd = -1;
+  }
+  int fd() const override {
+    return _fd;
+  }
+};
+
+int PosixServerSocketImpl::accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w) {
+  ceph_assert(sock);
+  sockaddr_storage ss;
+  socklen_t slen = sizeof(ss);
+  int sd = accept_cloexec(_fd, (sockaddr*)&ss, &slen);
+  if (sd < 0) {
+    return -ceph_sock_errno();
+  }
+
+  int r = handler.set_nonblock(sd);
+  if (r < 0) {
+    ::close(sd);
+    return -ceph_sock_errno();
+  }
+
+  r = handler.set_socket_options(sd, opt.nodelay, opt.rcbuf_size);
+  if (r < 0) {
+    ::close(sd);
+    return -ceph_sock_errno();
+  }
+
+  ceph_assert(NULL != out); //out should not be NULL in accept connection
+
+  out->set_type(addr_type);
+  out->set_sockaddr((sockaddr*)&ss);
+  handler.set_priority(sd, opt.priority, out->get_family());
+
+  std::unique_ptr<PosixConnectedSocketImpl> csi(new PosixConnectedSocketImpl(handler, *out, sd, true));
+  *sock = ConnectedSocket(std::move(csi));
+  return 0;
+}
+
+void PosixWorker::initialize()
+{
+}
+
+int PosixWorker::listen(entity_addr_t &sa,
+			unsigned addr_slot,
+			const SocketOptions &opt,
+                        ServerSocket *sock)
+{
+  int listen_sd = net.create_socket(sa.get_family(), true);
+  if (listen_sd < 0) {
+    return -ceph_sock_errno();
+  }
+
+  int r = net.set_nonblock(listen_sd);
+  if (r < 0) {
+    ::close(listen_sd);
+    return -ceph_sock_errno();
+  }
+
+  r = net.set_socket_options(listen_sd, opt.nodelay, opt.rcbuf_size);
+  if (r < 0) {
+    ::close(listen_sd);
+    return -ceph_sock_errno();
+  }
+
+  r = ::bind(listen_sd, sa.get_sockaddr(), sa.get_sockaddr_len());
+  if (r < 0) {
+    r = -ceph_sock_errno();
+    ldout(cct, 10) << __func__ << " unable to bind to " << sa.get_sockaddr()
+                   << ": " << cpp_strerror(r) << dendl;
+    ::close(listen_sd);
+    return r;
+  }
+
+  r = ::listen(listen_sd, cct->_conf->ms_tcp_listen_backlog);
+  if (r < 0) {
+    r = -ceph_sock_errno();
+    lderr(cct) << __func__ << " unable to listen on " << sa << ": " << cpp_strerror(r) << dendl;
+    ::close(listen_sd);
+    return r;
+  }
+
+  *sock = ServerSocket(
+          std::unique_ptr<PosixServerSocketImpl>(
+	    new PosixServerSocketImpl(net, listen_sd, sa, addr_slot)));
+  return 0;
+}
+
+int PosixWorker::connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) {
+  int sd;
+
+  if (opts.nonblock) {
+    sd = net.nonblock_connect(addr, opts.connect_bind_addr);
+  } else {
+    sd = net.connect(addr, opts.connect_bind_addr);
+  }
+
+  if (sd < 0) {
+    return -ceph_sock_errno();
+  }
+
+  net.set_priority(sd, opts.priority, addr.get_family());
+  *socket = ConnectedSocket(
+      std::unique_ptr<PosixConnectedSocketImpl>(new PosixConnectedSocketImpl(net, addr, sd, !opts.nonblock)));
+  return 0;
+}
+
+PosixNetworkStack::PosixNetworkStack(CephContext *c)
+    : NetworkStack(c)
+{
+}
diff --git a/src/msg/async/PosixStack.h b/src/msg/async/PosixStack.h
new file mode 100644
index 000000000..a7e2f249c
--- /dev/null
+++ b/src/msg/async/PosixStack.h
@@ -0,0 +1,60 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_ASYNC_POSIXSTACK_H
+#define CEPH_MSG_ASYNC_POSIXSTACK_H
+
+#include <thread>
+
+#include "msg/msg_types.h"
+#include "msg/async/net_handler.h"
+
+#include "Stack.h"
+
+class PosixWorker : public Worker {
+  ceph::NetHandler net;
+  void initialize() override;
+ public:
+  PosixWorker(CephContext *c, unsigned i)
+      : Worker(c, i), net(c) {}
+  int listen(entity_addr_t &sa,
+	     unsigned addr_slot,
+	     const SocketOptions &opt,
+	     ServerSocket *socks) override;
+  int connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) override;
+};
+
+class PosixNetworkStack : public NetworkStack {
+  std::vector<std::thread> threads;
+
+  virtual Worker* create_worker(CephContext *c, unsigned worker_id) override {
+    return new PosixWorker(c, worker_id);
+  }
+
+ public:
+  explicit PosixNetworkStack(CephContext *c);
+
+  void spawn_worker(unsigned i, std::function<void ()> &&func) override {
+    threads.resize(i+1);
+    threads[i] = std::thread(func);
+  }
+  void join_worker(unsigned i) override {
+    ceph_assert(threads.size() > i && threads[i].joinable());
+    threads[i].join();
+  }
+};
+
+#endif //CEPH_MSG_ASYNC_POSIXSTACK_H
diff --git a/src/msg/async/Protocol.cc b/src/msg/async/Protocol.cc
new file mode 100644
index 000000000..4bdc065eb
--- /dev/null
+++ b/src/msg/async/Protocol.cc
@@ -0,0 +1,14 @@
+#include "Protocol.h"
+
+#include "AsyncConnection.h"
+#include "AsyncMessenger.h"
+
+Protocol::Protocol(int type, AsyncConnection *connection)
+  : proto_type(type),
+    connection(connection),
+    messenger(connection->async_msgr),
+    cct(connection->async_msgr->cct) {
+  auth_meta.reset(new AuthConnectionMeta());
+}
+
+Protocol::~Protocol() {}
diff --git a/src/msg/async/Protocol.h b/src/msg/async/Protocol.h
new file mode 100644
index 000000000..10436307e
--- /dev/null
+++ b/src/msg/async/Protocol.h
@@ -0,0 +1,140 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef _MSG_ASYNC_PROTOCOL_
+#define _MSG_ASYNC_PROTOCOL_
+
+#include <list>
+#include <map>
+
+#include "AsyncConnection.h"
+#include "include/buffer.h"
+#include "include/msgr.h"
+
+/*
+ * Continuation Helper Classes
+ */
+
+#include <memory>
+#include <tuple>
+
+template <class C>
+class Ct {
+public:
+  virtual ~Ct() {}
+  virtual Ct<C> *call(C *foo) const = 0;
+};
+
+template <class C, typename... Args>
+class CtFun : public Ct<C> {
+private:
+  using fn_t = Ct<C> *(C::*)(Args...);
+  fn_t _f;
+  std::tuple<Args...> _params;
+
+  template <std::size_t... Is>
+  inline Ct<C> *_call(C *foo, std::index_sequence<Is...>) const {
+    return (foo->*_f)(std::get<Is>(_params)...);
+  }
+
+public:
+  CtFun(fn_t f) : _f(f) {}
+
+  inline void setParams(Args... args) { _params = std::make_tuple(args...); }
+  inline Ct<C> *call(C *foo) const override {
+    return _call(foo, std::index_sequence_for<Args...>());
+  }
+};
+
+using rx_buffer_t =
+  std::unique_ptr<ceph::buffer::ptr_node, ceph::buffer::ptr_node::disposer>;
+
+template <class C>
+class CtRxNode : public Ct<C> {
+  using fn_t = Ct<C> *(C::*)(rx_buffer_t&&, int r);
+  fn_t _f;
+
+public:
+  mutable rx_buffer_t node;
+  int r;
+
+  CtRxNode(fn_t f) : _f(f) {}
+  void setParams(rx_buffer_t &&node, int r) {
+    this->node = std::move(node);
+    this->r = r;
+  }
+  inline Ct<C> *call(C *foo) const override {
+    return (foo->*_f)(std::move(node), r);
+  }
+};
+
+template <class C> using CONTINUATION_TYPE = CtFun<C>;
+template <class C> using CONTINUATION_TX_TYPE = CtFun<C, int>;
+template <class C> using CONTINUATION_RX_TYPE = CtFun<C, char*, int>;
+template <class C> using CONTINUATION_RXBPTR_TYPE = CtRxNode<C>;
+
+#define CONTINUATION_DECL(C, F, ...)                    \
+  CtFun<C, ##__VA_ARGS__> F##_cont { (&C::F) };
+
+#define CONTINUATION(F) F##_cont
+#define CONTINUE(F, ...) (F##_cont.setParams(__VA_ARGS__), &F##_cont)
+
+#define CONTINUATION_RUN(CT)                                      \
+  {                                                               \
+    Ct<std::remove_reference<decltype(*this)>::type> *_cont = &CT;\
+    do {                                                          \
+      _cont = _cont->call(this);                                  \
+    } while (_cont);                                              \
+  }
+
+#define READ_HANDLER_CONTINUATION_DECL(C, F) \
+  CONTINUATION_DECL(C, F, char *, int)
+
+#define READ_BPTR_HANDLER_CONTINUATION_DECL(C, F) \
+  CtRxNode<C> F##_cont { (&C::F) };
+
+#define WRITE_HANDLER_CONTINUATION_DECL(C, F) CONTINUATION_DECL(C, F, int)
+
+//////////////////////////////////////////////////////////////////////
+
+class AsyncMessenger;
+
+class Protocol {
+public:
+  const int proto_type;
+protected:
+  AsyncConnection *connection;
+  AsyncMessenger *messenger;
+  CephContext *cct;
+public:
+  std::shared_ptr<AuthConnectionMeta> auth_meta;
+
+public:
+  Protocol(int type, AsyncConnection *connection);
+  virtual ~Protocol();
+
+  // prepare protocol for connecting to peer
+  virtual void connect() = 0;
+  // prepare protocol for accepting peer connections
+  virtual void accept() = 0;
+  // true -> protocol is ready for sending messages
+  virtual bool is_connected() = 0;
+  // stop connection
+  virtual void stop() = 0;
+  // signal and handle connection failure
+  virtual void fault() = 0;
+  // send message
+  virtual void send_message(Message *m) = 0;
+  // send keepalive
+  virtual void send_keepalive() = 0;
+
+  virtual void read_event() = 0;
+  virtual void write_event() = 0;
+  virtual bool is_queued() = 0;
+
+  int get_con_mode() const {
+    return auth_meta->con_mode;
+  }
+};
+
+#endif /* _MSG_ASYNC_PROTOCOL_ */
diff --git a/src/msg/async/ProtocolV1.cc b/src/msg/async/ProtocolV1.cc
new file mode 100644
index 000000000..1bc661a55
--- /dev/null
+++ b/src/msg/async/ProtocolV1.cc
@@ -0,0 +1,2617 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ProtocolV1.h"
+
+#include "common/errno.h"
+
+#include "AsyncConnection.h"
+#include "AsyncMessenger.h"
+#include "common/EventTrace.h"
+#include "include/random.h"
+#include "auth/AuthClient.h"
+#include "auth/AuthServer.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix _conn_prefix(_dout)
+std::ostream &ProtocolV1::_conn_prefix(std::ostream *_dout) {
+  return *_dout << "--1- " << messenger->get_myaddrs() << " >> "
+                << *connection->peer_addrs
+		<< " conn("
+                << connection << " " << this
+                << " :" << connection->port << " s=" << get_state_name(state)
+                << " pgs=" << peer_global_seq << " cs=" << connect_seq
+                << " l=" << connection->policy.lossy << ").";
+}
+
+#define WRITE(B, C) write(CONTINUATION(C), B)
+
+#define READ(L, C) read(CONTINUATION(C), L)
+
+#define READB(L, B, C) read(CONTINUATION(C), L, B)
+
+// Constant to limit starting sequence number to 2^31.  Nothing special about
+// it, just a big number.  PLR
+#define SEQ_MASK 0x7fffffff
+
+const int ASYNC_COALESCE_THRESHOLD = 256;
+
+using namespace std;
+
+static void alloc_aligned_buffer(ceph::buffer::list &data, unsigned len, unsigned off) {
+  // create a buffer to read into that matches the data alignment
+  unsigned alloc_len = 0;
+  unsigned left = len;
+  unsigned head = 0;
+  if (off & ~CEPH_PAGE_MASK) {
+    // head
+    alloc_len += CEPH_PAGE_SIZE;
+    head = std::min<uint64_t>(CEPH_PAGE_SIZE - (off & ~CEPH_PAGE_MASK), left);
+    left -= head;
+  }
+  alloc_len += left;
+  ceph::bufferptr ptr(ceph::buffer::create_small_page_aligned(alloc_len));
+  if (head) ptr.set_offset(CEPH_PAGE_SIZE - head);
+  data.push_back(std::move(ptr));
+}
+
+/**
+ * Protocol V1
+ **/
+
+ProtocolV1::ProtocolV1(AsyncConnection *connection)
+    : Protocol(1, connection),
+      temp_buffer(nullptr),
+      can_write(WriteStatus::NOWRITE),
+      keepalive(false),
+      connect_seq(0),
+      peer_global_seq(0),
+      msg_left(0),
+      cur_msg_size(0),
+      replacing(false),
+      is_reset_from_peer(false),
+      once_ready(false),
+      state(NONE),
+      global_seq(0),
+      wait_for_seq(false) {
+  temp_buffer = new char[4096];
+}
+
+ProtocolV1::~ProtocolV1() {
+  ceph_assert(out_q.empty());
+  ceph_assert(sent.empty());
+
+  delete[] temp_buffer;
+}
+
+void ProtocolV1::connect() {
+  this->state = START_CONNECT;
+
+  // reset connect state variables
+  authorizer_buf.clear();
+  // FIPS zeroization audit 20191115: these memsets are not security related.
+  memset(&connect_msg, 0, sizeof(connect_msg));
+  memset(&connect_reply, 0, sizeof(connect_reply));
+
+  global_seq = messenger->get_global_seq();
+}
+
+void ProtocolV1::accept() { this->state = START_ACCEPT; }
+
+bool ProtocolV1::is_connected() {
+  return can_write.load() == WriteStatus::CANWRITE;
+}
+
+void ProtocolV1::stop() {
+  ldout(cct, 20) << __func__ << dendl;
+  if (state == CLOSED) {
+    return;
+  }
+
+  if (connection->delay_state) connection->delay_state->flush();
+
+  ldout(cct, 2) << __func__ << dendl;
+  std::lock_guard<std::mutex> l(connection->write_lock);
+
+  reset_recv_state();
+  discard_out_queue();
+
+  connection->_stop();
+
+  can_write = WriteStatus::CLOSED;
+  state = CLOSED;
+}
+
+void ProtocolV1::fault() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (state == CLOSED || state == NONE) {
+    ldout(cct, 10) << __func__ << " connection is already closed" << dendl;
+    return;
+  }
+
+  if (connection->policy.lossy && state != START_CONNECT &&
+      state != CONNECTING) {
+    ldout(cct, 1) << __func__ << " on lossy channel, failing" << dendl;
+    stop();
+    connection->dispatch_queue->queue_reset(connection);
+    return;
+  }
+
+  connection->write_lock.lock();
+  can_write = WriteStatus::NOWRITE;
+  is_reset_from_peer = false;
+
+  // requeue sent items
+  requeue_sent();
+
+  if (!once_ready && out_q.empty() && state >= START_ACCEPT &&
+      state <= ACCEPTING_WAIT_CONNECT_MSG_AUTH && !replacing) {
+    ldout(cct, 10) << __func__ << " with nothing to send and in the half "
+                   << " accept state just closed" << dendl;
+    connection->write_lock.unlock();
+    stop();
+    connection->dispatch_queue->queue_reset(connection);
+    return;
+  }
+  replacing = false;
+
+  connection->fault();
+
+  reset_recv_state();
+
+  if (connection->policy.standby && out_q.empty() && !keepalive &&
+      state != WAIT) {
+    ldout(cct, 10) << __func__ << " with nothing to send, going to standby"
+                   << dendl;
+    state = STANDBY;
+    connection->write_lock.unlock();
+    return;
+  }
+
+  connection->write_lock.unlock();
+
+  if ((state >= START_CONNECT && state <= CONNECTING_SEND_CONNECT_MSG) ||
+      state == WAIT) {
+    // backoff!
+    if (state == WAIT) {
+      backoff.set_from_double(cct->_conf->ms_max_backoff);
+    } else if (backoff == utime_t()) {
+      backoff.set_from_double(cct->_conf->ms_initial_backoff);
+    } else {
+      backoff += backoff;
+      if (backoff > cct->_conf->ms_max_backoff)
+        backoff.set_from_double(cct->_conf->ms_max_backoff);
+    }
+
+    global_seq = messenger->get_global_seq();
+    state = START_CONNECT;
+    connection->state = AsyncConnection::STATE_CONNECTING;
+    ldout(cct, 10) << __func__ << " waiting " << backoff << dendl;
+    // woke up again;
+    connection->register_time_events.insert(
+        connection->center->create_time_event(backoff.to_nsec() / 1000,
+                                              connection->wakeup_handler));
+  } else {
+    // policy maybe empty when state is in accept
+    if (connection->policy.server) {
+      ldout(cct, 0) << __func__ << " server, going to standby" << dendl;
+      state = STANDBY;
+    } else {
+      ldout(cct, 0) << __func__ << " initiating reconnect" << dendl;
+      connect_seq++;
+      global_seq = messenger->get_global_seq();
+      state = START_CONNECT;
+      connection->state = AsyncConnection::STATE_CONNECTING;
+    }
+    backoff = utime_t();
+    connection->center->dispatch_event_external(connection->read_handler);
+  }
+}
+
+void ProtocolV1::send_message(Message *m) {
+  ceph::buffer::list bl;
+  uint64_t f = connection->get_features();
+
+  // TODO: Currently not all messages supports reencode like MOSDMap, so here
+  // only let fast dispatch support messages prepare message
+  bool can_fast_prepare = messenger->ms_can_fast_dispatch(m);
+  if (can_fast_prepare) {
+    prepare_send_message(f, m, bl);
+  }
+
+  std::lock_guard<std::mutex> l(connection->write_lock);
+  // "features" changes will change the payload encoding
+  if (can_fast_prepare &&
+      (can_write == WriteStatus::NOWRITE || connection->get_features() != f)) {
+    // ensure the correctness of message encoding
+    bl.clear();
+    m->clear_payload();
+    ldout(cct, 5) << __func__ << " clear encoded buffer previous " << f
+                  << " != " << connection->get_features() << dendl;
+  }
+  if (can_write == WriteStatus::CLOSED) {
+    ldout(cct, 10) << __func__ << " connection closed."
+                   << " Drop message " << m << dendl;
+    m->put();
+  } else {
+    m->queue_start = ceph::mono_clock::now();
+    m->trace.event("async enqueueing message");
+    out_q[m->get_priority()].emplace_back(std::move(bl), m);
+    ldout(cct, 15) << __func__ << " inline write is denied, reschedule m=" << m
+                   << dendl;
+    if (can_write != WriteStatus::REPLACING && !write_in_progress) {
+      write_in_progress = true;
+      connection->center->dispatch_event_external(connection->write_handler);
+    }
+  }
+}
+
+void ProtocolV1::prepare_send_message(uint64_t features, Message *m,
+                                      ceph::buffer::list &bl) {
+  ldout(cct, 20) << __func__ << " m " << *m << dendl;
+
+  // associate message with Connection (for benefit of encode_payload)
+  ldout(cct, 20) << __func__ << (m->empty_payload() ? " encoding features " : " half-reencoding features ")
+		 << features << " " << m  << " " << *m << dendl;
+
+  // encode and copy out of *m
+  // in write_message we update header.seq and need recalc crc
+  // so skip calc header in encode function.
+  m->encode(features, messenger->crcflags, true);
+
+  bl.append(m->get_payload());
+  bl.append(m->get_middle());
+  bl.append(m->get_data());
+}
+
+void ProtocolV1::send_keepalive() {
+  ldout(cct, 10) << __func__ << dendl;
+  std::lock_guard<std::mutex> l(connection->write_lock);
+  if (can_write != WriteStatus::CLOSED) {
+    keepalive = true;
+    connection->center->dispatch_event_external(connection->write_handler);
+  }
+}
+
+void ProtocolV1::read_event() {
+  ldout(cct, 20) << __func__ << dendl;
+  switch (state) {
+    case START_CONNECT:
+      CONTINUATION_RUN(CONTINUATION(send_client_banner));
+      break;
+    case START_ACCEPT:
+      CONTINUATION_RUN(CONTINUATION(send_server_banner));
+      break;
+    case OPENED:
+      CONTINUATION_RUN(CONTINUATION(wait_message));
+      break;
+    case THROTTLE_MESSAGE:
+      CONTINUATION_RUN(CONTINUATION(throttle_message));
+      break;
+    case THROTTLE_BYTES:
+      CONTINUATION_RUN(CONTINUATION(throttle_bytes));
+      break;
+    case THROTTLE_DISPATCH_QUEUE:
+      CONTINUATION_RUN(CONTINUATION(throttle_dispatch_queue));
+      break;
+    default:
+      break;
+  }
+}
+
+void ProtocolV1::write_event() {
+  ldout(cct, 10) << __func__ << dendl;
+  ssize_t r = 0;
+
+  connection->write_lock.lock();
+  if (can_write == WriteStatus::CANWRITE) {
+    if (keepalive) {
+      append_keepalive_or_ack();
+      keepalive = false;
+    }
+
+    auto start = ceph::mono_clock::now();
+    bool more;
+    do {
+      ceph::buffer::list data;
+      Message *m = _get_next_outgoing(&data);
+      if (!m) {
+        break;
+      }
+
+      if (!connection->policy.lossy) {
+        // put on sent list
+        sent.push_back(m);
+        m->get();
+      }
+      more = !out_q.empty();
+      connection->write_lock.unlock();
+
+      // send_message or requeue messages may not encode message
+      if (!data.length()) {
+        prepare_send_message(connection->get_features(), m, data);
+      }
+
+      if (m->queue_start != ceph::mono_time()) {
+        connection->logger->tinc(l_msgr_send_messages_queue_lat,
+				 ceph::mono_clock::now() - m->queue_start);
+      }
+
+      r = write_message(m, data, more);
+
+      connection->write_lock.lock();
+      if (r == 0) {
+        ;
+      } else if (r < 0) {
+        ldout(cct, 1) << __func__ << " send msg failed" << dendl;
+        break;
+      } else if (r > 0) {
+	// Outbound message in-progress, thread will be re-awoken
+	// when the outbound socket is writeable again
+	break;
+      }
+    } while (can_write == WriteStatus::CANWRITE);
+    write_in_progress = false;
+    connection->write_lock.unlock();
+
+    // if r > 0 mean data still lefted, so no need _try_send.
+    if (r == 0) {
+      uint64_t left = ack_left;
+      if (left) {
+        ceph_le64 s;
+        s = in_seq;
+        connection->outgoing_bl.append(CEPH_MSGR_TAG_ACK);
+        connection->outgoing_bl.append((char *)&s, sizeof(s));
+        ldout(cct, 10) << __func__ << " try send msg ack, acked " << left
+                       << " messages" << dendl;
+        ack_left -= left;
+        left = ack_left;
+        r = connection->_try_send(left);
+      } else if (is_queued()) {
+        r = connection->_try_send();
+      }
+    }
+
+    connection->logger->tinc(l_msgr_running_send_time,
+                             ceph::mono_clock::now() - start);
+    if (r < 0) {
+      ldout(cct, 1) << __func__ << " send msg failed" << dendl;
+      connection->lock.lock();
+      fault();
+      connection->lock.unlock();
+      return;
+    }
+  } else {
+    write_in_progress = false;
+    connection->write_lock.unlock();
+    connection->lock.lock();
+    connection->write_lock.lock();
+    if (state == STANDBY && !connection->policy.server && is_queued()) {
+      ldout(cct, 10) << __func__ << " policy.server is false" << dendl;
+      connection->_connect();
+    } else if (connection->cs && state != NONE && state != CLOSED &&
+               state != START_CONNECT) {
+      r = connection->_try_send();
+      if (r < 0) {
+        ldout(cct, 1) << __func__ << " send outcoming bl failed" << dendl;
+        connection->write_lock.unlock();
+        fault();
+        connection->lock.unlock();
+        return;
+      }
+    }
+    connection->write_lock.unlock();
+    connection->lock.unlock();
+  }
+}
+
+bool ProtocolV1::is_queued() {
+  return !out_q.empty() || connection->is_queued();
+}
+
+void ProtocolV1::run_continuation(CtPtr pcontinuation) {
+  if (pcontinuation) {
+    CONTINUATION_RUN(*pcontinuation);
+  }
+}
+
+CtPtr ProtocolV1::read(CONTINUATION_RX_TYPE<ProtocolV1> &next,
+                       int len, char *buffer) {
+  if (!buffer) {
+    buffer = temp_buffer;
+  }
+  ssize_t r = connection->read(len, buffer,
+                               [&next, this](char *buffer, int r) {
+                                 next.setParams(buffer, r);
+                                 CONTINUATION_RUN(next);
+                               });
+  if (r <= 0) {
+    next.setParams(buffer, r);
+    return &next;
+  }
+
+  return nullptr;
+}
+
+CtPtr ProtocolV1::write(CONTINUATION_TX_TYPE<ProtocolV1> &next,
+                        ceph::buffer::list &buffer) {
+  ssize_t r = connection->write(buffer, [&next, this](int r) {
+    next.setParams(r);
+    CONTINUATION_RUN(next);
+  });
+  if (r <= 0) {
+    next.setParams(r);
+    return &next;
+  }
+
+  return nullptr;
+}
+
+CtPtr ProtocolV1::ready() {
+  ldout(cct, 25) << __func__ << dendl;
+
+  // make sure no pending tick timer
+  if (connection->last_tick_id) {
+    connection->center->delete_time_event(connection->last_tick_id);
+  }
+  connection->last_tick_id = connection->center->create_time_event(
+      connection->inactive_timeout_us, connection->tick_handler);
+
+  connection->write_lock.lock();
+  can_write = WriteStatus::CANWRITE;
+  if (is_queued()) {
+    connection->center->dispatch_event_external(connection->write_handler);
+  }
+  connection->write_lock.unlock();
+  connection->maybe_start_delay_thread();
+
+  state = OPENED;
+  return wait_message();
+}
+
+CtPtr ProtocolV1::wait_message() {
+  if (state != OPENED) {  // must have changed due to a replace
+    return nullptr;
+  }
+
+  ldout(cct, 20) << __func__ << dendl;
+
+  return READ(sizeof(char), handle_message);
+}
+
+CtPtr ProtocolV1::handle_message(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read tag failed" << dendl;
+    return _fault();
+  }
+
+  char tag = buffer[0];
+  ldout(cct, 20) << __func__ << " process tag " << (int)tag << dendl;
+
+  if (tag == CEPH_MSGR_TAG_KEEPALIVE) {
+    ldout(cct, 20) << __func__ << " got KEEPALIVE" << dendl;
+    connection->set_last_keepalive(ceph_clock_now());
+  } else if (tag == CEPH_MSGR_TAG_KEEPALIVE2) {
+    return READ(sizeof(ceph_timespec), handle_keepalive2);
+  } else if (tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
+    return READ(sizeof(ceph_timespec), handle_keepalive2_ack);
+  } else if (tag == CEPH_MSGR_TAG_ACK) {
+    return READ(sizeof(ceph_le64), handle_tag_ack);
+  } else if (tag == CEPH_MSGR_TAG_MSG) {
+    recv_stamp = ceph_clock_now();
+    ldout(cct, 20) << __func__ << " begin MSG" << dendl;
+    return READ(sizeof(ceph_msg_header), handle_message_header);
+  } else if (tag == CEPH_MSGR_TAG_CLOSE) {
+    ldout(cct, 20) << __func__ << " got CLOSE" << dendl;
+    stop();
+  } else {
+    ldout(cct, 0) << __func__ << " bad tag " << (int)tag << dendl;
+    return _fault();
+  }
+  return nullptr;
+}
+
+CtPtr ProtocolV1::handle_keepalive2(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read keeplive timespec failed" << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 30) << __func__ << " got KEEPALIVE2 tag ..." << dendl;
+
+  ceph_timespec *t;
+  t = (ceph_timespec *)buffer;
+  utime_t kp_t = utime_t(*t);
+  connection->write_lock.lock();
+  append_keepalive_or_ack(true, &kp_t);
+  connection->write_lock.unlock();
+
+  ldout(cct, 20) << __func__ << " got KEEPALIVE2 " << kp_t << dendl;
+  connection->set_last_keepalive(ceph_clock_now());
+
+  if (is_connected()) {
+    connection->center->dispatch_event_external(connection->write_handler);
+  }
+
+  return CONTINUE(wait_message);
+}
+
+void ProtocolV1::append_keepalive_or_ack(bool ack, utime_t *tp) {
+  ldout(cct, 10) << __func__ << dendl;
+  if (ack) {
+    ceph_assert(tp);
+    struct ceph_timespec ts;
+    tp->encode_timeval(&ts);
+    connection->outgoing_bl.append(CEPH_MSGR_TAG_KEEPALIVE2_ACK);
+    connection->outgoing_bl.append((char *)&ts, sizeof(ts));
+  } else if (connection->has_feature(CEPH_FEATURE_MSGR_KEEPALIVE2)) {
+    struct ceph_timespec ts;
+    utime_t t = ceph_clock_now();
+    t.encode_timeval(&ts);
+    connection->outgoing_bl.append(CEPH_MSGR_TAG_KEEPALIVE2);
+    connection->outgoing_bl.append((char *)&ts, sizeof(ts));
+  } else {
+    connection->outgoing_bl.append(CEPH_MSGR_TAG_KEEPALIVE);
+  }
+}
+
+CtPtr ProtocolV1::handle_keepalive2_ack(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read keeplive timespec failed" << dendl;
+    return _fault();
+  }
+
+  ceph_timespec *t;
+  t = (ceph_timespec *)buffer;
+  connection->set_last_keepalive_ack(utime_t(*t));
+  ldout(cct, 20) << __func__ << " got KEEPALIVE_ACK" << dendl;
+
+  return CONTINUE(wait_message);
+}
+
+CtPtr ProtocolV1::handle_tag_ack(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read ack seq failed" << dendl;
+    return _fault();
+  }
+
+  ceph_le64 seq;
+  seq = *(ceph_le64 *)buffer;
+  ldout(cct, 20) << __func__ << " got ACK" << dendl;
+
+  ldout(cct, 15) << __func__ << " got ack seq " << seq << dendl;
+  // trim sent list
+  static const int max_pending = 128;
+  int i = 0;
+  auto now = ceph::mono_clock::now();
+  Message *pending[max_pending];
+  connection->write_lock.lock();
+  while (!sent.empty() && sent.front()->get_seq() <= seq && i < max_pending) {
+    Message *m = sent.front();
+    sent.pop_front();
+    pending[i++] = m;
+    ldout(cct, 10) << __func__ << " got ack seq " << seq
+                   << " >= " << m->get_seq() << " on " << m << " " << *m
+                   << dendl;
+  }
+  connection->write_lock.unlock();
+  connection->logger->tinc(l_msgr_handle_ack_lat, ceph::mono_clock::now() - now);
+  for (int k = 0; k < i; k++) {
+    pending[k]->put();
+  }
+
+  return CONTINUE(wait_message);
+}
+
+CtPtr ProtocolV1::handle_message_header(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read message header failed" << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 20) << __func__ << " got MSG header" << dendl;
+
+  current_header = *((ceph_msg_header *)buffer);
+
+  ldout(cct, 20) << __func__ << " got envelope type=" << current_header.type << " src "
+                 << entity_name_t(current_header.src) << " front=" << current_header.front_len
+                 << " data=" << current_header.data_len << " off " << current_header.data_off
+                 << dendl;
+
+  if (messenger->crcflags & MSG_CRC_HEADER) {
+    __u32 header_crc = 0;
+    header_crc = ceph_crc32c(0, (unsigned char *)&current_header,
+                             sizeof(current_header) - sizeof(current_header.crc));
+    // verify header crc
+    if (header_crc != current_header.crc) {
+      ldout(cct, 0) << __func__ << " got bad header crc " << header_crc
+                    << " != " << current_header.crc << dendl;
+      return _fault();
+    }
+  }
+
+  // Reset state
+  data_buf.clear();
+  front.clear();
+  middle.clear();
+  data.clear();
+
+  state = THROTTLE_MESSAGE;
+  return CONTINUE(throttle_message);
+}
+
+CtPtr ProtocolV1::throttle_message() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (connection->policy.throttler_messages) {
+    ldout(cct, 10) << __func__ << " wants " << 1
+                   << " message from policy throttler "
+                   << connection->policy.throttler_messages->get_current()
+                   << "/" << connection->policy.throttler_messages->get_max()
+                   << dendl;
+    if (!connection->policy.throttler_messages->get_or_fail()) {
+      ldout(cct, 1) << __func__ << " wants 1 message from policy throttle "
+                     << connection->policy.throttler_messages->get_current()
+                     << "/" << connection->policy.throttler_messages->get_max()
+                     << " failed, just wait." << dendl;
+      // following thread pool deal with th full message queue isn't a
+      // short time, so we can wait a ms.
+      if (connection->register_time_events.empty()) {
+        connection->register_time_events.insert(
+            connection->center->create_time_event(1000,
+                                                  connection->wakeup_handler));
+      }
+      return nullptr;
+    }
+  }
+
+  state = THROTTLE_BYTES;
+  return CONTINUE(throttle_bytes);
+}
+
+CtPtr ProtocolV1::throttle_bytes() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  cur_msg_size = current_header.front_len + current_header.middle_len +
+                 current_header.data_len;
+  if (cur_msg_size) {
+    if (connection->policy.throttler_bytes) {
+      ldout(cct, 10) << __func__ << " wants " << cur_msg_size
+                     << " bytes from policy throttler "
+                     << connection->policy.throttler_bytes->get_current() << "/"
+                     << connection->policy.throttler_bytes->get_max() << dendl;
+      if (!connection->policy.throttler_bytes->get_or_fail(cur_msg_size)) {
+        ldout(cct, 1) << __func__ << " wants " << cur_msg_size
+                       << " bytes from policy throttler "
+                       << connection->policy.throttler_bytes->get_current()
+                       << "/" << connection->policy.throttler_bytes->get_max()
+                       << " failed, just wait." << dendl;
+        // following thread pool deal with th full message queue isn't a
+        // short time, so we can wait a ms.
+        if (connection->register_time_events.empty()) {
+          connection->register_time_events.insert(
+              connection->center->create_time_event(
+                  1000, connection->wakeup_handler));
+        }
+        return nullptr;
+      }
+    }
+  }
+
+  state = THROTTLE_DISPATCH_QUEUE;
+  return CONTINUE(throttle_dispatch_queue);
+}
+
+CtPtr ProtocolV1::throttle_dispatch_queue() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (cur_msg_size) {
+    if (!connection->dispatch_queue->dispatch_throttler.get_or_fail(
+            cur_msg_size)) {
+      ldout(cct, 1)
+          << __func__ << " wants " << cur_msg_size
+          << " bytes from dispatch throttle "
+          << connection->dispatch_queue->dispatch_throttler.get_current() << "/"
+          << connection->dispatch_queue->dispatch_throttler.get_max()
+          << " failed, just wait." << dendl;
+      // following thread pool deal with th full message queue isn't a
+      // short time, so we can wait a ms.
+      if (connection->register_time_events.empty()) {
+        connection->register_time_events.insert(
+            connection->center->create_time_event(1000,
+                                                  connection->wakeup_handler));
+      }
+      return nullptr;
+    }
+  }
+
+  throttle_stamp = ceph_clock_now();
+
+  state = READ_MESSAGE_FRONT;
+  return read_message_front();
+}
+
+CtPtr ProtocolV1::read_message_front() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  unsigned front_len = current_header.front_len;
+  if (front_len) {
+    if (!front.length()) {
+      front.push_back(ceph::buffer::create(front_len));
+    }
+    return READB(front_len, front.c_str(), handle_message_front);
+  }
+  return read_message_middle();
+}
+
+CtPtr ProtocolV1::handle_message_front(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read message front failed" << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 20) << __func__ << " got front " << front.length() << dendl;
+
+  return read_message_middle();
+}
+
+CtPtr ProtocolV1::read_message_middle() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (current_header.middle_len) {
+    if (!middle.length()) {
+      middle.push_back(ceph::buffer::create(current_header.middle_len));
+    }
+    return READB(current_header.middle_len, middle.c_str(),
+                 handle_message_middle);
+  }
+
+  return read_message_data_prepare();
+}
+
+CtPtr ProtocolV1::handle_message_middle(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read message middle failed" << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 20) << __func__ << " got middle " << middle.length() << dendl;
+
+  return read_message_data_prepare();
+}
+
+CtPtr ProtocolV1::read_message_data_prepare() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  unsigned data_len = current_header.data_len;
+  unsigned data_off = current_header.data_off;
+
+  if (data_len) {
+    // get a buffer
+#if 0
+    // rx_buffers is broken by design... see
+    //  http://tracker.ceph.com/issues/22480
+    map<ceph_tid_t, pair<ceph::buffer::list, int> >::iterator p =
+        connection->rx_buffers.find(current_header.tid);
+    if (p != connection->rx_buffers.end()) {
+      ldout(cct, 10) << __func__ << " seleting rx buffer v " << p->second.second
+                     << " at offset " << data_off << " len "
+                     << p->second.first.length() << dendl;
+      data_buf = p->second.first;
+      // make sure it's big enough
+      if (data_buf.length() < data_len)
+        data_buf.push_back(buffer::create(data_len - data_buf.length()));
+      data_blp = data_buf.begin();
+    } else {
+      ldout(cct, 20) << __func__ << " allocating new rx buffer at offset "
+                     << data_off << dendl;
+      alloc_aligned_buffer(data_buf, data_len, data_off);
+      data_blp = data_buf.begin();
+    }
+#else
+    ldout(cct, 20) << __func__ << " allocating new rx buffer at offset "
+		   << data_off << dendl;
+    alloc_aligned_buffer(data_buf, data_len, data_off);
+    data_blp = data_buf.begin();
+#endif
+  }
+
+  msg_left = data_len;
+
+  return CONTINUE(read_message_data);
+}
+
+CtPtr ProtocolV1::read_message_data() {
+  ldout(cct, 20) << __func__ << " msg_left=" << msg_left << dendl;
+
+  if (msg_left > 0) {
+    auto bp = data_blp.get_current_ptr();
+    unsigned read_len = std::min(bp.length(), msg_left);
+
+    return READB(read_len, bp.c_str(), handle_message_data);
+  }
+
+  return read_message_footer();
+}
+
+CtPtr ProtocolV1::handle_message_data(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read data error " << dendl;
+    return _fault();
+  }
+
+  auto bp = data_blp.get_current_ptr();
+  unsigned read_len = std::min(bp.length(), msg_left);
+  ceph_assert(read_len <
+	      static_cast<unsigned>(std::numeric_limits<int>::max()));
+  data_blp += read_len;
+  data.append(bp, 0, read_len);
+  msg_left -= read_len;
+
+  return CONTINUE(read_message_data);
+}
+
+CtPtr ProtocolV1::read_message_footer() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  state = READ_FOOTER_AND_DISPATCH;
+
+  unsigned len;
+  if (connection->has_feature(CEPH_FEATURE_MSG_AUTH)) {
+    len = sizeof(ceph_msg_footer);
+  } else {
+    len = sizeof(ceph_msg_footer_old);
+  }
+
+  return READ(len, handle_message_footer);
+}
+
+CtPtr ProtocolV1::handle_message_footer(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read footer data error " << dendl;
+    return _fault();
+  }
+
+  ceph_msg_footer footer;
+  ceph_msg_footer_old old_footer;
+
+  if (connection->has_feature(CEPH_FEATURE_MSG_AUTH)) {
+    footer = *((ceph_msg_footer *)buffer);
+  } else {
+    old_footer = *((ceph_msg_footer_old *)buffer);
+    footer.front_crc = old_footer.front_crc;
+    footer.middle_crc = old_footer.middle_crc;
+    footer.data_crc = old_footer.data_crc;
+    footer.sig = 0;
+    footer.flags = old_footer.flags;
+  }
+
+  int aborted = (footer.flags & CEPH_MSG_FOOTER_COMPLETE) == 0;
+  ldout(cct, 10) << __func__ << " aborted = " << aborted << dendl;
+  if (aborted) {
+    ldout(cct, 0) << __func__ << " got " << front.length() << " + "
+                  << middle.length() << " + " << data.length()
+                  << " byte message.. ABORTED" << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 20) << __func__ << " got " << front.length() << " + "
+                 << middle.length() << " + " << data.length() << " byte message"
+                 << dendl;
+  Message *message = decode_message(cct, messenger->crcflags, current_header,
+                                    footer, front, middle, data, connection);
+  if (!message) {
+    ldout(cct, 1) << __func__ << " decode message failed " << dendl;
+    return _fault();
+  }
+
+  //
+  //  Check the signature if one should be present.  A zero return indicates
+  //  success. PLR
+  //
+
+  if (session_security.get() == NULL) {
+    ldout(cct, 10) << __func__ << " no session security set" << dendl;
+  } else {
+    if (session_security->check_message_signature(message)) {
+      ldout(cct, 0) << __func__ << " Signature check failed" << dendl;
+      message->put();
+      return _fault();
+    }
+  }
+  message->set_byte_throttler(connection->policy.throttler_bytes);
+  message->set_message_throttler(connection->policy.throttler_messages);
+
+  // store reservation size in message, so we don't get confused
+  // by messages entering the dispatch queue through other paths.
+  message->set_dispatch_throttle_size(cur_msg_size);
+
+  message->set_recv_stamp(recv_stamp);
+  message->set_throttle_stamp(throttle_stamp);
+  message->set_recv_complete_stamp(ceph_clock_now());
+
+  // check received seq#.  if it is old, drop the message.
+  // note that incoming messages may skip ahead.  this is convenient for the
+  // client side queueing because messages can't be renumbered, but the (kernel)
+  // client will occasionally pull a message out of the sent queue to send
+  // elsewhere.  in that case it doesn't matter if we "got" it or not.
+  uint64_t cur_seq = in_seq;
+  if (message->get_seq() <= cur_seq) {
+    ldout(cct, 0) << __func__ << " got old message " << message->get_seq()
+                  << " <= " << cur_seq << " " << message << " " << *message
+                  << ", discarding" << dendl;
+    message->put();
+    if (connection->has_feature(CEPH_FEATURE_RECONNECT_SEQ) &&
+        cct->_conf->ms_die_on_old_message) {
+      ceph_assert(0 == "old msgs despite reconnect_seq feature");
+    }
+    return nullptr;
+  }
+  if (message->get_seq() > cur_seq + 1) {
+    ldout(cct, 0) << __func__ << " missed message?  skipped from seq "
+                  << cur_seq << " to " << message->get_seq() << dendl;
+    if (cct->_conf->ms_die_on_skipped_message) {
+      ceph_assert(0 == "skipped incoming seq");
+    }
+  }
+
+#if defined(WITH_EVENTTRACE)
+  if (message->get_type() == CEPH_MSG_OSD_OP ||
+      message->get_type() == CEPH_MSG_OSD_OPREPLY) {
+    utime_t ltt_processed_stamp = ceph_clock_now();
+    double usecs_elapsed =
+      ((double)(ltt_processed_stamp.to_nsec() - recv_stamp.to_nsec())) / 1000;
+    ostringstream buf;
+    if (message->get_type() == CEPH_MSG_OSD_OP)
+      OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OP",
+                           false);
+    else
+      OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OPREPLY",
+                           false);
+  }
+#endif
+
+  // note last received message.
+  in_seq = message->get_seq();
+  ldout(cct, 5) << " rx " << message->get_source() << " seq "
+                << message->get_seq() << " " << message << " " << *message
+                << dendl;
+
+  bool need_dispatch_writer = false;
+  if (!connection->policy.lossy) {
+    ack_left++;
+    need_dispatch_writer = true;
+  }
+
+  state = OPENED;
+
+  ceph::mono_time fast_dispatch_time;
+
+  if (connection->is_blackhole()) {
+    ldout(cct, 10) << __func__ << " blackhole " << *message << dendl;
+    message->put();
+    goto out;
+  }
+
+  connection->logger->inc(l_msgr_recv_messages);
+  connection->logger->inc(
+      l_msgr_recv_bytes,
+      cur_msg_size + sizeof(ceph_msg_header) + sizeof(ceph_msg_footer));
+
+  messenger->ms_fast_preprocess(message);
+  fast_dispatch_time = ceph::mono_clock::now();
+  connection->logger->tinc(l_msgr_running_recv_time,
+			   fast_dispatch_time - connection->recv_start_time);
+  if (connection->delay_state) {
+    double delay_period = 0;
+    if (rand() % 10000 < cct->_conf->ms_inject_delay_probability * 10000.0) {
+      delay_period =
+          cct->_conf->ms_inject_delay_max * (double)(rand() % 10000) / 10000.0;
+      ldout(cct, 1) << "queue_received will delay after "
+                    << (ceph_clock_now() + delay_period) << " on " << message
+                    << " " << *message << dendl;
+    }
+    connection->delay_state->queue(delay_period, message);
+  } else if (messenger->ms_can_fast_dispatch(message)) {
+    connection->lock.unlock();
+    connection->dispatch_queue->fast_dispatch(message);
+    connection->recv_start_time = ceph::mono_clock::now();
+    connection->logger->tinc(l_msgr_running_fast_dispatch_time,
+                             connection->recv_start_time - fast_dispatch_time);
+    connection->lock.lock();
+  } else {
+    connection->dispatch_queue->enqueue(message, message->get_priority(),
+                                        connection->conn_id);
+  }
+
+ out:
+  // clean up local buffer references
+  data_buf.clear();
+  front.clear();
+  middle.clear();
+  data.clear();
+
+  if (need_dispatch_writer && connection->is_connected()) {
+    connection->center->dispatch_event_external(connection->write_handler);
+  }
+
+  return CONTINUE(wait_message);
+}
+
+void ProtocolV1::session_reset() {
+  ldout(cct, 10) << __func__ << " started" << dendl;
+
+  std::lock_guard<std::mutex> l(connection->write_lock);
+  if (connection->delay_state) {
+    connection->delay_state->discard();
+  }
+
+  connection->dispatch_queue->discard_queue(connection->conn_id);
+  discard_out_queue();
+  // note: we need to clear outgoing_bl here, but session_reset may be
+  // called by other thread, so let caller clear this itself!
+  // outgoing_bl.clear();
+
+  connection->dispatch_queue->queue_remote_reset(connection);
+
+  randomize_out_seq();
+
+  in_seq = 0;
+  connect_seq = 0;
+  // it's safe to directly set 0, double locked
+  ack_left = 0;
+  once_ready = false;
+  can_write = WriteStatus::NOWRITE;
+}
+
+void ProtocolV1::randomize_out_seq() {
+  if (connection->get_features() & CEPH_FEATURE_MSG_AUTH) {
+    // Set out_seq to a random value, so CRC won't be predictable.
+    auto rand_seq = ceph::util::generate_random_number<uint64_t>(0, SEQ_MASK);
+    ldout(cct, 10) << __func__ << " randomize_out_seq " << rand_seq << dendl;
+    out_seq = rand_seq;
+  } else {
+    // previously, seq #'s always started at 0.
+    out_seq = 0;
+  }
+}
+
+ssize_t ProtocolV1::write_message(Message *m, ceph::buffer::list &bl, bool more) {
+  FUNCTRACE(cct);
+  ceph_assert(connection->center->in_thread());
+  m->set_seq(++out_seq);
+
+  if (messenger->crcflags & MSG_CRC_HEADER) {
+    m->calc_header_crc();
+  }
+
+  ceph_msg_header &header = m->get_header();
+  ceph_msg_footer &footer = m->get_footer();
+
+  // TODO: let sign_message could be reentry?
+  // Now that we have all the crcs calculated, handle the
+  // digital signature for the message, if the AsyncConnection has session
+  // security set up.  Some session security options do not
+  // actually calculate and check the signature, but they should
+  // handle the calls to sign_message and check_signature.  PLR
+  if (session_security.get() == NULL) {
+    ldout(cct, 20) << __func__ << " no session security" << dendl;
+  } else {
+    if (session_security->sign_message(m)) {
+      ldout(cct, 20) << __func__ << " failed to sign m=" << m
+                     << "): sig = " << footer.sig << dendl;
+    } else {
+      ldout(cct, 20) << __func__ << " signed m=" << m
+                     << "): sig = " << footer.sig << dendl;
+    }
+  }
+
+  connection->outgoing_bl.append(CEPH_MSGR_TAG_MSG);
+  connection->outgoing_bl.append((char *)&header, sizeof(header));
+
+  ldout(cct, 20) << __func__ << " sending message type=" << header.type
+                 << " src " << entity_name_t(header.src)
+                 << " front=" << header.front_len << " data=" << header.data_len
+                 << " off " << header.data_off << dendl;
+
+  if ((bl.length() <= ASYNC_COALESCE_THRESHOLD) && (bl.get_num_buffers() > 1)) {
+    for (const auto &pb : bl.buffers()) {
+      connection->outgoing_bl.append((char *)pb.c_str(), pb.length());
+    }
+  } else {
+    connection->outgoing_bl.claim_append(bl);
+  }
+
+  // send footer; if receiver doesn't support signatures, use the old footer
+  // format
+  ceph_msg_footer_old old_footer;
+  if (connection->has_feature(CEPH_FEATURE_MSG_AUTH)) {
+    connection->outgoing_bl.append((char *)&footer, sizeof(footer));
+  } else {
+    if (messenger->crcflags & MSG_CRC_HEADER) {
+      old_footer.front_crc = footer.front_crc;
+      old_footer.middle_crc = footer.middle_crc;
+    } else {
+      old_footer.front_crc = old_footer.middle_crc = 0;
+    }
+    old_footer.data_crc =
+        messenger->crcflags & MSG_CRC_DATA ? footer.data_crc : 0;
+    old_footer.flags = footer.flags;
+    connection->outgoing_bl.append((char *)&old_footer, sizeof(old_footer));
+  }
+
+  m->trace.event("async writing message");
+  ldout(cct, 20) << __func__ << " sending " << m->get_seq() << " " << m
+                 << dendl;
+  ssize_t total_send_size = connection->outgoing_bl.length();
+  ssize_t rc = connection->_try_send(more);
+  if (rc < 0) {
+    ldout(cct, 1) << __func__ << " error sending " << m << ", "
+                  << cpp_strerror(rc) << dendl;
+  } else {
+    connection->logger->inc(
+        l_msgr_send_bytes, total_send_size - connection->outgoing_bl.length());
+    ldout(cct, 10) << __func__ << " sending " << m
+                   << (rc ? " continuely." : " done.") << dendl;
+  }
+
+#if defined(WITH_EVENTTRACE)
+  if (m->get_type() == CEPH_MSG_OSD_OP)
+    OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OP_END", false);
+  else if (m->get_type() == CEPH_MSG_OSD_OPREPLY)
+    OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OPREPLY_END", false);
+#endif
+  m->put();
+
+  return rc;
+}
+
+void ProtocolV1::requeue_sent() {
+  write_in_progress = false;
+  if (sent.empty()) {
+    return;
+  }
+
+  list<pair<ceph::buffer::list, Message *> > &rq = out_q[CEPH_MSG_PRIO_HIGHEST];
+  out_seq -= sent.size();
+  while (!sent.empty()) {
+    Message *m = sent.back();
+    sent.pop_back();
+    ldout(cct, 10) << __func__ << " " << *m << " for resend "
+                   << " (" << m->get_seq() << ")" << dendl;
+    m->clear_payload();
+    rq.push_front(make_pair(ceph::buffer::list(), m));
+  }
+}
+
+uint64_t ProtocolV1::discard_requeued_up_to(uint64_t out_seq, uint64_t seq) {
+  ldout(cct, 10) << __func__ << " " << seq << dendl;
+  std::lock_guard<std::mutex> l(connection->write_lock);
+  if (out_q.count(CEPH_MSG_PRIO_HIGHEST) == 0) {
+    return seq;
+  }
+  list<pair<ceph::buffer::list, Message *> > &rq = out_q[CEPH_MSG_PRIO_HIGHEST];
+  uint64_t count = out_seq;
+  while (!rq.empty()) {
+    pair<ceph::buffer::list, Message *> p = rq.front();
+    if (p.second->get_seq() == 0 || p.second->get_seq() > seq) break;
+    ldout(cct, 10) << __func__ << " " << *(p.second) << " for resend seq "
+                   << p.second->get_seq() << " <= " << seq << ", discarding"
+                   << dendl;
+    p.second->put();
+    rq.pop_front();
+    count++;
+  }
+  if (rq.empty()) out_q.erase(CEPH_MSG_PRIO_HIGHEST);
+  return count;
+}
+
+/*
+ * Tears down the message queues, and removes them from the
+ * DispatchQueue Must hold write_lock prior to calling.
+ */
+void ProtocolV1::discard_out_queue() {
+  ldout(cct, 10) << __func__ << " started" << dendl;
+
+  for (list<Message *>::iterator p = sent.begin(); p != sent.end(); ++p) {
+    ldout(cct, 20) << __func__ << " discard " << *p << dendl;
+    (*p)->put();
+  }
+  sent.clear();
+  for (map<int, list<pair<ceph::buffer::list, Message *> > >::iterator p =
+           out_q.begin();
+       p != out_q.end(); ++p) {
+    for (list<pair<ceph::buffer::list, Message *> >::iterator r = p->second.begin();
+         r != p->second.end(); ++r) {
+      ldout(cct, 20) << __func__ << " discard " << r->second << dendl;
+      r->second->put();
+    }
+  }
+  out_q.clear();
+  write_in_progress = false;
+}
+
+void ProtocolV1::reset_security()
+{
+  ldout(cct, 5) << __func__ << dendl;
+
+  auth_meta.reset(new AuthConnectionMeta);
+  authorizer_more.clear();
+  session_security.reset();
+}
+
+void ProtocolV1::reset_recv_state()
+{
+  ldout(cct, 5) << __func__ << dendl;
+
+  // execute in the same thread that uses the `session_security`.
+  // We need to do the warp because holding `write_lock` is not
+  // enough as `write_event()` releases it just before calling
+  // `write_message()`. `submit_to()` here is NOT blocking.
+  if (!connection->center->in_thread()) {
+    connection->center->submit_to(connection->center->get_id(), [this] {
+      ldout(cct, 5) << "reset_recv_state (warped) reseting security handlers"
+                    << dendl;
+      // Possibly unnecessary. See the comment in `deactivate_existing`.
+      std::lock_guard<std::mutex> l(connection->lock);
+      std::lock_guard<std::mutex> wl(connection->write_lock);
+      reset_security();
+    }, /* always_async = */true);
+  } else {
+    reset_security();
+  }
+
+  // clean read and write callbacks
+  connection->pendingReadLen.reset();
+  connection->writeCallback.reset();
+
+  if (state > THROTTLE_MESSAGE && state <= READ_FOOTER_AND_DISPATCH &&
+      connection->policy.throttler_messages) {
+    ldout(cct, 10) << __func__ << " releasing " << 1
+                   << " message to policy throttler "
+                   << connection->policy.throttler_messages->get_current()
+                   << "/" << connection->policy.throttler_messages->get_max()
+                   << dendl;
+    connection->policy.throttler_messages->put();
+  }
+  if (state > THROTTLE_BYTES && state <= READ_FOOTER_AND_DISPATCH) {
+    if (connection->policy.throttler_bytes) {
+      ldout(cct, 10) << __func__ << " releasing " << cur_msg_size
+                     << " bytes to policy throttler "
+                     << connection->policy.throttler_bytes->get_current() << "/"
+                     << connection->policy.throttler_bytes->get_max() << dendl;
+      connection->policy.throttler_bytes->put(cur_msg_size);
+    }
+  }
+  if (state > THROTTLE_DISPATCH_QUEUE && state <= READ_FOOTER_AND_DISPATCH) {
+    ldout(cct, 10)
+        << __func__ << " releasing " << cur_msg_size
+        << " bytes to dispatch_queue throttler "
+        << connection->dispatch_queue->dispatch_throttler.get_current() << "/"
+        << connection->dispatch_queue->dispatch_throttler.get_max() << dendl;
+    connection->dispatch_queue->dispatch_throttle_release(cur_msg_size);
+  }
+}
+
+Message *ProtocolV1::_get_next_outgoing(ceph::buffer::list *bl) {
+  Message *m = 0;
+  if (!out_q.empty()) {
+    map<int, list<pair<ceph::buffer::list, Message *> > >::reverse_iterator it =
+        out_q.rbegin();
+    ceph_assert(!it->second.empty());
+    list<pair<ceph::buffer::list, Message *> >::iterator p = it->second.begin();
+    m = p->second;
+    if (p->first.length() && bl) {
+      assert(bl->length() == 0);
+      bl->swap(p->first);
+    }
+    it->second.erase(p);
+    if (it->second.empty()) out_q.erase(it->first);
+  }
+  return m;
+}
+
+/**
+ * Client Protocol V1
+ **/
+
+CtPtr ProtocolV1::send_client_banner() {
+  ldout(cct, 20) << __func__ << dendl;
+  state = CONNECTING;
+
+  ceph::buffer::list bl;
+  bl.append(CEPH_BANNER, strlen(CEPH_BANNER));
+  return WRITE(bl, handle_client_banner_write);
+}
+
+CtPtr ProtocolV1::handle_client_banner_write(int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " write client banner failed" << dendl;
+    return _fault();
+  }
+  ldout(cct, 10) << __func__ << " connect write banner done: "
+                 << connection->get_peer_addr() << dendl;
+
+  return wait_server_banner();
+}
+
+CtPtr ProtocolV1::wait_server_banner() {
+  state = CONNECTING_WAIT_BANNER_AND_IDENTIFY;
+
+  ldout(cct, 20) << __func__ << dendl;
+
+  ceph::buffer::list myaddrbl;
+  unsigned banner_len = strlen(CEPH_BANNER);
+  unsigned need_len = banner_len + sizeof(ceph_entity_addr) * 2;
+  return READ(need_len, handle_server_banner_and_identify);
+}
+
+CtPtr ProtocolV1::handle_server_banner_and_identify(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read banner and identify addresses failed"
+                  << dendl;
+    return _fault();
+  }
+
+  unsigned banner_len = strlen(CEPH_BANNER);
+  if (memcmp(buffer, CEPH_BANNER, banner_len)) {
+    ldout(cct, 0) << __func__ << " connect protocol error (bad banner) on peer "
+                  << connection->get_peer_addr() << dendl;
+    return _fault();
+  }
+
+  ceph::buffer::list bl;
+  entity_addr_t paddr, peer_addr_for_me;
+
+  bl.append(buffer + banner_len, sizeof(ceph_entity_addr) * 2);
+  auto p = bl.cbegin();
+  try {
+    decode(paddr, p);
+    decode(peer_addr_for_me, p);
+  } catch (const ceph::buffer::error &e) {
+    lderr(cct) << __func__ << " decode peer addr failed " << dendl;
+    return _fault();
+  }
+  ldout(cct, 20) << __func__ << " connect read peer addr " << paddr
+                 << " on socket " << connection->cs.fd() << dendl;
+
+  entity_addr_t peer_addr = connection->peer_addrs->legacy_addr();
+  if (peer_addr != paddr) {
+    if (paddr.is_blank_ip() && peer_addr.get_port() == paddr.get_port() &&
+        peer_addr.get_nonce() == paddr.get_nonce()) {
+      ldout(cct, 0) << __func__ << " connect claims to be " << paddr << " not "
+                    << peer_addr << " - presumably this is the same node!"
+                    << dendl;
+    } else {
+      ldout(cct, 10) << __func__ << " connect claims to be " << paddr << " not "
+                     << peer_addr << dendl;
+      return _fault();
+    }
+  }
+
+  ldout(cct, 20) << __func__ << " connect peer addr for me is "
+                 << peer_addr_for_me << dendl;
+  if (messenger->get_myaddrs().empty() ||
+      messenger->get_myaddrs().front().is_blank_ip()) {
+    sockaddr_storage ss;
+    socklen_t len = sizeof(ss);
+    getsockname(connection->cs.fd(), (sockaddr *)&ss, &len);
+    entity_addr_t a;
+    if (cct->_conf->ms_learn_addr_from_peer) {
+      ldout(cct, 1) << __func__ << " peer " << connection->target_addr
+		    << " says I am " << peer_addr_for_me << " (socket says "
+		    << (sockaddr*)&ss << ")" << dendl;
+      a = peer_addr_for_me;
+    } else {
+      ldout(cct, 1) << __func__ << " socket to  " << connection->target_addr
+		    << " says I am " << (sockaddr*)&ss
+		    << " (peer says " << peer_addr_for_me << ")" << dendl;
+      a.set_sockaddr((sockaddr *)&ss);
+    }
+    a.set_type(entity_addr_t::TYPE_LEGACY); // anything but NONE; learned_addr ignores this
+    a.set_port(0);
+    connection->lock.unlock();
+    messenger->learned_addr(a);
+    if (cct->_conf->ms_inject_internal_delays &&
+	cct->_conf->ms_inject_socket_failures) {
+      if (rand() % cct->_conf->ms_inject_socket_failures == 0) {
+	ldout(cct, 10) << __func__ << " sleep for "
+		       << cct->_conf->ms_inject_internal_delays << dendl;
+	utime_t t;
+	t.set_from_double(cct->_conf->ms_inject_internal_delays);
+	t.sleep();
+      }
+    }
+    connection->lock.lock();
+    if (state != CONNECTING_WAIT_BANNER_AND_IDENTIFY) {
+      ldout(cct, 1) << __func__
+                  << " state changed while learned_addr, mark_down or "
+		    << " replacing must be happened just now" << dendl;
+      return nullptr;
+    }
+  }
+
+  ceph::buffer::list myaddrbl;
+  encode(messenger->get_myaddr_legacy(), myaddrbl, 0);  // legacy
+  return WRITE(myaddrbl, handle_my_addr_write);
+}
+
+CtPtr ProtocolV1::handle_my_addr_write(int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 2) << __func__ << " connect couldn't write my addr, "
+                  << cpp_strerror(r) << dendl;
+    return _fault();
+  }
+  ldout(cct, 10) << __func__ << " connect sent my addr "
+                 << messenger->get_myaddr_legacy() << dendl;
+
+  return CONTINUE(send_connect_message);
+}
+
+CtPtr ProtocolV1::send_connect_message()
+{
+  state = CONNECTING_SEND_CONNECT_MSG;
+
+  ldout(cct, 20) << __func__ << dendl;
+  ceph_assert(messenger->auth_client);
+
+  ceph::buffer::list auth_bl;
+  vector<uint32_t> preferred_modes;
+
+  if (connection->peer_type != CEPH_ENTITY_TYPE_MON ||
+      messenger->get_myname().type() == CEPH_ENTITY_TYPE_MON) {
+    if (authorizer_more.length()) {
+      ldout(cct,10) << __func__ << " using augmented (challenge) auth payload"
+		    << dendl;
+      auth_bl = authorizer_more;
+    } else {
+      auto am = auth_meta;
+      authorizer_more.clear();
+      connection->lock.unlock();
+      int r = messenger->auth_client->get_auth_request(
+	connection, am.get(),
+	&am->auth_method, &preferred_modes, &auth_bl);
+      connection->lock.lock();
+      if (r < 0) {
+	return _fault();
+      }
+      if (state != CONNECTING_SEND_CONNECT_MSG) {
+	ldout(cct, 1) << __func__ << " state changed!" << dendl;
+	return _fault();
+      }
+    }
+  }
+
+  ceph_msg_connect connect;
+  connect.features = connection->policy.features_supported;
+  connect.host_type = messenger->get_myname().type();
+  connect.global_seq = global_seq;
+  connect.connect_seq = connect_seq;
+  connect.protocol_version =
+      messenger->get_proto_version(connection->peer_type, true);
+  if (auth_bl.length()) {
+    ldout(cct, 10) << __func__
+                   << " connect_msg.authorizer_len=" << auth_bl.length()
+                   << " protocol=" << auth_meta->auth_method << dendl;
+    connect.authorizer_protocol = auth_meta->auth_method;
+    connect.authorizer_len = auth_bl.length();
+  } else {
+    connect.authorizer_protocol = 0;
+    connect.authorizer_len = 0;
+  }
+
+  connect.flags = 0;
+  if (connection->policy.lossy) {
+    connect.flags |=
+        CEPH_MSG_CONNECT_LOSSY;  // this is fyi, actually, server decides!
+  }
+
+  ceph::buffer::list bl;
+  bl.append((char *)&connect, sizeof(connect));
+  if (auth_bl.length()) {
+    bl.append(auth_bl.c_str(), auth_bl.length());
+  }
+
+  ldout(cct, 10) << __func__ << " connect sending gseq=" << global_seq
+                 << " cseq=" << connect_seq
+                 << " proto=" << connect.protocol_version << dendl;
+
+  return WRITE(bl, handle_connect_message_write);
+}
+
+CtPtr ProtocolV1::handle_connect_message_write(int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 2) << __func__ << " connect couldn't send reply "
+                  << cpp_strerror(r) << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 20) << __func__
+                 << " connect wrote (self +) cseq, waiting for reply" << dendl;
+
+  return wait_connect_reply();
+}
+
+CtPtr ProtocolV1::wait_connect_reply() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&connect_reply, 0, sizeof(connect_reply));
+  return READ(sizeof(connect_reply), handle_connect_reply_1);
+}
+
+CtPtr ProtocolV1::handle_connect_reply_1(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read connect reply failed" << dendl;
+    return _fault();
+  }
+
+  connect_reply = *((ceph_msg_connect_reply *)buffer);
+
+  ldout(cct, 20) << __func__ << " connect got reply tag "
+                 << (int)connect_reply.tag << " connect_seq "
+                 << connect_reply.connect_seq << " global_seq "
+                 << connect_reply.global_seq << " proto "
+                 << connect_reply.protocol_version << " flags "
+                 << (int)connect_reply.flags << " features "
+                 << connect_reply.features << dendl;
+
+  if (connect_reply.authorizer_len) {
+    return wait_connect_reply_auth();
+  }
+
+  return handle_connect_reply_2();
+}
+
+CtPtr ProtocolV1::wait_connect_reply_auth() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  ldout(cct, 10) << __func__
+                 << " reply.authorizer_len=" << connect_reply.authorizer_len
+                 << dendl;
+
+  ceph_assert(connect_reply.authorizer_len < 4096);
+
+  return READ(connect_reply.authorizer_len, handle_connect_reply_auth);
+}
+
+CtPtr ProtocolV1::handle_connect_reply_auth(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read connect reply authorizer failed"
+                  << dendl;
+    return _fault();
+  }
+
+  ceph::buffer::list authorizer_reply;
+  authorizer_reply.append(buffer, connect_reply.authorizer_len);
+
+  if (connection->peer_type != CEPH_ENTITY_TYPE_MON ||
+      messenger->get_myname().type() == CEPH_ENTITY_TYPE_MON) {
+    auto am = auth_meta;
+    bool more = (connect_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER);
+    ceph::buffer::list auth_retry_bl;
+    int r;
+    connection->lock.unlock();
+    if (more) {
+      r = messenger->auth_client->handle_auth_reply_more(
+	connection, am.get(), authorizer_reply, &auth_retry_bl);
+    } else {
+      // these aren't used for v1
+      CryptoKey skey;
+      string con_secret;
+      r = messenger->auth_client->handle_auth_done(
+	connection, am.get(),
+	0 /* global id */, 0 /* con mode */,
+	authorizer_reply,
+	&skey, &con_secret);
+    }
+    connection->lock.lock();
+    if (state != CONNECTING_SEND_CONNECT_MSG) {
+      ldout(cct, 1) << __func__ << " state changed" << dendl;
+      return _fault();
+    }
+    if (r < 0) {
+      return _fault();
+    }
+    if (more && r == 0) {
+      authorizer_more = auth_retry_bl;
+      return CONTINUE(send_connect_message);
+    }
+  }
+
+  return handle_connect_reply_2();
+}
+
+CtPtr ProtocolV1::handle_connect_reply_2() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_FEATURES) {
+    ldout(cct, 0) << __func__ << " connect protocol feature mismatch, my "
+                  << std::hex << connection->policy.features_supported
+                  << " < peer " << connect_reply.features << " missing "
+                  << (connect_reply.features &
+                      ~connection->policy.features_supported)
+                  << std::dec << dendl;
+    return _fault();
+  }
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_BADPROTOVER) {
+    ldout(cct, 0) << __func__ << " connect protocol version mismatch, my "
+                  << messenger->get_proto_version(connection->peer_type, true)
+                  << " != " << connect_reply.protocol_version << dendl;
+    return _fault();
+  }
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_BADAUTHORIZER) {
+    ldout(cct, 0) << __func__ << " connect got BADAUTHORIZER" << dendl;
+    authorizer_more.clear();
+    return _fault();
+  }
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_RESETSESSION) {
+    ldout(cct, 0) << __func__ << " connect got RESETSESSION" << dendl;
+    session_reset();
+    connect_seq = 0;
+
+    // see session_reset
+    connection->outgoing_bl.clear();
+
+    return CONTINUE(send_connect_message);
+  }
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_RETRY_GLOBAL) {
+    global_seq = messenger->get_global_seq(connect_reply.global_seq);
+    ldout(cct, 5) << __func__ << " connect got RETRY_GLOBAL "
+                  << connect_reply.global_seq << " chose new " << global_seq
+                  << dendl;
+    return CONTINUE(send_connect_message);
+  }
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_RETRY_SESSION) {
+    ceph_assert(connect_reply.connect_seq > connect_seq);
+    ldout(cct, 5) << __func__ << " connect got RETRY_SESSION " << connect_seq
+                  << " -> " << connect_reply.connect_seq << dendl;
+    connect_seq = connect_reply.connect_seq;
+    return CONTINUE(send_connect_message);
+  }
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_WAIT) {
+    ldout(cct, 1) << __func__ << " connect got WAIT (connection race)" << dendl;
+    state = WAIT;
+    return _fault();
+  }
+
+  uint64_t feat_missing;
+  feat_missing =
+      connection->policy.features_required & ~(uint64_t)connect_reply.features;
+  if (feat_missing) {
+    ldout(cct, 1) << __func__ << " missing required features " << std::hex
+                  << feat_missing << std::dec << dendl;
+    return _fault();
+  }
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_SEQ) {
+    ldout(cct, 10)
+        << __func__
+        << " got CEPH_MSGR_TAG_SEQ, reading acked_seq and writing in_seq"
+        << dendl;
+
+    return wait_ack_seq();
+  }
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_READY) {
+    ldout(cct, 10) << __func__ << " got CEPH_MSGR_TAG_READY " << dendl;
+  }
+
+  return client_ready();
+}
+
+CtPtr ProtocolV1::wait_ack_seq() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  return READ(sizeof(uint64_t), handle_ack_seq);
+}
+
+CtPtr ProtocolV1::handle_ack_seq(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read connect ack seq failed" << dendl;
+    return _fault();
+  }
+
+  uint64_t newly_acked_seq = 0;
+
+  newly_acked_seq = *((uint64_t *)buffer);
+  ldout(cct, 2) << __func__ << " got newly_acked_seq " << newly_acked_seq
+                << " vs out_seq " << out_seq << dendl;
+  out_seq = discard_requeued_up_to(out_seq, newly_acked_seq);
+
+  ceph::buffer::list bl;
+  uint64_t s = in_seq;
+  bl.append((char *)&s, sizeof(s));
+
+  return WRITE(bl, handle_in_seq_write);
+}
+
+CtPtr ProtocolV1::handle_in_seq_write(int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 10) << __func__ << " failed to send in_seq " << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 10) << __func__ << " send in_seq done " << dendl;
+
+  return client_ready();
+}
+
+CtPtr ProtocolV1::client_ready() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  // hooray!
+  peer_global_seq = connect_reply.global_seq;
+  connection->policy.lossy = connect_reply.flags & CEPH_MSG_CONNECT_LOSSY;
+
+  once_ready = true;
+  connect_seq += 1;
+  ceph_assert(connect_seq == connect_reply.connect_seq);
+  backoff = utime_t();
+  connection->set_features((uint64_t)connect_reply.features &
+                           (uint64_t)connection->policy.features_supported);
+  ldout(cct, 10) << __func__ << " connect success " << connect_seq
+                 << ", lossy = " << connection->policy.lossy << ", features "
+                 << connection->get_features() << dendl;
+
+  // If we have an authorizer, get a new AuthSessionHandler to deal with
+  // ongoing security of the connection.  PLR
+  if (auth_meta->authorizer) {
+    ldout(cct, 10) << __func__ << " setting up session_security with auth "
+		   << auth_meta->authorizer.get() << dendl;
+    session_security.reset(get_auth_session_handler(
+        cct, auth_meta->authorizer->protocol,
+	auth_meta->session_key,
+        connection->get_features()));
+  } else {
+    // We have no authorizer, so we shouldn't be applying security to messages
+    // in this AsyncConnection.  PLR
+    ldout(cct, 10) << __func__ << " no authorizer, clearing session_security"
+		   << dendl;
+    session_security.reset();
+  }
+
+  if (connection->delay_state) {
+    ceph_assert(connection->delay_state->ready());
+  }
+  connection->dispatch_queue->queue_connect(connection);
+  messenger->ms_deliver_handle_fast_connect(connection);
+
+  return ready();
+}
+
+/**
+ * Server Protocol V1
+ **/
+
+CtPtr ProtocolV1::send_server_banner() {
+  ldout(cct, 20) << __func__ << dendl;
+  state = ACCEPTING;
+
+  ceph::buffer::list bl;
+
+  bl.append(CEPH_BANNER, strlen(CEPH_BANNER));
+
+  // as a server, we should have a legacy addr if we accepted this connection.
+  auto legacy = messenger->get_myaddrs().legacy_addr();
+  encode(legacy, bl, 0);  // legacy
+  connection->port = legacy.get_port();
+  encode(connection->target_addr, bl, 0);  // legacy
+
+  ldout(cct, 1) << __func__ << " sd=" << connection->cs.fd()
+		<< " legacy " << legacy
+		<< " socket_addr " << connection->socket_addr
+		<< " target_addr " << connection->target_addr
+		<< dendl;
+
+  return WRITE(bl, handle_server_banner_write);
+}
+
+CtPtr ProtocolV1::handle_server_banner_write(int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << " write server banner failed" << dendl;
+    return _fault();
+  }
+  ldout(cct, 10) << __func__ << " write banner and addr done: "
+                 << connection->get_peer_addr() << dendl;
+
+  return wait_client_banner();
+}
+
+CtPtr ProtocolV1::wait_client_banner() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  return READ(strlen(CEPH_BANNER) + sizeof(ceph_entity_addr),
+              handle_client_banner);
+}
+
+CtPtr ProtocolV1::handle_client_banner(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read peer banner and addr failed" << dendl;
+    return _fault();
+  }
+
+  if (memcmp(buffer, CEPH_BANNER, strlen(CEPH_BANNER))) {
+    ldout(cct, 1) << __func__ << " accept peer sent bad banner '" << buffer
+                  << "' (should be '" << CEPH_BANNER << "')" << dendl;
+    return _fault();
+  }
+
+  ceph::buffer::list addr_bl;
+  entity_addr_t peer_addr;
+
+  addr_bl.append(buffer + strlen(CEPH_BANNER), sizeof(ceph_entity_addr));
+  try {
+    auto ti = addr_bl.cbegin();
+    decode(peer_addr, ti);
+  } catch (const ceph::buffer::error &e) {
+    lderr(cct) << __func__ << " decode peer_addr failed " << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 10) << __func__ << " accept peer addr is " << peer_addr << dendl;
+  if (peer_addr.is_blank_ip()) {
+    // peer apparently doesn't know what ip they have; figure it out for them.
+    int port = peer_addr.get_port();
+    peer_addr.set_sockaddr(connection->target_addr.get_sockaddr());
+    peer_addr.set_port(port);
+
+    ldout(cct, 0) << __func__ << " accept peer addr is really " << peer_addr
+                  << " (socket is " << connection->target_addr << ")" << dendl;
+  }
+  connection->set_peer_addr(peer_addr);  // so that connection_state gets set up
+  connection->target_addr = peer_addr;
+
+  return CONTINUE(wait_connect_message);
+}
+
+CtPtr ProtocolV1::wait_connect_message() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&connect_msg, 0, sizeof(connect_msg));
+  return READ(sizeof(connect_msg), handle_connect_message_1);
+}
+
+CtPtr ProtocolV1::handle_connect_message_1(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read connect msg failed" << dendl;
+    return _fault();
+  }
+
+  connect_msg = *((ceph_msg_connect *)buffer);
+
+  state = ACCEPTING_WAIT_CONNECT_MSG_AUTH;
+
+  if (connect_msg.authorizer_len) {
+    return wait_connect_message_auth();
+  }
+
+  return handle_connect_message_2();
+}
+
+CtPtr ProtocolV1::wait_connect_message_auth() {
+  ldout(cct, 20) << __func__ << dendl;
+  authorizer_buf.clear();
+  authorizer_buf.push_back(ceph::buffer::create(connect_msg.authorizer_len));
+  return READB(connect_msg.authorizer_len, authorizer_buf.c_str(),
+               handle_connect_message_auth);
+}
+
+CtPtr ProtocolV1::handle_connect_message_auth(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read connect authorizer failed" << dendl;
+    return _fault();
+  }
+
+  return handle_connect_message_2();
+}
+
+CtPtr ProtocolV1::handle_connect_message_2() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  ldout(cct, 20) << __func__ << " accept got peer connect_seq "
+                 << connect_msg.connect_seq << " global_seq "
+                 << connect_msg.global_seq << dendl;
+
+  connection->set_peer_type(connect_msg.host_type);
+  connection->policy = messenger->get_policy(connect_msg.host_type);
+
+  ldout(cct, 10) << __func__ << " accept of host_type " << connect_msg.host_type
+                 << ", policy.lossy=" << connection->policy.lossy
+                 << " policy.server=" << connection->policy.server
+                 << " policy.standby=" << connection->policy.standby
+                 << " policy.resetcheck=" << connection->policy.resetcheck
+		 << " features 0x" << std::hex << (uint64_t)connect_msg.features
+		 << std::dec
+                 << dendl;
+
+  ceph_msg_connect_reply reply;
+  ceph::buffer::list authorizer_reply;
+
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&reply, 0, sizeof(reply));
+  reply.protocol_version =
+      messenger->get_proto_version(connection->peer_type, false);
+
+  // mismatch?
+  ldout(cct, 10) << __func__ << " accept my proto " << reply.protocol_version
+                 << ", their proto " << connect_msg.protocol_version << dendl;
+
+  if (connect_msg.protocol_version != reply.protocol_version) {
+    return send_connect_message_reply(CEPH_MSGR_TAG_BADPROTOVER, reply,
+                                      authorizer_reply);
+  }
+
+  // require signatures for cephx?
+  if (connect_msg.authorizer_protocol == CEPH_AUTH_CEPHX) {
+    if (connection->peer_type == CEPH_ENTITY_TYPE_OSD ||
+        connection->peer_type == CEPH_ENTITY_TYPE_MDS ||
+        connection->peer_type == CEPH_ENTITY_TYPE_MGR) {
+      if (cct->_conf->cephx_require_signatures ||
+          cct->_conf->cephx_cluster_require_signatures) {
+        ldout(cct, 10)
+            << __func__
+            << " using cephx, requiring MSG_AUTH feature bit for cluster"
+            << dendl;
+        connection->policy.features_required |= CEPH_FEATURE_MSG_AUTH;
+      }
+      if (cct->_conf->cephx_require_version >= 2 ||
+          cct->_conf->cephx_cluster_require_version >= 2) {
+        ldout(cct, 10)
+            << __func__
+            << " using cephx, requiring cephx v2 feature bit for cluster"
+            << dendl;
+        connection->policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2;
+      }
+    } else {
+      if (cct->_conf->cephx_require_signatures ||
+          cct->_conf->cephx_service_require_signatures) {
+        ldout(cct, 10)
+            << __func__
+            << " using cephx, requiring MSG_AUTH feature bit for service"
+            << dendl;
+        connection->policy.features_required |= CEPH_FEATURE_MSG_AUTH;
+      }
+      if (cct->_conf->cephx_require_version >= 2 ||
+          cct->_conf->cephx_service_require_version >= 2) {
+        ldout(cct, 10)
+            << __func__
+            << " using cephx, requiring cephx v2 feature bit for service"
+            << dendl;
+        connection->policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2;
+      }
+    }
+  }
+
+  uint64_t feat_missing =
+      connection->policy.features_required & ~(uint64_t)connect_msg.features;
+  if (feat_missing) {
+    ldout(cct, 1) << __func__ << " peer missing required features " << std::hex
+                  << feat_missing << std::dec << dendl;
+    return send_connect_message_reply(CEPH_MSGR_TAG_FEATURES, reply,
+                                      authorizer_reply);
+  }
+
+  ceph::buffer::list auth_bl_copy = authorizer_buf;
+  auto am = auth_meta;
+  am->auth_method = connect_msg.authorizer_protocol;
+  if (!HAVE_FEATURE((uint64_t)connect_msg.features, CEPHX_V2)) {
+    // peer doesn't support it and we won't get here if we require it
+    am->skip_authorizer_challenge = true;
+  }
+  connection->lock.unlock();
+  ldout(cct,10) << __func__ << " authorizor_protocol "
+		<< connect_msg.authorizer_protocol
+		<< " len " << auth_bl_copy.length()
+		<< dendl;
+  bool more = (bool)auth_meta->authorizer_challenge;
+  int r = messenger->auth_server->handle_auth_request(
+    connection,
+    am.get(),
+    more,
+    am->auth_method,
+    auth_bl_copy,
+    &authorizer_reply);
+  if (r < 0) {
+    connection->lock.lock();
+    if (state != ACCEPTING_WAIT_CONNECT_MSG_AUTH) {
+      ldout(cct, 1) << __func__ << " state changed" << dendl;
+      return _fault();
+    }
+    ldout(cct, 0) << __func__ << ": got bad authorizer, auth_reply_len="
+		  << authorizer_reply.length() << dendl;
+    session_security.reset();
+    return send_connect_message_reply(CEPH_MSGR_TAG_BADAUTHORIZER, reply,
+				      authorizer_reply);
+  }
+  if (r == 0) {
+    connection->lock.lock();
+    if (state != ACCEPTING_WAIT_CONNECT_MSG_AUTH) {
+      ldout(cct, 1) << __func__ << " state changed" << dendl;
+      return _fault();
+    }
+    ldout(cct, 10) << __func__ << ": challenging authorizer" << dendl;
+    ceph_assert(authorizer_reply.length());
+    return send_connect_message_reply(CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER,
+				      reply, authorizer_reply);
+  }
+
+  // We've verified the authorizer for this AsyncConnection, so set up the
+  // session security structure.  PLR
+  ldout(cct, 10) << __func__ << " accept setting up session_security." << dendl;
+
+  if (connection->policy.server &&
+      connection->policy.lossy &&
+      !connection->policy.register_lossy_clients) {
+    // incoming lossy client, no need to register this connection
+    // new session
+    ldout(cct, 10) << __func__ << " accept new session" << dendl;
+    connection->lock.lock();
+    return open(reply, authorizer_reply);
+  }
+
+  AsyncConnectionRef existing = messenger->lookup_conn(*connection->peer_addrs);
+
+  connection->inject_delay();
+
+  connection->lock.lock();
+  if (state != ACCEPTING_WAIT_CONNECT_MSG_AUTH) {
+    ldout(cct, 1) << __func__ << " state changed" << dendl;
+    return _fault();
+  }
+
+  if (existing == connection) {
+    existing = nullptr;
+  }
+  if (existing && existing->protocol->proto_type != 1) {
+    ldout(cct,1) << __func__ << " existing " << existing << " proto "
+		 << existing->protocol.get() << " version is "
+		 << existing->protocol->proto_type << ", marking down" << dendl;
+    existing->mark_down();
+    existing = nullptr;
+  }
+
+  if (existing) {
+    // There is no possible that existing connection will acquire this
+    // connection's lock
+    existing->lock.lock();  // skip lockdep check (we are locking a second
+                            // AsyncConnection here)
+
+    ldout(cct,10) << __func__ << " existing=" << existing << " exproto="
+		  << existing->protocol.get() << dendl;
+    ProtocolV1 *exproto = dynamic_cast<ProtocolV1 *>(existing->protocol.get());
+    ceph_assert(exproto);
+    ceph_assert(exproto->proto_type == 1);
+
+    if (exproto->state == CLOSED) {
+      ldout(cct, 1) << __func__ << " existing " << existing
+		    << " already closed." << dendl;
+      existing->lock.unlock();
+      existing = nullptr;
+
+      return open(reply, authorizer_reply);
+    }
+
+    if (exproto->replacing) {
+      ldout(cct, 1) << __func__
+                    << " existing racing replace happened while replacing."
+                    << " existing_state="
+                    << connection->get_state_name(existing->state) << dendl;
+      reply.global_seq = exproto->peer_global_seq;
+      existing->lock.unlock();
+      return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_GLOBAL, reply,
+                                        authorizer_reply);
+    }
+
+    if (connect_msg.global_seq < exproto->peer_global_seq) {
+      ldout(cct, 10) << __func__ << " accept existing " << existing << ".gseq "
+                     << exproto->peer_global_seq << " > "
+                     << connect_msg.global_seq << ", RETRY_GLOBAL" << dendl;
+      reply.global_seq = exproto->peer_global_seq;  // so we can send it below..
+      existing->lock.unlock();
+      return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_GLOBAL, reply,
+                                        authorizer_reply);
+    } else {
+      ldout(cct, 10) << __func__ << " accept existing " << existing << ".gseq "
+                     << exproto->peer_global_seq
+                     << " <= " << connect_msg.global_seq << ", looks ok"
+                     << dendl;
+    }
+
+    if (existing->policy.lossy) {
+      ldout(cct, 0)
+          << __func__
+          << " accept replacing existing (lossy) channel (new one lossy="
+          << connection->policy.lossy << ")" << dendl;
+      exproto->session_reset();
+      return replace(existing, reply, authorizer_reply);
+    }
+
+    ldout(cct, 1) << __func__ << " accept connect_seq "
+                  << connect_msg.connect_seq
+                  << " vs existing csq=" << exproto->connect_seq
+                  << " existing_state="
+                  << connection->get_state_name(existing->state) << dendl;
+
+    if (connect_msg.connect_seq == 0 && exproto->connect_seq > 0) {
+      ldout(cct, 0)
+          << __func__
+          << " accept peer reset, then tried to connect to us, replacing"
+          << dendl;
+      // this is a hard reset from peer
+      is_reset_from_peer = true;
+      if (connection->policy.resetcheck) {
+        exproto->session_reset();  // this resets out_queue, msg_ and
+                                   // connect_seq #'s
+      }
+      return replace(existing, reply, authorizer_reply);
+    }
+
+    if (connect_msg.connect_seq < exproto->connect_seq) {
+      // old attempt, or we sent READY but they didn't get it.
+      ldout(cct, 10) << __func__ << " accept existing " << existing << ".cseq "
+                     << exproto->connect_seq << " > " << connect_msg.connect_seq
+                     << ", RETRY_SESSION" << dendl;
+      reply.connect_seq = exproto->connect_seq + 1;
+      existing->lock.unlock();
+      return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_SESSION, reply,
+                                        authorizer_reply);
+    }
+
+    if (connect_msg.connect_seq == exproto->connect_seq) {
+      // if the existing connection successfully opened, and/or
+      // subsequently went to standby, then the peer should bump
+      // their connect_seq and retry: this is not a connection race
+      // we need to resolve here.
+      if (exproto->state == OPENED || exproto->state == STANDBY) {
+        ldout(cct, 10) << __func__ << " accept connection race, existing "
+                       << existing << ".cseq " << exproto->connect_seq
+                       << " == " << connect_msg.connect_seq
+                       << ", OPEN|STANDBY, RETRY_SESSION " << dendl;
+        // if connect_seq both zero, dont stuck into dead lock. it's ok to
+        // replace
+        if (connection->policy.resetcheck && exproto->connect_seq == 0) {
+          return replace(existing, reply, authorizer_reply);
+        }
+
+        reply.connect_seq = exproto->connect_seq + 1;
+        existing->lock.unlock();
+        return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_SESSION, reply,
+                                          authorizer_reply);
+      }
+
+      // connection race?
+      if (connection->peer_addrs->legacy_addr() < messenger->get_myaddr_legacy() ||
+          existing->policy.server) {
+        // incoming wins
+        ldout(cct, 10) << __func__ << " accept connection race, existing "
+                       << existing << ".cseq " << exproto->connect_seq
+                       << " == " << connect_msg.connect_seq
+                       << ", or we are server, replacing my attempt" << dendl;
+        return replace(existing, reply, authorizer_reply);
+      } else {
+        // our existing outgoing wins
+        ldout(messenger->cct, 10)
+            << __func__ << " accept connection race, existing " << existing
+            << ".cseq " << exproto->connect_seq
+            << " == " << connect_msg.connect_seq << ", sending WAIT" << dendl;
+        ceph_assert(connection->peer_addrs->legacy_addr() >
+                    messenger->get_myaddr_legacy());
+        existing->lock.unlock();
+	// make sure we follow through with opening the existing
+	// connection (if it isn't yet open) since we know the peer
+	// has something to send to us.
+	existing->send_keepalive();
+        return send_connect_message_reply(CEPH_MSGR_TAG_WAIT, reply,
+                                          authorizer_reply);
+      }
+    }
+
+    ceph_assert(connect_msg.connect_seq > exproto->connect_seq);
+    ceph_assert(connect_msg.global_seq >= exproto->peer_global_seq);
+    if (connection->policy.resetcheck &&  // RESETSESSION only used by servers;
+                                          // peers do not reset each other
+        exproto->connect_seq == 0) {
+      ldout(cct, 0) << __func__ << " accept we reset (peer sent cseq "
+                    << connect_msg.connect_seq << ", " << existing
+                    << ".cseq = " << exproto->connect_seq
+                    << "), sending RESETSESSION " << dendl;
+      existing->lock.unlock();
+      return send_connect_message_reply(CEPH_MSGR_TAG_RESETSESSION, reply,
+                                        authorizer_reply);
+    }
+
+    // reconnect
+    ldout(cct, 10) << __func__ << " accept peer sent cseq "
+                   << connect_msg.connect_seq << " > " << exproto->connect_seq
+                   << dendl;
+    return replace(existing, reply, authorizer_reply);
+  }  // existing
+  else if (!replacing && connect_msg.connect_seq > 0) {
+    // we reset, and they are opening a new session
+    ldout(cct, 0) << __func__ << " accept we reset (peer sent cseq "
+                  << connect_msg.connect_seq << "), sending RESETSESSION"
+                  << dendl;
+    return send_connect_message_reply(CEPH_MSGR_TAG_RESETSESSION, reply,
+                                      authorizer_reply);
+  } else {
+    // new session
+    ldout(cct, 10) << __func__ << " accept new session" << dendl;
+    existing = nullptr;
+    return open(reply, authorizer_reply);
+  }
+}
+
+CtPtr ProtocolV1::send_connect_message_reply(char tag,
+                                             ceph_msg_connect_reply &reply,
+                                             ceph::buffer::list &authorizer_reply) {
+  ldout(cct, 20) << __func__ << dendl;
+  ceph::buffer::list reply_bl;
+  reply.tag = tag;
+  reply.features =
+      ((uint64_t)connect_msg.features & connection->policy.features_supported) |
+      connection->policy.features_required;
+  reply.authorizer_len = authorizer_reply.length();
+  reply_bl.append((char *)&reply, sizeof(reply));
+
+  ldout(cct, 10) << __func__ << " reply features 0x" << std::hex
+		 << reply.features << " = (policy sup 0x"
+		 << connection->policy.features_supported
+		 << " & connect 0x" << (uint64_t)connect_msg.features
+		 << ") | policy req 0x"
+		 << connection->policy.features_required
+		 << dendl;
+
+  if (reply.authorizer_len) {
+    reply_bl.append(authorizer_reply.c_str(), authorizer_reply.length());
+    authorizer_reply.clear();
+  }
+
+  return WRITE(reply_bl, handle_connect_message_reply_write);
+}
+
+CtPtr ProtocolV1::handle_connect_message_reply_write(int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << " write connect message reply failed" << dendl;
+    connection->inject_delay();
+    return _fault();
+  }
+
+  return CONTINUE(wait_connect_message);
+}
+
+CtPtr ProtocolV1::replace(const AsyncConnectionRef& existing,
+                          ceph_msg_connect_reply &reply,
+                          ceph::buffer::list &authorizer_reply) {
+  ldout(cct, 10) << __func__ << " accept replacing " << existing << dendl;
+
+  connection->inject_delay();
+  if (existing->policy.lossy) {
+    // disconnect from the Connection
+    ldout(cct, 1) << __func__ << " replacing on lossy channel, failing existing"
+                  << dendl;
+    existing->protocol->stop();
+    existing->dispatch_queue->queue_reset(existing.get());
+  } else {
+    ceph_assert(can_write == WriteStatus::NOWRITE);
+    existing->write_lock.lock();
+
+    ProtocolV1 *exproto = dynamic_cast<ProtocolV1 *>(existing->protocol.get());
+
+    // reset the in_seq if this is a hard reset from peer,
+    // otherwise we respect our original connection's value
+    if (is_reset_from_peer) {
+      exproto->is_reset_from_peer = true;
+    }
+
+    connection->center->delete_file_event(connection->cs.fd(),
+                                          EVENT_READABLE | EVENT_WRITABLE);
+
+    if (existing->delay_state) {
+      existing->delay_state->flush();
+      ceph_assert(!connection->delay_state);
+    }
+    exproto->reset_recv_state();
+
+    exproto->connect_msg.features = connect_msg.features;
+
+    auto temp_cs = std::move(connection->cs);
+    EventCenter *new_center = connection->center;
+    Worker *new_worker = connection->worker;
+    // avoid _stop shutdown replacing socket
+    // queue a reset on the new connection, which we're dumping for the old
+    stop();
+
+    connection->dispatch_queue->queue_reset(connection);
+    ldout(messenger->cct, 1)
+        << __func__ << " stop myself to swap existing" << dendl;
+    exproto->can_write = WriteStatus::REPLACING;
+    exproto->replacing = true;
+    exproto->write_in_progress = false;
+    existing->state_offset = 0;
+    // avoid previous thread modify event
+    exproto->state = NONE;
+    existing->state = AsyncConnection::STATE_NONE;
+    // Discard existing prefetch buffer in `recv_buf`
+    existing->recv_start = existing->recv_end = 0;
+    // there shouldn't exist any buffer
+    ceph_assert(connection->recv_start == connection->recv_end);
+
+    auto deactivate_existing = std::bind(
+        [existing, new_worker, new_center, exproto, reply,
+         authorizer_reply](ConnectedSocket &cs) mutable {
+          // we need to delete time event in original thread
+          {
+            std::lock_guard<std::mutex> l(existing->lock);
+            existing->write_lock.lock();
+            exproto->requeue_sent();
+            existing->outgoing_bl.clear();
+            existing->open_write = false;
+            existing->write_lock.unlock();
+            if (exproto->state == NONE) {
+              existing->shutdown_socket();
+              existing->cs = std::move(cs);
+              existing->worker->references--;
+              new_worker->references++;
+              existing->logger = new_worker->get_perf_counter();
+              existing->worker = new_worker;
+              existing->center = new_center;
+              if (existing->delay_state)
+                existing->delay_state->set_center(new_center);
+            } else if (exproto->state == CLOSED) {
+              auto back_to_close =
+                  std::bind([](ConnectedSocket &cs) mutable { cs.close(); },
+                            std::move(cs));
+              new_center->submit_to(new_center->get_id(),
+                                    std::move(back_to_close), true);
+              return;
+            } else {
+              ceph_abort();
+            }
+          }
+
+          // Before changing existing->center, it may already exists some
+          // events in existing->center's queue. Then if we mark down
+          // `existing`, it will execute in another thread and clean up
+          // connection. Previous event will result in segment fault
+          auto transfer_existing = [existing, exproto, reply,
+                                    authorizer_reply]() mutable {
+            std::lock_guard<std::mutex> l(existing->lock);
+            if (exproto->state == CLOSED) return;
+            ceph_assert(exproto->state == NONE);
+
+            // we have called shutdown_socket above
+            ceph_assert(existing->last_tick_id == 0);
+            // restart timer since we are going to re-build connection
+            existing->last_connect_started = ceph::coarse_mono_clock::now();
+            existing->last_tick_id = existing->center->create_time_event(
+              existing->connect_timeout_us, existing->tick_handler);
+            existing->state = AsyncConnection::STATE_CONNECTION_ESTABLISHED;
+            exproto->state = ACCEPTING;
+
+            existing->center->create_file_event(
+                existing->cs.fd(), EVENT_READABLE, existing->read_handler);
+            reply.global_seq = exproto->peer_global_seq;
+            exproto->run_continuation(exproto->send_connect_message_reply(
+                CEPH_MSGR_TAG_RETRY_GLOBAL, reply, authorizer_reply));
+          };
+          if (existing->center->in_thread())
+            transfer_existing();
+          else
+            existing->center->submit_to(existing->center->get_id(),
+                                        std::move(transfer_existing), true);
+        },
+        std::move(temp_cs));
+
+    existing->center->submit_to(existing->center->get_id(),
+                                std::move(deactivate_existing), true);
+    existing->write_lock.unlock();
+    existing->lock.unlock();
+    return nullptr;
+  }
+  existing->lock.unlock();
+
+  return open(reply, authorizer_reply);
+}
+
+CtPtr ProtocolV1::open(ceph_msg_connect_reply &reply,
+                       ceph::buffer::list &authorizer_reply) {
+  ldout(cct, 20) << __func__ << dendl;
+
+  connect_seq = connect_msg.connect_seq + 1;
+  peer_global_seq = connect_msg.global_seq;
+  ldout(cct, 10) << __func__ << " accept success, connect_seq = " << connect_seq
+                 << " in_seq=" << in_seq << ", sending READY" << dendl;
+
+  // if it is a hard reset from peer, we don't need a round-trip to negotiate
+  // in/out sequence
+  if ((connect_msg.features & CEPH_FEATURE_RECONNECT_SEQ) &&
+      !is_reset_from_peer) {
+    reply.tag = CEPH_MSGR_TAG_SEQ;
+    wait_for_seq = true;
+  } else {
+    reply.tag = CEPH_MSGR_TAG_READY;
+    wait_for_seq = false;
+    out_seq = discard_requeued_up_to(out_seq, 0);
+    is_reset_from_peer = false;
+    in_seq = 0;
+  }
+
+  // send READY reply
+  reply.features = connection->policy.features_supported;
+  reply.global_seq = messenger->get_global_seq();
+  reply.connect_seq = connect_seq;
+  reply.flags = 0;
+  reply.authorizer_len = authorizer_reply.length();
+  if (connection->policy.lossy) {
+    reply.flags = reply.flags | CEPH_MSG_CONNECT_LOSSY;
+  }
+
+  connection->set_features((uint64_t)reply.features &
+                           (uint64_t)connect_msg.features);
+  ldout(cct, 10) << __func__ << " accept features "
+                 << connection->get_features()
+		 << " authorizer_protocol "
+		 << connect_msg.authorizer_protocol << dendl;
+
+  session_security.reset(
+    get_auth_session_handler(cct, auth_meta->auth_method,
+			     auth_meta->session_key,
+			     connection->get_features()));
+
+  ceph::buffer::list reply_bl;
+  reply_bl.append((char *)&reply, sizeof(reply));
+
+  if (reply.authorizer_len) {
+    reply_bl.append(authorizer_reply.c_str(), authorizer_reply.length());
+  }
+
+  if (reply.tag == CEPH_MSGR_TAG_SEQ) {
+    uint64_t s = in_seq;
+    reply_bl.append((char *)&s, sizeof(s));
+  }
+
+  connection->lock.unlock();
+  // Because "replacing" will prevent other connections preempt this addr,
+  // it's safe that here we don't acquire Connection's lock
+  ssize_t r = messenger->accept_conn(connection);
+
+  connection->inject_delay();
+
+  connection->lock.lock();
+  replacing = false;
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " existing race replacing process for addr = "
+                  << connection->peer_addrs->legacy_addr()
+                  << " just fail later one(this)" << dendl;
+    ldout(cct, 10) << "accept fault after register" << dendl;
+    connection->inject_delay();
+    return _fault();
+  }
+  if (state != ACCEPTING_WAIT_CONNECT_MSG_AUTH) {
+    ldout(cct, 1) << __func__
+                  << " state changed while accept_conn, it must be mark_down"
+                  << dendl;
+    ceph_assert(state == CLOSED || state == NONE);
+    ldout(cct, 10) << "accept fault after register" << dendl;
+    messenger->unregister_conn(connection);
+    connection->inject_delay();
+    return _fault();
+  }
+
+  return WRITE(reply_bl, handle_ready_connect_message_reply_write);
+}
+
+CtPtr ProtocolV1::handle_ready_connect_message_reply_write(int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " write ready connect message reply failed"
+                  << dendl;
+    return _fault();
+  }
+
+  // notify
+  connection->dispatch_queue->queue_accept(connection);
+  messenger->ms_deliver_handle_fast_accept(connection);
+  once_ready = true;
+
+  state = ACCEPTING_HANDLED_CONNECT_MSG;
+
+  if (wait_for_seq) {
+    return wait_seq();
+  }
+
+  return server_ready();
+}
+
+CtPtr ProtocolV1::wait_seq() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  return READ(sizeof(uint64_t), handle_seq);
+}
+
+CtPtr ProtocolV1::handle_seq(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read ack seq failed" << dendl;
+    return _fault();
+  }
+
+  uint64_t newly_acked_seq = *(uint64_t *)buffer;
+  ldout(cct, 2) << __func__ << " accept get newly_acked_seq " << newly_acked_seq
+                << dendl;
+  out_seq = discard_requeued_up_to(out_seq, newly_acked_seq);
+
+  return server_ready();
+}
+
+CtPtr ProtocolV1::server_ready() {
+  ldout(cct, 20) << __func__ << " session_security is "
+		 << session_security
+		 << dendl;
+
+  ldout(cct, 20) << __func__ << " accept done" << dendl;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&connect_msg, 0, sizeof(connect_msg));
+
+  if (connection->delay_state) {
+    ceph_assert(connection->delay_state->ready());
+  }
+
+  return ready();
+}
diff --git a/src/msg/async/ProtocolV1.h b/src/msg/async/ProtocolV1.h
new file mode 100644
index 000000000..b23860e8a
--- /dev/null
+++ b/src/msg/async/ProtocolV1.h
@@ -0,0 +1,303 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef _MSG_ASYNC_PROTOCOL_V1_
+#define _MSG_ASYNC_PROTOCOL_V1_
+
+#include "Protocol.h"
+
+class ProtocolV1;
+using CtPtr = Ct<ProtocolV1>*;
+
+class ProtocolV1 : public Protocol {
+/*
+ *  ProtocolV1 State Machine
+ *
+
+    send_server_banner                             send_client_banner
+            |                                              |
+            v                                              v
+    wait_client_banner                              wait_server_banner
+            |                                              |
+            |                                              v
+            v                                 handle_server_banner_and_identify
+    wait_connect_message <---------\                       |
+      |     |                      |                       v
+      |  wait_connect_message_auth |           send_connect_message <----------\
+      |     |                      |                       |                   |
+      v     v                      |                       |                   |
+handle_connect_message_2           |                       v                   |
+        |           |              |            wait_connect_reply             |
+        v           v              |              |        |                   |
+     replace -> send_connect_message_reply        |        V                   |
+        |                                         |   wait_connect_reply_auth  |
+        |                                         |        |                   |
+        v                                         v        v                   |
+      open ---\                                 handle_connect_reply_2 --------/
+        |     |                                            |
+        |     v                                            v
+        |   wait_seq                                  wait_ack_seq
+        |     |                                            |
+        v     v                                            v
+    server_ready                                      client_ready
+            |                                              |
+            \------------------> wait_message <------------/
+                                 |  ^   |  ^
+        /------------------------/  |   |  |
+        |                           |   |  \----------------- ------------\
+        v                /----------/   v                                 |
+handle_keepalive2        |        handle_message_header      read_message_footer
+handle_keepalive2_ack    |              |                                 ^
+handle_tag_ack           |              v                                 |
+        |                |        throttle_message             read_message_data
+        \----------------/              |                                 ^
+                                        v                                 |
+                             read_message_front --> read_message_middle --/
+*/
+
+protected:
+
+  enum State {
+    NONE = 0,
+    START_CONNECT,
+    CONNECTING,
+    CONNECTING_WAIT_BANNER_AND_IDENTIFY,
+    CONNECTING_SEND_CONNECT_MSG,
+    START_ACCEPT,
+    ACCEPTING,
+    ACCEPTING_WAIT_CONNECT_MSG_AUTH,
+    ACCEPTING_HANDLED_CONNECT_MSG,
+    OPENED,
+    THROTTLE_MESSAGE,
+    THROTTLE_BYTES,
+    THROTTLE_DISPATCH_QUEUE,
+    READ_MESSAGE_FRONT,
+    READ_FOOTER_AND_DISPATCH,
+    CLOSED,
+    WAIT,
+    STANDBY
+  };
+
+  static const char *get_state_name(int state) {
+    const char *const statenames[] = {"NONE",
+                                      "START_CONNECT",
+                                      "CONNECTING",
+                                      "CONNECTING_WAIT_BANNER_AND_IDENTIFY",
+                                      "CONNECTING_SEND_CONNECT_MSG",
+                                      "START_ACCEPT",
+                                      "ACCEPTING",
+                                      "ACCEPTING_WAIT_CONNECT_MSG_AUTH",
+                                      "ACCEPTING_HANDLED_CONNECT_MSG",
+                                      "OPENED",
+                                      "THROTTLE_MESSAGE",
+                                      "THROTTLE_BYTES",
+                                      "THROTTLE_DISPATCH_QUEUE",
+                                      "READ_MESSAGE_FRONT",
+                                      "READ_FOOTER_AND_DISPATCH",
+                                      "CLOSED",
+                                      "WAIT",
+                                      "STANDBY"};
+    return statenames[state];
+  }
+
+  char *temp_buffer;
+
+  enum class WriteStatus { NOWRITE, REPLACING, CANWRITE, CLOSED };
+  std::atomic<WriteStatus> can_write;
+  std::list<Message *> sent;  // the first ceph::buffer::list need to inject seq
+  // priority queue for outbound msgs
+  std::map<int, std::list<std::pair<ceph::buffer::list, Message *>>> out_q;
+  bool keepalive;
+  bool write_in_progress = false;
+
+  __u32 connect_seq, peer_global_seq;
+  std::atomic<uint64_t> in_seq{0};
+  std::atomic<uint64_t> out_seq{0};
+  std::atomic<uint64_t> ack_left{0};
+
+  std::shared_ptr<AuthSessionHandler> session_security;
+
+  // Open state
+  ceph_msg_connect connect_msg;
+  ceph_msg_connect_reply connect_reply;
+  ceph::buffer::list authorizer_buf;  // auth(orizer) payload read off the wire
+  ceph::buffer::list authorizer_more;  // connect-side auth retry (we added challenge)
+
+  utime_t backoff;  // backoff time
+  utime_t recv_stamp;
+  utime_t throttle_stamp;
+  unsigned msg_left;
+  uint64_t cur_msg_size;
+  ceph_msg_header current_header;
+  ceph::buffer::list data_buf;
+  ceph::buffer::list::iterator data_blp;
+  ceph::buffer::list front, middle, data;
+
+  bool replacing;  // when replacing process happened, we will reply connect
+                   // side with RETRY tag and accept side will clear replaced
+                   // connection. So when connect side reissue connect_msg,
+                   // there won't exists conflicting connection so we use
+                   // "replacing" to skip RESETSESSION to avoid detect wrong
+                   // presentation
+  bool is_reset_from_peer;
+  bool once_ready;
+
+  State state;
+
+  void run_continuation(CtPtr pcontinuation);
+  CtPtr read(CONTINUATION_RX_TYPE<ProtocolV1> &next, int len,
+             char *buffer = nullptr);
+  CtPtr write(CONTINUATION_TX_TYPE<ProtocolV1> &next,ceph::buffer::list &bl);
+  inline CtPtr _fault() {  // helper fault method that stops continuation
+    fault();
+    return nullptr;
+  }
+
+  CONTINUATION_DECL(ProtocolV1, wait_message);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_keepalive2);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_keepalive2_ack);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_tag_ack);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_header);
+  CONTINUATION_DECL(ProtocolV1, throttle_message);
+  CONTINUATION_DECL(ProtocolV1, throttle_bytes);
+  CONTINUATION_DECL(ProtocolV1, throttle_dispatch_queue);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_front);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_middle);
+  CONTINUATION_DECL(ProtocolV1, read_message_data);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_data);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_footer);
+
+  CtPtr ready();
+  CtPtr wait_message();
+  CtPtr handle_message(char *buffer, int r);
+
+  CtPtr handle_keepalive2(char *buffer, int r);
+  void append_keepalive_or_ack(bool ack = false, utime_t *t = nullptr);
+  CtPtr handle_keepalive2_ack(char *buffer, int r);
+  CtPtr handle_tag_ack(char *buffer, int r);
+
+  CtPtr handle_message_header(char *buffer, int r);
+  CtPtr throttle_message();
+  CtPtr throttle_bytes();
+  CtPtr throttle_dispatch_queue();
+  CtPtr read_message_front();
+  CtPtr handle_message_front(char *buffer, int r);
+  CtPtr read_message_middle();
+  CtPtr handle_message_middle(char *buffer, int r);
+  CtPtr read_message_data_prepare();
+  CtPtr read_message_data();
+  CtPtr handle_message_data(char *buffer, int r);
+  CtPtr read_message_footer();
+  CtPtr handle_message_footer(char *buffer, int r);
+
+  void session_reset();
+  void randomize_out_seq();
+
+  Message *_get_next_outgoing(ceph::buffer::list *bl);
+
+  void prepare_send_message(uint64_t features, Message *m, ceph::buffer::list &bl);
+  ssize_t write_message(Message *m, ceph::buffer::list &bl, bool more);
+
+  void requeue_sent();
+  uint64_t discard_requeued_up_to(uint64_t out_seq, uint64_t seq);
+  void discard_out_queue();
+
+  void reset_recv_state();
+  void reset_security();
+
+  std::ostream& _conn_prefix(std::ostream *_dout);
+
+public:
+  ProtocolV1(AsyncConnection *connection);
+  virtual ~ProtocolV1();
+
+  virtual void connect() override;
+  virtual void accept() override;
+  virtual bool is_connected() override;
+  virtual void stop() override;
+  virtual void fault() override;
+  virtual void send_message(Message *m) override;
+  virtual void send_keepalive() override;
+
+  virtual void read_event() override;
+  virtual void write_event() override;
+  virtual bool is_queued() override;
+
+  // Client Protocol
+private:
+  int global_seq;
+
+  CONTINUATION_DECL(ProtocolV1, send_client_banner);
+  WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_client_banner_write);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_server_banner_and_identify);
+  WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_my_addr_write);
+  CONTINUATION_DECL(ProtocolV1, send_connect_message);
+  WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_message_write);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_reply_1);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_reply_auth);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_ack_seq);
+  WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_in_seq_write);
+
+  CtPtr send_client_banner();
+  CtPtr handle_client_banner_write(int r);
+  CtPtr wait_server_banner();
+  CtPtr handle_server_banner_and_identify(char *buffer, int r);
+  CtPtr handle_my_addr_write(int r);
+  CtPtr send_connect_message();
+  CtPtr handle_connect_message_write(int r);
+  CtPtr wait_connect_reply();
+  CtPtr handle_connect_reply_1(char *buffer, int r);
+  CtPtr wait_connect_reply_auth();
+  CtPtr handle_connect_reply_auth(char *buffer, int r);
+  CtPtr handle_connect_reply_2();
+  CtPtr wait_ack_seq();
+  CtPtr handle_ack_seq(char *buffer, int r);
+  CtPtr handle_in_seq_write(int r);
+  CtPtr client_ready();
+
+  // Server Protocol
+protected:
+  bool wait_for_seq;
+
+  CONTINUATION_DECL(ProtocolV1, send_server_banner);
+  WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_server_banner_write);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_client_banner);
+  CONTINUATION_DECL(ProtocolV1, wait_connect_message);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_message_1);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_message_auth);
+  WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1,
+                                  handle_connect_message_reply_write);
+  WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1,
+                                  handle_ready_connect_message_reply_write);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_seq);
+
+  CtPtr send_server_banner();
+  CtPtr handle_server_banner_write(int r);
+  CtPtr wait_client_banner();
+  CtPtr handle_client_banner(char *buffer, int r);
+  CtPtr wait_connect_message();
+  CtPtr handle_connect_message_1(char *buffer, int r);
+  CtPtr wait_connect_message_auth();
+  CtPtr handle_connect_message_auth(char *buffer, int r);
+  CtPtr handle_connect_message_2();
+  CtPtr send_connect_message_reply(char tag, ceph_msg_connect_reply &reply,
+                                   ceph::buffer::list &authorizer_reply);
+  CtPtr handle_connect_message_reply_write(int r);
+  CtPtr replace(const AsyncConnectionRef& existing, ceph_msg_connect_reply &reply,
+                ceph::buffer::list &authorizer_reply);
+  CtPtr open(ceph_msg_connect_reply &reply, ceph::buffer::list &authorizer_reply);
+  CtPtr handle_ready_connect_message_reply_write(int r);
+  CtPtr wait_seq();
+  CtPtr handle_seq(char *buffer, int r);
+  CtPtr server_ready();
+};
+
+class LoopbackProtocolV1 : public ProtocolV1 {
+public:
+  LoopbackProtocolV1(AsyncConnection *connection) : ProtocolV1(connection) {
+    this->can_write = WriteStatus::CANWRITE;
+  }
+};
+
+#endif /* _MSG_ASYNC_PROTOCOL_V1_ */
diff --git a/src/msg/async/ProtocolV2.cc b/src/msg/async/ProtocolV2.cc
new file mode 100644
index 000000000..e51d84ec2
--- /dev/null
+++ b/src/msg/async/ProtocolV2.cc
@@ -0,0 +1,2899 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <type_traits>
+
+#include "ProtocolV2.h"
+#include "AsyncMessenger.h"
+
+#include "common/EventTrace.h"
+#include "common/ceph_crypto.h"
+#include "common/errno.h"
+#include "include/random.h"
+#include "auth/AuthClient.h"
+#include "auth/AuthServer.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix _conn_prefix(_dout)
+std::ostream &ProtocolV2::_conn_prefix(std::ostream *_dout) {
+  return *_dout << "--2- " << messenger->get_myaddrs() << " >> "
+                << *connection->peer_addrs << " conn(" << connection << " "
+                << this
+		<< " " << ceph_con_mode_name(auth_meta->con_mode)
+		<< " :" << connection->port
+                << " s=" << get_state_name(state) << " pgs=" << peer_global_seq
+                << " cs=" << connect_seq << " l=" << connection->policy.lossy
+                << " rev1=" << HAVE_MSGR2_FEATURE(peer_supported_features,
+                                                  REVISION_1)
+                << " rx=" << session_stream_handlers.rx.get()
+                << " tx=" << session_stream_handlers.tx.get()
+                << ").";
+}
+
+using namespace ceph::msgr::v2;
+
+using CtPtr = Ct<ProtocolV2> *;
+using CtRef = Ct<ProtocolV2> &;
+
+void ProtocolV2::run_continuation(CtPtr pcontinuation) {
+  if (pcontinuation) {
+    run_continuation(*pcontinuation);
+  }
+}
+
+void ProtocolV2::run_continuation(CtRef continuation) {
+  try {
+    CONTINUATION_RUN(continuation)
+  } catch (const ceph::buffer::error &e) {
+    lderr(cct) << __func__ << " failed decoding of frame header: " << e.what()
+               << dendl;
+    _fault();
+  } catch (const ceph::crypto::onwire::MsgAuthError &e) {
+    lderr(cct) << __func__ << " " << e.what() << dendl;
+    _fault();
+  } catch (const DecryptionError &) {
+    lderr(cct) << __func__ << " failed to decrypt frame payload" << dendl;
+  }
+}
+
+#define WRITE(B, D, C) write(D, CONTINUATION(C), B)
+
+#define READ(L, C) read(CONTINUATION(C), ceph::buffer::ptr_node::create(ceph::buffer::create(L)))
+
+#define READ_RXBUF(B, C) read(CONTINUATION(C), B)
+
+#ifdef UNIT_TESTS_BUILT
+
+#define INTERCEPT(S) { \
+if(connection->interceptor) { \
+  auto a = connection->interceptor->intercept(connection, (S)); \
+  if (a == Interceptor::ACTION::FAIL) { \
+    return _fault(); \
+  } else if (a == Interceptor::ACTION::STOP) { \
+    stop(); \
+    connection->dispatch_queue->queue_reset(connection); \
+    return nullptr; \
+  }}}
+  
+#else
+#define INTERCEPT(S)
+#endif
+
+ProtocolV2::ProtocolV2(AsyncConnection *connection)
+    : Protocol(2, connection),
+      state(NONE),
+      peer_supported_features(0),
+      client_cookie(0),
+      server_cookie(0),
+      global_seq(0),
+      connect_seq(0),
+      peer_global_seq(0),
+      message_seq(0),
+      reconnecting(false),
+      replacing(false),
+      can_write(false),
+      bannerExchangeCallback(nullptr),
+      tx_frame_asm(&session_stream_handlers, false),
+      rx_frame_asm(&session_stream_handlers, false),
+      next_tag(static_cast<Tag>(0)),
+      keepalive(false) {
+}
+
+ProtocolV2::~ProtocolV2() {
+}
+
+void ProtocolV2::connect() {
+  ldout(cct, 1) << __func__ << dendl;
+  state = START_CONNECT;
+  pre_auth.enabled = true;
+}
+
+void ProtocolV2::accept() {
+  ldout(cct, 1) << __func__ << dendl;
+  state = START_ACCEPT;
+}
+
+bool ProtocolV2::is_connected() { return can_write; }
+
+/*
+ * Tears down the message queues, and removes them from the
+ * DispatchQueue Must hold write_lock prior to calling.
+ */
+void ProtocolV2::discard_out_queue() {
+  ldout(cct, 10) << __func__ << " started" << dendl;
+
+  for (auto p = sent.begin(); p != sent.end(); ++p) {
+    ldout(cct, 20) << __func__ << " discard " << *p << dendl;
+    (*p)->put();
+  }
+  sent.clear();
+  for (auto& [ prio, entries ] : out_queue) {
+    static_cast<void>(prio);
+    for (auto& entry : entries) {
+      ldout(cct, 20) << __func__ << " discard " << *entry.m << dendl;
+      entry.m->put();
+    }
+  }
+  out_queue.clear();
+  write_in_progress = false;
+}
+
+void ProtocolV2::reset_session() {
+  ldout(cct, 1) << __func__ << dendl;
+
+  std::lock_guard<std::mutex> l(connection->write_lock);
+  if (connection->delay_state) {
+    connection->delay_state->discard();
+  }
+
+  connection->dispatch_queue->discard_queue(connection->conn_id);
+  discard_out_queue();
+  connection->outgoing_bl.clear();
+
+  connection->dispatch_queue->queue_remote_reset(connection);
+
+  out_seq = 0;
+  in_seq = 0;
+  client_cookie = 0;
+  server_cookie = 0;
+  connect_seq = 0;
+  peer_global_seq = 0;
+  message_seq = 0;
+  ack_left = 0;
+  can_write = false;
+}
+
+void ProtocolV2::stop() {
+  ldout(cct, 1) << __func__ << dendl;
+  if (state == CLOSED) {
+    return;
+  }
+
+  if (connection->delay_state) connection->delay_state->flush();
+
+  std::lock_guard<std::mutex> l(connection->write_lock);
+
+  reset_recv_state();
+  discard_out_queue();
+
+  connection->_stop();
+
+  can_write = false;
+  state = CLOSED;
+}
+
+void ProtocolV2::fault() { _fault(); }
+
+void ProtocolV2::requeue_sent() {
+  write_in_progress = false;
+  if (sent.empty()) {
+    return;
+  }
+
+  auto& rq = out_queue[CEPH_MSG_PRIO_HIGHEST];
+  out_seq -= sent.size();
+  while (!sent.empty()) {
+    Message *m = sent.back();
+    sent.pop_back();
+    ldout(cct, 5) << __func__ << " requeueing message m=" << m
+                  << " seq=" << m->get_seq() << " type=" << m->get_type() << " "
+                  << *m << dendl;
+    m->clear_payload();
+    rq.emplace_front(out_queue_entry_t{false, m});
+  }
+}
+
+uint64_t ProtocolV2::discard_requeued_up_to(uint64_t out_seq, uint64_t seq) {
+  ldout(cct, 10) << __func__ << " " << seq << dendl;
+  std::lock_guard<std::mutex> l(connection->write_lock);
+  if (out_queue.count(CEPH_MSG_PRIO_HIGHEST) == 0) {
+    return seq;
+  }
+  auto& rq = out_queue[CEPH_MSG_PRIO_HIGHEST];
+  uint64_t count = out_seq;
+  while (!rq.empty()) {
+    Message* const m = rq.front().m;
+    if (m->get_seq() == 0 || m->get_seq() > seq) break;
+    ldout(cct, 5) << __func__ << " discarding message m=" << m
+                  << " seq=" << m->get_seq() << " ack_seq=" << seq << " "
+                  << *m << dendl;
+    m->put();
+    rq.pop_front();
+    count++;
+  }
+  if (rq.empty()) out_queue.erase(CEPH_MSG_PRIO_HIGHEST);
+  return count;
+}
+
+void ProtocolV2::reset_security() {
+  ldout(cct, 5) << __func__ << dendl;
+
+  auth_meta.reset(new AuthConnectionMeta);
+  session_stream_handlers.rx.reset(nullptr);
+  session_stream_handlers.tx.reset(nullptr);
+  pre_auth.rxbuf.clear();
+  pre_auth.txbuf.clear();
+}
+
+// it's expected the `write_lock` is held while calling this method.
+void ProtocolV2::reset_recv_state() {
+  ldout(cct, 5) << __func__ << dendl;
+
+  if (!connection->center->in_thread()) {
+    // execute in the same thread that uses the rx/tx handlers. We need
+    // to do the warp because holding `write_lock` is not enough as
+    // `write_event()` unlocks it just before calling `write_message()`.
+    // `submit_to()` here is NOT blocking.
+    connection->center->submit_to(connection->center->get_id(), [this] {
+      ldout(cct, 5) << "reset_recv_state (warped) reseting crypto handlers"
+                    << dendl;
+      // Possibly unnecessary. See the comment in `deactivate_existing`.
+      std::lock_guard<std::mutex> l(connection->lock);
+      std::lock_guard<std::mutex> wl(connection->write_lock);
+      reset_security();
+    }, /* always_async = */true);
+  } else {
+    reset_security();
+  }
+
+  // clean read and write callbacks
+  connection->pendingReadLen.reset();
+  connection->writeCallback.reset();
+
+  next_tag = static_cast<Tag>(0);
+
+  reset_throttle();
+}
+
+size_t ProtocolV2::get_current_msg_size() const {
+  ceph_assert(rx_frame_asm.get_num_segments() > 0);
+  size_t sum = 0;
+  // we don't include SegmentIndex::Msg::HEADER.
+  for (size_t i = 1; i < rx_frame_asm.get_num_segments(); i++) {
+    sum += rx_frame_asm.get_segment_logical_len(i);
+  }
+  return sum;
+}
+
+void ProtocolV2::reset_throttle() {
+  if (state > THROTTLE_MESSAGE && state <= THROTTLE_DONE &&
+      connection->policy.throttler_messages) {
+    ldout(cct, 10) << __func__ << " releasing " << 1
+                   << " message to policy throttler "
+                   << connection->policy.throttler_messages->get_current()
+                   << "/" << connection->policy.throttler_messages->get_max()
+                   << dendl;
+    connection->policy.throttler_messages->put();
+  }
+  if (state > THROTTLE_BYTES && state <= THROTTLE_DONE) {
+    if (connection->policy.throttler_bytes) {
+      const size_t cur_msg_size = get_current_msg_size();
+      ldout(cct, 10) << __func__ << " releasing " << cur_msg_size
+                     << " bytes to policy throttler "
+                     << connection->policy.throttler_bytes->get_current() << "/"
+                     << connection->policy.throttler_bytes->get_max() << dendl;
+      connection->policy.throttler_bytes->put(cur_msg_size);
+    }
+  }
+  if (state > THROTTLE_DISPATCH_QUEUE && state <= THROTTLE_DONE) {
+    const size_t cur_msg_size = get_current_msg_size();
+    ldout(cct, 10)
+        << __func__ << " releasing " << cur_msg_size
+        << " bytes to dispatch_queue throttler "
+        << connection->dispatch_queue->dispatch_throttler.get_current() << "/"
+        << connection->dispatch_queue->dispatch_throttler.get_max() << dendl;
+    connection->dispatch_queue->dispatch_throttle_release(cur_msg_size);
+  }
+}
+
+CtPtr ProtocolV2::_fault() {
+  ldout(cct, 10) << __func__ << dendl;
+
+  if (state == CLOSED || state == NONE) {
+    ldout(cct, 10) << __func__ << " connection is already closed" << dendl;
+    return nullptr;
+  }
+
+  if (connection->policy.lossy &&
+      !(state >= START_CONNECT && state <= SESSION_RECONNECTING)) {
+    ldout(cct, 2) << __func__ << " on lossy channel, failing" << dendl;
+    stop();
+    connection->dispatch_queue->queue_reset(connection);
+    return nullptr;
+  }
+
+  connection->write_lock.lock();
+
+  can_write = false;
+  // requeue sent items
+  requeue_sent();
+
+  if (out_queue.empty() && state >= START_ACCEPT &&
+      state <= SESSION_ACCEPTING && !replacing) {
+    ldout(cct, 2) << __func__ << " with nothing to send and in the half "
+                   << " accept state just closed" << dendl;
+    connection->write_lock.unlock();
+    stop();
+    connection->dispatch_queue->queue_reset(connection);
+    return nullptr;
+  }
+
+  replacing = false;
+  connection->fault();
+  reset_recv_state();
+
+  reconnecting = false;
+
+  if (connection->policy.standby && out_queue.empty() && !keepalive &&
+      state != WAIT) {
+    ldout(cct, 1) << __func__ << " with nothing to send, going to standby"
+                  << dendl;
+    state = STANDBY;
+    connection->write_lock.unlock();
+    return nullptr;
+  }
+  if (connection->policy.server) {
+    ldout(cct, 1) << __func__ << " server, going to standby, even though i have stuff queued" << dendl;
+    state = STANDBY;
+    connection->write_lock.unlock();
+    return nullptr;
+  }
+
+  connection->write_lock.unlock();
+
+  if (!(state >= START_CONNECT && state <= SESSION_RECONNECTING) &&
+      state != WAIT &&
+      state != SESSION_ACCEPTING /* due to connection race */) {
+    // policy maybe empty when state is in accept
+    if (connection->policy.server) {
+      ldout(cct, 1) << __func__ << " server, going to standby" << dendl;
+      state = STANDBY;
+    } else {
+      ldout(cct, 1) << __func__ << " initiating reconnect" << dendl;
+      connect_seq++;
+      global_seq = messenger->get_global_seq();
+      state = START_CONNECT;
+      pre_auth.enabled = true;
+      connection->state = AsyncConnection::STATE_CONNECTING;
+    }
+    backoff = utime_t();
+    connection->center->dispatch_event_external(connection->read_handler);
+  } else {
+    if (state == WAIT) {
+      backoff.set_from_double(cct->_conf->ms_max_backoff);
+    } else if (backoff == utime_t()) {
+      backoff.set_from_double(cct->_conf->ms_initial_backoff);
+    } else {
+      backoff += backoff;
+      if (backoff > cct->_conf->ms_max_backoff)
+        backoff.set_from_double(cct->_conf->ms_max_backoff);
+    }
+
+    if (server_cookie) {
+      connect_seq++;
+    }
+
+    global_seq = messenger->get_global_seq();
+    state = START_CONNECT;
+    pre_auth.enabled = true;
+    connection->state = AsyncConnection::STATE_CONNECTING;
+    ldout(cct, 1) << __func__ << " waiting " << backoff << dendl;
+    // woke up again;
+    connection->register_time_events.insert(
+        connection->center->create_time_event(backoff.to_nsec() / 1000,
+                                              connection->wakeup_handler));
+  }
+  return nullptr;
+}
+
+void ProtocolV2::prepare_send_message(uint64_t features,
+				      Message *m) {
+  ldout(cct, 20) << __func__ << " m=" << *m << dendl;
+
+  // associate message with Connection (for benefit of encode_payload)
+  ldout(cct, 20) << __func__ << (m->empty_payload() ? " encoding features " : " half-reencoding features ")
+		 << features << " " << m  << " " << *m << dendl;
+
+  // encode and copy out of *m
+  m->encode(features, 0);
+}
+
+void ProtocolV2::send_message(Message *m) {
+  uint64_t f = connection->get_features();
+
+  // TODO: Currently not all messages supports reencode like MOSDMap, so here
+  // only let fast dispatch support messages prepare message
+  const bool can_fast_prepare = messenger->ms_can_fast_dispatch(m);
+  if (can_fast_prepare) {
+    prepare_send_message(f, m);
+  }
+
+  std::lock_guard<std::mutex> l(connection->write_lock);
+  bool is_prepared = can_fast_prepare;
+  // "features" changes will change the payload encoding
+  if (can_fast_prepare && (!can_write || connection->get_features() != f)) {
+    // ensure the correctness of message encoding
+    m->clear_payload();
+    is_prepared = false;
+    ldout(cct, 10) << __func__ << " clear encoded buffer previous " << f
+                   << " != " << connection->get_features() << dendl;
+  }
+  if (state == CLOSED) {
+    ldout(cct, 10) << __func__ << " connection closed."
+                   << " Drop message " << m << dendl;
+    m->put();
+  } else {
+    ldout(cct, 5) << __func__ << " enqueueing message m=" << m
+                  << " type=" << m->get_type() << " " << *m << dendl;
+    m->queue_start = ceph::mono_clock::now();
+    m->trace.event("async enqueueing message");
+    out_queue[m->get_priority()].emplace_back(
+      out_queue_entry_t{is_prepared, m});
+    ldout(cct, 15) << __func__ << " inline write is denied, reschedule m=" << m
+                   << dendl;
+    if (((!replacing && can_write) || state == STANDBY) && !write_in_progress) {
+      write_in_progress = true;
+      connection->center->dispatch_event_external(connection->write_handler);
+    }
+  }
+}
+
+void ProtocolV2::send_keepalive() {
+  ldout(cct, 10) << __func__ << dendl;
+  std::lock_guard<std::mutex> l(connection->write_lock);
+  if (state != CLOSED) {
+    keepalive = true;
+    connection->center->dispatch_event_external(connection->write_handler);
+  }
+}
+
+void ProtocolV2::read_event() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  switch (state) {
+    case START_CONNECT:
+      run_continuation(CONTINUATION(start_client_banner_exchange));
+      break;
+    case START_ACCEPT:
+      run_continuation(CONTINUATION(start_server_banner_exchange));
+      break;
+    case READY:
+      run_continuation(CONTINUATION(read_frame));
+      break;
+    case THROTTLE_MESSAGE:
+      run_continuation(CONTINUATION(throttle_message));
+      break;
+    case THROTTLE_BYTES:
+      run_continuation(CONTINUATION(throttle_bytes));
+      break;
+    case THROTTLE_DISPATCH_QUEUE:
+      run_continuation(CONTINUATION(throttle_dispatch_queue));
+      break;
+    default:
+      break;
+  }
+}
+
+ProtocolV2::out_queue_entry_t ProtocolV2::_get_next_outgoing() {
+  out_queue_entry_t out_entry;
+
+  if (!out_queue.empty()) {
+    auto it = out_queue.rbegin();
+    auto& entries = it->second;
+    ceph_assert(!entries.empty());
+    out_entry = entries.front();
+    entries.pop_front();
+    if (entries.empty()) {
+      out_queue.erase(it->first);
+    }
+  }
+  return out_entry;
+}
+
+ssize_t ProtocolV2::write_message(Message *m, bool more) {
+  FUNCTRACE(cct);
+  ceph_assert(connection->center->in_thread());
+  m->set_seq(++out_seq);
+
+  connection->lock.lock();
+  uint64_t ack_seq = in_seq;
+  ack_left = 0;
+  connection->lock.unlock();
+
+  ceph_msg_header &header = m->get_header();
+  ceph_msg_footer &footer = m->get_footer();
+
+  ceph_msg_header2 header2{header.seq,        header.tid,
+                           header.type,       header.priority,
+                           header.version,
+                           init_le32(0),      header.data_off,
+                           init_le64(ack_seq),
+                           footer.flags,      header.compat_version,
+                           header.reserved};
+
+  auto message = MessageFrame::Encode(
+			     header2,
+			     m->get_payload(),
+			     m->get_middle(),
+			     m->get_data());
+  if (!append_frame(message)) {
+    m->put();
+    return -EILSEQ;
+  }
+
+  ldout(cct, 5) << __func__ << " sending message m=" << m
+                << " seq=" << m->get_seq() << " " << *m << dendl;
+
+  m->trace.event("async writing message");
+  ldout(cct, 20) << __func__ << " sending m=" << m << " seq=" << m->get_seq()
+                 << " src=" << entity_name_t(messenger->get_myname())
+                 << " off=" << header2.data_off
+                 << dendl;
+  ssize_t total_send_size = connection->outgoing_bl.length();
+  ssize_t rc = connection->_try_send(more);
+  if (rc < 0) {
+    ldout(cct, 1) << __func__ << " error sending " << m << ", "
+                  << cpp_strerror(rc) << dendl;
+  } else {
+    connection->logger->inc(
+        l_msgr_send_bytes, total_send_size - connection->outgoing_bl.length());
+    ldout(cct, 10) << __func__ << " sending " << m
+                   << (rc ? " continuely." : " done.") << dendl;
+  }
+
+#if defined(WITH_EVENTTRACE)
+  if (m->get_type() == CEPH_MSG_OSD_OP)
+    OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OP_END", false);
+  else if (m->get_type() == CEPH_MSG_OSD_OPREPLY)
+    OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OPREPLY_END", false);
+#endif
+  m->put();
+
+  return rc;
+}
+
+template <class F>
+bool ProtocolV2::append_frame(F& frame) {
+  ceph::bufferlist bl;
+  try {
+    bl = frame.get_buffer(tx_frame_asm);
+  } catch (ceph::crypto::onwire::TxHandlerError &e) {
+    ldout(cct, 1) << __func__ << " " << e.what() << dendl;
+    return false;
+  }
+
+  ldout(cct, 25) << __func__ << " assembled frame " << bl.length()
+                 << " bytes " << tx_frame_asm << dendl;
+  connection->outgoing_bl.append(bl);
+  return true;
+}
+
+void ProtocolV2::handle_message_ack(uint64_t seq) {
+  if (connection->policy.lossy) {  // lossy connections don't keep sent messages
+    return;
+  }
+
+  ldout(cct, 15) << __func__ << " seq=" << seq << dendl;
+
+  // trim sent list
+  static const int max_pending = 128;
+  int i = 0;
+  Message *pending[max_pending];
+  auto now = ceph::mono_clock::now();
+  connection->write_lock.lock();
+  while (!sent.empty() && sent.front()->get_seq() <= seq && i < max_pending) {
+    Message *m = sent.front();
+    sent.pop_front();
+    pending[i++] = m;
+    ldout(cct, 10) << __func__ << " got ack seq " << seq
+                   << " >= " << m->get_seq() << " on " << m << " " << *m
+                   << dendl;
+  }
+  connection->write_lock.unlock();
+  connection->logger->tinc(l_msgr_handle_ack_lat, ceph::mono_clock::now() - now);
+  for (int k = 0; k < i; k++) {
+    pending[k]->put();
+  }
+}
+
+void ProtocolV2::write_event() {
+  ldout(cct, 10) << __func__ << dendl;
+  ssize_t r = 0;
+
+  connection->write_lock.lock();
+  if (can_write) {
+    if (keepalive) {
+      ldout(cct, 10) << __func__ << " appending keepalive" << dendl;
+      auto keepalive_frame = KeepAliveFrame::Encode();
+      if (!append_frame(keepalive_frame)) {
+        connection->write_lock.unlock();
+        connection->lock.lock();
+        fault();
+        connection->lock.unlock();
+        return;
+      }
+      keepalive = false;
+    }
+
+    auto start = ceph::mono_clock::now();
+    bool more;
+    do {
+      const auto out_entry = _get_next_outgoing();
+      if (!out_entry.m) {
+        break;
+      }
+
+      if (!connection->policy.lossy) {
+        // put on sent list
+        sent.push_back(out_entry.m);
+        out_entry.m->get();
+      }
+      more = !out_queue.empty();
+      connection->write_lock.unlock();
+
+      // send_message or requeue messages may not encode message
+      if (!out_entry.is_prepared) {
+        prepare_send_message(connection->get_features(), out_entry.m);
+      }
+
+      if (out_entry.m->queue_start != ceph::mono_time()) {
+        connection->logger->tinc(l_msgr_send_messages_queue_lat,
+				 ceph::mono_clock::now() -
+				 out_entry.m->queue_start);
+      }
+
+      r = write_message(out_entry.m, more);
+
+      connection->write_lock.lock();
+      if (r == 0) {
+        ;
+      } else if (r < 0) {
+        ldout(cct, 1) << __func__ << " send msg failed" << dendl;
+        break;
+      } else if (r > 0) {
+	// Outbound message in-progress, thread will be re-awoken
+	// when the outbound socket is writeable again
+        break;
+      }
+    } while (can_write);
+    write_in_progress = false;
+
+    // if r > 0 mean data still lefted, so no need _try_send.
+    if (r == 0) {
+      uint64_t left = ack_left;
+      if (left) {
+        ldout(cct, 10) << __func__ << " try send msg ack, acked " << left
+                       << " messages" << dendl;
+        auto ack_frame = AckFrame::Encode(in_seq);
+        if (append_frame(ack_frame)) {
+          ack_left -= left;
+          left = ack_left;
+          r = connection->_try_send(left);
+        } else {
+          r = -EILSEQ;
+        }
+      } else if (is_queued()) {
+        r = connection->_try_send();
+      }
+    }
+    connection->write_lock.unlock();
+
+    connection->logger->tinc(l_msgr_running_send_time,
+                             ceph::mono_clock::now() - start);
+    if (r < 0) {
+      ldout(cct, 1) << __func__ << " send msg failed" << dendl;
+      connection->lock.lock();
+      fault();
+      connection->lock.unlock();
+      return;
+    }
+  } else {
+    write_in_progress = false;
+    connection->write_lock.unlock();
+    connection->lock.lock();
+    connection->write_lock.lock();
+    if (state == STANDBY && !connection->policy.server && is_queued()) {
+      ldout(cct, 10) << __func__ << " policy.server is false" << dendl;
+      if (server_cookie) {  // only increment connect_seq if there is a session
+        connect_seq++;
+      }
+      connection->_connect();
+    } else if (connection->cs && state != NONE && state != CLOSED &&
+               state != START_CONNECT) {
+      r = connection->_try_send();
+      if (r < 0) {
+        ldout(cct, 1) << __func__ << " send outcoming bl failed" << dendl;
+        connection->write_lock.unlock();
+        fault();
+        connection->lock.unlock();
+        return;
+      }
+    }
+    connection->write_lock.unlock();
+    connection->lock.unlock();
+  }
+}
+
+bool ProtocolV2::is_queued() {
+  return !out_queue.empty() || connection->is_queued();
+}
+
+CtPtr ProtocolV2::read(CONTINUATION_RXBPTR_TYPE<ProtocolV2> &next,
+                       rx_buffer_t &&buffer) {
+  const auto len = buffer->length();
+  const auto buf = buffer->c_str();
+  next.node = std::move(buffer);
+  ssize_t r = connection->read(len, buf,
+    [&next, this](char *buffer, int r) {
+      if (unlikely(pre_auth.enabled) && r >= 0) {
+        pre_auth.rxbuf.append(*next.node);
+	ceph_assert(!cct->_conf->ms_die_on_bug ||
+		    pre_auth.rxbuf.length() < 20000000);
+      }
+      next.r = r;
+      run_continuation(next);
+    });
+  if (r <= 0) {
+    // error or done synchronously
+    if (unlikely(pre_auth.enabled) && r == 0) {
+      pre_auth.rxbuf.append(*next.node);
+      ceph_assert(!cct->_conf->ms_die_on_bug ||
+		  pre_auth.rxbuf.length() < 20000000);
+    }
+    next.r = r;
+    return &next;
+  }
+
+  return nullptr;
+}
+
+template <class F>
+CtPtr ProtocolV2::write(const std::string &desc,
+                        CONTINUATION_TYPE<ProtocolV2> &next,
+                        F &frame) {
+  ceph::bufferlist bl;
+  try {
+    bl = frame.get_buffer(tx_frame_asm);
+  } catch (ceph::crypto::onwire::TxHandlerError &e) {
+    ldout(cct, 1) << __func__ << " " << e.what() << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 25) << __func__ << " assembled frame " << bl.length()
+                 << " bytes " << tx_frame_asm << dendl;
+  return write(desc, next, bl);
+}
+
+CtPtr ProtocolV2::write(const std::string &desc,
+                        CONTINUATION_TYPE<ProtocolV2> &next,
+                        ceph::bufferlist &buffer) {
+  if (unlikely(pre_auth.enabled)) {
+    pre_auth.txbuf.append(buffer);
+    ceph_assert(!cct->_conf->ms_die_on_bug ||
+		pre_auth.txbuf.length() < 20000000);
+  }
+
+  ssize_t r =
+      connection->write(buffer, [&next, desc, this](int r) {
+        if (r < 0) {
+          ldout(cct, 1) << __func__ << " " << desc << " write failed r=" << r
+                        << " (" << cpp_strerror(r) << ")" << dendl;
+          connection->inject_delay();
+          _fault();
+        }
+        run_continuation(next);
+      });
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " " << desc << " write failed r=" << r
+                  << " (" << cpp_strerror(r) << ")" << dendl;
+    return _fault();
+  } else if (r == 0) {
+    next.setParams();
+    return &next;
+  }
+
+  return nullptr;
+}
+
+CtPtr ProtocolV2::_banner_exchange(CtRef callback) {
+  ldout(cct, 20) << __func__ << dendl;
+  bannerExchangeCallback = &callback;
+
+  ceph::bufferlist banner_payload;
+  using ceph::encode;
+  encode((uint64_t)CEPH_MSGR2_SUPPORTED_FEATURES, banner_payload, 0);
+  encode((uint64_t)CEPH_MSGR2_REQUIRED_FEATURES, banner_payload, 0);
+
+  ceph::bufferlist bl;
+  bl.append(CEPH_BANNER_V2_PREFIX, strlen(CEPH_BANNER_V2_PREFIX));
+  encode((uint16_t)banner_payload.length(), bl, 0);
+  bl.claim_append(banner_payload);
+
+  INTERCEPT(state == BANNER_CONNECTING ? 3 : 4);
+
+  return WRITE(bl, "banner", _wait_for_peer_banner);
+}
+
+CtPtr ProtocolV2::_wait_for_peer_banner() {
+  unsigned banner_len = strlen(CEPH_BANNER_V2_PREFIX) + sizeof(ceph_le16);
+  return READ(banner_len, _handle_peer_banner);
+}
+
+CtPtr ProtocolV2::_handle_peer_banner(rx_buffer_t &&buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read peer banner failed r=" << r << " ("
+                  << cpp_strerror(r) << ")" << dendl;
+    return _fault();
+  }
+
+  unsigned banner_prefix_len = strlen(CEPH_BANNER_V2_PREFIX);
+
+  if (memcmp(buffer->c_str(), CEPH_BANNER_V2_PREFIX, banner_prefix_len)) {
+    if (memcmp(buffer->c_str(), CEPH_BANNER, strlen(CEPH_BANNER)) == 0) {
+      lderr(cct) << __func__ << " peer " << *connection->peer_addrs
+                 << " is using msgr V1 protocol" << dendl;
+      return _fault();
+    }
+    ldout(cct, 1) << __func__ << " accept peer sent bad banner" << dendl;
+    return _fault();
+  }
+
+  uint16_t payload_len;
+  ceph::bufferlist bl;
+  buffer->set_offset(banner_prefix_len);
+  buffer->set_length(sizeof(ceph_le16));
+  bl.push_back(std::move(buffer));
+  auto ti = bl.cbegin();
+  using ceph::decode;
+  try {
+    decode(payload_len, ti);
+  } catch (const ceph::buffer::error &e) {
+    lderr(cct) << __func__ << " decode banner payload len failed " << dendl;
+    return _fault();
+  }
+
+  INTERCEPT(state == BANNER_CONNECTING ? 5 : 6);
+
+  return READ(payload_len, _handle_peer_banner_payload);
+}
+
+CtPtr ProtocolV2::_handle_peer_banner_payload(rx_buffer_t &&buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read peer banner payload failed r=" << r
+                  << " (" << cpp_strerror(r) << ")" << dendl;
+    return _fault();
+  }
+
+  uint64_t peer_supported_features;
+  uint64_t peer_required_features;
+
+  ceph::bufferlist bl;
+  using ceph::decode;
+  bl.push_back(std::move(buffer));
+  auto ti = bl.cbegin();
+  try {
+    decode(peer_supported_features, ti);
+    decode(peer_required_features, ti);
+  } catch (const ceph::buffer::error &e) {
+    lderr(cct) << __func__ << " decode banner payload failed " << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 1) << __func__ << " supported=" << std::hex
+                << peer_supported_features << " required=" << std::hex
+                << peer_required_features << std::dec << dendl;
+
+  // Check feature bit compatibility
+
+  uint64_t supported_features = CEPH_MSGR2_SUPPORTED_FEATURES;
+  uint64_t required_features = CEPH_MSGR2_REQUIRED_FEATURES;
+
+  if ((required_features & peer_supported_features) != required_features) {
+    ldout(cct, 1) << __func__ << " peer does not support all required features"
+                  << " required=" << std::hex << required_features
+                  << " supported=" << std::hex << peer_supported_features
+                  << std::dec << dendl;
+    stop();
+    connection->dispatch_queue->queue_reset(connection);
+    return nullptr;
+  }
+  if ((supported_features & peer_required_features) != peer_required_features) {
+    ldout(cct, 1) << __func__ << " we do not support all peer required features"
+                  << " required=" << std::hex << peer_required_features
+                  << " supported=" << supported_features << std::dec << dendl;
+    stop();
+    connection->dispatch_queue->queue_reset(connection);
+    return nullptr;
+  }
+
+  this->peer_supported_features = peer_supported_features;
+  if (peer_required_features == 0) {
+    this->connection_features = msgr2_required;
+  }
+
+  // if the peer supports msgr2.1, switch to it
+  bool is_rev1 = HAVE_MSGR2_FEATURE(peer_supported_features, REVISION_1);
+  tx_frame_asm.set_is_rev1(is_rev1);
+  rx_frame_asm.set_is_rev1(is_rev1);
+
+  if (state == BANNER_CONNECTING) {
+    state = HELLO_CONNECTING;
+  }
+  else {
+    ceph_assert(state == BANNER_ACCEPTING);
+    state = HELLO_ACCEPTING;
+  }
+
+  auto hello = HelloFrame::Encode(messenger->get_mytype(),
+                                  connection->target_addr);
+
+  INTERCEPT(state == HELLO_CONNECTING ? 7 : 8);
+
+  return WRITE(hello, "hello frame", read_frame);
+}
+
+CtPtr ProtocolV2::handle_hello(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != HELLO_CONNECTING && state != HELLO_ACCEPTING) {
+    lderr(cct) << __func__ << " not in hello exchange state!" << dendl;
+    return _fault();
+  }
+
+  auto hello = HelloFrame::Decode(payload);
+
+  ldout(cct, 5) << __func__ << " received hello:"
+                << " peer_type=" << (int)hello.entity_type()
+                << " peer_addr_for_me=" << hello.peer_addr() << dendl;
+
+  sockaddr_storage ss;
+  socklen_t len = sizeof(ss);
+  getsockname(connection->cs.fd(), (sockaddr *)&ss, &len);
+  ldout(cct, 5) << __func__ << " getsockname says I am " << (sockaddr *)&ss
+		<< " when talking to " << connection->target_addr << dendl;
+
+  if (connection->get_peer_type() == -1) {
+    connection->set_peer_type(hello.entity_type());
+
+    ceph_assert(state == HELLO_ACCEPTING);
+    connection->policy = messenger->get_policy(hello.entity_type());
+    ldout(cct, 10) << __func__ << " accept of host_type "
+                   << (int)hello.entity_type()
+                   << ", policy.lossy=" << connection->policy.lossy
+                   << " policy.server=" << connection->policy.server
+                   << " policy.standby=" << connection->policy.standby
+                   << " policy.resetcheck=" << connection->policy.resetcheck
+                   << dendl;
+  } else {
+    ceph_assert(state == HELLO_CONNECTING);
+    if (connection->get_peer_type() != hello.entity_type()) {
+      ldout(cct, 1) << __func__ << " connection peer type does not match what"
+                    << " peer advertises " << connection->get_peer_type()
+                    << " != " << (int)hello.entity_type() << dendl;
+      stop();
+      connection->dispatch_queue->queue_reset(connection);
+      return nullptr;
+    }
+  }
+
+  if (messenger->get_myaddrs().empty() ||
+      messenger->get_myaddrs().front().is_blank_ip()) {
+    entity_addr_t a;
+    if (cct->_conf->ms_learn_addr_from_peer) {
+      ldout(cct, 1) << __func__ << " peer " << connection->target_addr
+		    << " says I am " << hello.peer_addr() << " (socket says "
+		    << (sockaddr*)&ss << ")" << dendl;
+      a = hello.peer_addr();
+    } else {
+      ldout(cct, 1) << __func__ << " socket to  " << connection->target_addr
+		    << " says I am " << (sockaddr*)&ss
+		    << " (peer says " << hello.peer_addr() << ")" << dendl;
+      a.set_sockaddr((sockaddr *)&ss);
+    }
+    a.set_type(entity_addr_t::TYPE_MSGR2); // anything but NONE; learned_addr ignores this
+    a.set_port(0);
+    connection->lock.unlock();
+    messenger->learned_addr(a);
+    if (cct->_conf->ms_inject_internal_delays &&
+        cct->_conf->ms_inject_socket_failures) {
+      if (rand() % cct->_conf->ms_inject_socket_failures == 0) {
+        ldout(cct, 10) << __func__ << " sleep for "
+                       << cct->_conf->ms_inject_internal_delays << dendl;
+        utime_t t;
+        t.set_from_double(cct->_conf->ms_inject_internal_delays);
+        t.sleep();
+      }
+    }
+    connection->lock.lock();
+    if (state != HELLO_CONNECTING) {
+      ldout(cct, 1) << __func__
+                    << " state changed while learned_addr, mark_down or "
+                    << " replacing must be happened just now" << dendl;
+      return nullptr;
+    }
+  }
+
+
+
+  CtPtr callback;
+  callback = bannerExchangeCallback;
+  bannerExchangeCallback = nullptr;
+  ceph_assert(callback);
+  return callback;
+}
+
+CtPtr ProtocolV2::read_frame() {
+  if (state == CLOSED) {
+    return nullptr;
+  }
+
+  ldout(cct, 20) << __func__ << dendl;
+  rx_preamble.clear();
+  rx_epilogue.clear();
+  rx_segments_data.clear();
+
+  return READ(rx_frame_asm.get_preamble_onwire_len(),
+              handle_read_frame_preamble_main);
+}
+
+CtPtr ProtocolV2::handle_read_frame_preamble_main(rx_buffer_t &&buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read frame preamble failed r=" << r
+                  << " (" << cpp_strerror(r) << ")" << dendl;
+    return _fault();
+  }
+
+  rx_preamble.push_back(std::move(buffer));
+
+  ldout(cct, 30) << __func__ << " preamble\n";
+  rx_preamble.hexdump(*_dout);
+  *_dout << dendl;
+
+  try {
+    next_tag = rx_frame_asm.disassemble_preamble(rx_preamble);
+  } catch (FrameError& e) {
+    ldout(cct, 1) << __func__ << " " << e.what() << dendl;
+    return _fault();
+  } catch (ceph::crypto::onwire::MsgAuthError&) {
+    ldout(cct, 1) << __func__ << "bad auth tag" << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 25) << __func__ << " disassembled preamble " << rx_frame_asm
+                 << dendl;
+
+  if (session_stream_handlers.rx) {
+    ldout(cct, 30) << __func__ << " preamble after decrypt\n";
+    rx_preamble.hexdump(*_dout);
+    *_dout << dendl;
+  }
+
+  // does it need throttle?
+  if (next_tag == Tag::MESSAGE) {
+    if (state != READY) {
+      lderr(cct) << __func__ << " not in ready state!" << dendl;
+      return _fault();
+    }
+    recv_stamp = ceph_clock_now();
+    state = THROTTLE_MESSAGE;
+    return CONTINUE(throttle_message);
+  } else {
+    return read_frame_segment();
+  }
+}
+
+CtPtr ProtocolV2::handle_read_frame_dispatch() {
+  ldout(cct, 10) << __func__
+                 << " tag=" << static_cast<uint32_t>(next_tag) << dendl;
+
+  switch (next_tag) {
+    case Tag::HELLO:
+    case Tag::AUTH_REQUEST:
+    case Tag::AUTH_BAD_METHOD:
+    case Tag::AUTH_REPLY_MORE:
+    case Tag::AUTH_REQUEST_MORE:
+    case Tag::AUTH_DONE:
+    case Tag::AUTH_SIGNATURE:
+    case Tag::CLIENT_IDENT:
+    case Tag::SERVER_IDENT:
+    case Tag::IDENT_MISSING_FEATURES:
+    case Tag::SESSION_RECONNECT:
+    case Tag::SESSION_RESET:
+    case Tag::SESSION_RETRY:
+    case Tag::SESSION_RETRY_GLOBAL:
+    case Tag::SESSION_RECONNECT_OK:
+    case Tag::KEEPALIVE2:
+    case Tag::KEEPALIVE2_ACK:
+    case Tag::ACK:
+    case Tag::WAIT:
+      return handle_frame_payload();
+    case Tag::MESSAGE:
+      return handle_message();
+    default: {
+      lderr(cct) << __func__
+                 << " received unknown tag=" << static_cast<uint32_t>(next_tag)
+                 << dendl;
+      return _fault();
+    }
+  }
+
+  return nullptr;
+}
+
+CtPtr ProtocolV2::read_frame_segment() {
+  size_t seg_idx = rx_segments_data.size();
+  ldout(cct, 20) << __func__ << " seg_idx=" << seg_idx << dendl;
+  rx_segments_data.emplace_back();
+
+  uint32_t onwire_len = rx_frame_asm.get_segment_onwire_len(seg_idx);
+  if (onwire_len == 0) {
+    return _handle_read_frame_segment();
+  }
+
+  rx_buffer_t rx_buffer;
+  uint16_t align = rx_frame_asm.get_segment_align(seg_idx);
+  try {
+    rx_buffer = ceph::buffer::ptr_node::create(ceph::buffer::create_aligned(
+        onwire_len, align));
+  } catch (const ceph::buffer::bad_alloc&) {
+    // Catching because of potential issues with satisfying alignment.
+    ldout(cct, 1) << __func__ << " can't allocate aligned rx_buffer"
+                  << " len=" << onwire_len
+                  << " align=" << align
+                  << dendl;
+    return _fault();
+  }
+
+  return READ_RXBUF(std::move(rx_buffer), handle_read_frame_segment);
+}
+
+CtPtr ProtocolV2::handle_read_frame_segment(rx_buffer_t &&rx_buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read frame segment failed r=" << r << " ("
+                  << cpp_strerror(r) << ")" << dendl;
+    return _fault();
+  }
+
+  rx_segments_data.back().push_back(std::move(rx_buffer));
+  return _handle_read_frame_segment();
+}
+
+CtPtr ProtocolV2::_handle_read_frame_segment() {
+  if (rx_segments_data.size() == rx_frame_asm.get_num_segments()) {
+    // OK, all segments planned to read are read. Can go with epilogue.
+    uint32_t epilogue_onwire_len = rx_frame_asm.get_epilogue_onwire_len();
+    if (epilogue_onwire_len == 0) {
+      return _handle_read_frame_epilogue_main();
+    }
+    return READ(epilogue_onwire_len, handle_read_frame_epilogue_main);
+  }
+  // TODO: for makeshift only. This will be more generic and throttled
+  return read_frame_segment();
+}
+
+CtPtr ProtocolV2::handle_frame_payload() {
+  ceph_assert(!rx_segments_data.empty());
+  auto& payload = rx_segments_data.back();
+
+  ldout(cct, 30) << __func__ << "\n";
+  payload.hexdump(*_dout);
+  *_dout << dendl;
+
+  switch (next_tag) {
+    case Tag::HELLO:
+      return handle_hello(payload);
+    case Tag::AUTH_REQUEST:
+      return handle_auth_request(payload);
+    case Tag::AUTH_BAD_METHOD:
+      return handle_auth_bad_method(payload);
+    case Tag::AUTH_REPLY_MORE:
+      return handle_auth_reply_more(payload);
+    case Tag::AUTH_REQUEST_MORE:
+      return handle_auth_request_more(payload);
+    case Tag::AUTH_DONE:
+      return handle_auth_done(payload);
+    case Tag::AUTH_SIGNATURE:
+      return handle_auth_signature(payload);
+    case Tag::CLIENT_IDENT:
+      return handle_client_ident(payload);
+    case Tag::SERVER_IDENT:
+      return handle_server_ident(payload);
+    case Tag::IDENT_MISSING_FEATURES:
+      return handle_ident_missing_features(payload);
+    case Tag::SESSION_RECONNECT:
+      return handle_reconnect(payload);
+    case Tag::SESSION_RESET:
+      return handle_session_reset(payload);
+    case Tag::SESSION_RETRY:
+      return handle_session_retry(payload);
+    case Tag::SESSION_RETRY_GLOBAL:
+      return handle_session_retry_global(payload);
+    case Tag::SESSION_RECONNECT_OK:
+      return handle_reconnect_ok(payload);
+    case Tag::KEEPALIVE2:
+      return handle_keepalive2(payload);
+    case Tag::KEEPALIVE2_ACK:
+      return handle_keepalive2_ack(payload);
+    case Tag::ACK:
+      return handle_message_ack(payload);
+    case Tag::WAIT:
+      return handle_wait(payload);
+    default:
+      ceph_abort();
+  }
+  return nullptr;
+}
+
+CtPtr ProtocolV2::ready() {
+  ldout(cct, 25) << __func__ << dendl;
+
+  reconnecting = false;
+  replacing = false;
+
+  // make sure no pending tick timer
+  if (connection->last_tick_id) {
+    connection->center->delete_time_event(connection->last_tick_id);
+  }
+  connection->last_tick_id = connection->center->create_time_event(
+      connection->inactive_timeout_us, connection->tick_handler);
+
+  {
+    std::lock_guard<std::mutex> l(connection->write_lock);
+    can_write = true;
+    if (!out_queue.empty()) {
+      connection->center->dispatch_event_external(connection->write_handler);
+    }
+  }
+
+  connection->maybe_start_delay_thread();
+
+  state = READY;
+  ldout(cct, 1) << __func__ << " entity=" << peer_name << " client_cookie="
+                << std::hex << client_cookie << " server_cookie="
+                << server_cookie << std::dec << " in_seq=" << in_seq
+                << " out_seq=" << out_seq << dendl;
+
+  INTERCEPT(15);
+
+  return CONTINUE(read_frame);
+}
+
+CtPtr ProtocolV2::handle_read_frame_epilogue_main(rx_buffer_t &&buffer, int r)
+{
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read frame epilogue failed r=" << r
+                  << " (" << cpp_strerror(r) << ")" << dendl;
+    return _fault();
+  }
+
+  rx_epilogue.push_back(std::move(buffer));
+  return _handle_read_frame_epilogue_main();
+}
+
+CtPtr ProtocolV2::_handle_read_frame_epilogue_main() {
+  bool aborted;
+  try {
+    rx_frame_asm.disassemble_first_segment(rx_preamble, rx_segments_data[0]);
+    aborted = !rx_frame_asm.disassemble_remaining_segments(
+        rx_segments_data.data(), rx_epilogue);
+  } catch (FrameError& e) {
+    ldout(cct, 1) << __func__ << " " << e.what() << dendl;
+    return _fault();
+  } catch (ceph::crypto::onwire::MsgAuthError&) {
+    ldout(cct, 1) << __func__ << "bad auth tag" << dendl;
+    return _fault();
+  }
+
+  // we do have a mechanism that allows transmitter to start sending message
+  // and abort after putting entire data field on wire. This will be used by
+  // the kernel client to avoid unnecessary buffering.
+  if (aborted) {
+    reset_throttle();
+    state = READY;
+    return CONTINUE(read_frame);
+  }
+  return handle_read_frame_dispatch();
+}
+
+CtPtr ProtocolV2::handle_message() {
+  ldout(cct, 20) << __func__ << dendl;
+  ceph_assert(state == THROTTLE_DONE);
+
+  const size_t cur_msg_size = get_current_msg_size();
+  auto msg_frame = MessageFrame::Decode(rx_segments_data);
+
+  // XXX: paranoid copy just to avoid oops
+  ceph_msg_header2 current_header = msg_frame.header();
+
+  ldout(cct, 5) << __func__
+		<< " got " << msg_frame.front_len()
+		<< " + " << msg_frame.middle_len()
+		<< " + " << msg_frame.data_len()
+		<< " byte message."
+		<< " envelope type=" << current_header.type
+		<< " src " << peer_name
+		<< " off " << current_header.data_off
+                << dendl;
+
+  INTERCEPT(16);
+  ceph_msg_header header{current_header.seq,
+                         current_header.tid,
+                         current_header.type,
+                         current_header.priority,
+                         current_header.version,
+                         init_le32(msg_frame.front_len()),
+                         init_le32(msg_frame.middle_len()),
+                         init_le32(msg_frame.data_len()),
+                         current_header.data_off,
+                         peer_name,
+                         current_header.compat_version,
+                         current_header.reserved,
+                         init_le32(0)};
+  ceph_msg_footer footer{init_le32(0), init_le32(0),
+	                 init_le32(0), init_le64(0), current_header.flags};
+
+  Message *message = decode_message(cct, 0, header, footer,
+      msg_frame.front(),
+      msg_frame.middle(),
+      msg_frame.data(),
+      connection);
+  if (!message) {
+    ldout(cct, 1) << __func__ << " decode message failed " << dendl;
+    return _fault();
+  } else {
+    state = READ_MESSAGE_COMPLETE;
+  }
+
+  INTERCEPT(17);
+
+  message->set_byte_throttler(connection->policy.throttler_bytes);
+  message->set_message_throttler(connection->policy.throttler_messages);
+
+  // store reservation size in message, so we don't get confused
+  // by messages entering the dispatch queue through other paths.
+  message->set_dispatch_throttle_size(cur_msg_size);
+
+  message->set_recv_stamp(recv_stamp);
+  message->set_throttle_stamp(throttle_stamp);
+  message->set_recv_complete_stamp(ceph_clock_now());
+
+  // check received seq#.  if it is old, drop the message.
+  // note that incoming messages may skip ahead.  this is convenient for the
+  // client side queueing because messages can't be renumbered, but the (kernel)
+  // client will occasionally pull a message out of the sent queue to send
+  // elsewhere.  in that case it doesn't matter if we "got" it or not.
+  uint64_t cur_seq = in_seq;
+  if (message->get_seq() <= cur_seq) {
+    ldout(cct, 0) << __func__ << " got old message " << message->get_seq()
+                  << " <= " << cur_seq << " " << message << " " << *message
+                  << ", discarding" << dendl;
+    message->put();
+    if (connection->has_feature(CEPH_FEATURE_RECONNECT_SEQ) &&
+        cct->_conf->ms_die_on_old_message) {
+      ceph_assert(0 == "old msgs despite reconnect_seq feature");
+    }
+    return nullptr;
+  }
+  if (message->get_seq() > cur_seq + 1) {
+    ldout(cct, 0) << __func__ << " missed message?  skipped from seq "
+                  << cur_seq << " to " << message->get_seq() << dendl;
+    if (cct->_conf->ms_die_on_skipped_message) {
+      ceph_assert(0 == "skipped incoming seq");
+    }
+  }
+
+#if defined(WITH_EVENTTRACE)
+  if (message->get_type() == CEPH_MSG_OSD_OP ||
+      message->get_type() == CEPH_MSG_OSD_OPREPLY) {
+    utime_t ltt_processed_stamp = ceph_clock_now();
+    double usecs_elapsed =
+        (ltt_processed_stamp.to_nsec() - recv_stamp.to_nsec()) / 1000;
+    ostringstream buf;
+    if (message->get_type() == CEPH_MSG_OSD_OP)
+      OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OP",
+                           false);
+    else
+      OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OPREPLY",
+                           false);
+  }
+#endif
+
+  // note last received message.
+  in_seq = message->get_seq();
+  ldout(cct, 5) << __func__ << " received message m=" << message
+                << " seq=" << message->get_seq()
+                << " from=" << message->get_source() << " type=" << header.type
+                << " " << *message << dendl;
+
+  bool need_dispatch_writer = false;
+  if (!connection->policy.lossy) {
+    ack_left++;
+    need_dispatch_writer = true;
+  }
+
+  state = READY;
+
+  ceph::mono_time fast_dispatch_time;
+
+  if (connection->is_blackhole()) {
+    ldout(cct, 10) << __func__ << " blackhole " << *message << dendl;
+    message->put();
+    goto out;
+  }
+
+  connection->logger->inc(l_msgr_recv_messages);
+  connection->logger->inc(l_msgr_recv_bytes,
+                          rx_frame_asm.get_frame_onwire_len());
+
+  messenger->ms_fast_preprocess(message);
+  fast_dispatch_time = ceph::mono_clock::now();
+  connection->logger->tinc(l_msgr_running_recv_time,
+			   fast_dispatch_time - connection->recv_start_time);
+  if (connection->delay_state) {
+    double delay_period = 0;
+    if (rand() % 10000 < cct->_conf->ms_inject_delay_probability * 10000.0) {
+      delay_period =
+          cct->_conf->ms_inject_delay_max * (double)(rand() % 10000) / 10000.0;
+      ldout(cct, 1) << "queue_received will delay after "
+                    << (ceph_clock_now() + delay_period) << " on " << message
+                    << " " << *message << dendl;
+    }
+    connection->delay_state->queue(delay_period, message);
+  } else if (messenger->ms_can_fast_dispatch(message)) {
+    connection->lock.unlock();
+    connection->dispatch_queue->fast_dispatch(message);
+    connection->recv_start_time = ceph::mono_clock::now();
+    connection->logger->tinc(l_msgr_running_fast_dispatch_time,
+                             connection->recv_start_time - fast_dispatch_time);
+    connection->lock.lock();
+    // we might have been reused by another connection
+    // let's check if that is the case
+    if (state != READY) {
+      // yes, that was the case, let's do nothing
+      return nullptr;
+    }
+  } else {
+    connection->dispatch_queue->enqueue(message, message->get_priority(),
+                                        connection->conn_id);
+  }
+
+  handle_message_ack(current_header.ack_seq);
+
+ out:
+  if (need_dispatch_writer && connection->is_connected()) {
+    connection->center->dispatch_event_external(connection->write_handler);
+  }
+
+  return CONTINUE(read_frame);
+}
+
+
+CtPtr ProtocolV2::throttle_message() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (connection->policy.throttler_messages) {
+    ldout(cct, 10) << __func__ << " wants " << 1
+                   << " message from policy throttler "
+                   << connection->policy.throttler_messages->get_current()
+                   << "/" << connection->policy.throttler_messages->get_max()
+                   << dendl;
+    if (!connection->policy.throttler_messages->get_or_fail()) {
+      ldout(cct, 1) << __func__ << " wants 1 message from policy throttle "
+                     << connection->policy.throttler_messages->get_current()
+                     << "/" << connection->policy.throttler_messages->get_max()
+                     << " failed, just wait." << dendl;
+      // following thread pool deal with th full message queue isn't a
+      // short time, so we can wait a ms.
+      if (connection->register_time_events.empty()) {
+        connection->register_time_events.insert(
+            connection->center->create_time_event(1000,
+                                                  connection->wakeup_handler));
+      }
+      return nullptr;
+    }
+  }
+
+  state = THROTTLE_BYTES;
+  return CONTINUE(throttle_bytes);
+}
+
+CtPtr ProtocolV2::throttle_bytes() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  const size_t cur_msg_size = get_current_msg_size();
+  if (cur_msg_size) {
+    if (connection->policy.throttler_bytes) {
+      ldout(cct, 10) << __func__ << " wants " << cur_msg_size
+                     << " bytes from policy throttler "
+                     << connection->policy.throttler_bytes->get_current() << "/"
+                     << connection->policy.throttler_bytes->get_max() << dendl;
+      if (!connection->policy.throttler_bytes->get_or_fail(cur_msg_size)) {
+        ldout(cct, 1) << __func__ << " wants " << cur_msg_size
+                       << " bytes from policy throttler "
+                       << connection->policy.throttler_bytes->get_current()
+                       << "/" << connection->policy.throttler_bytes->get_max()
+                       << " failed, just wait." << dendl;
+        // following thread pool deal with th full message queue isn't a
+        // short time, so we can wait a ms.
+        if (connection->register_time_events.empty()) {
+          connection->register_time_events.insert(
+              connection->center->create_time_event(
+                  1000, connection->wakeup_handler));
+        }
+        return nullptr;
+      }
+    }
+  }
+
+  state = THROTTLE_DISPATCH_QUEUE;
+  return CONTINUE(throttle_dispatch_queue);
+}
+
+CtPtr ProtocolV2::throttle_dispatch_queue() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  const size_t cur_msg_size = get_current_msg_size();
+  if (cur_msg_size) {
+    if (!connection->dispatch_queue->dispatch_throttler.get_or_fail(
+            cur_msg_size)) {
+      ldout(cct, 1)
+          << __func__ << " wants " << cur_msg_size
+          << " bytes from dispatch throttle "
+          << connection->dispatch_queue->dispatch_throttler.get_current() << "/"
+          << connection->dispatch_queue->dispatch_throttler.get_max()
+          << " failed, just wait." << dendl;
+      // following thread pool deal with th full message queue isn't a
+      // short time, so we can wait a ms.
+      if (connection->register_time_events.empty()) {
+        connection->register_time_events.insert(
+            connection->center->create_time_event(1000,
+                                                  connection->wakeup_handler));
+      }
+      return nullptr;
+    }
+  }
+
+  throttle_stamp = ceph_clock_now();
+  state = THROTTLE_DONE;
+
+  return read_frame_segment();
+}
+
+CtPtr ProtocolV2::handle_keepalive2(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != READY) {
+    lderr(cct) << __func__ << " not in ready state!" << dendl;
+    return _fault();
+  }
+
+  auto keepalive_frame = KeepAliveFrame::Decode(payload);
+
+  ldout(cct, 30) << __func__ << " got KEEPALIVE2 tag ..." << dendl;
+
+  connection->write_lock.lock();
+  auto keepalive_ack_frame = KeepAliveFrameAck::Encode(keepalive_frame.timestamp());
+  if (!append_frame(keepalive_ack_frame)) {
+    connection->write_lock.unlock();
+    return _fault();
+  }
+  connection->write_lock.unlock();
+
+  ldout(cct, 20) << __func__ << " got KEEPALIVE2 "
+                 << keepalive_frame.timestamp() << dendl;
+  connection->set_last_keepalive(ceph_clock_now());
+
+  if (is_connected()) {
+    connection->center->dispatch_event_external(connection->write_handler);
+  }
+
+  return CONTINUE(read_frame);
+}
+
+CtPtr ProtocolV2::handle_keepalive2_ack(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != READY) {
+    lderr(cct) << __func__ << " not in ready state!" << dendl;
+    return _fault();
+  }
+
+  auto keepalive_ack_frame = KeepAliveFrameAck::Decode(payload);
+  connection->set_last_keepalive_ack(keepalive_ack_frame.timestamp());
+  ldout(cct, 20) << __func__ << " got KEEPALIVE_ACK" << dendl;
+
+  return CONTINUE(read_frame);
+}
+
+CtPtr ProtocolV2::handle_message_ack(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != READY) {
+    lderr(cct) << __func__ << " not in ready state!" << dendl;
+    return _fault();
+  }
+
+  auto ack = AckFrame::Decode(payload);
+  handle_message_ack(ack.seq());
+  return CONTINUE(read_frame);
+}
+
+/* Client Protocol Methods */
+
+CtPtr ProtocolV2::start_client_banner_exchange() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  INTERCEPT(1);
+
+  state = BANNER_CONNECTING;
+
+  global_seq = messenger->get_global_seq();
+
+  return _banner_exchange(CONTINUATION(post_client_banner_exchange));
+}
+
+CtPtr ProtocolV2::post_client_banner_exchange() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  state = AUTH_CONNECTING;
+
+  return send_auth_request();
+}
+
+CtPtr ProtocolV2::send_auth_request(std::vector<uint32_t> &allowed_methods) {
+  ceph_assert(messenger->auth_client);
+  ldout(cct, 20) << __func__ << " peer_type " << (int)connection->peer_type
+		 << " auth_client " << messenger->auth_client << dendl;
+
+  ceph::bufferlist bl;
+  std::vector<uint32_t> preferred_modes;
+  auto am = auth_meta;
+  connection->lock.unlock();
+  int r = messenger->auth_client->get_auth_request(
+    connection, am.get(),
+    &am->auth_method, &preferred_modes, &bl);
+  connection->lock.lock();
+  if (state != AUTH_CONNECTING) {
+    ldout(cct, 1) << __func__ << " state changed!" << dendl;
+    return _fault();
+  }
+  if (r < 0) {
+    ldout(cct, 0) << __func__ << " get_initial_auth_request returned " << r
+		  << dendl;
+    stop();
+    connection->dispatch_queue->queue_reset(connection);
+    return nullptr;
+  }
+
+  INTERCEPT(9);
+
+  auto frame = AuthRequestFrame::Encode(auth_meta->auth_method, preferred_modes,
+                                        bl);
+  return WRITE(frame, "auth request", read_frame);
+}
+
+CtPtr ProtocolV2::handle_auth_bad_method(ceph::bufferlist &payload) {
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != AUTH_CONNECTING) {
+    lderr(cct) << __func__ << " not in auth connect state!" << dendl;
+    return _fault();
+  }
+
+  auto bad_method = AuthBadMethodFrame::Decode(payload);
+  ldout(cct, 1) << __func__ << " method=" << bad_method.method()
+		<< " result " << cpp_strerror(bad_method.result())
+                << ", allowed methods=" << bad_method.allowed_methods()
+		<< ", allowed modes=" << bad_method.allowed_modes()
+                << dendl;
+  ceph_assert(messenger->auth_client);
+  auto am = auth_meta;
+  connection->lock.unlock();
+  int r = messenger->auth_client->handle_auth_bad_method(
+    connection,
+    am.get(),
+    bad_method.method(), bad_method.result(),
+    bad_method.allowed_methods(),
+    bad_method.allowed_modes());
+  connection->lock.lock();
+  if (state != AUTH_CONNECTING || r < 0) {
+    return _fault();
+  }
+  return send_auth_request(bad_method.allowed_methods());
+}
+
+CtPtr ProtocolV2::handle_auth_reply_more(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != AUTH_CONNECTING) {
+    lderr(cct) << __func__ << " not in auth connect state!" << dendl;
+    return _fault();
+  }
+
+  auto auth_more = AuthReplyMoreFrame::Decode(payload);
+  ldout(cct, 5) << __func__
+                << " auth reply more len=" << auth_more.auth_payload().length()
+                << dendl;
+  ceph_assert(messenger->auth_client);
+  ceph::bufferlist reply;
+  auto am = auth_meta;
+  connection->lock.unlock();
+  int r = messenger->auth_client->handle_auth_reply_more(
+    connection, am.get(), auth_more.auth_payload(), &reply);
+  connection->lock.lock();
+  if (state != AUTH_CONNECTING) {
+    ldout(cct, 1) << __func__ << " state changed!" << dendl;
+    return _fault();
+  }
+  if (r < 0) {
+    lderr(cct) << __func__ << " auth_client handle_auth_reply_more returned "
+	       << r << dendl;
+    return _fault();
+  }
+  auto more_reply = AuthRequestMoreFrame::Encode(reply);
+  return WRITE(more_reply, "auth request more", read_frame);
+}
+
+CtPtr ProtocolV2::handle_auth_done(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != AUTH_CONNECTING) {
+    lderr(cct) << __func__ << " not in auth connect state!" << dendl;
+    return _fault();
+  }
+
+  auto auth_done = AuthDoneFrame::Decode(payload);
+
+  ceph_assert(messenger->auth_client);
+  auto am = auth_meta;
+  connection->lock.unlock();
+  int r = messenger->auth_client->handle_auth_done(
+    connection,
+    am.get(),
+    auth_done.global_id(),
+    auth_done.con_mode(),
+    auth_done.auth_payload(),
+    &am->session_key,
+    &am->connection_secret);
+  connection->lock.lock();
+  if (state != AUTH_CONNECTING) {
+    ldout(cct, 1) << __func__ << " state changed!" << dendl;
+    return _fault();
+  }
+  if (r < 0) {
+    return _fault();
+  }
+  auth_meta->con_mode = auth_done.con_mode();
+  bool is_rev1 = HAVE_MSGR2_FEATURE(peer_supported_features, REVISION_1);
+  session_stream_handlers = ceph::crypto::onwire::rxtx_t::create_handler_pair(
+      cct, *auth_meta, /*new_nonce_format=*/is_rev1, /*crossed=*/false);
+
+  state = AUTH_CONNECTING_SIGN;
+
+  const auto sig = auth_meta->session_key.empty() ? sha256_digest_t() :
+    auth_meta->session_key.hmac_sha256(cct, pre_auth.rxbuf);
+  auto sig_frame = AuthSignatureFrame::Encode(sig);
+  pre_auth.enabled = false;
+  pre_auth.rxbuf.clear();
+  return WRITE(sig_frame, "auth signature", read_frame);
+}
+
+CtPtr ProtocolV2::finish_client_auth() {
+  if (!server_cookie) {
+    ceph_assert(connect_seq == 0);
+    state = SESSION_CONNECTING;
+    return send_client_ident();
+  } else {  // reconnecting to previous session
+    state = SESSION_RECONNECTING;
+    ceph_assert(connect_seq > 0);
+    return send_reconnect();
+  }
+}
+
+CtPtr ProtocolV2::send_client_ident() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (!connection->policy.lossy && !client_cookie) {
+    client_cookie = ceph::util::generate_random_number<uint64_t>(1, -1ll);
+  }
+
+  uint64_t flags = 0;
+  if (connection->policy.lossy) {
+    flags |= CEPH_MSG_CONNECT_LOSSY;
+  }
+
+  auto client_ident = ClientIdentFrame::Encode(
+      messenger->get_myaddrs(),
+      connection->target_addr,
+      messenger->get_myname().num(),
+      global_seq,
+      connection->policy.features_supported,
+      connection->policy.features_required | msgr2_required,
+      flags,
+      client_cookie);
+
+  ldout(cct, 5) << __func__ << " sending identification: "
+                << "addrs=" << messenger->get_myaddrs()
+                << " target=" << connection->target_addr
+                << " gid=" << messenger->get_myname().num()
+                << " global_seq=" << global_seq
+                << " features_supported=" << std::hex
+                << connection->policy.features_supported
+                << " features_required="
+		            << (connection->policy.features_required | msgr2_required)
+                << " flags=" << flags
+                << " cookie=" << client_cookie << std::dec << dendl;
+
+  INTERCEPT(11);
+
+  return WRITE(client_ident, "client ident", read_frame);
+}
+
+CtPtr ProtocolV2::send_reconnect() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  auto reconnect = ReconnectFrame::Encode(messenger->get_myaddrs(),
+                                          client_cookie,
+                                          server_cookie,
+                                          global_seq,
+                                          connect_seq,
+                                          in_seq);
+
+  ldout(cct, 5) << __func__ << " reconnect to session: client_cookie="
+                << std::hex << client_cookie << " server_cookie="
+                << server_cookie << std::dec
+                << " gs=" << global_seq << " cs=" << connect_seq
+                << " ms=" << in_seq << dendl;
+
+  INTERCEPT(13);
+
+  return WRITE(reconnect, "reconnect", read_frame);
+}
+
+CtPtr ProtocolV2::handle_ident_missing_features(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != SESSION_CONNECTING) {
+    lderr(cct) << __func__ << " not in session connect state!" << dendl;
+    return _fault();
+  }
+
+  auto ident_missing =
+      IdentMissingFeaturesFrame::Decode(payload);
+  lderr(cct) << __func__
+             << " client does not support all server features: " << std::hex
+             << ident_missing.features() << std::dec << dendl;
+
+  return _fault();
+}
+
+CtPtr ProtocolV2::handle_session_reset(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != SESSION_RECONNECTING) {
+    lderr(cct) << __func__ << " not in session reconnect state!" << dendl;
+    return _fault();
+  }
+
+  auto reset = ResetFrame::Decode(payload);
+
+  ldout(cct, 1) << __func__ << " received session reset full=" << reset.full()
+                << dendl;
+  if (reset.full()) {
+    reset_session();
+  } else {
+    server_cookie = 0;
+    connect_seq = 0;
+    in_seq = 0;
+  }
+
+  state = SESSION_CONNECTING;
+  return send_client_ident();
+}
+
+CtPtr ProtocolV2::handle_session_retry(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != SESSION_RECONNECTING) {
+    lderr(cct) << __func__ << " not in session reconnect state!" << dendl;
+    return _fault();
+  }
+
+  auto retry = RetryFrame::Decode(payload);
+  connect_seq = retry.connect_seq() + 1;
+
+  ldout(cct, 1) << __func__
+                << " received session retry connect_seq=" << retry.connect_seq()
+                << ", inc to cs=" << connect_seq << dendl;
+
+  return send_reconnect();
+}
+
+CtPtr ProtocolV2::handle_session_retry_global(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != SESSION_RECONNECTING) {
+    lderr(cct) << __func__ << " not in session reconnect state!" << dendl;
+    return _fault();
+  }
+
+  auto retry = RetryGlobalFrame::Decode(payload);
+  global_seq = messenger->get_global_seq(retry.global_seq());
+
+  ldout(cct, 1) << __func__ << " received session retry global global_seq="
+                << retry.global_seq() << ", choose new gs=" << global_seq
+                << dendl;
+
+  return send_reconnect();
+}
+
+CtPtr ProtocolV2::handle_wait(ceph::bufferlist &payload) {
+  ldout(cct, 20) << __func__
+		 << " received WAIT (connection race)"
+		 << " payload.length()=" << payload.length()
+		 << dendl;
+
+  if (state != SESSION_CONNECTING && state != SESSION_RECONNECTING) {
+    lderr(cct) << __func__ << " not in session (re)connect state!" << dendl;
+    return _fault();
+  }
+
+  state = WAIT;
+  WaitFrame::Decode(payload);
+  return _fault();
+}
+
+CtPtr ProtocolV2::handle_reconnect_ok(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != SESSION_RECONNECTING) {
+    lderr(cct) << __func__ << " not in session reconnect state!" << dendl;
+    return _fault();
+  }
+
+  auto reconnect_ok = ReconnectOkFrame::Decode(payload);
+  ldout(cct, 5) << __func__
+                << " reconnect accepted: sms=" << reconnect_ok.msg_seq()
+                << dendl;
+
+  out_seq = discard_requeued_up_to(out_seq, reconnect_ok.msg_seq());
+
+  backoff = utime_t();
+  ldout(cct, 10) << __func__ << " reconnect success " << connect_seq
+                 << ", lossy = " << connection->policy.lossy << ", features "
+                 << connection->get_features() << dendl;
+
+  if (connection->delay_state) {
+    ceph_assert(connection->delay_state->ready());
+  }
+
+  connection->dispatch_queue->queue_connect(connection);
+  messenger->ms_deliver_handle_fast_connect(connection);
+
+  return ready();
+}
+
+CtPtr ProtocolV2::handle_server_ident(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != SESSION_CONNECTING) {
+    lderr(cct) << __func__ << " not in session connect state!" << dendl;
+    return _fault();
+  }
+
+  auto server_ident = ServerIdentFrame::Decode(payload);
+  ldout(cct, 5) << __func__ << " received server identification:"
+                << " addrs=" << server_ident.addrs()
+                << " gid=" << server_ident.gid()
+                << " global_seq=" << server_ident.global_seq()
+                << " features_supported=" << std::hex
+                << server_ident.supported_features()
+                << " features_required=" << server_ident.required_features()
+                << " flags=" << server_ident.flags()
+                << " cookie=" << server_ident.cookie() << std::dec << dendl;
+
+  // is this who we intended to talk to?
+  // be a bit forgiving here, since we may be connecting based on addresses parsed out
+  // of mon_host or something.
+  if (!server_ident.addrs().contains(connection->target_addr)) {
+    ldout(cct,1) << __func__ << " peer identifies as " << server_ident.addrs()
+		 << ", does not include " << connection->target_addr << dendl;
+    return _fault();
+  }
+
+  server_cookie = server_ident.cookie();
+
+  connection->set_peer_addrs(server_ident.addrs());
+  peer_name = entity_name_t(connection->get_peer_type(), server_ident.gid());
+  connection->set_features(server_ident.supported_features() &
+                           connection->policy.features_supported);
+  peer_global_seq = server_ident.global_seq();
+
+  connection->policy.lossy = server_ident.flags() & CEPH_MSG_CONNECT_LOSSY;
+
+  backoff = utime_t();
+  ldout(cct, 10) << __func__ << " connect success " << connect_seq
+                 << ", lossy = " << connection->policy.lossy << ", features "
+                 << connection->get_features() << dendl;
+
+  if (connection->delay_state) {
+    ceph_assert(connection->delay_state->ready());
+  }
+
+  connection->dispatch_queue->queue_connect(connection);
+  messenger->ms_deliver_handle_fast_connect(connection);
+
+  return ready();
+}
+
+/* Server Protocol Methods */
+
+CtPtr ProtocolV2::start_server_banner_exchange() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  INTERCEPT(2);
+
+  state = BANNER_ACCEPTING;
+
+  return _banner_exchange(CONTINUATION(post_server_banner_exchange));
+}
+
+CtPtr ProtocolV2::post_server_banner_exchange() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  state = AUTH_ACCEPTING;
+
+  return CONTINUE(read_frame);
+}
+
+CtPtr ProtocolV2::handle_auth_request(ceph::bufferlist &payload) {
+  ldout(cct, 20) << __func__ << " payload.length()=" << payload.length()
+                 << dendl;
+
+  if (state != AUTH_ACCEPTING) {
+    lderr(cct) << __func__ << " not in auth accept state!" << dendl;
+    return _fault();
+  }
+
+  auto request = AuthRequestFrame::Decode(payload);
+  ldout(cct, 10) << __func__ << " AuthRequest(method=" << request.method()
+		 << ", preferred_modes=" << request.preferred_modes()
+                 << ", payload_len=" << request.auth_payload().length() << ")"
+                 << dendl;
+  auth_meta->auth_method = request.method();
+  auth_meta->con_mode = messenger->auth_server->pick_con_mode(
+    connection->get_peer_type(), auth_meta->auth_method,
+    request.preferred_modes());
+  if (auth_meta->con_mode == CEPH_CON_MODE_UNKNOWN) {
+    return _auth_bad_method(-EOPNOTSUPP);
+  }
+  return _handle_auth_request(request.auth_payload(), false);
+}
+
+CtPtr ProtocolV2::_auth_bad_method(int r)
+{
+  ceph_assert(r < 0);
+  std::vector<uint32_t> allowed_methods;
+  std::vector<uint32_t> allowed_modes;
+  messenger->auth_server->get_supported_auth_methods(
+    connection->get_peer_type(), &allowed_methods, &allowed_modes);
+  ldout(cct, 1) << __func__ << " auth_method " << auth_meta->auth_method
+		<< " r " << cpp_strerror(r)
+		<< ", allowed_methods " << allowed_methods
+		<< ", allowed_modes " << allowed_modes
+		<< dendl;
+  auto bad_method = AuthBadMethodFrame::Encode(auth_meta->auth_method, r,
+                                               allowed_methods, allowed_modes);
+  return WRITE(bad_method, "bad auth method", read_frame);
+}
+
+CtPtr ProtocolV2::_handle_auth_request(ceph::bufferlist& auth_payload, bool more)
+{
+  if (!messenger->auth_server) {
+    return _fault();
+  }
+  ceph::bufferlist reply;
+  auto am = auth_meta;
+  connection->lock.unlock();
+  int r = messenger->auth_server->handle_auth_request(
+    connection, am.get(),
+    more, am->auth_method, auth_payload,
+    &reply);
+  connection->lock.lock();
+  if (state != AUTH_ACCEPTING && state != AUTH_ACCEPTING_MORE) {
+    ldout(cct, 1) << __func__
+                  << " state changed while accept, it must be mark_down"
+                  << dendl;
+    ceph_assert(state == CLOSED);
+    return _fault();
+  }
+  if (r == 1) {
+    INTERCEPT(10);
+    state = AUTH_ACCEPTING_SIGN;
+
+    auto auth_done = AuthDoneFrame::Encode(connection->peer_global_id,
+                                           auth_meta->con_mode,
+                                           reply);
+    return WRITE(auth_done, "auth done", finish_auth);
+  } else if (r == 0) {
+    state = AUTH_ACCEPTING_MORE;
+
+    auto more = AuthReplyMoreFrame::Encode(reply);
+    return WRITE(more, "auth reply more", read_frame);
+  } else if (r == -EBUSY) {
+    // kick the client and maybe they'll come back later
+    return _fault();
+  } else {
+    return _auth_bad_method(r);
+  }
+}
+
+CtPtr ProtocolV2::finish_auth()
+{
+  ceph_assert(auth_meta);
+  // TODO: having a possibility to check whether we're server or client could
+  // allow reusing finish_auth().
+  bool is_rev1 = HAVE_MSGR2_FEATURE(peer_supported_features, REVISION_1);
+  session_stream_handlers = ceph::crypto::onwire::rxtx_t::create_handler_pair(
+      cct, *auth_meta, /*new_nonce_format=*/is_rev1, /*crossed=*/true);
+
+  const auto sig = auth_meta->session_key.empty() ? sha256_digest_t() :
+    auth_meta->session_key.hmac_sha256(cct, pre_auth.rxbuf);
+  auto sig_frame = AuthSignatureFrame::Encode(sig);
+  pre_auth.enabled = false;
+  pre_auth.rxbuf.clear();
+  return WRITE(sig_frame, "auth signature", read_frame);
+}
+
+CtPtr ProtocolV2::handle_auth_request_more(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != AUTH_ACCEPTING_MORE) {
+    lderr(cct) << __func__ << " not in auth accept more state!" << dendl;
+    return _fault();
+  }
+
+  auto auth_more = AuthRequestMoreFrame::Decode(payload);
+  return _handle_auth_request(auth_more.auth_payload(), true);
+}
+
+CtPtr ProtocolV2::handle_auth_signature(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != AUTH_ACCEPTING_SIGN && state != AUTH_CONNECTING_SIGN) {
+    lderr(cct) << __func__
+               << " pre-auth verification signature seen in wrong state!"
+               << dendl;
+    return _fault();
+  }
+
+  auto sig_frame = AuthSignatureFrame::Decode(payload);
+
+  const auto actual_tx_sig = auth_meta->session_key.empty() ?
+    sha256_digest_t() : auth_meta->session_key.hmac_sha256(cct, pre_auth.txbuf);
+  if (sig_frame.signature() != actual_tx_sig) {
+    ldout(cct, 2) << __func__ << " pre-auth signature mismatch"
+                  << " actual_tx_sig=" << actual_tx_sig
+                  << " sig_frame.signature()=" << sig_frame.signature()
+                  << dendl;
+    return _fault();
+  } else {
+    ldout(cct, 20) << __func__ << " pre-auth signature success"
+                   << " sig_frame.signature()=" << sig_frame.signature()
+                   << dendl;
+    pre_auth.txbuf.clear();
+  }
+
+  if (state == AUTH_ACCEPTING_SIGN) {
+    // server had sent AuthDone and client responded with correct pre-auth
+    // signature. we can start accepting new sessions/reconnects.
+    state = SESSION_ACCEPTING;
+    return CONTINUE(read_frame);
+  } else if (state == AUTH_CONNECTING_SIGN) {
+    // this happened at client side
+    return finish_client_auth();
+  } else {
+    ceph_abort("state corruption");
+  }
+}
+
+CtPtr ProtocolV2::handle_client_ident(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != SESSION_ACCEPTING) {
+    lderr(cct) << __func__ << " not in session accept state!" << dendl;
+    return _fault();
+  }
+
+  auto client_ident = ClientIdentFrame::Decode(payload);
+
+  ldout(cct, 5) << __func__ << " received client identification:"
+                << " addrs=" << client_ident.addrs()
+		            << " target=" << client_ident.target_addr()
+                << " gid=" << client_ident.gid()
+                << " global_seq=" << client_ident.global_seq()
+                << " features_supported=" << std::hex
+                << client_ident.supported_features()
+                << " features_required=" << client_ident.required_features()
+                << " flags=" << client_ident.flags()
+                << " cookie=" << client_ident.cookie() << std::dec << dendl;
+
+  if (client_ident.addrs().empty() ||
+      client_ident.addrs().front() == entity_addr_t()) {
+    ldout(cct,5) << __func__ << " oops, client_ident.addrs() is empty" << dendl;
+    return _fault();  // a v2 peer should never do this
+  }
+  if (!messenger->get_myaddrs().contains(client_ident.target_addr())) {
+    ldout(cct,5) << __func__ << " peer is trying to reach "
+		 << client_ident.target_addr()
+		 << " which is not us (" << messenger->get_myaddrs() << ")"
+		 << dendl;
+    return _fault();
+  }
+
+  connection->set_peer_addrs(client_ident.addrs());
+  connection->target_addr = connection->_infer_target_addr(client_ident.addrs());
+
+  peer_name = entity_name_t(connection->get_peer_type(), client_ident.gid());
+  connection->set_peer_id(client_ident.gid());
+
+  client_cookie = client_ident.cookie();
+
+  uint64_t feat_missing =
+    (connection->policy.features_required | msgr2_required) &
+    ~(uint64_t)client_ident.supported_features();
+  if (feat_missing) {
+    ldout(cct, 1) << __func__ << " peer missing required features " << std::hex
+                  << feat_missing << std::dec << dendl;
+    auto ident_missing_features =
+        IdentMissingFeaturesFrame::Encode(feat_missing);
+
+    return WRITE(ident_missing_features, "ident missing features", read_frame);
+  }
+
+  connection_features =
+      client_ident.supported_features() & connection->policy.features_supported;
+
+  peer_global_seq = client_ident.global_seq();
+
+  if (connection->policy.server &&
+      connection->policy.lossy &&
+      !connection->policy.register_lossy_clients) {
+    // incoming lossy client, no need to register this connection
+  } else {
+    // Looks good so far, let's check if there is already an existing connection
+    // to this peer.
+    connection->lock.unlock();
+    AsyncConnectionRef existing = messenger->lookup_conn(
+      *connection->peer_addrs);
+
+    if (existing &&
+	existing->protocol->proto_type != 2) {
+      ldout(cct,1) << __func__ << " existing " << existing << " proto "
+		   << existing->protocol.get() << " version is "
+		   << existing->protocol->proto_type << ", marking down"
+		   << dendl;
+      existing->mark_down();
+      existing = nullptr;
+    }
+
+    connection->inject_delay();
+
+    connection->lock.lock();
+    if (state != SESSION_ACCEPTING) {
+      ldout(cct, 1) << __func__
+		    << " state changed while accept, it must be mark_down"
+		    << dendl;
+      ceph_assert(state == CLOSED);
+      return _fault();
+    }
+
+    if (existing) {
+      return handle_existing_connection(existing);
+    }
+  }
+
+  // if everything is OK reply with server identification
+  return send_server_ident();
+}
+
+CtPtr ProtocolV2::handle_reconnect(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != SESSION_ACCEPTING) {
+    lderr(cct) << __func__ << " not in session accept state!" << dendl;
+    return _fault();
+  }
+
+  auto reconnect = ReconnectFrame::Decode(payload);
+
+  ldout(cct, 5) << __func__
+                << " received reconnect:" 
+                << " client_cookie=" << std::hex << reconnect.client_cookie()
+                << " server_cookie=" << reconnect.server_cookie() << std::dec
+                << " gs=" << reconnect.global_seq()
+                << " cs=" << reconnect.connect_seq()
+                << " ms=" << reconnect.msg_seq()
+		            << dendl;
+
+  // Should we check if one of the ident.addrs match connection->target_addr
+  // as we do in ProtocolV1?
+  connection->set_peer_addrs(reconnect.addrs());
+  connection->target_addr = connection->_infer_target_addr(reconnect.addrs());
+  peer_global_seq = reconnect.global_seq();
+
+  connection->lock.unlock();
+  AsyncConnectionRef existing = messenger->lookup_conn(*connection->peer_addrs);
+
+  if (existing &&
+      existing->protocol->proto_type != 2) {
+    ldout(cct,1) << __func__ << " existing " << existing << " proto "
+		 << existing->protocol.get() << " version is "
+		 << existing->protocol->proto_type << ", marking down" << dendl;
+    existing->mark_down();
+    existing = nullptr;
+  }
+
+  connection->inject_delay();
+
+  connection->lock.lock();
+  if (state != SESSION_ACCEPTING) {
+    ldout(cct, 1) << __func__
+                  << " state changed while accept, it must be mark_down"
+                  << dendl;
+    ceph_assert(state == CLOSED);
+    return _fault();
+  }
+
+  if (!existing) {
+    // there is no existing connection therefore cannot reconnect to previous
+    // session
+    ldout(cct, 0) << __func__
+                  << " no existing connection exists, reseting client" << dendl;
+    auto reset = ResetFrame::Encode(true);
+    return WRITE(reset, "session reset", read_frame);
+  }
+
+  std::lock_guard<std::mutex> l(existing->lock);
+
+  ProtocolV2 *exproto = dynamic_cast<ProtocolV2 *>(existing->protocol.get());
+  if (!exproto) {
+    ldout(cct, 1) << __func__ << " existing=" << existing << dendl;
+    ceph_assert(false);
+  }
+
+  if (exproto->state == CLOSED) {
+    ldout(cct, 5) << __func__ << " existing " << existing
+                  << " already closed. Reseting client" << dendl;
+    auto reset = ResetFrame::Encode(true);
+    return WRITE(reset, "session reset", read_frame);
+  }
+
+  if (exproto->replacing) {
+    ldout(cct, 1) << __func__
+                  << " existing racing replace happened while replacing."
+                  << " existing=" << existing << dendl;
+    auto retry = RetryGlobalFrame::Encode(exproto->peer_global_seq);
+    return WRITE(retry, "session retry", read_frame);
+  }
+
+  if (exproto->client_cookie != reconnect.client_cookie()) {
+    ldout(cct, 1) << __func__ << " existing=" << existing
+                  << " client cookie mismatch, I must have reseted:"
+                  << " cc=" << std::hex << exproto->client_cookie
+                  << " rcc=" << reconnect.client_cookie()
+                  << ", reseting client." << std::dec
+                  << dendl;
+    auto reset = ResetFrame::Encode(connection->policy.resetcheck);
+    return WRITE(reset, "session reset", read_frame);
+  } else if (exproto->server_cookie == 0) {
+    // this happens when:
+    //   - a connects to b
+    //   - a sends client_ident
+    //   - b gets client_ident, sends server_ident and sets cookie X
+    //   - connection fault
+    //   - b reconnects to a with cookie X, connect_seq=1
+    //   - a has cookie==0
+    ldout(cct, 1) << __func__ << " I was a client and didn't received the"
+                  << " server_ident. Asking peer to resume session"
+                  << " establishment" << dendl;
+    auto reset = ResetFrame::Encode(false);
+    return WRITE(reset, "session reset", read_frame);
+  }
+
+  if (exproto->peer_global_seq > reconnect.global_seq()) {
+    ldout(cct, 5) << __func__
+                  << " stale global_seq: sgs=" << exproto->peer_global_seq
+                  << " cgs=" << reconnect.global_seq()
+                  << ", ask client to retry global" << dendl;
+    auto retry = RetryGlobalFrame::Encode(exproto->peer_global_seq);
+
+    INTERCEPT(18);
+
+    return WRITE(retry, "session retry", read_frame);
+  }
+
+  if (exproto->connect_seq > reconnect.connect_seq()) {
+    ldout(cct, 5) << __func__
+                  << " stale connect_seq scs=" << exproto->connect_seq
+                  << " ccs=" << reconnect.connect_seq()
+                  << " , ask client to retry" << dendl;
+    auto retry = RetryFrame::Encode(exproto->connect_seq);
+    return WRITE(retry, "session retry", read_frame);
+  }
+
+  if (exproto->connect_seq == reconnect.connect_seq()) {
+    // reconnect race: both peers are sending reconnect messages
+    if (existing->peer_addrs->msgr2_addr() >
+            messenger->get_myaddrs().msgr2_addr() &&
+        !existing->policy.server) {
+      // the existing connection wins
+      ldout(cct, 1)
+          << __func__
+          << " reconnect race detected, this connection loses to existing="
+          << existing << dendl;
+
+      auto wait = WaitFrame::Encode();
+      return WRITE(wait, "wait", read_frame);
+    } else {
+      // this connection wins
+      ldout(cct, 1) << __func__
+                    << " reconnect race detected, replacing existing="
+                    << existing << " socket by this connection's socket"
+                    << dendl;
+    }
+  }
+
+  ldout(cct, 1) << __func__ << " reconnect to existing=" << existing << dendl;
+
+  reconnecting = true;
+
+  // everything looks good
+  exproto->connect_seq = reconnect.connect_seq();
+  exproto->message_seq = reconnect.msg_seq();
+
+  return reuse_connection(existing, exproto);
+}
+
+CtPtr ProtocolV2::handle_existing_connection(const AsyncConnectionRef& existing) {
+  ldout(cct, 20) << __func__ << " existing=" << existing << dendl;
+
+  std::unique_lock<std::mutex> l(existing->lock);
+
+  ProtocolV2 *exproto = dynamic_cast<ProtocolV2 *>(existing->protocol.get());
+  if (!exproto) {
+    ldout(cct, 1) << __func__ << " existing=" << existing << dendl;
+    ceph_assert(false);
+  }
+
+  if (exproto->state == CLOSED) {
+    ldout(cct, 1) << __func__ << " existing " << existing << " already closed."
+                  << dendl;
+    l.unlock();
+    return send_server_ident();
+  }
+
+  if (exproto->replacing) {
+    ldout(cct, 1) << __func__
+                  << " existing racing replace happened while replacing."
+                  << " existing=" << existing << dendl;
+    auto wait = WaitFrame::Encode();
+    return WRITE(wait, "wait", read_frame);
+  }
+
+  if (exproto->peer_global_seq > peer_global_seq) {
+    ldout(cct, 1) << __func__ << " this is a stale connection, peer_global_seq="
+                  << peer_global_seq
+                  << " existing->peer_global_seq=" << exproto->peer_global_seq
+                  << ", stopping this connection." << dendl;
+    stop();
+    connection->dispatch_queue->queue_reset(connection);
+    return nullptr;
+  }
+
+  if (existing->policy.lossy) {
+    // existing connection can be thrown out in favor of this one
+    ldout(cct, 1)
+        << __func__ << " existing=" << existing
+        << " is a lossy channel. Stopping existing in favor of this connection"
+        << dendl;
+    existing->protocol->stop();
+    existing->dispatch_queue->queue_reset(existing.get());
+    l.unlock();
+    return send_server_ident();
+  }
+
+  if (exproto->server_cookie && exproto->client_cookie &&
+      exproto->client_cookie != client_cookie) {
+    // Found previous session
+    // peer has reseted and we're going to reuse the existing connection
+    // by replacing the communication socket
+    ldout(cct, 1) << __func__ << " found previous session existing=" << existing
+                  << ", peer must have reseted." << dendl;
+    if (connection->policy.resetcheck) {
+      exproto->reset_session();
+    }
+    return reuse_connection(existing, exproto);
+  }
+
+  if (exproto->client_cookie == client_cookie) {
+    // session establishment interrupted between client_ident and server_ident,
+    // continuing...
+    ldout(cct, 1) << __func__ << " found previous session existing=" << existing
+                  << ", continuing session establishment." << dendl;
+    return reuse_connection(existing, exproto);
+  }
+
+  if (exproto->state == READY || exproto->state == STANDBY) {
+    ldout(cct, 1) << __func__ << " existing=" << existing
+                  << " is READY/STANDBY, lets reuse it" << dendl;
+    return reuse_connection(existing, exproto);
+  }
+
+  // Looks like a connection race: server and client are both connecting to
+  // each other at the same time.
+  if (connection->peer_addrs->msgr2_addr() <
+          messenger->get_myaddrs().msgr2_addr() ||
+      existing->policy.server) {
+    // this connection wins
+    ldout(cct, 1) << __func__
+                  << " connection race detected, replacing existing="
+                  << existing << " socket by this connection's socket" << dendl;
+    return reuse_connection(existing, exproto);
+  } else {
+    // the existing connection wins
+    ldout(cct, 1)
+        << __func__
+        << " connection race detected, this connection loses to existing="
+        << existing << dendl;
+    ceph_assert(connection->peer_addrs->msgr2_addr() >
+                messenger->get_myaddrs().msgr2_addr());
+
+    // make sure we follow through with opening the existing
+    // connection (if it isn't yet open) since we know the peer
+    // has something to send to us.
+    existing->send_keepalive();
+    auto wait = WaitFrame::Encode();
+    return WRITE(wait, "wait", read_frame);
+  }
+}
+
+CtPtr ProtocolV2::reuse_connection(const AsyncConnectionRef& existing,
+                                   ProtocolV2 *exproto) {
+  ldout(cct, 20) << __func__ << " existing=" << existing
+                 << " reconnect=" << reconnecting << dendl;
+
+  connection->inject_delay();
+
+  std::lock_guard<std::mutex> l(existing->write_lock);
+
+  connection->center->delete_file_event(connection->cs.fd(),
+                                        EVENT_READABLE | EVENT_WRITABLE);
+
+  if (existing->delay_state) {
+    existing->delay_state->flush();
+    ceph_assert(!connection->delay_state);
+  }
+  exproto->reset_recv_state();
+  exproto->pre_auth.enabled = false;
+
+  if (!reconnecting) {
+    exproto->client_cookie = client_cookie;
+    exproto->peer_name = peer_name;
+    exproto->connection_features = connection_features;
+    existing->set_features(connection_features);
+    exproto->peer_supported_features = peer_supported_features;
+  }
+  exproto->peer_global_seq = peer_global_seq;
+
+  ceph_assert(connection->center->in_thread());
+  auto temp_cs = std::move(connection->cs);
+  EventCenter *new_center = connection->center;
+  Worker *new_worker = connection->worker;
+  // we can steal the session_stream_handlers under the assumption
+  // this happens in the event center's thread as there should be
+  // no user outside its boundaries (simlarly to e.g. outgoing_bl).
+  auto temp_stream_handlers = std::move(session_stream_handlers);
+  exproto->auth_meta = auth_meta;
+
+  ldout(messenger->cct, 5) << __func__ << " stop myself to swap existing"
+                           << dendl;
+
+  // avoid _stop shutdown replacing socket
+  // queue a reset on the new connection, which we're dumping for the old
+  stop();
+
+  connection->dispatch_queue->queue_reset(connection);
+
+  exproto->can_write = false;
+  exproto->write_in_progress = false;
+  exproto->reconnecting = reconnecting;
+  exproto->replacing = true;
+  existing->state_offset = 0;
+  // avoid previous thread modify event
+  exproto->state = NONE;
+  existing->state = AsyncConnection::STATE_NONE;
+  // Discard existing prefetch buffer in `recv_buf`
+  existing->recv_start = existing->recv_end = 0;
+  // there shouldn't exist any buffer
+  ceph_assert(connection->recv_start == connection->recv_end);
+
+  auto deactivate_existing = std::bind(
+      [ existing,
+        new_worker,
+        new_center,
+        exproto,
+        reconnecting=reconnecting,
+        tx_is_rev1=tx_frame_asm.get_is_rev1(),
+        rx_is_rev1=rx_frame_asm.get_is_rev1(),
+        temp_stream_handlers=std::move(temp_stream_handlers)
+      ](ConnectedSocket &cs) mutable {
+        // we need to delete time event in original thread
+        {
+          std::lock_guard<std::mutex> l(existing->lock);
+          existing->write_lock.lock();
+          exproto->requeue_sent();
+          // XXX: do we really need the locking for `outgoing_bl`? There is
+          // a comment just above its definition saying "lockfree, only used
+          // in own thread". I'm following lockfull schema just in the case.
+          // From performance point of view it should be fine – this happens
+          // far away from hot paths.
+          existing->outgoing_bl.clear();
+          existing->open_write = false;
+          exproto->session_stream_handlers = std::move(temp_stream_handlers);
+          if (!reconnecting) {
+            exproto->tx_frame_asm.set_is_rev1(tx_is_rev1);
+            exproto->rx_frame_asm.set_is_rev1(rx_is_rev1);
+          }
+          existing->write_lock.unlock();
+          if (exproto->state == NONE) {
+            existing->shutdown_socket();
+            existing->cs = std::move(cs);
+            existing->worker->references--;
+            new_worker->references++;
+            existing->logger = new_worker->get_perf_counter();
+            existing->worker = new_worker;
+            existing->center = new_center;
+            if (existing->delay_state)
+              existing->delay_state->set_center(new_center);
+          } else if (exproto->state == CLOSED) {
+            auto back_to_close = std::bind(
+                [](ConnectedSocket &cs) mutable { cs.close(); }, std::move(cs));
+            new_center->submit_to(new_center->get_id(),
+                                  std::move(back_to_close), true);
+            return;
+          } else {
+            ceph_abort();
+          }
+        }
+
+        // Before changing existing->center, it may already exists some
+        // events in existing->center's queue. Then if we mark down
+        // `existing`, it will execute in another thread and clean up
+        // connection. Previous event will result in segment fault
+        auto transfer_existing = [existing, exproto]() mutable {
+          std::lock_guard<std::mutex> l(existing->lock);
+          if (exproto->state == CLOSED) return;
+          ceph_assert(exproto->state == NONE);
+
+          exproto->state = SESSION_ACCEPTING;
+          // we have called shutdown_socket above
+          ceph_assert(existing->last_tick_id == 0);
+          // restart timer since we are going to re-build connection
+          existing->last_connect_started = ceph::coarse_mono_clock::now();
+          existing->last_tick_id = existing->center->create_time_event(
+            existing->connect_timeout_us, existing->tick_handler);
+          existing->state = AsyncConnection::STATE_CONNECTION_ESTABLISHED;
+          existing->center->create_file_event(existing->cs.fd(), EVENT_READABLE,
+                                              existing->read_handler);
+          if (!exproto->reconnecting) {
+            exproto->run_continuation(exproto->send_server_ident());
+          } else {
+            exproto->run_continuation(exproto->send_reconnect_ok());
+          }
+        };
+        if (existing->center->in_thread())
+          transfer_existing();
+        else
+          existing->center->submit_to(existing->center->get_id(),
+                                      std::move(transfer_existing), true);
+      },
+      std::move(temp_cs));
+
+  existing->center->submit_to(existing->center->get_id(),
+                              std::move(deactivate_existing), true);
+  return nullptr;
+}
+
+CtPtr ProtocolV2::send_server_ident() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  // this is required for the case when this connection is being replaced
+  out_seq = discard_requeued_up_to(out_seq, 0);
+  in_seq = 0;
+
+  if (!connection->policy.lossy) {
+    server_cookie = ceph::util::generate_random_number<uint64_t>(1, -1ll);
+  }
+
+  uint64_t flags = 0;
+  if (connection->policy.lossy) {
+    flags = flags | CEPH_MSG_CONNECT_LOSSY;
+  }
+
+  uint64_t gs = messenger->get_global_seq();
+  auto server_ident = ServerIdentFrame::Encode(
+          messenger->get_myaddrs(),
+          messenger->get_myname().num(),
+          gs,
+          connection->policy.features_supported,
+          connection->policy.features_required | msgr2_required,
+          flags,
+          server_cookie);
+
+  ldout(cct, 5) << __func__ << " sending identification:"
+                << " addrs=" << messenger->get_myaddrs()
+                << " gid=" << messenger->get_myname().num()
+                << " global_seq=" << gs << " features_supported=" << std::hex
+                << connection->policy.features_supported
+                << " features_required="
+		            << (connection->policy.features_required | msgr2_required)
+                << " flags=" << flags
+                << " cookie=" << server_cookie << std::dec << dendl;
+
+  connection->lock.unlock();
+  // Because "replacing" will prevent other connections preempt this addr,
+  // it's safe that here we don't acquire Connection's lock
+  ssize_t r = messenger->accept_conn(connection);
+
+  connection->inject_delay();
+
+  connection->lock.lock();
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " existing race replacing process for addr = "
+                  << connection->peer_addrs->msgr2_addr()
+                  << " just fail later one(this)" << dendl;
+    connection->inject_delay();
+    return _fault();
+  }
+  if (state != SESSION_ACCEPTING) {
+    ldout(cct, 1) << __func__
+                  << " state changed while accept_conn, it must be mark_down"
+                  << dendl;
+    ceph_assert(state == CLOSED || state == NONE);
+    messenger->unregister_conn(connection);
+    connection->inject_delay();
+    return _fault();
+  }
+
+  connection->set_features(connection_features);
+
+  // notify
+  connection->dispatch_queue->queue_accept(connection);
+  messenger->ms_deliver_handle_fast_accept(connection);
+
+  INTERCEPT(12);
+
+  return WRITE(server_ident, "server ident", server_ready);
+}
+
+CtPtr ProtocolV2::server_ready() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (connection->delay_state) {
+    ceph_assert(connection->delay_state->ready());
+  }
+
+  return ready();
+}
+
+CtPtr ProtocolV2::send_reconnect_ok() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  out_seq = discard_requeued_up_to(out_seq, message_seq);
+
+  uint64_t ms = in_seq;
+  auto reconnect_ok = ReconnectOkFrame::Encode(ms);
+
+  ldout(cct, 5) << __func__ << " sending reconnect_ok: msg_seq=" << ms << dendl;
+
+  connection->lock.unlock();
+  // Because "replacing" will prevent other connections preempt this addr,
+  // it's safe that here we don't acquire Connection's lock
+  ssize_t r = messenger->accept_conn(connection);
+
+  connection->inject_delay();
+
+  connection->lock.lock();
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " existing race replacing process for addr = "
+                  << connection->peer_addrs->msgr2_addr()
+                  << " just fail later one(this)" << dendl;
+    connection->inject_delay();
+    return _fault();
+  }
+  if (state != SESSION_ACCEPTING) {
+    ldout(cct, 1) << __func__
+                  << " state changed while accept_conn, it must be mark_down"
+                  << dendl;
+    ceph_assert(state == CLOSED || state == NONE);
+    messenger->unregister_conn(connection);
+    connection->inject_delay();
+    return _fault();
+  }
+
+  // notify
+  connection->dispatch_queue->queue_accept(connection);
+  messenger->ms_deliver_handle_fast_accept(connection);
+
+  INTERCEPT(14);
+
+  return WRITE(reconnect_ok, "reconnect ok", server_ready);
+}
diff --git a/src/msg/async/ProtocolV2.h b/src/msg/async/ProtocolV2.h
new file mode 100644
index 000000000..087553891
--- /dev/null
+++ b/src/msg/async/ProtocolV2.h
@@ -0,0 +1,258 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef _MSG_ASYNC_PROTOCOL_V2_
+#define _MSG_ASYNC_PROTOCOL_V2_
+
+#include "Protocol.h"
+#include "crypto_onwire.h"
+#include "frames_v2.h"
+
+class ProtocolV2 : public Protocol {
+private:
+  enum State {
+    NONE,
+    START_CONNECT,
+    BANNER_CONNECTING,
+    HELLO_CONNECTING,
+    AUTH_CONNECTING,
+    AUTH_CONNECTING_SIGN,
+    SESSION_CONNECTING,
+    SESSION_RECONNECTING,
+    START_ACCEPT,
+    BANNER_ACCEPTING,
+    HELLO_ACCEPTING,
+    AUTH_ACCEPTING,
+    AUTH_ACCEPTING_MORE,
+    AUTH_ACCEPTING_SIGN,
+    SESSION_ACCEPTING,
+    READY,
+    THROTTLE_MESSAGE,
+    THROTTLE_BYTES,
+    THROTTLE_DISPATCH_QUEUE,
+    THROTTLE_DONE,
+    READ_MESSAGE_COMPLETE,
+    STANDBY,
+    WAIT,
+    CLOSED
+  };
+
+  static const char *get_state_name(int state) {
+    const char *const statenames[] = {"NONE",
+                                      "START_CONNECT",
+                                      "BANNER_CONNECTING",
+                                      "HELLO_CONNECTING",
+                                      "AUTH_CONNECTING",
+                                      "AUTH_CONNECTING_SIGN",
+                                      "SESSION_CONNECTING",
+                                      "SESSION_RECONNECTING",
+                                      "START_ACCEPT",
+                                      "BANNER_ACCEPTING",
+                                      "HELLO_ACCEPTING",
+                                      "AUTH_ACCEPTING",
+                                      "AUTH_ACCEPTING_MORE",
+                                      "AUTH_ACCEPTING_SIGN",
+                                      "SESSION_ACCEPTING",
+                                      "READY",
+                                      "THROTTLE_MESSAGE",
+                                      "THROTTLE_BYTES",
+                                      "THROTTLE_DISPATCH_QUEUE",
+                                      "THROTTLE_DONE",
+                                      "READ_MESSAGE_COMPLETE",
+                                      "STANDBY",
+                                      "WAIT",
+                                      "CLOSED"};
+    return statenames[state];
+  }
+
+  // TODO: move into auth_meta?
+  ceph::crypto::onwire::rxtx_t session_stream_handlers;
+
+  entity_name_t peer_name;
+  State state;
+  uint64_t peer_supported_features;  // CEPH_MSGR2_FEATURE_*
+
+  uint64_t client_cookie;
+  uint64_t server_cookie;
+  uint64_t global_seq;
+  uint64_t connect_seq;
+  uint64_t peer_global_seq;
+  uint64_t message_seq;
+  bool reconnecting;
+  bool replacing;
+  bool can_write;
+  struct out_queue_entry_t {
+    bool is_prepared {false};
+    Message* m {nullptr};
+  };
+  std::map<int, std::list<out_queue_entry_t>> out_queue;
+  std::list<Message *> sent;
+  std::atomic<uint64_t> out_seq{0};
+  std::atomic<uint64_t> in_seq{0};
+  std::atomic<uint64_t> ack_left{0};
+
+  using ProtFuncPtr = void (ProtocolV2::*)();
+  Ct<ProtocolV2> *bannerExchangeCallback;
+
+  ceph::msgr::v2::FrameAssembler tx_frame_asm;
+  ceph::msgr::v2::FrameAssembler rx_frame_asm;
+
+  ceph::bufferlist rx_preamble;
+  ceph::bufferlist rx_epilogue;
+  ceph::msgr::v2::segment_bls_t rx_segments_data;
+  ceph::msgr::v2::Tag next_tag;
+  utime_t backoff;  // backoff time
+  utime_t recv_stamp;
+  utime_t throttle_stamp;
+
+  struct {
+    ceph::bufferlist rxbuf;
+    ceph::bufferlist txbuf;
+    bool enabled {true};
+  } pre_auth;
+
+  bool keepalive;
+  bool write_in_progress = false;
+
+  std::ostream& _conn_prefix(std::ostream *_dout);
+  void run_continuation(Ct<ProtocolV2> *pcontinuation);
+  void run_continuation(Ct<ProtocolV2> &continuation);
+
+  Ct<ProtocolV2> *read(CONTINUATION_RXBPTR_TYPE<ProtocolV2> &next,
+                       rx_buffer_t&& buffer);
+  template <class F>
+  Ct<ProtocolV2> *write(const std::string &desc,
+                        CONTINUATION_TYPE<ProtocolV2> &next,
+			F &frame);
+  Ct<ProtocolV2> *write(const std::string &desc,
+                        CONTINUATION_TYPE<ProtocolV2> &next,
+                        ceph::bufferlist &buffer);
+
+  template <class F>
+  bool append_frame(F& frame);
+
+  void requeue_sent();
+  uint64_t discard_requeued_up_to(uint64_t out_seq, uint64_t seq);
+  void reset_recv_state();
+  void reset_security();
+  void reset_throttle();
+  Ct<ProtocolV2> *_fault();
+  void discard_out_queue();
+  void reset_session();
+  void prepare_send_message(uint64_t features, Message *m);
+  out_queue_entry_t _get_next_outgoing();
+  ssize_t write_message(Message *m, bool more);
+  void handle_message_ack(uint64_t seq);
+
+  CONTINUATION_DECL(ProtocolV2, _wait_for_peer_banner);
+  READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, _handle_peer_banner);
+  READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, _handle_peer_banner_payload);
+
+  Ct<ProtocolV2> *_banner_exchange(Ct<ProtocolV2> &callback);
+  Ct<ProtocolV2> *_wait_for_peer_banner();
+  Ct<ProtocolV2> *_handle_peer_banner(rx_buffer_t &&buffer, int r);
+  Ct<ProtocolV2> *_handle_peer_banner_payload(rx_buffer_t &&buffer, int r);
+  Ct<ProtocolV2> *handle_hello(ceph::bufferlist &payload);
+
+  CONTINUATION_DECL(ProtocolV2, read_frame);
+  CONTINUATION_DECL(ProtocolV2, finish_auth);
+  READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, handle_read_frame_preamble_main);
+  READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, handle_read_frame_segment);
+  READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, handle_read_frame_epilogue_main);
+  CONTINUATION_DECL(ProtocolV2, throttle_message);
+  CONTINUATION_DECL(ProtocolV2, throttle_bytes);
+  CONTINUATION_DECL(ProtocolV2, throttle_dispatch_queue);
+
+  Ct<ProtocolV2> *read_frame();
+  Ct<ProtocolV2> *finish_auth();
+  Ct<ProtocolV2> *finish_client_auth();
+  Ct<ProtocolV2> *handle_read_frame_preamble_main(rx_buffer_t &&buffer, int r);
+  Ct<ProtocolV2> *read_frame_segment();
+  Ct<ProtocolV2> *handle_read_frame_segment(rx_buffer_t &&rx_buffer, int r);
+  Ct<ProtocolV2> *_handle_read_frame_segment();
+  Ct<ProtocolV2> *handle_read_frame_epilogue_main(rx_buffer_t &&buffer, int r);
+  Ct<ProtocolV2> *_handle_read_frame_epilogue_main();
+  Ct<ProtocolV2> *handle_read_frame_dispatch();
+  Ct<ProtocolV2> *handle_frame_payload();
+
+  Ct<ProtocolV2> *ready();
+
+  Ct<ProtocolV2> *handle_message();
+  Ct<ProtocolV2> *throttle_message();
+  Ct<ProtocolV2> *throttle_bytes();
+  Ct<ProtocolV2> *throttle_dispatch_queue();
+
+  Ct<ProtocolV2> *handle_keepalive2(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_keepalive2_ack(ceph::bufferlist &payload);
+
+  Ct<ProtocolV2> *handle_message_ack(ceph::bufferlist &payload);
+
+public:
+  uint64_t connection_features;
+
+  ProtocolV2(AsyncConnection *connection);
+  virtual ~ProtocolV2();
+
+  virtual void connect() override;
+  virtual void accept() override;
+  virtual bool is_connected() override;
+  virtual void stop() override;
+  virtual void fault() override;
+  virtual void send_message(Message *m) override;
+  virtual void send_keepalive() override;
+
+  virtual void read_event() override;
+  virtual void write_event() override;
+  virtual bool is_queued() override;
+
+private:
+  // Client Protocol
+  CONTINUATION_DECL(ProtocolV2, start_client_banner_exchange);
+  CONTINUATION_DECL(ProtocolV2, post_client_banner_exchange);
+
+  Ct<ProtocolV2> *start_client_banner_exchange();
+  Ct<ProtocolV2> *post_client_banner_exchange();
+  inline Ct<ProtocolV2> *send_auth_request() {
+    std::vector<uint32_t> empty;
+    return send_auth_request(empty);
+  }
+  Ct<ProtocolV2> *send_auth_request(std::vector<uint32_t> &allowed_methods);
+  Ct<ProtocolV2> *handle_auth_bad_method(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_auth_reply_more(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_auth_done(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_auth_signature(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *send_client_ident();
+  Ct<ProtocolV2> *send_reconnect();
+  Ct<ProtocolV2> *handle_ident_missing_features(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_session_reset(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_session_retry(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_session_retry_global(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_wait(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_reconnect_ok(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_server_ident(ceph::bufferlist &payload);
+
+  // Server Protocol
+  CONTINUATION_DECL(ProtocolV2, start_server_banner_exchange);
+  CONTINUATION_DECL(ProtocolV2, post_server_banner_exchange);
+  CONTINUATION_DECL(ProtocolV2, server_ready);
+
+  Ct<ProtocolV2> *start_server_banner_exchange();
+  Ct<ProtocolV2> *post_server_banner_exchange();
+  Ct<ProtocolV2> *handle_auth_request(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_auth_request_more(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *_handle_auth_request(ceph::bufferlist& auth_payload, bool more);
+  Ct<ProtocolV2> *_auth_bad_method(int r);
+  Ct<ProtocolV2> *handle_client_ident(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_ident_missing_features_write(int r);
+  Ct<ProtocolV2> *handle_reconnect(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_existing_connection(const AsyncConnectionRef& existing);
+  Ct<ProtocolV2> *reuse_connection(const AsyncConnectionRef& existing,
+                                   ProtocolV2 *exproto);
+  Ct<ProtocolV2> *send_server_ident();
+  Ct<ProtocolV2> *send_reconnect_ok();
+  Ct<ProtocolV2> *server_ready();
+
+  size_t get_current_msg_size() const;
+};
+
+#endif /* _MSG_ASYNC_PROTOCOL_V2_ */
diff --git a/src/msg/async/Stack.cc b/src/msg/async/Stack.cc
new file mode 100644
index 000000000..37e15634d
--- /dev/null
+++ b/src/msg/async/Stack.cc
@@ -0,0 +1,206 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <mutex>
+
+#include "include/compat.h"
+#include "common/Cond.h"
+#include "common/errno.h"
+#include "PosixStack.h"
+#ifdef HAVE_RDMA
+#include "rdma/RDMAStack.h"
+#endif
+#ifdef HAVE_DPDK
+#include "dpdk/DPDKStack.h"
+#endif
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << "stack "
+
+std::function<void ()> NetworkStack::add_thread(unsigned worker_id)
+{
+  Worker *w = workers[worker_id];
+  return [this, w]() {
+      char tp_name[16];
+      sprintf(tp_name, "msgr-worker-%u", w->id);
+      ceph_pthread_setname(pthread_self(), tp_name);
+      const unsigned EventMaxWaitUs = 30000000;
+      w->center.set_owner();
+      ldout(cct, 10) << __func__ << " starting" << dendl;
+      w->initialize();
+      w->init_done();
+      while (!w->done) {
+        ldout(cct, 30) << __func__ << " calling event process" << dendl;
+
+        ceph::timespan dur;
+        int r = w->center.process_events(EventMaxWaitUs, &dur);
+        if (r < 0) {
+          ldout(cct, 20) << __func__ << " process events failed: "
+                         << cpp_strerror(errno) << dendl;
+          // TODO do something?
+        }
+        w->perf_logger->tinc(l_msgr_running_total_time, dur);
+      }
+      w->reset();
+      w->destroy();
+  };
+}
+
+std::shared_ptr<NetworkStack> NetworkStack::create(CephContext *c,
+						   const std::string &t)
+{
+  std::shared_ptr<NetworkStack> stack = nullptr;
+
+  if (t == "posix")
+    stack.reset(new PosixNetworkStack(c));
+#ifdef HAVE_RDMA
+  else if (t == "rdma")
+    stack.reset(new RDMAStack(c));
+#endif
+#ifdef HAVE_DPDK
+  else if (t == "dpdk")
+    stack.reset(new DPDKStack(c));
+#endif
+
+  if (stack == nullptr) {
+    lderr(c) << __func__ << " ms_async_transport_type " << t <<
+    " is not supported! " << dendl;
+    ceph_abort();
+    return nullptr;
+  }
+  
+  const int InitEventNumber = 5000;
+  for (unsigned worker_id = 0; worker_id < stack->num_workers; ++worker_id) {
+    Worker *w = stack->create_worker(c, worker_id);
+    int ret = w->center.init(InitEventNumber, worker_id, t);
+    if (ret)
+      throw std::system_error(-ret, std::generic_category());
+    stack->workers.push_back(w);
+  }
+
+  return stack;
+}
+
+NetworkStack::NetworkStack(CephContext *c)
+  : cct(c)
+{
+  ceph_assert(cct->_conf->ms_async_op_threads > 0);
+
+  num_workers = cct->_conf->ms_async_op_threads;
+  if (num_workers >= EventCenter::MAX_EVENTCENTER) {
+    ldout(cct, 0) << __func__ << " max thread limit is "
+                  << EventCenter::MAX_EVENTCENTER << ", switching to this now. "
+                  << "Higher thread values are unnecessary and currently unsupported."
+                  << dendl;
+    num_workers = EventCenter::MAX_EVENTCENTER;
+  }
+}
+
+void NetworkStack::start()
+{
+  std::unique_lock<decltype(pool_spin)> lk(pool_spin);
+
+  if (started) {
+    return ;
+  }
+
+  for (unsigned i = 0; i < num_workers; ++i) {
+    if (workers[i]->is_init())
+      continue;
+    std::function<void ()> thread = add_thread(i);
+    spawn_worker(i, std::move(thread));
+  }
+  started = true;
+  lk.unlock();
+
+  for (unsigned i = 0; i < num_workers; ++i)
+    workers[i]->wait_for_init();
+}
+
+Worker* NetworkStack::get_worker()
+{
+  ldout(cct, 30) << __func__ << dendl;
+
+   // start with some reasonably large number
+  unsigned min_load = std::numeric_limits<int>::max();
+  Worker* current_best = nullptr;
+
+  pool_spin.lock();
+  // find worker with least references
+  // tempting case is returning on references == 0, but in reality
+  // this will happen so rarely that there's no need for special case.
+  for (unsigned i = 0; i < num_workers; ++i) {
+    unsigned worker_load = workers[i]->references.load();
+    if (worker_load < min_load) {
+      current_best = workers[i];
+      min_load = worker_load;
+    }
+  }
+
+  pool_spin.unlock();
+  ceph_assert(current_best);
+  ++current_best->references;
+  return current_best;
+}
+
+void NetworkStack::stop()
+{
+  std::lock_guard lk(pool_spin);
+  for (unsigned i = 0; i < num_workers; ++i) {
+    workers[i]->done = true;
+    workers[i]->center.wakeup();
+    join_worker(i);
+  }
+  started = false;
+}
+
+class C_drain : public EventCallback {
+  ceph::mutex drain_lock = ceph::make_mutex("C_drain::drain_lock");
+  ceph::condition_variable drain_cond;
+  unsigned drain_count;
+
+ public:
+  explicit C_drain(size_t c)
+      : drain_count(c) {}
+  void do_request(uint64_t id) override {
+    std::lock_guard l{drain_lock};
+    drain_count--;
+    if (drain_count == 0) drain_cond.notify_all();
+  }
+  void wait() {
+    std::unique_lock l{drain_lock};
+    drain_cond.wait(l, [this] { return drain_count == 0; });
+  }
+};
+
+void NetworkStack::drain()
+{
+  ldout(cct, 30) << __func__ << " started." << dendl;
+  pthread_t cur = pthread_self();
+  pool_spin.lock();
+  C_drain drain(num_workers);
+  for (unsigned i = 0; i < num_workers; ++i) {
+    ceph_assert(cur != workers[i]->center.get_owner());
+    workers[i]->center.dispatch_event_external(EventCallbackRef(&drain));
+  }
+  pool_spin.unlock();
+  drain.wait();
+  ldout(cct, 30) << __func__ << " end." << dendl;
+}
diff --git a/src/msg/async/Stack.h b/src/msg/async/Stack.h
new file mode 100644
index 000000000..6a2188b5a
--- /dev/null
+++ b/src/msg/async/Stack.h
@@ -0,0 +1,348 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_ASYNC_STACK_H
+#define CEPH_MSG_ASYNC_STACK_H
+
+#include "include/spinlock.h"
+#include "common/perf_counters.h"
+#include "msg/msg_types.h"
+#include "msg/async/Event.h"
+
+class Worker;
+class ConnectedSocketImpl {
+ public:
+  virtual ~ConnectedSocketImpl() {}
+  virtual int is_connected() = 0;
+  virtual ssize_t read(char*, size_t) = 0;
+  virtual ssize_t send(ceph::buffer::list &bl, bool more) = 0;
+  virtual void shutdown() = 0;
+  virtual void close() = 0;
+  virtual int fd() const = 0;
+};
+
+class ConnectedSocket;
+struct SocketOptions {
+  bool nonblock = true;
+  bool nodelay = true;
+  int rcbuf_size = 0;
+  int priority = -1;
+  entity_addr_t connect_bind_addr;
+};
+
+/// \cond internal
+class ServerSocketImpl {
+ public:
+  unsigned addr_type; ///< entity_addr_t::TYPE_*
+  unsigned addr_slot; ///< position of our addr in myaddrs().v
+  ServerSocketImpl(unsigned type, unsigned slot)
+    : addr_type(type), addr_slot(slot) {}
+  virtual ~ServerSocketImpl() {}
+  virtual int accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w) = 0;
+  virtual void abort_accept() = 0;
+  /// Get file descriptor
+  virtual int fd() const = 0;
+};
+/// \endcond
+
+/// \addtogroup networking-module
+/// @{
+
+/// A TCP (or other stream-based protocol) connection.
+///
+/// A \c ConnectedSocket represents a full-duplex stream between
+/// two endpoints, a local endpoint and a remote endpoint.
+class ConnectedSocket {
+  std::unique_ptr<ConnectedSocketImpl> _csi;
+
+ public:
+  /// Constructs a \c ConnectedSocket not corresponding to a connection
+  ConnectedSocket() {};
+  /// \cond internal
+  explicit ConnectedSocket(std::unique_ptr<ConnectedSocketImpl> csi)
+      : _csi(std::move(csi)) {}
+  /// \endcond
+   ~ConnectedSocket() {
+    if (_csi)
+      _csi->close();
+  }
+  /// Moves a \c ConnectedSocket object.
+  ConnectedSocket(ConnectedSocket&& cs) = default;
+  /// Move-assigns a \c ConnectedSocket object.
+  ConnectedSocket& operator=(ConnectedSocket&& cs) = default;
+
+  int is_connected() {
+    return _csi->is_connected();
+  }
+  /// Read the input stream with copy.
+  ///
+  /// Copy an object returning data sent from the remote endpoint.
+  ssize_t read(char* buf, size_t len) {
+    return _csi->read(buf, len);
+  }
+  /// Gets the output stream.
+  ///
+  /// Gets an object that sends data to the remote endpoint.
+  ssize_t send(ceph::buffer::list &bl, bool more) {
+    return _csi->send(bl, more);
+  }
+  /// Disables output to the socket.
+  ///
+  /// Current or future writes that have not been successfully flushed
+  /// will immediately fail with an error.  This is useful to abort
+  /// operations on a socket that is not making progress due to a
+  /// peer failure.
+  void shutdown() {
+    return _csi->shutdown();
+  }
+  /// Disables input from the socket.
+  ///
+  /// Current or future reads will immediately fail with an error.
+  /// This is useful to abort operations on a socket that is not making
+  /// progress due to a peer failure.
+  void close() {
+    _csi->close();
+    _csi.reset();
+  }
+
+  /// Get file descriptor
+  int fd() const {
+    return _csi->fd();
+  }
+
+  explicit operator bool() const {
+    return _csi.get();
+  }
+};
+/// @}
+
+/// \addtogroup networking-module
+/// @{
+
+/// A listening socket, waiting to accept incoming network connections.
+class ServerSocket {
+  std::unique_ptr<ServerSocketImpl> _ssi;
+ public:
+  /// Constructs a \c ServerSocket not corresponding to a connection
+  ServerSocket() {}
+  /// \cond internal
+  explicit ServerSocket(std::unique_ptr<ServerSocketImpl> ssi)
+      : _ssi(std::move(ssi)) {}
+  ~ServerSocket() {
+    if (_ssi)
+      _ssi->abort_accept();
+  }
+  /// \endcond
+  /// Moves a \c ServerSocket object.
+  ServerSocket(ServerSocket&& ss) = default;
+  /// Move-assigns a \c ServerSocket object.
+  ServerSocket& operator=(ServerSocket&& cs) = default;
+
+  /// Accepts the next connection to successfully connect to this socket.
+  ///
+  /// \Accepts a \ref ConnectedSocket representing the connection, and
+  ///          a \ref entity_addr_t describing the remote endpoint.
+  int accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w) {
+    return _ssi->accept(sock, opt, out, w);
+  }
+
+  /// Stops any \ref accept() in progress.
+  ///
+  /// Current and future \ref accept() calls will terminate immediately
+  /// with an error.
+  void abort_accept() {
+    _ssi->abort_accept();
+    _ssi.reset();
+  }
+
+  /// Get file descriptor
+  int fd() const {
+    return _ssi->fd();
+  }
+
+  /// get listen/bind addr
+  unsigned get_addr_slot() {
+    return _ssi->addr_slot;
+  }
+
+  explicit operator bool() const {
+    return _ssi.get();
+  }
+};
+/// @}
+
+class NetworkStack;
+
+enum {
+  l_msgr_first = 94000,
+  l_msgr_recv_messages,
+  l_msgr_send_messages,
+  l_msgr_recv_bytes,
+  l_msgr_send_bytes,
+  l_msgr_created_connections,
+  l_msgr_active_connections,
+
+  l_msgr_running_total_time,
+  l_msgr_running_send_time,
+  l_msgr_running_recv_time,
+  l_msgr_running_fast_dispatch_time,
+
+  l_msgr_send_messages_queue_lat,
+  l_msgr_handle_ack_lat,
+
+  l_msgr_last,
+};
+
+class Worker {
+  std::mutex init_lock;
+  std::condition_variable init_cond;
+  bool init = false;
+
+ public:
+  bool done = false;
+
+  CephContext *cct;
+  PerfCounters *perf_logger;
+  unsigned id;
+
+  std::atomic_uint references;
+  EventCenter center;
+
+  Worker(const Worker&) = delete;
+  Worker& operator=(const Worker&) = delete;
+
+  Worker(CephContext *c, unsigned worker_id)
+    : cct(c), perf_logger(NULL), id(worker_id), references(0), center(c) {
+    char name[128];
+    sprintf(name, "AsyncMessenger::Worker-%u", id);
+    // initialize perf_logger
+    PerfCountersBuilder plb(cct, name, l_msgr_first, l_msgr_last);
+
+    plb.add_u64_counter(l_msgr_recv_messages, "msgr_recv_messages", "Network received messages");
+    plb.add_u64_counter(l_msgr_send_messages, "msgr_send_messages", "Network sent messages");
+    plb.add_u64_counter(l_msgr_recv_bytes, "msgr_recv_bytes", "Network received bytes", NULL, 0, unit_t(UNIT_BYTES));
+    plb.add_u64_counter(l_msgr_send_bytes, "msgr_send_bytes", "Network sent bytes", NULL, 0, unit_t(UNIT_BYTES));
+    plb.add_u64_counter(l_msgr_active_connections, "msgr_active_connections", "Active connection number");
+    plb.add_u64_counter(l_msgr_created_connections, "msgr_created_connections", "Created connection number");
+
+    plb.add_time(l_msgr_running_total_time, "msgr_running_total_time", "The total time of thread running");
+    plb.add_time(l_msgr_running_send_time, "msgr_running_send_time", "The total time of message sending");
+    plb.add_time(l_msgr_running_recv_time, "msgr_running_recv_time", "The total time of message receiving");
+    plb.add_time(l_msgr_running_fast_dispatch_time, "msgr_running_fast_dispatch_time", "The total time of fast dispatch");
+
+    plb.add_time_avg(l_msgr_send_messages_queue_lat, "msgr_send_messages_queue_lat", "Network sent messages lat");
+    plb.add_time_avg(l_msgr_handle_ack_lat, "msgr_handle_ack_lat", "Connection handle ack lat");
+
+    perf_logger = plb.create_perf_counters();
+    cct->get_perfcounters_collection()->add(perf_logger);
+  }
+  virtual ~Worker() {
+    if (perf_logger) {
+      cct->get_perfcounters_collection()->remove(perf_logger);
+      delete perf_logger;
+    }
+  }
+
+  virtual int listen(entity_addr_t &addr, unsigned addr_slot,
+                     const SocketOptions &opts, ServerSocket *) = 0;
+  virtual int connect(const entity_addr_t &addr,
+                      const SocketOptions &opts, ConnectedSocket *socket) = 0;
+  virtual void destroy() {}
+
+  virtual void initialize() {}
+  PerfCounters *get_perf_counter() { return perf_logger; }
+  void release_worker() {
+    int oldref = references.fetch_sub(1);
+    ceph_assert(oldref > 0);
+  }
+  void init_done() {
+    init_lock.lock();
+    init = true;
+    init_cond.notify_all();
+    init_lock.unlock();
+  }
+  bool is_init() {
+    std::lock_guard<std::mutex> l(init_lock);
+    return init;
+  }
+  void wait_for_init() {
+    std::unique_lock<std::mutex> l(init_lock);
+    while (!init)
+      init_cond.wait(l);
+  }
+  void reset() {
+    init_lock.lock();
+    init = false;
+    init_cond.notify_all();
+    init_lock.unlock();
+    done = false;
+  }
+};
+
+class NetworkStack {
+  unsigned num_workers = 0;
+  ceph::spinlock pool_spin;
+  bool started = false;
+
+  std::function<void ()> add_thread(unsigned i);
+
+  virtual Worker* create_worker(CephContext *c, unsigned i) = 0;
+
+ protected:
+  CephContext *cct;
+  std::vector<Worker*> workers;
+
+  explicit NetworkStack(CephContext *c);
+ public:
+  NetworkStack(const NetworkStack &) = delete;
+  NetworkStack& operator=(const NetworkStack &) = delete;
+  virtual ~NetworkStack() {
+    for (auto &&w : workers)
+      delete w;
+  }
+
+  static std::shared_ptr<NetworkStack> create(
+    CephContext *c, const std::string &type);
+
+  // backend need to override this method if backend doesn't support shared
+  // listen table.
+  // For example, posix backend has in kernel global listen table. If one
+  // thread bind a port, other threads also aware this.
+  // But for dpdk backend, we maintain listen table in each thread. So we
+  // need to let each thread do binding port.
+  virtual bool support_local_listen_table() const { return false; }
+  virtual bool nonblock_connect_need_writable_event() const { return true; }
+
+  void start();
+  void stop();
+  virtual Worker *get_worker();
+  Worker *get_worker(unsigned worker_id) {
+    return workers[worker_id];
+  }
+  void drain();
+  unsigned get_num_worker() const {
+    return num_workers;
+  }
+
+  // direct is used in tests only
+  virtual void spawn_worker(unsigned i, std::function<void ()> &&) = 0;
+  virtual void join_worker(unsigned i) = 0;
+
+  virtual bool is_ready() { return true; };
+  virtual void ready() { };
+};
+
+#endif //CEPH_MSG_ASYNC_STACK_H
diff --git a/src/msg/async/crypto_onwire.cc b/src/msg/async/crypto_onwire.cc
new file mode 100644
index 000000000..615820b35
--- /dev/null
+++ b/src/msg/async/crypto_onwire.cc
@@ -0,0 +1,309 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <array>
+#include <openssl/evp.h>
+
+#include "crypto_onwire.h"
+
+#include "common/debug.h"
+#include "common/ceph_crypto.h"
+#include "include/types.h"
+
+#define dout_subsys ceph_subsys_ms
+
+namespace ceph::crypto::onwire {
+
+static constexpr const std::size_t AESGCM_KEY_LEN{16};
+static constexpr const std::size_t AESGCM_IV_LEN{12};
+static constexpr const std::size_t AESGCM_TAG_LEN{16};
+static constexpr const std::size_t AESGCM_BLOCK_LEN{16};
+
+struct nonce_t {
+  ceph_le32 fixed;
+  ceph_le64 counter;
+
+  bool operator==(const nonce_t& rhs) const {
+    return !memcmp(this, &rhs, sizeof(*this));
+  }
+} __attribute__((packed));
+static_assert(sizeof(nonce_t) == AESGCM_IV_LEN);
+
+using key_t = std::array<std::uint8_t, AESGCM_KEY_LEN>;
+
+// http://www.mindspring.com/~dmcgrew/gcm-nist-6.pdf
+// https://www.openssl.org/docs/man1.0.2/crypto/EVP_aes_128_gcm.html#GCM-mode
+// https://wiki.openssl.org/index.php/EVP_Authenticated_Encryption_and_Decryption
+// https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38d.pdf
+class AES128GCM_OnWireTxHandler : public ceph::crypto::onwire::TxHandler {
+  CephContext* const cct;
+  std::unique_ptr<EVP_CIPHER_CTX, decltype(&::EVP_CIPHER_CTX_free)> ectx;
+  ceph::bufferlist buffer;
+  nonce_t nonce, initial_nonce;
+  bool used_initial_nonce;
+  bool new_nonce_format;  // 64-bit counter?
+  static_assert(sizeof(nonce) == AESGCM_IV_LEN);
+
+public:
+  AES128GCM_OnWireTxHandler(CephContext* const cct,
+			    const key_t& key,
+			    const nonce_t& nonce,
+			    bool new_nonce_format)
+    : cct(cct),
+      ectx(EVP_CIPHER_CTX_new(), EVP_CIPHER_CTX_free),
+      nonce(nonce), initial_nonce(nonce), used_initial_nonce(false),
+      new_nonce_format(new_nonce_format) {
+    ceph_assert_always(ectx);
+    ceph_assert_always(key.size() * CHAR_BIT == 128);
+
+    if (1 != EVP_EncryptInit_ex(ectx.get(), EVP_aes_128_gcm(),
+			        nullptr, nullptr, nullptr)) {
+      throw std::runtime_error("EVP_EncryptInit_ex failed");
+    }
+
+    if(1 != EVP_EncryptInit_ex(ectx.get(), nullptr, nullptr,
+			       key.data(), nullptr)) {
+      throw std::runtime_error("EVP_EncryptInit_ex failed");
+    }
+  }
+
+  ~AES128GCM_OnWireTxHandler() override {
+    ::TOPNSPC::crypto::zeroize_for_security(&nonce, sizeof(nonce));
+    ::TOPNSPC::crypto::zeroize_for_security(&initial_nonce, sizeof(initial_nonce));
+  }
+
+  void reset_tx_handler(const uint32_t* first, const uint32_t* last) override;
+
+  void authenticated_encrypt_update(const ceph::bufferlist& plaintext) override;
+  ceph::bufferlist authenticated_encrypt_final() override;
+};
+
+void AES128GCM_OnWireTxHandler::reset_tx_handler(const uint32_t* first,
+                                                 const uint32_t* last)
+{
+  if (nonce == initial_nonce) {
+    if (used_initial_nonce) {
+      throw ceph::crypto::onwire::TxHandlerError("out of nonces");
+    }
+    used_initial_nonce = true;
+  }
+
+  if(1 != EVP_EncryptInit_ex(ectx.get(), nullptr, nullptr, nullptr,
+      reinterpret_cast<const unsigned char*>(&nonce))) {
+    throw std::runtime_error("EVP_EncryptInit_ex failed");
+  }
+
+  ceph_assert(buffer.get_append_buffer_unused_tail_length() == 0);
+  buffer.reserve(std::accumulate(first, last, AESGCM_TAG_LEN));
+
+  if (!new_nonce_format) {
+    // msgr2.0: 32-bit counter followed by 64-bit fixed field,
+    // susceptible to overflow!
+    nonce.fixed = nonce.fixed + 1;
+  } else {
+    nonce.counter = nonce.counter + 1;
+  }
+}
+
+void AES128GCM_OnWireTxHandler::authenticated_encrypt_update(
+  const ceph::bufferlist& plaintext)
+{
+  ceph_assert(buffer.get_append_buffer_unused_tail_length() >=
+              plaintext.length());
+  auto filler = buffer.append_hole(plaintext.length());
+
+  for (const auto& plainbuf : plaintext.buffers()) {
+    int update_len = 0;
+
+    if(1 != EVP_EncryptUpdate(ectx.get(),
+	reinterpret_cast<unsigned char*>(filler.c_str()),
+	&update_len,
+	reinterpret_cast<const unsigned char*>(plainbuf.c_str()),
+	plainbuf.length())) {
+      throw std::runtime_error("EVP_EncryptUpdate failed");
+    }
+    ceph_assert_always(update_len >= 0);
+    ceph_assert(static_cast<unsigned>(update_len) == plainbuf.length());
+    filler.advance(update_len);
+  }
+
+  ldout(cct, 15) << __func__
+		 << " plaintext.length()=" << plaintext.length()
+		 << " buffer.length()=" << buffer.length()
+		 << dendl;
+}
+
+ceph::bufferlist AES128GCM_OnWireTxHandler::authenticated_encrypt_final()
+{
+  int final_len = 0;
+  ceph_assert(buffer.get_append_buffer_unused_tail_length() ==
+              AESGCM_BLOCK_LEN);
+  auto filler = buffer.append_hole(AESGCM_BLOCK_LEN);
+  if(1 != EVP_EncryptFinal_ex(ectx.get(),
+	reinterpret_cast<unsigned char*>(filler.c_str()),
+	&final_len)) {
+    throw std::runtime_error("EVP_EncryptFinal_ex failed");
+  }
+  ceph_assert_always(final_len == 0);
+
+  static_assert(AESGCM_BLOCK_LEN == AESGCM_TAG_LEN);
+  if(1 != EVP_CIPHER_CTX_ctrl(ectx.get(),
+	EVP_CTRL_GCM_GET_TAG, AESGCM_TAG_LEN,
+	filler.c_str())) {
+    throw std::runtime_error("EVP_CIPHER_CTX_ctrl failed");
+  }
+
+  ldout(cct, 15) << __func__
+		 << " buffer.length()=" << buffer.length()
+		 << " final_len=" << final_len
+		 << dendl;
+  return std::move(buffer);
+}
+
+// RX PART
+class AES128GCM_OnWireRxHandler : public ceph::crypto::onwire::RxHandler {
+  std::unique_ptr<EVP_CIPHER_CTX, decltype(&::EVP_CIPHER_CTX_free)> ectx;
+  nonce_t nonce;
+  bool new_nonce_format;  // 64-bit counter?
+  static_assert(sizeof(nonce) == AESGCM_IV_LEN);
+
+public:
+  AES128GCM_OnWireRxHandler(CephContext* const cct,
+			    const key_t& key,
+			    const nonce_t& nonce,
+			    bool new_nonce_format)
+    : ectx(EVP_CIPHER_CTX_new(), EVP_CIPHER_CTX_free),
+      nonce(nonce), new_nonce_format(new_nonce_format) {
+    ceph_assert_always(ectx);
+    ceph_assert_always(key.size() * CHAR_BIT == 128);
+
+    if (1 != EVP_DecryptInit_ex(ectx.get(), EVP_aes_128_gcm(),
+			        nullptr, nullptr, nullptr)) {
+      throw std::runtime_error("EVP_DecryptInit_ex failed");
+    }
+
+    if(1 != EVP_DecryptInit_ex(ectx.get(), nullptr, nullptr,
+			       key.data(), nullptr)) {
+      throw std::runtime_error("EVP_DecryptInit_ex failed");
+    }
+  }
+
+  ~AES128GCM_OnWireRxHandler() override {
+    ::TOPNSPC::crypto::zeroize_for_security(&nonce, sizeof(nonce));
+  }
+
+  std::uint32_t get_extra_size_at_final() override {
+    return AESGCM_TAG_LEN;
+  }
+  void reset_rx_handler() override;
+  void authenticated_decrypt_update(ceph::bufferlist& bl) override;
+  void authenticated_decrypt_update_final(ceph::bufferlist& bl) override;
+};
+
+void AES128GCM_OnWireRxHandler::reset_rx_handler()
+{
+  if(1 != EVP_DecryptInit_ex(ectx.get(), nullptr, nullptr, nullptr,
+	reinterpret_cast<const unsigned char*>(&nonce))) {
+    throw std::runtime_error("EVP_DecryptInit_ex failed");
+  }
+
+  if (!new_nonce_format) {
+    // msgr2.0: 32-bit counter followed by 64-bit fixed field,
+    // susceptible to overflow!
+    nonce.fixed = nonce.fixed + 1;
+  } else {
+    nonce.counter = nonce.counter + 1;
+  }
+}
+
+void AES128GCM_OnWireRxHandler::authenticated_decrypt_update(
+  ceph::bufferlist& bl)
+{
+  // discard cached crcs as we will be writing through c_str()
+  bl.invalidate_crc();
+  for (auto& buf : bl.buffers()) {
+    auto p = reinterpret_cast<unsigned char*>(const_cast<char*>(buf.c_str()));
+    int update_len = 0;
+
+    if (1 != EVP_DecryptUpdate(ectx.get(), p, &update_len, p, buf.length())) {
+      throw std::runtime_error("EVP_DecryptUpdate failed");
+    }
+    ceph_assert_always(update_len >= 0);
+    ceph_assert(static_cast<unsigned>(update_len) == buf.length());
+  }
+}
+
+void AES128GCM_OnWireRxHandler::authenticated_decrypt_update_final(
+  ceph::bufferlist& bl)
+{
+  unsigned orig_len = bl.length();
+  ceph_assert(orig_len >= AESGCM_TAG_LEN);
+
+  // decrypt optional data. Caller is obliged to provide only signature but it
+  // may supply ciphertext as well. Combining the update + final is reflected
+  // combined together.
+  ceph::bufferlist auth_tag;
+  bl.splice(orig_len - AESGCM_TAG_LEN, AESGCM_TAG_LEN, &auth_tag);
+  if (bl.length() > 0) {
+    authenticated_decrypt_update(bl);
+  }
+
+  // we need to ensure the tag is stored in continuous memory.
+  if (1 != EVP_CIPHER_CTX_ctrl(ectx.get(), EVP_CTRL_GCM_SET_TAG,
+	AESGCM_TAG_LEN, auth_tag.c_str())) {
+    throw std::runtime_error("EVP_CIPHER_CTX_ctrl failed");
+  }
+
+  // I expect that 0 bytes will be appended. The call is supposed solely to
+  // authenticate the message.
+  {
+    int final_len = 0;
+    if (0 >= EVP_DecryptFinal_ex(ectx.get(), nullptr, &final_len)) {
+      throw MsgAuthError();
+    }
+    ceph_assert_always(final_len == 0);
+    ceph_assert(bl.length() + AESGCM_TAG_LEN == orig_len);
+  }
+}
+
+ceph::crypto::onwire::rxtx_t ceph::crypto::onwire::rxtx_t::create_handler_pair(
+  CephContext* cct,
+  const AuthConnectionMeta& auth_meta,
+  bool new_nonce_format,
+  bool crossed)
+{
+  if (auth_meta.is_mode_secure()) {
+    ceph_assert_always(auth_meta.connection_secret.length() >= \
+      sizeof(key_t) + 2 * sizeof(nonce_t));
+    const char* secbuf = auth_meta.connection_secret.c_str();
+
+    key_t key;
+    {
+      ::memcpy(key.data(), secbuf, sizeof(key));
+      secbuf += sizeof(key);
+    }
+
+    nonce_t rx_nonce;
+    {
+      ::memcpy(&rx_nonce, secbuf, sizeof(rx_nonce));
+      secbuf += sizeof(rx_nonce);
+    }
+
+    nonce_t tx_nonce;
+    {
+      ::memcpy(&tx_nonce, secbuf, sizeof(tx_nonce));
+      secbuf += sizeof(tx_nonce);
+    }
+
+    return {
+      std::make_unique<AES128GCM_OnWireRxHandler>(
+	cct, key, crossed ? tx_nonce : rx_nonce, new_nonce_format),
+      std::make_unique<AES128GCM_OnWireTxHandler>(
+	cct, key, crossed ? rx_nonce : tx_nonce, new_nonce_format)
+    };
+  } else {
+    return { nullptr, nullptr };
+  }
+}
+
+} // namespace ceph::crypto::onwire
diff --git a/src/msg/async/crypto_onwire.h b/src/msg/async/crypto_onwire.h
new file mode 100644
index 000000000..55f755086
--- /dev/null
+++ b/src/msg/async/crypto_onwire.h
@@ -0,0 +1,130 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2009 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_CRYPTO_ONWIRE_H
+#define CEPH_CRYPTO_ONWIRE_H
+
+#include <cstdint>
+#include <memory>
+
+#include "auth/Auth.h"
+#include "include/buffer.h"
+
+namespace ceph::math {
+
+// TODO
+template <typename T>
+class always_aligned_t {
+  T val;
+
+  template <class... Args>
+  always_aligned_t(Args&&... args)
+    : val(std::forward<Args>(args)...) {
+  }
+};
+
+} // namespace ceph::math
+
+namespace ceph::crypto::onwire {
+
+struct MsgAuthError : public std::runtime_error {
+  MsgAuthError()
+    : runtime_error("message signature mismatch") {
+  }
+};
+
+struct TxHandlerError : public std::runtime_error {
+  TxHandlerError(const char* what)
+    : std::runtime_error(std::string("tx handler error: ") + what) {}
+};
+
+struct TxHandler {
+  virtual ~TxHandler() = default;
+
+  // Instance of TxHandler must be reset before doing any encrypt-update
+  // step. This applies also to situation when encrypt-final was already
+  // called and another round of update-...-update-final will take place.
+  //
+  // The input parameter informs implementation how the -update sequence
+  // is fragmented and allows to make concious decision about allocation
+  // or reusage of provided memory. One implementation could do in-place
+  // encryption while other might prefer one huge output buffer.
+  //
+  // It's undefined what will happen if client doesn't follow the order.
+  //
+  // TODO: switch to always_aligned_t
+  virtual void reset_tx_handler(const uint32_t* first,
+                                const uint32_t* last) = 0;
+
+  void reset_tx_handler(std::initializer_list<uint32_t> update_size_sequence) {
+    if (update_size_sequence.size() > 0) {
+      const uint32_t* first = &*update_size_sequence.begin();
+      reset_tx_handler(first, first + update_size_sequence.size());
+    } else {
+      reset_tx_handler(nullptr, nullptr);
+    }
+  }
+
+  // Perform encryption. Client gives full ownership right to provided
+  // bufferlist. The method MUST NOT be called after _final() if there
+  // was no call to _reset().
+  virtual void authenticated_encrypt_update(
+    const ceph::bufferlist& plaintext) = 0;
+
+  // Generates authentication signature and returns bufferlist crafted
+  // basing on plaintext from preceding call to _update().
+  virtual ceph::bufferlist authenticated_encrypt_final() = 0;
+};
+
+class RxHandler {
+public:
+  virtual ~RxHandler() = default;
+
+  // Transmitter can append extra bytes of ciphertext at the -final step.
+  // This method return how much was added, and thus let client translate
+  // plaintext size into ciphertext size to grab from wire.
+  virtual std::uint32_t get_extra_size_at_final() = 0;
+
+  // Instance of RxHandler must be reset before doing any decrypt-update
+  // step. This applies also to situation when decrypt-final was already
+  // called and another round of update-...-update-final will take place.
+  virtual void reset_rx_handler() = 0;
+
+  // Perform decryption ciphertext must be ALWAYS aligned to 16 bytes.
+  virtual void authenticated_decrypt_update(ceph::bufferlist& bl) = 0;
+
+  // Perform decryption of last cipertext's portion and verify signature
+  // for overall decryption sequence.
+  // Throws on integrity/authenticity checks
+  virtual void authenticated_decrypt_update_final(ceph::bufferlist& bl) = 0;
+};
+
+struct rxtx_t {
+  //rxtx_t(rxtx_t&& r) : rx(std::move(rx)), tx(std::move(tx)) {}
+  // Each peer can use different handlers.
+  // Hmm, isn't that too much flexbility?
+  std::unique_ptr<RxHandler> rx;
+  std::unique_ptr<TxHandler> tx;
+
+  static rxtx_t create_handler_pair(
+    CephContext* ctx,
+    const class AuthConnectionMeta& auth_meta,
+    bool new_nonce_format,
+    bool crossed);
+};
+
+} // namespace ceph::crypto::onwire
+
+#endif // CEPH_CRYPTO_ONWIRE_H
diff --git a/src/msg/async/dpdk/ARP.cc b/src/msg/async/dpdk/ARP.cc
new file mode 100644
index 000000000..dedc9e3c7
--- /dev/null
+++ b/src/msg/async/dpdk/ARP.cc
@@ -0,0 +1,89 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#include "ARP.h"
+
+arp_for_protocol::arp_for_protocol(arp& a, uint16_t proto_num)
+    : _arp(a), _proto_num(proto_num)
+{
+  _arp.add(proto_num, this);
+}
+
+arp_for_protocol::~arp_for_protocol()
+{
+  _arp.del(_proto_num);
+}
+
+arp::arp(interface* netif):
+    _netif(netif),
+    _proto(netif, eth_protocol_num::arp, [this] { return get_packet(); }),
+    _rx_packets(
+        _proto.receive(
+            [this] (Packet p, ethernet_address ea) {
+              return process_packet(std::move(p), ea);
+            },
+            [this](forward_hash& out_hash_data, Packet& p, size_t off) {
+              return forward(out_hash_data, p, off);
+            }
+        )
+    )
+{}
+
+Tub<l3_protocol::l3packet> arp::get_packet()
+{
+  Tub<l3_protocol::l3packet> p;
+  if (!_packetq.empty()) {
+    p = std::move(_packetq.front());
+    _packetq.pop_front();
+  }
+  return p;
+}
+
+bool arp::forward(forward_hash& out_hash_data, Packet& p, size_t off)
+{
+  auto ah = p.get_header<arp_hdr>(off);
+  auto i = _arp_for_protocol.find(ntoh(ah->ptype));
+  if (i != _arp_for_protocol.end()) {
+    return i->second->forward(out_hash_data, p, off);
+  }
+  return false;
+}
+
+void arp::add(uint16_t proto_num, arp_for_protocol* afp)
+{
+  _arp_for_protocol[proto_num] = afp;
+}
+
+void arp::del(uint16_t proto_num)
+{
+  _arp_for_protocol.erase(proto_num);
+}
+
+int arp::process_packet(Packet p, ethernet_address from)
+{
+  auto ah = p.get_header<arp_hdr>()->ntoh();
+  auto i = _arp_for_protocol.find(ah.ptype);
+  if (i != _arp_for_protocol.end()) {
+    i->second->received(std::move(p));
+  }
+  return 0;
+}
diff --git a/src/msg/async/dpdk/ARP.h b/src/msg/async/dpdk/ARP.h
new file mode 100644
index 000000000..545695648
--- /dev/null
+++ b/src/msg/async/dpdk/ARP.h
@@ -0,0 +1,301 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ */
+
+#ifndef CEPH_MSG_ARP_H_
+#define CEPH_MSG_ARP_H_
+
+#include <errno.h>
+
+#include <unordered_map>
+#include <functional>
+
+#include "msg/async/Event.h"
+
+#include "ethernet.h"
+#include "circular_buffer.h"
+#include "ip_types.h"
+#include "net.h"
+#include "Packet.h"
+
+class arp;
+template <typename L3>
+class arp_for;
+
+class arp_for_protocol {
+ protected:
+  arp& _arp;
+  uint16_t _proto_num;
+ public:
+  arp_for_protocol(arp& a, uint16_t proto_num);
+  virtual ~arp_for_protocol();
+  virtual int received(Packet p) = 0;
+  virtual bool forward(forward_hash& out_hash_data, Packet& p, size_t off) { return false; }
+};
+
+class interface;
+
+class arp {
+  interface* _netif;
+  l3_protocol _proto;
+  subscription<Packet, ethernet_address> _rx_packets;
+  std::unordered_map<uint16_t, arp_for_protocol*> _arp_for_protocol;
+  circular_buffer<l3_protocol::l3packet> _packetq;
+ private:
+  struct arp_hdr {
+    uint16_t htype;
+    uint16_t ptype;
+    arp_hdr ntoh() {
+      arp_hdr hdr = *this;
+      hdr.htype = ::ntoh(htype);
+      hdr.ptype = ::ntoh(ptype);
+      return hdr;
+    }
+    arp_hdr hton() {
+      arp_hdr hdr = *this;
+      hdr.htype = ::hton(htype);
+      hdr.ptype = ::hton(ptype);
+      return hdr;
+    }
+  };
+ public:
+  explicit arp(interface* netif);
+  void add(uint16_t proto_num, arp_for_protocol* afp);
+  void del(uint16_t proto_num);
+ private:
+  ethernet_address l2self() { return _netif->hw_address(); }
+  int process_packet(Packet p, ethernet_address from);
+  bool forward(forward_hash& out_hash_data, Packet& p, size_t off);
+  Tub<l3_protocol::l3packet> get_packet();
+  template <class l3_proto>
+  friend class arp_for;
+};
+
+template <typename L3>
+class arp_for : public arp_for_protocol {
+ public:
+  using l2addr = ethernet_address;
+  using l3addr = typename L3::address_type;
+ private:
+  static constexpr auto max_waiters = 512;
+  enum oper {
+    op_request = 1,
+    op_reply = 2,
+  };
+  struct arp_hdr {
+    uint16_t htype;
+    uint16_t ptype;
+    uint8_t hlen;
+    uint8_t plen;
+    uint16_t oper;
+    l2addr sender_hwaddr;
+    l3addr sender_paddr;
+    l2addr target_hwaddr;
+    l3addr target_paddr;
+
+    arp_hdr ntoh() {
+      arp_hdr hdr = *this;
+      hdr.htype = ::ntoh(htype);
+      hdr.ptype = ::ntoh(ptype);
+      hdr.oper = ::ntoh(oper);
+      hdr.sender_hwaddr = sender_hwaddr.ntoh();
+      hdr.sender_paddr = sender_paddr.ntoh();
+      hdr.target_hwaddr = target_hwaddr.ntoh();
+      hdr.target_paddr = target_paddr.ntoh();
+      return hdr;
+    }
+
+    arp_hdr hton() {
+      arp_hdr hdr = *this;
+      hdr.htype = ::hton(htype);
+      hdr.ptype = ::hton(ptype);
+      hdr.oper = ::hton(oper);
+      hdr.sender_hwaddr = sender_hwaddr.hton();
+      hdr.sender_paddr = sender_paddr.hton();
+      hdr.target_hwaddr = target_hwaddr.hton();
+      hdr.target_paddr = target_paddr.hton();
+      return hdr;
+    }
+  };
+  struct resolution {
+    std::vector<std::pair<resolution_cb, Packet>> _waiters;
+    uint64_t timeout_fd;
+  };
+  class C_handle_arp_timeout : public EventCallback {
+    arp_for *arp;
+    l3addr paddr;
+    bool first_request;
+
+   public:
+    C_handle_arp_timeout(arp_for *a, l3addr addr, bool first):
+        arp(a), paddr(addr), first_request(first) {}
+    void do_request(uint64_t r) {
+      arp->send_query(paddr);
+      auto &res = arp->_in_progress[paddr];
+
+      for (auto& p : res._waiters) {
+        p.first(ethernet_address(), std::move(p.second), -ETIMEDOUT);
+      }
+      res._waiters.clear();
+      res.timeout_fd = arp->center->create_time_event(
+          1*1000*1000, this);
+    }
+  };
+  friend class C_handle_arp_timeout;
+
+ private:
+  CephContext *cct;
+  EventCenter *center;
+  l3addr _l3self = L3::broadcast_address();
+  std::unordered_map<l3addr, l2addr> _table;
+  std::unordered_map<l3addr, resolution> _in_progress;
+ private:
+  Packet make_query_packet(l3addr paddr);
+  virtual int received(Packet p) override;
+  int handle_request(arp_hdr* ah);
+  l2addr l2self() { return _arp.l2self(); }
+  void send(l2addr to, Packet &&p);
+ public:
+  void send_query(const l3addr& paddr);
+  explicit arp_for(CephContext *c, arp& a, EventCenter *cen)
+      : arp_for_protocol(a, L3::arp_protocol_type()), cct(c), center(cen) {
+    _table[L3::broadcast_address()] = ethernet::broadcast_address();
+  }
+  ~arp_for() {
+    for (auto && p : _in_progress)
+      center->delete_time_event(p.second.timeout_fd);
+  }
+  void wait(const l3addr& addr, Packet p, resolution_cb cb);
+  void learn(l2addr l2, l3addr l3);
+  void run();
+  void set_self_addr(l3addr addr) {
+    _table.erase(_l3self);
+    _table[addr] = l2self();
+    _l3self = addr;
+  }
+  friend class arp;
+};
+
+template <typename L3>
+void arp_for<L3>::send(l2addr to, Packet &&p) {
+  _arp._packetq.push_back(l3_protocol::l3packet{eth_protocol_num::arp, to, std::move(p)});
+}
+
+template <typename L3>
+Packet arp_for<L3>::make_query_packet(l3addr paddr) {
+  arp_hdr hdr;
+  hdr.htype = ethernet::arp_hardware_type();
+  hdr.ptype = L3::arp_protocol_type();
+  hdr.hlen = sizeof(l2addr);
+  hdr.plen = sizeof(l3addr);
+  hdr.oper = op_request;
+  hdr.sender_hwaddr = l2self();
+  hdr.sender_paddr = _l3self;
+  hdr.target_hwaddr = ethernet::broadcast_address();
+  hdr.target_paddr = paddr;
+  hdr = hdr.hton();
+  return Packet(reinterpret_cast<char*>(&hdr), sizeof(hdr));
+}
+
+template <typename L3>
+void arp_for<L3>::send_query(const l3addr& paddr) {
+  send(ethernet::broadcast_address(), make_query_packet(paddr));
+}
+
+template <typename L3>
+void arp_for<L3>::learn(l2addr hwaddr, l3addr paddr) {
+  _table[paddr] = hwaddr;
+  auto i = _in_progress.find(paddr);
+  if (i != _in_progress.end()) {
+    auto& res = i->second;
+    center->delete_time_event(res.timeout_fd);
+    for (auto &&p : res._waiters) {
+      p.first(hwaddr, std::move(p.second), 0);
+    }
+    _in_progress.erase(i);
+  }
+}
+
+template <typename L3>
+void arp_for<L3>::wait(const l3addr& paddr, Packet p, resolution_cb cb) {
+  auto i = _table.find(paddr);
+  if (i != _table.end()) {
+    cb(i->second, std::move(p), 0);
+    return ;
+  }
+
+  auto j = _in_progress.find(paddr);
+  auto first_request = j == _in_progress.end();
+  auto& res = first_request ? _in_progress[paddr] : j->second;
+
+  if (first_request) {
+    res.timeout_fd = center->create_time_event(
+        1*1000*1000, new C_handle_arp_timeout(this, paddr, first_request));
+    send_query(paddr);
+  }
+
+  if (res._waiters.size() >= max_waiters) {
+    cb(ethernet_address(), std::move(p), -EBUSY);
+    return ;
+  }
+
+  res._waiters.emplace_back(cb, std::move(p));
+  return ;
+}
+
+template <typename L3>
+int arp_for<L3>::received(Packet p) {
+  auto ah = p.get_header<arp_hdr>();
+  if (!ah) {
+    return 0;
+  }
+  auto h = ah->ntoh();
+  if (h.hlen != sizeof(l2addr) || h.plen != sizeof(l3addr)) {
+    return 0;
+  }
+  switch (h.oper) {
+    case op_request:
+      return handle_request(&h);
+    case op_reply:
+      _arp._netif->arp_learn(h.sender_hwaddr, h.sender_paddr);
+      return 0;
+    default:
+      return 0;
+  }
+}
+
+template <typename L3>
+int arp_for<L3>::handle_request(arp_hdr* ah) {
+  if (ah->target_paddr == _l3self
+      && _l3self != L3::broadcast_address()) {
+    ah->oper = op_reply;
+    ah->target_hwaddr = ah->sender_hwaddr;
+    ah->target_paddr = ah->sender_paddr;
+    ah->sender_hwaddr = l2self();
+    ah->sender_paddr = _l3self;
+    *ah = ah->hton();
+    send(ah->target_hwaddr, Packet(reinterpret_cast<char*>(ah), sizeof(*ah)));
+  }
+  return 0;
+}
+
+#endif /* CEPH_MSG_ARP_H_ */
diff --git a/src/msg/async/dpdk/DPDK.cc b/src/msg/async/dpdk/DPDK.cc
new file mode 100644
index 000000000..ff4967888
--- /dev/null
+++ b/src/msg/async/dpdk/DPDK.cc
@@ -0,0 +1,1263 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#include <atomic>
+#include <vector>
+#include <queue>
+
+#include <rte_config.h>
+#include <rte_common.h>
+#include <rte_eal.h>
+#include <rte_pci.h>
+#include <rte_ethdev.h>
+#include <rte_cycles.h>
+#include <rte_memzone.h>
+
+#include "include/page.h"
+#include "align.h"
+#include "IP.h"
+#include "const.h"
+#include "dpdk_rte.h"
+#include "DPDK.h"
+#include "toeplitz.h"
+
+#include "common/Cycles.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "dpdk "
+
+
+void* as_cookie(struct rte_pktmbuf_pool_private& p) {
+  return &p;
+};
+
+#ifndef MARKER
+typedef void    *MARKER[0];   /**< generic marker for a point in a structure */
+#endif
+
+/******************* Net device related constatns *****************************/
+static constexpr uint16_t default_ring_size      = 512;
+
+//
+// We need 2 times the ring size of buffers because of the way PMDs
+// refill the ring.
+//
+static constexpr uint16_t mbufs_per_queue_rx     = 2 * default_ring_size;
+static constexpr uint16_t rx_gc_thresh           = 64;
+
+//
+// No need to keep more descriptors in the air than can be sent in a single
+// rte_eth_tx_burst() call.
+//
+static constexpr uint16_t mbufs_per_queue_tx     = 2 * default_ring_size;
+
+static constexpr uint16_t mbuf_cache_size        = 512;
+//
+// Size of the data buffer in the non-inline case.
+//
+// We may want to change (increase) this value in future, while the
+// inline_mbuf_data_size value will unlikely change due to reasons described
+// above.
+//
+static constexpr size_t mbuf_data_size = 4096;
+
+static constexpr uint16_t mbuf_overhead          =
+                          sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM;
+//
+// We'll allocate 2K data buffers for an inline case because this would require
+// a single page per mbuf. If we used 4K data buffers here it would require 2
+// pages for a single buffer (due to "mbuf_overhead") and this is a much more
+// demanding memory constraint.
+//
+static constexpr size_t inline_mbuf_data_size = 2048;
+
+
+// (INLINE_MBUF_DATA_SIZE(2K)*32 = 64K = Max TSO/LRO size) + 1 mbuf for headers
+static constexpr uint8_t max_frags = 32 + 1;
+
+//
+// Intel's 40G NIC HW limit for a number of fragments in an xmit segment.
+//
+// See Chapter 8.4.1 "Transmit Packet in System Memory" of the xl710 devices
+// spec. for more details.
+//
+static constexpr uint8_t i40e_max_xmit_segment_frags = 8;
+
+//
+// VMWare's virtual NIC limit for a number of fragments in an xmit segment.
+//
+// see drivers/net/vmxnet3/base/vmxnet3_defs.h VMXNET3_MAX_TXD_PER_PKT
+//
+static constexpr uint8_t vmxnet3_max_xmit_segment_frags = 16;
+
+static constexpr uint16_t inline_mbuf_size = inline_mbuf_data_size + mbuf_overhead;
+
+static size_t huge_page_size = 512 * CEPH_PAGE_SIZE;
+
+uint32_t qp_mempool_obj_size()
+{
+  uint32_t mp_size = 0;
+  struct rte_mempool_objsz mp_obj_sz = {};
+
+  //
+  // We will align each size to huge page size because DPDK allocates
+  // physically contiguous memory region for each pool object.
+  //
+
+  // Rx
+  mp_size += align_up(rte_mempool_calc_obj_size(mbuf_overhead, 0, &mp_obj_sz)+
+                      sizeof(struct rte_pktmbuf_pool_private),
+                      huge_page_size);
+
+  //Tx
+  std::memset(&mp_obj_sz, 0, sizeof(mp_obj_sz));
+  mp_size += align_up(rte_mempool_calc_obj_size(inline_mbuf_size, 0,
+                                                &mp_obj_sz)+
+                      sizeof(struct rte_pktmbuf_pool_private),
+                      huge_page_size);
+  return mp_size;
+}
+
+static constexpr const char* pktmbuf_pool_name   = "dpdk_net_pktmbuf_pool";
+
+/*
+ * When doing reads from the NIC queues, use this batch size
+ */
+static constexpr uint8_t packet_read_size        = 32;
+/******************************************************************************/
+
+int DPDKDevice::init_port_start()
+{
+  ceph_assert(_port_idx < rte_eth_dev_count_avail());
+
+  rte_eth_dev_info_get(_port_idx, &_dev_info);
+
+  //
+  // This is a workaround for a missing handling of a HW limitation in the
+  // DPDK i40e driver. This and all related to _is_i40e_device code should be
+  // removed once this handling is added.
+  //
+  if (std::string("rte_i40evf_pmd") == _dev_info.driver_name ||
+      std::string("rte_i40e_pmd") == _dev_info.driver_name) {
+    ldout(cct, 1) << __func__ << " Device is an Intel's 40G NIC. Enabling 8 fragments hack!" << dendl;
+    _is_i40e_device = true;
+  }
+
+  if (std::string("rte_vmxnet3_pmd") == _dev_info.driver_name) {
+    ldout(cct, 1) << __func__ << " Device is a VMWare Virtual NIC. Enabling 16 fragments hack!" << dendl;
+    _is_vmxnet3_device = true;
+  }
+
+  //
+  // Another workaround: this time for a lack of number of RSS bits.
+  // ixgbe PF NICs support up to 16 RSS queues.
+  // ixgbe VF NICs support up to 4 RSS queues.
+  // i40e PF NICs support up to 64 RSS queues.
+  // i40e VF NICs support up to 16 RSS queues.
+  //
+  if (std::string("rte_ixgbe_pmd") == _dev_info.driver_name) {
+    _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)16);
+  } else if (std::string("rte_ixgbevf_pmd") == _dev_info.driver_name) {
+    _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)4);
+  } else if (std::string("rte_i40e_pmd") == _dev_info.driver_name) {
+    _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)64);
+  } else if (std::string("rte_i40evf_pmd") == _dev_info.driver_name) {
+    _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)16);
+  }
+
+  // Hardware offload capabilities
+  // https://github.com/DPDK/dpdk/blob/v19.05/lib/librte_ethdev/rte_ethdev.h#L993-L1074
+  // We want to support all available offload features
+  // TODO: below features are implemented in 17.05, should support new ones
+  const uint64_t tx_offloads_wanted =
+    DEV_TX_OFFLOAD_VLAN_INSERT      |
+    DEV_TX_OFFLOAD_IPV4_CKSUM       |
+    DEV_TX_OFFLOAD_UDP_CKSUM        |
+    DEV_TX_OFFLOAD_TCP_CKSUM        |
+    DEV_TX_OFFLOAD_SCTP_CKSUM       |
+    DEV_TX_OFFLOAD_TCP_TSO          |
+    DEV_TX_OFFLOAD_UDP_TSO          |
+    DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM |
+    DEV_TX_OFFLOAD_QINQ_INSERT      |
+    DEV_TX_OFFLOAD_VXLAN_TNL_TSO    |
+    DEV_TX_OFFLOAD_GRE_TNL_TSO      |
+    DEV_TX_OFFLOAD_IPIP_TNL_TSO     |
+    DEV_TX_OFFLOAD_GENEVE_TNL_TSO   |
+    DEV_TX_OFFLOAD_MACSEC_INSERT;
+
+  _dev_info.default_txconf.offloads =
+    _dev_info.tx_offload_capa & tx_offloads_wanted;
+
+  /* for port configuration all features are off by default */
+  rte_eth_conf port_conf = { 0 };
+
+  /* setting tx offloads for port */
+  port_conf.txmode.offloads = _dev_info.default_txconf.offloads;
+
+  ldout(cct, 5) << __func__ << " Port " << int(_port_idx) << ": max_rx_queues "
+                << _dev_info.max_rx_queues << "  max_tx_queues "
+                << _dev_info.max_tx_queues << dendl;
+
+  _num_queues = std::min({_num_queues, _dev_info.max_rx_queues, _dev_info.max_tx_queues});
+
+  ldout(cct, 5) << __func__ << " Port " << int(_port_idx) << ": using "
+                << _num_queues << " queues" << dendl;
+
+  // Set RSS mode: enable RSS if seastar is configured with more than 1 CPU.
+  // Even if port has a single queue we still want the RSS feature to be
+  // available in order to make HW calculate RSS hash for us.
+  if (_num_queues > 1) {
+    if (_dev_info.hash_key_size == 40) {
+      _rss_key = default_rsskey_40bytes;
+    } else if (_dev_info.hash_key_size == 52) {
+      _rss_key = default_rsskey_52bytes;
+    } else if (_dev_info.hash_key_size != 0) {
+      rte_exit(EXIT_FAILURE,
+               "Port %d: We support only 40 or 52 bytes RSS hash keys, %d bytes key requested",
+               _port_idx, _dev_info.hash_key_size);
+    } else {
+      _rss_key = default_rsskey_40bytes;
+      _dev_info.hash_key_size = 40;
+    }
+
+    port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
+    /* enable all supported rss offloads */
+    port_conf.rx_adv_conf.rss_conf.rss_hf = _dev_info.flow_type_rss_offloads;
+    if (_dev_info.hash_key_size) {
+      port_conf.rx_adv_conf.rss_conf.rss_key = const_cast<uint8_t *>(_rss_key.data());
+      port_conf.rx_adv_conf.rss_conf.rss_key_len = _dev_info.hash_key_size;
+    }
+  } else {
+    port_conf.rxmode.mq_mode = ETH_MQ_RX_NONE;
+  }
+
+  if (_num_queues > 1) {
+    if (_dev_info.reta_size) {
+      // RETA size should be a power of 2
+      ceph_assert((_dev_info.reta_size & (_dev_info.reta_size - 1)) == 0);
+
+      // Set the RSS table to the correct size
+      _redir_table.resize(_dev_info.reta_size);
+      _rss_table_bits = std::lround(std::log2(_dev_info.reta_size));
+      ldout(cct, 5) << __func__ << " Port " << int(_port_idx)
+                    << ": RSS table size is " << _dev_info.reta_size << dendl;
+    } else {
+      // FIXME: same with sw_reta
+      _redir_table.resize(128);
+      _rss_table_bits = std::lround(std::log2(128));
+    }
+  } else {
+    _redir_table.push_back(0);
+  }
+
+  // Set Rx VLAN stripping
+  if (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
+    port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
+  }
+
+#ifdef RTE_ETHDEV_HAS_LRO_SUPPORT
+  // Enable LRO
+  if (_use_lro && (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO)) {
+    ldout(cct, 1) << __func__ << " LRO is on" << dendl;
+    port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO;
+    _hw_features.rx_lro = true;
+  } else
+#endif
+    ldout(cct, 1) << __func__ << " LRO is off" << dendl;
+
+  // Check that all CSUM features are either all set all together or not set
+  // all together. If this assumption breaks we need to rework the below logic
+  // by splitting the csum offload feature bit into separate bits for IPv4,
+  // TCP.
+  ceph_assert(((_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
+          (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) ||
+         (!(_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
+          !(_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)));
+
+  // Set Rx checksum checking
+  if ((_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
+      (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
+    ldout(cct, 1) << __func__ << " RX checksum offload supported" << dendl;
+    port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
+    _hw_features.rx_csum_offload = 1;
+  }
+
+  if ((_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
+    ldout(cct, 1) << __func__ << " TX ip checksum offload supported" << dendl;
+    _hw_features.tx_csum_ip_offload = 1;
+  }
+
+  // TSO is supported starting from DPDK v1.8
+  if (_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
+    ldout(cct, 1) << __func__ << " TSO is supported" << dendl;
+    _hw_features.tx_tso = 1;
+  }
+
+  // Check that Tx TCP CSUM features are either all set all together
+  // or not set all together. If this assumption breaks we need to rework the
+  // below logic by splitting the csum offload feature bit into separate bits
+  // for TCP.
+  ceph_assert((_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) ||
+          !(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM));
+
+  if (_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) {
+    ldout(cct, 1) << __func__ << " TX TCP checksum offload supported" << dendl;
+    _hw_features.tx_csum_l4_offload = 1;
+  }
+
+  int retval;
+
+  ldout(cct, 1) << __func__ << " Port " << int(_port_idx) << " init ... " << dendl;
+
+  /*
+   * Standard DPDK port initialisation - config port, then set up
+   * rx and tx rings.
+   */
+  if ((retval = rte_eth_dev_configure(_port_idx, _num_queues, _num_queues,
+                                      &port_conf)) != 0) {
+    lderr(cct) << __func__ << " failed to configure port " << (int)_port_idx
+               << " rx/tx queues " << _num_queues << " error " << cpp_strerror(retval) << dendl;
+    return retval;
+  }
+
+  //rte_eth_promiscuous_enable(port_num);
+  ldout(cct, 1) << __func__ << " done." << dendl;
+
+  return 0;
+}
+
+void DPDKDevice::set_hw_flow_control()
+{
+  // Read the port's current/default flow control settings
+  struct rte_eth_fc_conf fc_conf;
+  auto ret = rte_eth_dev_flow_ctrl_get(_port_idx, &fc_conf);
+
+  if (ret == -ENOTSUP) {
+    ldout(cct, 1) << __func__ << " port " << int(_port_idx)
+                  << ": not support to get hardware flow control settings: " << ret << dendl;
+    goto not_supported;
+  }
+
+  if (ret < 0) {
+    lderr(cct) << __func__ << " port " << int(_port_idx)
+               << ": failed to get hardware flow control settings: " << ret << dendl;
+    ceph_abort();
+  }
+
+  if (_enable_fc) {
+    fc_conf.mode = RTE_FC_FULL;
+  } else {
+    fc_conf.mode = RTE_FC_NONE;
+  }
+
+  ret = rte_eth_dev_flow_ctrl_set(_port_idx, &fc_conf);
+  if (ret == -ENOTSUP) {
+    ldout(cct, 1) << __func__ << " port " << int(_port_idx)
+                  << ": not support to set hardware flow control settings: " << ret << dendl;
+    goto not_supported;
+  }
+
+  if (ret < 0) {
+    lderr(cct) << __func__ << " port " << int(_port_idx)
+               << ": failed to set hardware flow control settings: " << ret << dendl;
+    ceph_abort();
+  }
+
+  ldout(cct, 1) << __func__ << " port " << int(_port_idx) << ":  HW FC " << _enable_fc << dendl;
+  return;
+
+not_supported:
+  ldout(cct, 1) << __func__ << " port " << int(_port_idx) << ": changing HW FC settings is not supported" << dendl;
+}
+
+int DPDKDevice::init_port_fini()
+{
+  // Changing FC requires HW reset, so set it before the port is initialized.
+  set_hw_flow_control();
+
+  if (rte_eth_dev_start(_port_idx) != 0) {
+    lderr(cct) << __func__ << " can't start port " << _port_idx << dendl;
+    return -1;
+  }
+
+  if (_num_queues > 1)
+    set_rss_table();
+
+  // Wait for a link
+  if (check_port_link_status() < 0) {
+    lderr(cct) << __func__ << " port link up failed " << _port_idx << dendl;
+    return -1;
+  }
+
+  ldout(cct, 5) << __func__ << " created DPDK device" << dendl;
+  return 0;
+}
+
+void DPDKDevice::set_rss_table()
+{
+  struct rte_flow_attr attr;
+  struct rte_flow_item pattern[1];
+  struct rte_flow_action action[2];
+  struct rte_flow_action_rss rss_conf;
+
+  /*
+   * set the rule attribute.
+   * in this case only ingress packets will be checked.
+   */
+  memset(&attr, 0, sizeof(struct rte_flow_attr));
+  attr.ingress = 1;
+
+  /* the final level must be always type end */
+  pattern[0].type = RTE_FLOW_ITEM_TYPE_END;
+
+  /*
+   * create the action sequence.
+   * one action only,  set rss hash func to toeplitz.
+   */
+  uint16_t i = 0;
+  for (auto& r : _redir_table) {
+    r = i++ % _num_queues;
+  }
+  rss_conf.func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
+  rss_conf.types = ETH_RSS_FRAG_IPV4 | ETH_RSS_NONFRAG_IPV4_TCP;
+  rss_conf.queue_num = _num_queues;
+  rss_conf.queue = const_cast<uint16_t *>(_redir_table.data());
+  rss_conf.key_len = _dev_info.hash_key_size;
+  rss_conf.key = const_cast<uint8_t *>(_rss_key.data());
+  rss_conf.level = 0;
+  action[0].type = RTE_FLOW_ACTION_TYPE_RSS;
+  action[0].conf = &rss_conf;
+  action[1].type = RTE_FLOW_ACTION_TYPE_END;
+
+  if (rte_flow_validate(_port_idx, &attr, pattern, action, nullptr) == 0)
+    _flow = rte_flow_create(_port_idx, &attr, pattern, action, nullptr);
+  else
+    ldout(cct, 0) << __func__ << " Port " << _port_idx
+                  << ": flow rss func configuration is unsupported"
+                  << dendl;
+}
+
+void DPDKQueuePair::configure_proxies(const std::map<unsigned, float>& cpu_weights) {
+  ceph_assert(!cpu_weights.empty());
+  if (cpu_weights.size() == 1 && cpu_weights.begin()->first == _qid) {
+    // special case queue sending to self only, to avoid requiring a hash value
+    return;
+  }
+  register_packet_provider([this] {
+    Tub<Packet> p;
+    if (!_proxy_packetq.empty()) {
+      p = std::move(_proxy_packetq.front());
+      _proxy_packetq.pop_front();
+    }
+    return p;
+  });
+  build_sw_reta(cpu_weights);
+}
+
+void DPDKQueuePair::build_sw_reta(const std::map<unsigned, float>& cpu_weights) {
+  float total_weight = 0;
+  for (auto&& x : cpu_weights) {
+    total_weight += x.second;
+  }
+  float accum = 0;
+  unsigned idx = 0;
+  std::array<uint8_t, 128> reta;
+  for (auto&& entry : cpu_weights) {
+    auto cpu = entry.first;
+    auto weight = entry.second;
+    accum += weight;
+    while (idx < (accum / total_weight * reta.size() - 0.5)) {
+      reta[idx++] = cpu;
+    }
+  }
+  _sw_reta = reta;
+}
+
+
+bool DPDKQueuePair::init_rx_mbuf_pool()
+{
+  std::string name = std::string(pktmbuf_pool_name) + std::to_string(_qid) + "_rx";
+
+  // reserve the memory for Rx buffers containers
+  _rx_free_pkts.reserve(mbufs_per_queue_rx);
+  _rx_free_bufs.reserve(mbufs_per_queue_rx);
+
+  _pktmbuf_pool_rx = rte_mempool_lookup(name.c_str());
+  if (!_pktmbuf_pool_rx) {
+    ldout(cct, 1) << __func__ << " Creating Rx mbuf pool '" << name.c_str()
+                  << "' [" << mbufs_per_queue_rx << " mbufs] ..."<< dendl;
+
+    //
+    // Don't pass single-producer/single-consumer flags to mbuf create as it
+    // seems faster to use a cache instead.
+    //
+    struct rte_pktmbuf_pool_private roomsz = {};
+    roomsz.mbuf_data_room_size = mbuf_data_size + RTE_PKTMBUF_HEADROOM;
+    _pktmbuf_pool_rx = rte_mempool_create(
+        name.c_str(),
+        mbufs_per_queue_rx, mbuf_overhead + mbuf_data_size,
+        mbuf_cache_size,
+        sizeof(struct rte_pktmbuf_pool_private),
+        rte_pktmbuf_pool_init, as_cookie(roomsz),
+        rte_pktmbuf_init, nullptr,
+        rte_socket_id(), 0);
+    if (!_pktmbuf_pool_rx) {
+      lderr(cct) << __func__ << " Failed to create mempool for rx" << dendl;
+      return false;
+    }
+
+    //
+    // allocate more data buffer
+    int bufs_count =  cct->_conf->ms_dpdk_rx_buffer_count_per_core - mbufs_per_queue_rx;
+    int mz_flags = RTE_MEMZONE_1GB|RTE_MEMZONE_SIZE_HINT_ONLY;
+    std::string mz_name = "rx_buffer_data" + std::to_string(_qid);
+    const struct rte_memzone *mz = rte_memzone_reserve_aligned(mz_name.c_str(),
+          mbuf_data_size*bufs_count, _pktmbuf_pool_rx->socket_id, mz_flags, mbuf_data_size);
+    ceph_assert(mz);
+    void* m = mz->addr;
+    for (int i = 0; i < bufs_count; i++) {
+      ceph_assert(m);
+      _alloc_bufs.push_back(m);
+      m += mbuf_data_size;
+    }
+
+    if (rte_eth_rx_queue_setup(_dev_port_idx, _qid, default_ring_size,
+                               rte_eth_dev_socket_id(_dev_port_idx),
+                               _dev->def_rx_conf(), _pktmbuf_pool_rx) < 0) {
+      lderr(cct) << __func__ << " cannot initialize rx queue" << dendl;
+      return false;
+    }
+  }
+
+  return _pktmbuf_pool_rx != nullptr;
+}
+
+int DPDKDevice::check_port_link_status()
+{
+  int count = 0;
+
+  ldout(cct, 20) << __func__ << dendl;
+  const int sleep_time = 100 * 1000;
+  const int max_check_time = 90;  /* 9s (90 * 100ms) in total */
+  while (true) {
+    struct rte_eth_link link;
+    memset(&link, 0, sizeof(link));
+    rte_eth_link_get_nowait(_port_idx, &link);
+
+    if (true) {
+      if (link.link_status) {
+        ldout(cct, 5) << __func__ << " done port "
+                      << static_cast<unsigned>(_port_idx)
+                      << " link Up - speed " << link.link_speed
+                      << " Mbps - "
+                      << ((link.link_duplex == ETH_LINK_FULL_DUPLEX) ? ("full-duplex") : ("half-duplex\n"))
+                      << dendl;
+        break;
+      } else if (count++ < max_check_time) {
+        ldout(cct, 20) << __func__ << " not ready, continue to wait." << dendl;
+        usleep(sleep_time);
+      } else {
+        lderr(cct) << __func__ << " done port " << _port_idx << " link down" << dendl;
+        return -1;
+      }
+    }
+  }
+  return 0;
+}
+
+class C_handle_dev_stats : public EventCallback {
+  DPDKQueuePair *_qp;
+ public:
+  C_handle_dev_stats(DPDKQueuePair *qp): _qp(qp) { }
+  void do_request(uint64_t id) {
+    _qp->handle_stats();
+  }
+};
+
+DPDKQueuePair::DPDKQueuePair(CephContext *c, EventCenter *cen, DPDKDevice* dev, uint8_t qid)
+  : cct(c), _dev(dev), _dev_port_idx(dev->port_idx()), center(cen), _qid(qid),
+    _tx_poller(this), _rx_gc_poller(this), _tx_buf_factory(c, dev, qid),
+    _tx_gc_poller(this)
+{
+  if (!init_rx_mbuf_pool()) {
+    lderr(cct) << __func__ << " cannot initialize mbuf pools" << dendl;
+    ceph_abort();
+  }
+
+  static_assert(offsetof(tx_buf, private_end) -
+                offsetof(tx_buf, private_start) <= RTE_PKTMBUF_HEADROOM,
+                "RTE_PKTMBUF_HEADROOM is less than DPDKQueuePair::tx_buf size! "
+                "Increase the headroom size in the DPDK configuration");
+  static_assert(offsetof(tx_buf, _mbuf) == 0,
+                "There is a pad at the beginning of the tx_buf before _mbuf "
+                "field!");
+  static_assert((inline_mbuf_data_size & (inline_mbuf_data_size - 1)) == 0,
+                "inline_mbuf_data_size has to be a power of two!");
+
+  std::string name(std::string("queue") + std::to_string(qid));
+  PerfCountersBuilder plb(cct, name, l_dpdk_qp_first, l_dpdk_qp_last);
+
+  plb.add_u64_counter(l_dpdk_qp_rx_packets, "dpdk_receive_packets", "DPDK received packets");
+  plb.add_u64_counter(l_dpdk_qp_tx_packets, "dpdk_send_packets", "DPDK sendd packets");
+  plb.add_u64_counter(l_dpdk_qp_rx_bad_checksum_errors, "dpdk_receive_bad_checksum_errors", "DPDK received bad checksum packets");
+  plb.add_u64_counter(l_dpdk_qp_rx_no_memory_errors, "dpdk_receive_no_memory_errors", "DPDK received no memory packets");
+  plb.add_u64_counter(l_dpdk_qp_rx_bytes, "dpdk_receive_bytes", "DPDK received bytes", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_u64_counter(l_dpdk_qp_tx_bytes, "dpdk_send_bytes", "DPDK sendd bytes", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_u64_counter(l_dpdk_qp_rx_last_bunch, "dpdk_receive_last_bunch", "DPDK last received bunch");
+  plb.add_u64_counter(l_dpdk_qp_tx_last_bunch, "dpdk_send_last_bunch", "DPDK last send bunch");
+  plb.add_u64_counter(l_dpdk_qp_rx_fragments, "dpdk_receive_fragments", "DPDK received total fragments");
+  plb.add_u64_counter(l_dpdk_qp_tx_fragments, "dpdk_send_fragments", "DPDK sendd total fragments");
+  plb.add_u64_counter(l_dpdk_qp_rx_copy_ops, "dpdk_receive_copy_ops", "DPDK received copy operations");
+  plb.add_u64_counter(l_dpdk_qp_tx_copy_ops, "dpdk_send_copy_ops", "DPDK sendd copy operations");
+  plb.add_u64_counter(l_dpdk_qp_rx_copy_bytes, "dpdk_receive_copy_bytes", "DPDK received copy bytes", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_u64_counter(l_dpdk_qp_tx_copy_bytes, "dpdk_send_copy_bytes", "DPDK send copy bytes", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_u64_counter(l_dpdk_qp_rx_linearize_ops, "dpdk_receive_linearize_ops", "DPDK received linearize operations");
+  plb.add_u64_counter(l_dpdk_qp_tx_linearize_ops, "dpdk_send_linearize_ops", "DPDK send linearize operations");
+  plb.add_u64_counter(l_dpdk_qp_tx_queue_length, "dpdk_send_queue_length", "DPDK send queue length");
+
+  perf_logger = plb.create_perf_counters();
+  cct->get_perfcounters_collection()->add(perf_logger);
+
+  if (!_qid)
+    device_stat_time_fd = center->create_time_event(1000*1000, new C_handle_dev_stats(this));
+}
+
+void DPDKQueuePair::handle_stats()
+{
+  ldout(cct, 20) << __func__ << " started." << dendl;
+  rte_eth_stats rte_stats = {};
+  int rc = rte_eth_stats_get(_dev_port_idx, &rte_stats);
+
+  if (rc) {
+    ldout(cct, 0) << __func__ << " failed to get port statistics: " << cpp_strerror(rc) << dendl;
+    return ;
+  }
+
+#if RTE_VERSION < RTE_VERSION_NUM(16,7,0,0)
+  _dev->perf_logger->set(l_dpdk_dev_rx_mcast, rte_stats.imcasts);
+  _dev->perf_logger->set(l_dpdk_dev_rx_badcrc_errors, rte_stats.ibadcrc);
+#endif
+  _dev->perf_logger->set(l_dpdk_dev_rx_dropped_errors, rte_stats.imissed);
+  _dev->perf_logger->set(l_dpdk_dev_rx_nombuf_errors, rte_stats.rx_nombuf);
+
+  _dev->perf_logger->set(l_dpdk_dev_rx_total_errors, rte_stats.ierrors);
+  _dev->perf_logger->set(l_dpdk_dev_tx_total_errors, rte_stats.oerrors);
+  device_stat_time_fd = center->create_time_event(1000*1000, new C_handle_dev_stats(this));
+}
+
+bool DPDKQueuePair::poll_tx() {
+  bool nonloopback = !cct->_conf->ms_dpdk_debug_allow_loopback;
+#ifdef CEPH_PERF_DEV
+  uint64_t start = Cycles::rdtsc();
+#endif
+  uint32_t total_work = 0;
+  if (_tx_packetq.size() < 16) {
+    // refill send queue from upper layers
+    uint32_t work;
+    do {
+      work = 0;
+      for (auto&& pr : _pkt_providers) {
+        auto p = pr();
+        if (p) {
+          work++;
+          if (likely(nonloopback)) {
+            // ldout(cct, 0) << __func__ << " len: " << p->len() << " frags: " << p->nr_frags() << dendl;
+            _tx_packetq.push_back(std::move(*p));
+          } else {
+            auto th = p->get_header<eth_hdr>(0);
+            if (th->dst_mac == th->src_mac) {
+              _dev->l2receive(_qid, std::move(*p));
+            } else {
+              _tx_packetq.push_back(std::move(*p));
+            }
+          }
+          if (_tx_packetq.size() == 128) {
+            break;
+          }
+        }
+      }
+      total_work += work;
+    } while (work && total_work < 256 && _tx_packetq.size() < 128);
+  }
+  if (!_tx_packetq.empty()) {
+    uint64_t c = send(_tx_packetq);
+    perf_logger->inc(l_dpdk_qp_tx_packets, c);
+    perf_logger->set(l_dpdk_qp_tx_last_bunch, c);
+#ifdef CEPH_PERF_DEV
+    tx_count += total_work;
+    tx_cycles += Cycles::rdtsc() - start;
+#endif
+    return true;
+  }
+
+  return false;
+}
+
+inline Tub<Packet> DPDKQueuePair::from_mbuf_lro(rte_mbuf* m)
+{
+  _frags.clear();
+  _bufs.clear();
+
+  for (; m != nullptr; m = m->next) {
+    char* data = rte_pktmbuf_mtod(m, char*);
+
+    _frags.emplace_back(fragment{data, rte_pktmbuf_data_len(m)});
+    _bufs.push_back(data);
+  }
+
+  auto del = std::bind(
+          [this](std::vector<char*> &bufs) {
+            for (auto&& b : bufs) { _alloc_bufs.push_back(b); }
+          }, std::move(_bufs));
+  return Packet(
+      _frags.begin(), _frags.end(), make_deleter(std::move(del)));
+}
+
+inline Tub<Packet> DPDKQueuePair::from_mbuf(rte_mbuf* m)
+{
+  _rx_free_pkts.push_back(m);
+  _num_rx_free_segs += m->nb_segs;
+
+  if (!_dev->hw_features_ref().rx_lro || rte_pktmbuf_is_contiguous(m)) {
+    char* data = rte_pktmbuf_mtod(m, char*);
+
+    return Packet(fragment{data, rte_pktmbuf_data_len(m)},
+                  make_deleter([this, data] { _alloc_bufs.push_back(data); }));
+  } else {
+    return from_mbuf_lro(m);
+  }
+}
+
+inline bool DPDKQueuePair::refill_one_cluster(rte_mbuf* head)
+{
+  for (; head != nullptr; head = head->next) {
+    if (!refill_rx_mbuf(head, mbuf_data_size, _alloc_bufs)) {
+      //
+      // If we failed to allocate a new buffer - push the rest of the
+      // cluster back to the free_packets list for a later retry.
+      //
+      _rx_free_pkts.push_back(head);
+      return false;
+    }
+    _rx_free_bufs.push_back(head);
+  }
+
+  return true;
+}
+
+bool DPDKQueuePair::rx_gc(bool force)
+{
+  if (_num_rx_free_segs >= rx_gc_thresh || force) {
+    ldout(cct, 10) << __func__ << " free segs " << _num_rx_free_segs
+                   << " thresh " << rx_gc_thresh
+                   << " free pkts " << _rx_free_pkts.size()
+                   << dendl;
+
+    while (!_rx_free_pkts.empty()) {
+      //
+      // Use back() + pop_back() semantics to avoid an extra
+      // _rx_free_pkts.clear() at the end of the function - clear() has a
+      // linear complexity.
+      //
+      auto m = _rx_free_pkts.back();
+      _rx_free_pkts.pop_back();
+
+      if (!refill_one_cluster(m)) {
+        ldout(cct, 1) << __func__ << " get new mbuf failed " << dendl;
+        break;
+      }
+    }
+    for (auto&& m : _rx_free_bufs) {
+      rte_pktmbuf_prefree_seg(m);
+    }
+
+    if (_rx_free_bufs.size()) {
+      rte_mempool_put_bulk(_pktmbuf_pool_rx,
+                           (void **)_rx_free_bufs.data(),
+                           _rx_free_bufs.size());
+
+      // TODO: ceph_assert() in a fast path! Remove me ASAP!
+      ceph_assert(_num_rx_free_segs >= _rx_free_bufs.size());
+
+      _num_rx_free_segs -= _rx_free_bufs.size();
+      _rx_free_bufs.clear();
+
+      // TODO: ceph_assert() in a fast path! Remove me ASAP!
+      ceph_assert((_rx_free_pkts.empty() && !_num_rx_free_segs) ||
+             (!_rx_free_pkts.empty() && _num_rx_free_segs));
+    }
+  }
+
+  return _num_rx_free_segs >= rx_gc_thresh;
+}
+
+
+void DPDKQueuePair::process_packets(
+    struct rte_mbuf **bufs, uint16_t count)
+{
+  uint64_t nr_frags = 0, bytes = 0;
+
+  for (uint16_t i = 0; i < count; i++) {
+    struct rte_mbuf *m = bufs[i];
+    offload_info oi;
+
+    Tub<Packet> p = from_mbuf(m);
+
+    // Drop the packet if translation above has failed
+    if (!p) {
+      perf_logger->inc(l_dpdk_qp_rx_no_memory_errors);
+      continue;
+    }
+    // ldout(cct, 0) << __func__ << " len " << p->len() << " " << dendl;
+
+    nr_frags += m->nb_segs;
+    bytes    += m->pkt_len;
+
+    // Set stipped VLAN value if available
+    if ((_dev->_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) &&
+        (m->ol_flags & PKT_RX_VLAN_STRIPPED)) {
+      oi.vlan_tci = m->vlan_tci;
+    }
+
+    if (_dev->get_hw_features().rx_csum_offload) {
+      if (m->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
+        // Packet with bad checksum, just drop it.
+        perf_logger->inc(l_dpdk_qp_rx_bad_checksum_errors);
+        continue;
+      }
+      // Note that when _hw_features.rx_csum_offload is on, the receive
+      // code for ip, tcp and udp will assume they don't need to check
+      // the checksum again, because we did this here.
+    }
+
+    p->set_offload_info(oi);
+    if (m->ol_flags & PKT_RX_RSS_HASH) {
+      p->set_rss_hash(m->hash.rss);
+    }
+
+    _dev->l2receive(_qid, std::move(*p));
+  }
+
+  perf_logger->inc(l_dpdk_qp_rx_packets, count);
+  perf_logger->set(l_dpdk_qp_rx_last_bunch, count);
+  perf_logger->inc(l_dpdk_qp_rx_fragments, nr_frags);
+  perf_logger->inc(l_dpdk_qp_rx_bytes, bytes);
+}
+
+bool DPDKQueuePair::poll_rx_once()
+{
+  struct rte_mbuf *buf[packet_read_size];
+
+  /* read a port */
+#ifdef CEPH_PERF_DEV
+  uint64_t start = Cycles::rdtsc();
+#endif
+  uint16_t count = rte_eth_rx_burst(_dev_port_idx, _qid,
+                                       buf, packet_read_size);
+
+  /* Now process the NIC packets read */
+  if (likely(count > 0)) {
+    process_packets(buf, count);
+#ifdef CEPH_PERF_DEV
+    rx_cycles = Cycles::rdtsc() - start;
+    rx_count += count;
+#endif
+  }
+#ifdef CEPH_PERF_DEV
+  else {
+    if (rx_count > 10000 && tx_count) {
+      ldout(cct, 0) << __func__ << " rx count=" << rx_count << " avg rx=" << Cycles::to_nanoseconds(rx_cycles)/rx_count << "ns "
+                    << " tx count=" << tx_count << " avg tx=" << Cycles::to_nanoseconds(tx_cycles)/tx_count << "ns"
+                    << dendl;
+      rx_count = rx_cycles = tx_count = tx_cycles = 0;
+    }
+  }
+#endif
+
+  return count;
+}
+
+DPDKQueuePair::tx_buf_factory::tx_buf_factory(CephContext *c,
+        DPDKDevice *dev, uint8_t qid): cct(c)
+{
+  std::string name = std::string(pktmbuf_pool_name) + std::to_string(qid) + "_tx";
+
+  _pool = rte_mempool_lookup(name.c_str());
+  if (!_pool) {
+    ldout(cct, 0) << __func__ << " Creating Tx mbuf pool '" << name.c_str()
+                  << "' [" << mbufs_per_queue_tx << " mbufs] ..." << dendl;
+    //
+    // We are going to push the buffers from the mempool into
+    // the circular_buffer and then poll them from there anyway, so
+    // we prefer to make a mempool non-atomic in this case.
+    //
+    _pool = rte_mempool_create(name.c_str(),
+                               mbufs_per_queue_tx, inline_mbuf_size,
+                               mbuf_cache_size,
+                               sizeof(struct rte_pktmbuf_pool_private),
+                               rte_pktmbuf_pool_init, nullptr,
+                               rte_pktmbuf_init, nullptr,
+                               rte_socket_id(), 0);
+
+    if (!_pool) {
+      lderr(cct) << __func__ << " Failed to create mempool for Tx" << dendl;
+      ceph_abort();
+    }
+    if (rte_eth_tx_queue_setup(dev->port_idx(), qid, default_ring_size,
+                               rte_eth_dev_socket_id(dev->port_idx()),
+                               dev->def_tx_conf()) < 0) {
+      lderr(cct) << __func__ << " cannot initialize tx queue" << dendl;
+      ceph_abort();
+    }
+  }
+
+  //
+  // Fill the factory with the buffers from the mempool allocated
+  // above.
+  //
+  init_factory();
+}
+
+bool DPDKQueuePair::tx_buf::i40e_should_linearize(rte_mbuf *head)
+{
+  bool is_tso = head->ol_flags & PKT_TX_TCP_SEG;
+
+  // For a non-TSO case: number of fragments should not exceed 8
+  if (!is_tso){
+    return head->nb_segs > i40e_max_xmit_segment_frags;
+  }
+
+  //
+  // For a TSO case each MSS window should not include more than 8
+  // fragments including headers.
+  //
+
+  // Calculate the number of frags containing headers.
+  //
+  // Note: we support neither VLAN nor tunneling thus headers size
+  // accounting is super simple.
+  //
+  size_t headers_size = head->l2_len + head->l3_len + head->l4_len;
+  unsigned hdr_frags = 0;
+  size_t cur_payload_len = 0;
+  rte_mbuf *cur_seg = head;
+
+  while (cur_seg && cur_payload_len < headers_size) {
+    cur_payload_len += cur_seg->data_len;
+    cur_seg = cur_seg->next;
+    hdr_frags++;
+  }
+
+  //
+  // Header fragments will be used for each TSO segment, thus the
+  // maximum number of data segments will be 8 minus the number of
+  // header fragments.
+  //
+  // It's unclear from the spec how the first TSO segment is treated
+  // if the last fragment with headers contains some data bytes:
+  // whether this fragment will be accounted as a single fragment or
+  // as two separate fragments. We prefer to play it safe and assume
+  // that this fragment will be accounted as two separate fragments.
+  //
+  size_t max_win_size = i40e_max_xmit_segment_frags - hdr_frags;
+
+  if (head->nb_segs <= max_win_size) {
+    return false;
+  }
+
+  // Get the data (without headers) part of the first data fragment
+  size_t prev_frag_data = cur_payload_len - headers_size;
+  auto mss = head->tso_segsz;
+
+  while (cur_seg) {
+    unsigned frags_in_seg = 0;
+    size_t cur_seg_size = 0;
+
+    if (prev_frag_data) {
+      cur_seg_size = prev_frag_data;
+      frags_in_seg++;
+      prev_frag_data = 0;
+    }
+
+    while (cur_seg_size < mss && cur_seg) {
+      cur_seg_size += cur_seg->data_len;
+      cur_seg = cur_seg->next;
+      frags_in_seg++;
+
+      if (frags_in_seg > max_win_size) {
+        return true;
+      }
+    }
+
+    if (cur_seg_size > mss) {
+      prev_frag_data = cur_seg_size - mss;
+    }
+  }
+
+  return false;
+}
+
+void DPDKQueuePair::tx_buf::set_cluster_offload_info(const Packet& p, const DPDKQueuePair& qp, rte_mbuf* head)
+{
+  // Handle TCP checksum offload
+  auto oi = p.offload_info();
+  if (oi.needs_ip_csum) {
+    head->ol_flags |= PKT_TX_IP_CKSUM;
+    // TODO: Take a VLAN header into an account here
+    head->l2_len = sizeof(struct ether_hdr);
+    head->l3_len = oi.ip_hdr_len;
+  }
+  if (qp.port().get_hw_features().tx_csum_l4_offload) {
+    if (oi.protocol == ip_protocol_num::tcp) {
+      head->ol_flags |= PKT_TX_TCP_CKSUM;
+      // TODO: Take a VLAN header into an account here
+      head->l2_len = sizeof(struct ether_hdr);
+      head->l3_len = oi.ip_hdr_len;
+
+      if (oi.tso_seg_size) {
+        ceph_assert(oi.needs_ip_csum);
+        head->ol_flags |= PKT_TX_TCP_SEG;
+        head->l4_len = oi.tcp_hdr_len;
+        head->tso_segsz = oi.tso_seg_size;
+      }
+    }
+  }
+}
+
+DPDKQueuePair::tx_buf* DPDKQueuePair::tx_buf::from_packet_zc(
+        CephContext *cct, Packet&& p, DPDKQueuePair& qp)
+{
+  // Too fragmented - linearize
+  if (p.nr_frags() > max_frags) {
+    p.linearize();
+    qp.perf_logger->inc(l_dpdk_qp_tx_linearize_ops);
+  }
+
+ build_mbuf_cluster:
+  rte_mbuf *head = nullptr, *last_seg = nullptr;
+  unsigned nsegs = 0;
+
+  //
+  // Create a HEAD of the fragmented packet: check if frag0 has to be
+  // copied and if yes - send it in a copy way
+  //
+  if (!check_frag0(p)) {
+    if (!copy_one_frag(qp, p.frag(0), head, last_seg, nsegs)) {
+      ldout(cct, 1) << __func__ << " no available mbuf for " << p.frag(0).size << dendl;
+      return nullptr;
+    }
+  } else if (!translate_one_frag(qp, p.frag(0), head, last_seg, nsegs)) {
+    ldout(cct, 1) << __func__ << " no available mbuf for " << p.frag(0).size << dendl;
+    return nullptr;
+  }
+
+  unsigned total_nsegs = nsegs;
+
+  for (unsigned i = 1; i < p.nr_frags(); i++) {
+    rte_mbuf *h = nullptr, *new_last_seg = nullptr;
+    if (!translate_one_frag(qp, p.frag(i), h, new_last_seg, nsegs)) {
+      ldout(cct, 1) << __func__ << " no available mbuf for " << p.frag(i).size << dendl;
+      me(head)->recycle();
+      return nullptr;
+    }
+
+    total_nsegs += nsegs;
+
+    // Attach a new buffers' chain to the packet chain
+    last_seg->next = h;
+    last_seg = new_last_seg;
+  }
+
+  // Update the HEAD buffer with the packet info
+  head->pkt_len = p.len();
+  head->nb_segs = total_nsegs;
+  // tx_pkt_burst loops until the next pointer is null, so last_seg->next must
+  // be null.
+  last_seg->next = nullptr;
+
+  set_cluster_offload_info(p, qp, head);
+
+  //
+  // If a packet hasn't been linearized already and the resulting
+  // cluster requires the linearisation due to HW limitation:
+  //
+  //    - Recycle the cluster.
+  //    - Linearize the packet.
+  //    - Build the cluster once again
+  //
+  if (head->nb_segs > max_frags ||
+      (p.nr_frags() > 1 && qp.port().is_i40e_device() && i40e_should_linearize(head)) ||
+      (p.nr_frags() > vmxnet3_max_xmit_segment_frags && qp.port().is_vmxnet3_device())) {
+    me(head)->recycle();
+    p.linearize();
+    qp.perf_logger->inc(l_dpdk_qp_tx_linearize_ops);
+
+    goto build_mbuf_cluster;
+  }
+
+  me(last_seg)->set_packet(std::move(p));
+
+  return me(head);
+}
+
+void DPDKQueuePair::tx_buf::copy_packet_to_cluster(const Packet& p, rte_mbuf* head)
+{
+  rte_mbuf* cur_seg = head;
+  size_t cur_seg_offset = 0;
+  unsigned cur_frag_idx = 0;
+  size_t cur_frag_offset = 0;
+
+  while (true) {
+    size_t to_copy = std::min(p.frag(cur_frag_idx).size - cur_frag_offset,
+                              inline_mbuf_data_size - cur_seg_offset);
+
+    memcpy(rte_pktmbuf_mtod_offset(cur_seg, void*, cur_seg_offset),
+           p.frag(cur_frag_idx).base + cur_frag_offset, to_copy);
+
+    cur_frag_offset += to_copy;
+    cur_seg_offset += to_copy;
+
+    if (cur_frag_offset >= p.frag(cur_frag_idx).size) {
+      ++cur_frag_idx;
+      if (cur_frag_idx >= p.nr_frags()) {
+        //
+        // We are done - set the data size of the last segment
+        // of the cluster.
+        //
+        cur_seg->data_len = cur_seg_offset;
+        break;
+      }
+
+      cur_frag_offset = 0;
+    }
+
+    if (cur_seg_offset >= inline_mbuf_data_size) {
+      cur_seg->data_len = inline_mbuf_data_size;
+      cur_seg = cur_seg->next;
+      cur_seg_offset = 0;
+
+      // FIXME: assert in a fast-path - remove!!!
+      ceph_assert(cur_seg);
+    }
+  }
+}
+
+DPDKQueuePair::tx_buf* DPDKQueuePair::tx_buf::from_packet_copy(Packet&& p, DPDKQueuePair& qp)
+{
+  // sanity
+  if (!p.len()) {
+    return nullptr;
+  }
+
+  /*
+   * Here we are going to use the fact that the inline data size is a
+   * power of two.
+   *
+   * We will first try to allocate the cluster and only if we are
+   * successful - we will go and copy the data.
+   */
+  auto aligned_len = align_up((size_t)p.len(), inline_mbuf_data_size);
+  unsigned nsegs = aligned_len / inline_mbuf_data_size;
+  rte_mbuf *head = nullptr, *last_seg = nullptr;
+
+  tx_buf* buf = qp.get_tx_buf();
+  if (!buf) {
+    return nullptr;
+  }
+
+  head = buf->rte_mbuf_p();
+  last_seg = head;
+  for (unsigned i = 1; i < nsegs; i++) {
+    buf = qp.get_tx_buf();
+    if (!buf) {
+      me(head)->recycle();
+      return nullptr;
+    }
+
+    last_seg->next = buf->rte_mbuf_p();
+    last_seg = last_seg->next;
+  }
+
+  //
+  // If we've got here means that we have succeeded already!
+  // We only need to copy the data and set the head buffer with the
+  // relevant info.
+  //
+  head->pkt_len = p.len();
+  head->nb_segs = nsegs;
+  // tx_pkt_burst loops until the next pointer is null, so last_seg->next must
+  // be null.
+  last_seg->next = nullptr;
+
+  copy_packet_to_cluster(p, head);
+  set_cluster_offload_info(p, qp, head);
+
+  return me(head);
+}
+
+size_t DPDKQueuePair::tx_buf::copy_one_data_buf(
+    DPDKQueuePair& qp, rte_mbuf*& m, char* data, size_t buf_len)
+{
+  tx_buf* buf = qp.get_tx_buf();
+  if (!buf) {
+    return 0;
+  }
+
+  size_t len = std::min(buf_len, inline_mbuf_data_size);
+
+  m = buf->rte_mbuf_p();
+
+  // mbuf_put()
+  m->data_len = len;
+  m->pkt_len  = len;
+
+  qp.perf_logger->inc(l_dpdk_qp_tx_copy_ops);
+  qp.perf_logger->inc(l_dpdk_qp_tx_copy_bytes, len);
+
+  memcpy(rte_pktmbuf_mtod(m, void*), data, len);
+
+  return len;
+}
+
+/******************************** Interface functions *************************/
+
+std::unique_ptr<DPDKDevice> create_dpdk_net_device(
+    CephContext *cct,
+    unsigned cores,
+    uint8_t port_idx,
+    bool use_lro,
+    bool enable_fc)
+{
+  // Check that we have at least one DPDK-able port
+  if (rte_eth_dev_count_avail() == 0) {
+    rte_exit(EXIT_FAILURE, "No Ethernet ports - bye\n");
+  } else {
+    ldout(cct, 10) << __func__ << " ports number: " << int(rte_eth_dev_count_avail()) << dendl;
+  }
+
+  return std::unique_ptr<DPDKDevice>(
+      new DPDKDevice(cct, port_idx, cores, use_lro, enable_fc));
+}
diff --git a/src/msg/async/dpdk/DPDK.h b/src/msg/async/dpdk/DPDK.h
new file mode 100644
index 000000000..78a1a0769
--- /dev/null
+++ b/src/msg/async/dpdk/DPDK.h
@@ -0,0 +1,921 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_DPDK_DEV_H
+#define CEPH_DPDK_DEV_H
+
+#include <memory>
+#include <functional>
+#include <rte_config.h>
+#include <rte_common.h>
+#include <rte_ethdev.h>
+#include <rte_malloc.h>
+#include <rte_version.h>
+
+#include "include/page.h"
+#include "common/Tub.h"
+#include "common/perf_counters.h"
+#include "msg/async/Event.h"
+#include "const.h"
+#include "circular_buffer.h"
+#include "ethernet.h"
+#include "Packet.h"
+#include "stream.h"
+#include "net.h"
+#include "toeplitz.h"
+
+
+struct free_deleter {
+  void operator()(void* p) { ::free(p); }
+};
+
+
+enum {
+  l_dpdk_dev_first = 58800,
+  l_dpdk_dev_rx_mcast,
+  l_dpdk_dev_rx_total_errors,
+  l_dpdk_dev_tx_total_errors,
+  l_dpdk_dev_rx_badcrc_errors,
+  l_dpdk_dev_rx_dropped_errors,
+  l_dpdk_dev_rx_nombuf_errors,
+  l_dpdk_dev_last
+};
+
+enum {
+  l_dpdk_qp_first = 58900,
+  l_dpdk_qp_rx_packets,
+  l_dpdk_qp_tx_packets,
+  l_dpdk_qp_rx_bad_checksum_errors,
+  l_dpdk_qp_rx_no_memory_errors,
+  l_dpdk_qp_rx_bytes,
+  l_dpdk_qp_tx_bytes,
+  l_dpdk_qp_rx_last_bunch,
+  l_dpdk_qp_tx_last_bunch,
+  l_dpdk_qp_rx_fragments,
+  l_dpdk_qp_tx_fragments,
+  l_dpdk_qp_rx_copy_ops,
+  l_dpdk_qp_tx_copy_ops,
+  l_dpdk_qp_rx_copy_bytes,
+  l_dpdk_qp_tx_copy_bytes,
+  l_dpdk_qp_rx_linearize_ops,
+  l_dpdk_qp_tx_linearize_ops,
+  l_dpdk_qp_tx_queue_length,
+  l_dpdk_qp_last
+};
+
+class DPDKDevice;
+class DPDKWorker;
+
+class DPDKQueuePair {
+  using packet_provider_type = std::function<Tub<Packet> ()>;
+ public:
+  void configure_proxies(const std::map<unsigned, float>& cpu_weights);
+  // build REdirection TAble for cpu_weights map: target cpu -> weight
+  void build_sw_reta(const std::map<unsigned, float>& cpu_weights);
+  void proxy_send(Packet p) {
+    _proxy_packetq.push_back(std::move(p));
+  }
+  void register_packet_provider(packet_provider_type func) {
+    _pkt_providers.push_back(std::move(func));
+  }
+  bool poll_tx();
+  friend class DPDKDevice;
+
+  class tx_buf_factory;
+
+  class tx_buf {
+    friend class DPDKQueuePair;
+   public:
+    static tx_buf* me(rte_mbuf* mbuf) {
+      return reinterpret_cast<tx_buf*>(mbuf);
+    }
+
+   private:
+    /**
+     * Checks if the original packet of a given cluster should be linearized
+     * due to HW limitations.
+     *
+     * @param head head of a cluster to check
+     *
+     * @return TRUE if a packet should be linearized.
+     */
+    static bool i40e_should_linearize(rte_mbuf *head);
+
+    /**
+     * Sets the offload info in the head buffer of an rte_mbufs cluster.
+     *
+     * @param p an original packet the cluster is built for
+     * @param qp QP handle
+     * @param head a head of an rte_mbufs cluster
+     */
+    static void set_cluster_offload_info(const Packet& p, const DPDKQueuePair& qp, rte_mbuf* head);
+
+    /**
+     * Creates a tx_buf cluster representing a given packet in a "zero-copy"
+     * way.
+     *
+     * @param p packet to translate
+     * @param qp DPDKQueuePair handle
+     *
+     * @return the HEAD tx_buf of the cluster or nullptr in case of a
+     *         failure
+     */
+    static tx_buf* from_packet_zc(
+            CephContext *cct, Packet&& p, DPDKQueuePair& qp);
+
+    /**
+     * Copy the contents of the "packet" into the given cluster of
+     * rte_mbuf's.
+     *
+     * @note Size of the cluster has to be big enough to accommodate all the
+     *       contents of the given packet.
+     *
+     * @param p packet to copy
+     * @param head head of the rte_mbuf's cluster
+     */
+    static void copy_packet_to_cluster(const Packet& p, rte_mbuf* head);
+
+    /**
+     * Creates a tx_buf cluster representing a given packet in a "copy" way.
+     *
+     * @param p packet to translate
+     * @param qp DPDKQueuePair handle
+     *
+     * @return the HEAD tx_buf of the cluster or nullptr in case of a
+     *         failure
+     */
+    static tx_buf* from_packet_copy(Packet&& p, DPDKQueuePair& qp);
+
+    /**
+     * Zero-copy handling of a single fragment.
+     *
+     * @param do_one_buf Functor responsible for a single rte_mbuf
+     *                   handling
+     * @param qp DPDKQueuePair handle (in)
+     * @param frag Fragment to copy (in)
+     * @param head Head of the cluster (out)
+     * @param last_seg Last segment of the cluster (out)
+     * @param nsegs Number of segments in the cluster (out)
+     *
+     * @return TRUE in case of success
+     */
+    template <class DoOneBufFunc>
+    static bool do_one_frag(DoOneBufFunc do_one_buf, DPDKQueuePair& qp,
+                            fragment& frag, rte_mbuf*& head,
+                            rte_mbuf*& last_seg, unsigned& nsegs) {
+      size_t len, left_to_set = frag.size;
+      char* base = frag.base;
+
+      rte_mbuf* m;
+
+      // TODO: ceph_assert() in a fast path! Remove me ASAP!
+      ceph_assert(frag.size);
+
+      // Create a HEAD of mbufs' cluster and set the first bytes into it
+      len = do_one_buf(qp, head, base, left_to_set);
+      if (!len) {
+        return false;
+      }
+
+      left_to_set -= len;
+      base += len;
+      nsegs = 1;
+
+      //
+      // Set the rest of the data into the new mbufs and chain them to
+      // the cluster.
+      //
+      rte_mbuf* prev_seg = head;
+      while (left_to_set) {
+        len = do_one_buf(qp, m, base, left_to_set);
+        if (!len) {
+          me(head)->recycle();
+          return false;
+        }
+
+        left_to_set -= len;
+        base += len;
+        nsegs++;
+
+        prev_seg->next = m;
+        prev_seg = m;
+      }
+
+      // Return the last mbuf in the cluster
+      last_seg = prev_seg;
+
+      return true;
+    }
+
+    /**
+     * Zero-copy handling of a single fragment.
+     *
+     * @param qp DPDKQueuePair handle (in)
+     * @param frag Fragment to copy (in)
+     * @param head Head of the cluster (out)
+     * @param last_seg Last segment of the cluster (out)
+     * @param nsegs Number of segments in the cluster (out)
+     *
+     * @return TRUE in case of success
+     */
+    static bool translate_one_frag(DPDKQueuePair& qp, fragment& frag,
+                                   rte_mbuf*& head, rte_mbuf*& last_seg,
+                                   unsigned& nsegs) {
+      return do_one_frag(set_one_data_buf, qp, frag, head,
+                         last_seg, nsegs);
+    }
+
+    /**
+     * Copies one fragment into the cluster of rte_mbuf's.
+     *
+     * @param qp DPDKQueuePair handle (in)
+     * @param frag Fragment to copy (in)
+     * @param head Head of the cluster (out)
+     * @param last_seg Last segment of the cluster (out)
+     * @param nsegs Number of segments in the cluster (out)
+     *
+     * We return the "last_seg" to avoid traversing the cluster in order to get
+     * it.
+     *
+     * @return TRUE in case of success
+     */
+    static bool copy_one_frag(DPDKQueuePair& qp, fragment& frag,
+                              rte_mbuf*& head, rte_mbuf*& last_seg,
+                              unsigned& nsegs) {
+      return do_one_frag(copy_one_data_buf, qp, frag, head,
+                         last_seg, nsegs);
+    }
+
+    /**
+     * Allocates a single rte_mbuf and sets it to point to a given data
+     * buffer.
+     *
+     * @param qp DPDKQueuePair handle (in)
+     * @param m New allocated rte_mbuf (out)
+     * @param va virtual address of a data buffer (in)
+     * @param buf_len length of the data to copy (in)
+     *
+     * @return The actual number of bytes that has been set in the mbuf
+     */
+    static size_t set_one_data_buf(
+        DPDKQueuePair& qp, rte_mbuf*& m, char* va, size_t buf_len) {
+      static constexpr size_t max_frag_len = 15 * 1024; // 15K
+
+      // FIXME: current all tx buf is allocated without rte_malloc
+      return copy_one_data_buf(qp, m, va, buf_len);
+      //
+      // Currently we break a buffer on a 15K boundary because 82599
+      // devices have a 15.5K limitation on a maximum single fragment
+      // size.
+      //
+      rte_iova_t pa = rte_malloc_virt2iova(va);
+      if (!pa)
+        return copy_one_data_buf(qp, m, va, buf_len);
+
+      ceph_assert(buf_len);
+      tx_buf* buf = qp.get_tx_buf();
+      if (!buf) {
+        return 0;
+      }
+
+      size_t len = std::min(buf_len, max_frag_len);
+
+      buf->set_zc_info(va, pa, len);
+      m = buf->rte_mbuf_p();
+
+      return len;
+    }
+
+    /**
+     *  Allocates a single rte_mbuf and copies a given data into it.
+     *
+     * @param qp DPDKQueuePair handle (in)
+     * @param m New allocated rte_mbuf (out)
+     * @param data Data to copy from (in)
+     * @param buf_len length of the data to copy (in)
+     *
+     * @return The actual number of bytes that has been copied
+     */
+    static size_t copy_one_data_buf(
+        DPDKQueuePair& qp, rte_mbuf*& m, char* data, size_t buf_len);
+
+    /**
+     * Checks if the first fragment of the given packet satisfies the
+     * zero-copy flow requirement: its first 128 bytes should not cross the
+     * 4K page boundary. This is required in order to avoid splitting packet
+     * headers.
+     *
+     * @param p packet to check
+     *
+     * @return TRUE if packet is ok and FALSE otherwise.
+     */
+    static bool check_frag0(Packet& p)
+    {
+      //
+      // First frag is special - it has headers that should not be split.
+      // If the addressing is such that the first fragment has to be
+      // split, then send this packet in a (non-zero) copy flow. We'll
+      // check if the first 128 bytes of the first fragment reside in the
+      // physically contiguous area. If that's the case - we are good to
+      // go.
+      //
+      if (p.frag(0).size < 128)
+        return false;
+
+      return true;
+    }
+
+   public:
+    tx_buf(tx_buf_factory& fc) : _fc(fc) {
+
+      _buf_physaddr = _mbuf.buf_physaddr;
+      _data_off     = _mbuf.data_off;
+    }
+
+    rte_mbuf* rte_mbuf_p() { return &_mbuf; }
+
+    void set_zc_info(void* va, phys_addr_t pa, size_t len) {
+      // mbuf_put()
+      _mbuf.data_len           = len;
+      _mbuf.pkt_len            = len;
+
+      // Set the mbuf to point to our data
+      _mbuf.buf_addr           = va;
+      _mbuf.buf_physaddr       = pa;
+      _mbuf.data_off           = 0;
+      _is_zc                   = true;
+    }
+
+    void reset_zc() {
+
+      //
+      // If this mbuf was the last in a cluster and contains an
+      // original packet object then call the destructor of the
+      // original packet object.
+      //
+      if (_p) {
+        //
+        // Reset the std::optional. This in particular is going
+        // to call the "packet"'s destructor and reset the
+        // "optional" state to "nonengaged".
+        //
+        _p.destroy();
+
+      } else if (!_is_zc) {
+        return;
+      }
+
+      // Restore the rte_mbuf fields we trashed in set_zc_info()
+      _mbuf.buf_physaddr = _buf_physaddr;
+      _mbuf.buf_addr     = rte_mbuf_to_baddr(&_mbuf);
+      _mbuf.data_off     = _data_off;
+
+      _is_zc             = false;
+    }
+
+    void recycle() {
+      struct rte_mbuf *m = &_mbuf, *m_next;
+
+      while (m != nullptr) {
+        m_next = m->next;
+        rte_pktmbuf_reset(m);
+        _fc.put(me(m));
+        m = m_next;
+      }
+    }
+
+    void set_packet(Packet&& p) {
+      _p = std::move(p);
+    }
+
+   private:
+    struct rte_mbuf _mbuf;
+    MARKER private_start;
+    Tub<Packet> _p;
+    phys_addr_t _buf_physaddr;
+    uint16_t _data_off;
+    // TRUE if underlying mbuf has been used in the zero-copy flow
+    bool _is_zc = false;
+    // buffers' factory the buffer came from
+    tx_buf_factory& _fc;
+    MARKER private_end;
+  };
+
+  class tx_buf_factory {
+    //
+    // Number of buffers to free in each GC iteration:
+    // We want the buffers to be allocated from the mempool as many as
+    // possible.
+    //
+    // On the other hand if there is no Tx for some time we want the
+    // completions to be eventually handled. Thus we choose the smallest
+    // possible packets count number here.
+    //
+    static constexpr int gc_count = 1;
+   public:
+    tx_buf_factory(CephContext *c, DPDKDevice *dev, uint8_t qid);
+    ~tx_buf_factory() {
+      // put all mbuf back into mempool in order to make the next factory work
+      while (gc());
+      rte_mempool_put_bulk(_pool, (void**)_ring.data(),
+                           _ring.size());
+    }
+
+
+    /**
+     * @note Should not be called if there are no free tx_buf's
+     *
+     * @return a free tx_buf object
+     */
+    tx_buf* get() {
+      // Take completed from the HW first
+      tx_buf *pkt = get_one_completed();
+      if (pkt) {
+        pkt->reset_zc();
+        return pkt;
+      }
+
+      //
+      // If there are no completed at the moment - take from the
+      // factory's cache.
+      //
+      if (_ring.empty()) {
+        return nullptr;
+      }
+
+      pkt = _ring.back();
+      _ring.pop_back();
+
+      return pkt;
+    }
+
+    void put(tx_buf* buf) {
+      buf->reset_zc();
+      _ring.push_back(buf);
+    }
+
+    bool gc() {
+      for (int cnt = 0; cnt < gc_count; ++cnt) {
+        auto tx_buf_p = get_one_completed();
+        if (!tx_buf_p) {
+          return false;
+        }
+
+        put(tx_buf_p);
+      }
+
+      return true;
+    }
+   private:
+    /**
+     * Fill the mbufs circular buffer: after this the _pool will become
+     * empty. We will use it to catch the completed buffers:
+     *
+     * - Underlying PMD drivers will "free" the mbufs once they are
+     *   completed.
+     * - We will poll the _pktmbuf_pool_tx till it's empty and release
+     *   all the buffers from the freed mbufs.
+     */
+    void init_factory() {
+      while (rte_mbuf* mbuf = rte_pktmbuf_alloc(_pool)) {
+        _ring.push_back(new(tx_buf::me(mbuf)) tx_buf{*this});
+      }
+    }
+
+    /**
+     * PMD puts the completed buffers back into the mempool they have
+     * originally come from.
+     *
+     * @note rte_pktmbuf_alloc() resets the mbuf so there is no need to call
+     *       rte_pktmbuf_reset() here again.
+     *
+     * @return a single tx_buf that has been completed by HW.
+     */
+    tx_buf* get_one_completed() {
+      return tx_buf::me(rte_pktmbuf_alloc(_pool));
+    }
+
+   private:
+    CephContext *cct;
+    std::vector<tx_buf*> _ring;
+    rte_mempool* _pool = nullptr;
+  };
+
+ public:
+  explicit DPDKQueuePair(CephContext *c, EventCenter *cen, DPDKDevice* dev, uint8_t qid);
+  ~DPDKQueuePair() {
+    if (device_stat_time_fd) {
+      center->delete_time_event(device_stat_time_fd);
+    }
+    rx_gc(true);
+  }
+
+  void rx_start() {
+    _rx_poller.construct(this);
+  }
+
+  uint32_t send(circular_buffer<Packet>& pb) {
+    // Zero-copy send
+    return _send(pb, [&] (Packet&& p) {
+      return tx_buf::from_packet_zc(cct, std::move(p), *this);
+    });
+  }
+
+  DPDKDevice& port() const { return *_dev; }
+  tx_buf* get_tx_buf() { return _tx_buf_factory.get(); }
+
+  void handle_stats();
+
+ private:
+  template <class Func>
+  uint32_t _send(circular_buffer<Packet>& pb, Func &&packet_to_tx_buf_p) {
+    if (_tx_burst.size() == 0) {
+      for (auto&& p : pb) {
+        // TODO: ceph_assert() in a fast path! Remove me ASAP!
+        ceph_assert(p.len());
+
+        tx_buf* buf = packet_to_tx_buf_p(std::move(p));
+        if (!buf) {
+          break;
+        }
+
+        _tx_burst.push_back(buf->rte_mbuf_p());
+      }
+    }
+
+    uint16_t sent = rte_eth_tx_burst(_dev_port_idx, _qid,
+                                     _tx_burst.data() + _tx_burst_idx,
+                                     _tx_burst.size() - _tx_burst_idx);
+
+    uint64_t nr_frags = 0, bytes = 0;
+
+    for (int i = 0; i < sent; i++) {
+      rte_mbuf* m = _tx_burst[_tx_burst_idx + i];
+      bytes    += m->pkt_len;
+      nr_frags += m->nb_segs;
+      pb.pop_front();
+    }
+
+    perf_logger->inc(l_dpdk_qp_tx_fragments, nr_frags);
+    perf_logger->inc(l_dpdk_qp_tx_bytes, bytes);
+
+    _tx_burst_idx += sent;
+
+    if (_tx_burst_idx == _tx_burst.size()) {
+      _tx_burst_idx = 0;
+      _tx_burst.clear();
+    }
+
+    return sent;
+  }
+
+  /**
+   * Allocate a new data buffer and set the mbuf to point to it.
+   *
+   * Do some DPDK hacks to work on PMD: it assumes that the buf_addr
+   * points to the private data of RTE_PKTMBUF_HEADROOM before the actual
+   * data buffer.
+   *
+   * @param m mbuf to update
+   */
+  static bool refill_rx_mbuf(rte_mbuf* m, size_t size,
+                             std::vector<void*> &datas) {
+    if (datas.empty())
+      return false;
+    void *data = datas.back();
+    datas.pop_back();
+
+    //
+    // Set the mbuf to point to our data.
+    //
+    // Do some DPDK hacks to work on PMD: it assumes that the buf_addr
+    // points to the private data of RTE_PKTMBUF_HEADROOM before the
+    // actual data buffer.
+    //
+    m->buf_addr      = (char*)data - RTE_PKTMBUF_HEADROOM;
+    m->buf_physaddr  = rte_mem_virt2phy(data) - RTE_PKTMBUF_HEADROOM;
+    return true;
+  }
+
+  bool init_rx_mbuf_pool();
+  bool rx_gc(bool force=false);
+  bool refill_one_cluster(rte_mbuf* head);
+
+  /**
+   * Polls for a burst of incoming packets. This function will not block and
+   * will immediately return after processing all available packets.
+   *
+   */
+  bool poll_rx_once();
+
+  /**
+   * Translates an rte_mbuf's into packet and feeds them to _rx_stream.
+   *
+   * @param bufs An array of received rte_mbuf's
+   * @param count Number of buffers in the bufs[]
+   */
+  void process_packets(struct rte_mbuf **bufs, uint16_t count);
+
+  /**
+   * Translate rte_mbuf into the "packet".
+   * @param m mbuf to translate
+   *
+   * @return a "optional" object representing the newly received data if in an
+   *         "engaged" state or an error if in a "disengaged" state.
+   */
+  Tub<Packet> from_mbuf(rte_mbuf* m);
+
+  /**
+   * Transform an LRO rte_mbuf cluster into the "packet" object.
+   * @param m HEAD of the mbufs' cluster to transform
+   *
+   * @return a "optional" object representing the newly received LRO packet if
+   *         in an "engaged" state or an error if in a "disengaged" state.
+   */
+  Tub<Packet> from_mbuf_lro(rte_mbuf* m);
+
+ private:
+  CephContext *cct;
+  std::vector<packet_provider_type> _pkt_providers;
+  Tub<std::array<uint8_t, 128>> _sw_reta;
+  circular_buffer<Packet> _proxy_packetq;
+  stream<Packet> _rx_stream;
+  circular_buffer<Packet> _tx_packetq;
+  std::vector<void*> _alloc_bufs;
+
+  PerfCounters *perf_logger;
+  DPDKDevice* _dev;
+  uint8_t _dev_port_idx;
+  EventCenter *center;
+  uint8_t _qid;
+  rte_mempool *_pktmbuf_pool_rx;
+  std::vector<rte_mbuf*> _rx_free_pkts;
+  std::vector<rte_mbuf*> _rx_free_bufs;
+  std::vector<fragment> _frags;
+  std::vector<char*> _bufs;
+  size_t _num_rx_free_segs = 0;
+  uint64_t device_stat_time_fd = 0;
+
+#ifdef CEPH_PERF_DEV
+  uint64_t rx_cycles = 0;
+  uint64_t rx_count = 0;
+  uint64_t tx_cycles = 0;
+  uint64_t tx_count = 0;
+#endif
+
+  class DPDKTXPoller : public EventCenter::Poller {
+    DPDKQueuePair *qp;
+
+   public:
+    explicit DPDKTXPoller(DPDKQueuePair *qp)
+        : EventCenter::Poller(qp->center, "DPDK::DPDKTXPoller"), qp(qp) {}
+
+    virtual int poll() {
+      return qp->poll_tx();
+    }
+  } _tx_poller;
+
+  class DPDKRXGCPoller : public EventCenter::Poller {
+    DPDKQueuePair *qp;
+
+   public:
+    explicit DPDKRXGCPoller(DPDKQueuePair *qp)
+        : EventCenter::Poller(qp->center, "DPDK::DPDKRXGCPoller"), qp(qp) {}
+
+    virtual int poll() {
+      return qp->rx_gc();
+    }
+  } _rx_gc_poller;
+  tx_buf_factory _tx_buf_factory;
+  class DPDKRXPoller : public EventCenter::Poller {
+    DPDKQueuePair *qp;
+
+   public:
+    explicit DPDKRXPoller(DPDKQueuePair *qp)
+        : EventCenter::Poller(qp->center, "DPDK::DPDKRXPoller"), qp(qp) {}
+
+    virtual int poll() {
+      return qp->poll_rx_once();
+    }
+  };
+  Tub<DPDKRXPoller> _rx_poller;
+  class DPDKTXGCPoller : public EventCenter::Poller {
+    DPDKQueuePair *qp;
+
+   public:
+    explicit DPDKTXGCPoller(DPDKQueuePair *qp)
+        : EventCenter::Poller(qp->center, "DPDK::DPDKTXGCPoller"), qp(qp) {}
+
+    virtual int poll() {
+      return qp->_tx_buf_factory.gc();
+    }
+  } _tx_gc_poller;
+  std::vector<rte_mbuf*> _tx_burst;
+  uint16_t _tx_burst_idx = 0;
+};
+
+class DPDKDevice {
+ public:
+  CephContext *cct;
+  PerfCounters *perf_logger;
+  std::vector<std::unique_ptr<DPDKQueuePair>> _queues;
+  std::vector<DPDKWorker*> workers;
+  size_t _rss_table_bits = 0;
+  uint8_t _port_idx;
+  uint16_t _num_queues;
+  unsigned cores;
+  hw_features _hw_features;
+  uint8_t _queues_ready = 0;
+  unsigned _home_cpu;
+  bool _use_lro;
+  bool _enable_fc;
+  std::vector<uint16_t> _redir_table;
+  rss_key_type _rss_key;
+  struct rte_flow *_flow = nullptr;
+  bool _is_i40e_device = false;
+  bool _is_vmxnet3_device = false;
+
+ public:
+  rte_eth_dev_info _dev_info = {};
+
+  /**
+   * The final stage of a port initialization.
+   * @note Must be called *after* all queues from stage (2) have been
+   *       initialized.
+   */
+  int init_port_fini();
+
+ private:
+  /**
+   * Port initialization consists of 3 main stages:
+   * 1) General port initialization which ends with a call to
+   *    rte_eth_dev_configure() where we request the needed number of Rx and
+   *    Tx queues.
+   * 2) Individual queues initialization. This is done in the constructor of
+   *    DPDKQueuePair class. In particular the memory pools for queues are allocated
+   *    in this stage.
+   * 3) The final stage of the initialization which starts with the call of
+   *    rte_eth_dev_start() after which the port becomes fully functional. We
+   *    will also wait for a link to get up in this stage.
+   */
+
+
+  /**
+   * First stage of the port initialization.
+   *
+   * @return 0 in case of success and an appropriate error code in case of an
+   *         error.
+   */
+  int init_port_start();
+
+  /**
+   * Check the link status of out port in up to 9s, and print them finally.
+   */
+  int check_port_link_status();
+
+  /**
+   * Configures the HW Flow Control
+   */
+  void set_hw_flow_control();
+
+ public:
+  DPDKDevice(CephContext *c, uint8_t port_idx, uint16_t num_queues, bool use_lro, bool enable_fc):
+      cct(c), _port_idx(port_idx), _num_queues(num_queues),
+      _home_cpu(0), _use_lro(use_lro),
+      _enable_fc(enable_fc) {
+    _queues = std::vector<std::unique_ptr<DPDKQueuePair>>(_num_queues);
+    /* now initialise the port we will use */
+    int ret = init_port_start();
+    if (ret != 0) {
+      rte_exit(EXIT_FAILURE, "Cannot initialise port %u\n", _port_idx);
+    }
+    string name(std::string("port") + std::to_string(port_idx));
+    PerfCountersBuilder plb(cct, name, l_dpdk_dev_first, l_dpdk_dev_last);
+
+    plb.add_u64_counter(l_dpdk_dev_rx_mcast, "dpdk_device_receive_multicast_packets", "DPDK received multicast packets");
+    plb.add_u64_counter(l_dpdk_dev_rx_badcrc_errors, "dpdk_device_receive_badcrc_errors", "DPDK received bad crc errors");
+
+    plb.add_u64_counter(l_dpdk_dev_rx_total_errors, "dpdk_device_receive_total_errors", "DPDK received total_errors");
+    plb.add_u64_counter(l_dpdk_dev_tx_total_errors, "dpdk_device_send_total_errors", "DPDK sendd total_errors");
+    plb.add_u64_counter(l_dpdk_dev_rx_dropped_errors, "dpdk_device_receive_dropped_errors", "DPDK received dropped errors");
+    plb.add_u64_counter(l_dpdk_dev_rx_nombuf_errors, "dpdk_device_receive_nombuf_errors", "DPDK received RX mbuf allocation errors");
+
+    perf_logger = plb.create_perf_counters();
+    cct->get_perfcounters_collection()->add(perf_logger);
+  }
+
+  ~DPDKDevice() {
+    if (_flow)
+       rte_flow_destroy(_port_idx, _flow, nullptr);
+    rte_eth_dev_stop(_port_idx);
+  }
+
+  DPDKQueuePair& queue_for_cpu(unsigned cpu) { return *_queues[cpu]; }
+  void l2receive(int qid, Packet p) {
+    _queues[qid]->_rx_stream.produce(std::move(p));
+  }
+  subscription<Packet> receive(unsigned cpuid, std::function<int (Packet)> next_packet) {
+    auto sub = _queues[cpuid]->_rx_stream.listen(std::move(next_packet));
+    _queues[cpuid]->rx_start();
+    return sub;
+  }
+  ethernet_address hw_address() {
+    struct ether_addr mac;
+    rte_eth_macaddr_get(_port_idx, &mac);
+
+    return mac.addr_bytes;
+  }
+  hw_features get_hw_features() {
+    return _hw_features;
+  }
+  const rss_key_type& rss_key() const { return _rss_key; }
+  uint16_t hw_queues_count() { return _num_queues; }
+  std::unique_ptr<DPDKQueuePair> init_local_queue(CephContext *c, EventCenter *center, string hugepages, uint16_t qid) {
+    std::unique_ptr<DPDKQueuePair> qp;
+    qp = std::unique_ptr<DPDKQueuePair>(new DPDKQueuePair(c, center, this, qid));
+    return qp;
+  }
+  unsigned hash2qid(uint32_t hash) {
+    // return hash % hw_queues_count();
+    return _redir_table[hash & (_redir_table.size() - 1)];
+  }
+  void set_local_queue(unsigned i, std::unique_ptr<DPDKQueuePair> qp) {
+    ceph_assert(!_queues[i]);
+    _queues[i] = std::move(qp);
+  }
+  void unset_local_queue(unsigned i) {
+    ceph_assert(_queues[i]);
+    _queues[i].reset();
+  }
+  template <typename Func>
+  unsigned forward_dst(unsigned src_cpuid, Func&& hashfn) {
+    auto& qp = queue_for_cpu(src_cpuid);
+    if (!qp._sw_reta)
+      return src_cpuid;
+
+    ceph_assert(!qp._sw_reta);
+    auto hash = hashfn() >> _rss_table_bits;
+    auto& reta = *qp._sw_reta;
+    return reta[hash % reta.size()];
+  }
+  unsigned hash2cpu(uint32_t hash) {
+    // there is an assumption here that qid == get_id() which will
+    // not necessary be true in the future
+    return forward_dst(hash2qid(hash), [hash] { return hash; });
+  }
+
+  hw_features& hw_features_ref() { return _hw_features; }
+
+  const rte_eth_rxconf* def_rx_conf() const {
+    return &_dev_info.default_rxconf;
+  }
+
+  const rte_eth_txconf* def_tx_conf() const {
+    return &_dev_info.default_txconf;
+  }
+
+  /**
+   *  Set the RSS table in the device and store it in the internal vector.
+   */
+  void set_rss_table();
+
+  uint8_t port_idx() { return _port_idx; }
+  bool is_i40e_device() const {
+    return _is_i40e_device;
+  }
+  bool is_vmxnet3_device() const {
+    return _is_vmxnet3_device;
+  }
+};
+
+
+std::unique_ptr<DPDKDevice> create_dpdk_net_device(
+    CephContext *c, unsigned cores, uint8_t port_idx = 0,
+    bool use_lro = true, bool enable_fc = true);
+
+
+/**
+ * @return Number of bytes needed for mempool objects of each QP.
+ */
+uint32_t qp_mempool_obj_size();
+
+#endif // CEPH_DPDK_DEV_H
diff --git a/src/msg/async/dpdk/DPDKStack.cc b/src/msg/async/dpdk/DPDKStack.cc
new file mode 100644
index 000000000..9a73dac5d
--- /dev/null
+++ b/src/msg/async/dpdk/DPDKStack.cc
@@ -0,0 +1,280 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <memory>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <tuple>
+
+#include "common/ceph_argparse.h"
+#include "dpdk_rte.h"
+#include "DPDKStack.h"
+#include "DPDK.h"
+#include "IP.h"
+#include "TCP-Stack.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+#include "common/Cond.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "dpdkstack "
+
+static int dpdk_thread_adaptor(void* f)
+{
+  (*static_cast<std::function<void ()>*>(f))();
+  return 0;
+}
+
+void DPDKWorker::initialize()
+{
+  static enum {
+    WAIT_DEVICE_STAGE,
+    WAIT_PORT_FIN_STAGE,
+    DONE
+  } create_stage = WAIT_DEVICE_STAGE;
+  static ceph::mutex lock = ceph::make_mutex("DPDKStack::lock");
+  static ceph::condition_variable cond;
+  static unsigned queue_init_done = 0;
+  static unsigned cores = 0;
+  static std::shared_ptr<DPDKDevice> sdev;
+
+  unsigned i = center.get_id();
+  if (i == 0) {
+    // Hardcoded port index 0.
+    // TODO: Inherit it from the opts
+    cores = cct->_conf->ms_async_op_threads;
+    std::unique_ptr<DPDKDevice> dev = create_dpdk_net_device(
+        cct, cores, cct->_conf->ms_dpdk_port_id,
+        cct->_conf->ms_dpdk_lro,
+        cct->_conf->ms_dpdk_hw_flow_control);
+    sdev = std::shared_ptr<DPDKDevice>(dev.release());
+    sdev->workers.resize(cores);
+    ldout(cct, 1) << __func__ << " using " << cores << " cores " << dendl;
+
+    std::lock_guard l{lock};
+    create_stage = WAIT_PORT_FIN_STAGE;
+    cond.notify_all();
+  } else {
+    std::unique_lock l{lock};
+    cond.wait(l, [] { return create_stage > WAIT_DEVICE_STAGE; });
+  }
+  ceph_assert(sdev);
+  if (i < sdev->hw_queues_count()) {
+    auto qp = sdev->init_local_queue(cct, &center, cct->_conf->ms_dpdk_hugepages, i);
+    std::map<unsigned, float> cpu_weights;
+    for (unsigned j = sdev->hw_queues_count() + i % sdev->hw_queues_count();
+         j < cores; j+= sdev->hw_queues_count())
+      cpu_weights[i] = 1;
+    cpu_weights[i] = cct->_conf->ms_dpdk_hw_queue_weight;
+    qp->configure_proxies(cpu_weights);
+    sdev->set_local_queue(i, std::move(qp));
+    std::lock_guard l{lock};
+    ++queue_init_done;
+    cond.notify_all();
+  } else {
+    // auto master = qid % sdev->hw_queues_count();
+    // sdev->set_local_queue(create_proxy_net_device(master, sdev.get()));
+    ceph_abort();
+  }
+  if (i == 0) {
+    {
+      std::unique_lock l{lock};
+      cond.wait(l, [] { return queue_init_done >= cores; });
+    }
+
+    if (sdev->init_port_fini() < 0) {
+      lderr(cct) << __func__ << " init_port_fini failed " << dendl;
+      ceph_abort();
+    }
+    std::lock_guard l{lock};
+    create_stage = DONE;
+    cond.notify_all();
+  } else {
+    std::unique_lock  l{lock};
+    cond.wait(l, [&] { return create_stage > WAIT_PORT_FIN_STAGE; });
+  }
+
+  sdev->workers[i] = this;
+  _impl = std::unique_ptr<DPDKWorker::Impl>(
+          new DPDKWorker::Impl(cct, i, &center, sdev));
+  {
+    std::lock_guard l{lock};
+    if (!--queue_init_done) {
+      create_stage = WAIT_DEVICE_STAGE;
+      sdev.reset();
+    }
+  }
+}
+
+using AvailableIPAddress = std::tuple<string, string, string>;
+static bool parse_available_address(
+        const string &ips, const string &gates, const string &masks, vector<AvailableIPAddress> &res)
+{
+  vector<string> ip_vec, gate_vec, mask_vec;
+  string_to_vec(ip_vec, ips);
+  string_to_vec(gate_vec, gates);
+  string_to_vec(mask_vec, masks);
+  if (ip_vec.empty() || ip_vec.size() != gate_vec.size() || ip_vec.size() != mask_vec.size())
+    return false;
+
+  for (size_t i = 0; i < ip_vec.size(); ++i) {
+    res.push_back(AvailableIPAddress{ip_vec[i], gate_vec[i], mask_vec[i]});
+  }
+  return true;
+}
+
+static bool match_available_address(const vector<AvailableIPAddress> &avails,
+                                    const entity_addr_t &ip, int &res)
+{
+  for (size_t i = 0; i < avails.size(); ++i) {
+    entity_addr_t addr;
+    auto a = std::get<0>(avails[i]).c_str();
+    if (!addr.parse(a))
+      continue;
+    if (addr.is_same_host(ip)) {
+      res = i;
+      return true;
+    }
+  }
+  return false;
+}
+
+DPDKWorker::Impl::Impl(CephContext *cct, unsigned i, EventCenter *c, std::shared_ptr<DPDKDevice> dev)
+    : id(i), _netif(cct, dev, c), _dev(dev), _inet(cct, c, &_netif)
+{
+  vector<AvailableIPAddress> tuples;
+  bool parsed = parse_available_address(cct->_conf.get_val<std::string>("ms_dpdk_host_ipv4_addr"),
+                                        cct->_conf.get_val<std::string>("ms_dpdk_gateway_ipv4_addr"),
+                                        cct->_conf.get_val<std::string>("ms_dpdk_netmask_ipv4_addr"), tuples);
+  if (!parsed) {
+    lderr(cct) << __func__ << " no available address "
+               << cct->_conf.get_val<std::string>("ms_dpdk_host_ipv4_addr") << ", "
+               << cct->_conf.get_val<std::string>("ms_dpdk_gateway_ipv4_addr") << ", "
+               << cct->_conf.get_val<std::string>("ms_dpdk_netmask_ipv4_addr") << ", "
+               << dendl;
+    ceph_abort();
+  }
+  _inet.set_host_address(ipv4_address(std::get<0>(tuples[0])));
+  _inet.set_gw_address(ipv4_address(std::get<1>(tuples[0])));
+  _inet.set_netmask_address(ipv4_address(std::get<2>(tuples[0])));
+}
+
+DPDKWorker::Impl::~Impl()
+{
+  _dev->unset_local_queue(id);
+}
+
+int DPDKWorker::listen(entity_addr_t &sa,
+		       unsigned addr_slot,
+		       const SocketOptions &opt,
+                       ServerSocket *sock)
+{
+  ceph_assert(sa.get_family() == AF_INET);
+  ceph_assert(sock);
+
+  ldout(cct, 10) << __func__ << " addr " << sa << dendl;
+  // vector<AvailableIPAddress> tuples;
+  // bool parsed = parse_available_address(cct->_conf->ms_dpdk_host_ipv4_addr,
+  //                                       cct->_conf->ms_dpdk_gateway_ipv4_addr,
+  //                                       cct->_conf->ms_dpdk_netmask_ipv4_addr, tuples);
+  // if (!parsed) {
+  //   lderr(cct) << __func__ << " no available address "
+  //              << cct->_conf->ms_dpdk_host_ipv4_addr << ", "
+  //              << cct->_conf->ms_dpdk_gateway_ipv4_addr << ", "
+  //              << cct->_conf->ms_dpdk_netmask_ipv4_addr << ", "
+  //              << dendl;
+  //   return -EINVAL;
+  // }
+  // int idx;
+  // parsed = match_available_address(tuples, sa, idx);
+  // if (!parsed) {
+  //   lderr(cct) << __func__ << " no matched address for " << sa << dendl;
+  //   return -EINVAL;
+  // }
+  // _inet.set_host_address(ipv4_address(std::get<0>(tuples[idx])));
+  // _inet.set_gw_address(ipv4_address(std::get<1>(tuples[idx])));
+  // _inet.set_netmask_address(ipv4_address(std::get<2>(tuples[idx])));
+  return tcpv4_listen(_impl->_inet.get_tcp(), sa.get_port(), opt, sa.get_type(),
+		      addr_slot, sock);
+}
+
+int DPDKWorker::connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket)
+{
+  // ceph_assert(addr.get_family() == AF_INET);
+  int r =  tcpv4_connect(_impl->_inet.get_tcp(), addr, socket);
+  ldout(cct, 10) << __func__ << " addr " << addr << dendl;
+  return r;
+}
+
+void DPDKStack::spawn_worker(unsigned i, std::function<void ()> &&func)
+{
+  // create a extra master thread
+  //
+  funcs[i] = std::move(func);
+  int r = 0;
+  r = dpdk::eal::init(cct);
+  if (r < 0) {
+    lderr(cct) << __func__ << " init dpdk rte failed, r=" << r << dendl;
+    ceph_abort();
+  }
+  // if dpdk::eal::init already called by NVMEDevice, we will select 1..n
+  // cores
+  ceph_assert(rte_lcore_count() >= i + 1);
+  unsigned core_id;
+  int j = i;
+  RTE_LCORE_FOREACH_SLAVE(core_id) {
+    if (i-- == 0) {
+      break;
+    }
+  }
+  dpdk::eal::execute_on_master([&]() {
+    r = rte_eal_remote_launch(dpdk_thread_adaptor, static_cast<void*>(&funcs[j]), core_id);
+    if (r < 0) {
+      lderr(cct) << __func__ << " remote launch failed, r=" << r << dendl;
+      ceph_abort();
+    }
+  });
+}
+
+void DPDKStack::join_worker(unsigned i)
+{
+  dpdk::eal::execute_on_master([&]() {
+    rte_eal_wait_lcore(i+1);
+  });
+}
diff --git a/src/msg/async/dpdk/DPDKStack.h b/src/msg/async/dpdk/DPDKStack.h
new file mode 100644
index 000000000..926adaffc
--- /dev/null
+++ b/src/msg/async/dpdk/DPDKStack.h
@@ -0,0 +1,266 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef CEPH_MSG_DPDKSTACK_H
+#define CEPH_MSG_DPDKSTACK_H
+
+#include <functional>
+
+#include "common/ceph_context.h"
+#include "common/Tub.h"
+
+#include "msg/async/Stack.h"
+#include "net.h"
+#include "const.h"
+#include "IP.h"
+#include "Packet.h"
+
+class interface;
+
+template <typename Protocol>
+class NativeConnectedSocketImpl;
+
+// DPDKServerSocketImpl
+template <typename Protocol>
+class DPDKServerSocketImpl : public ServerSocketImpl {
+  typename Protocol::listener _listener;
+ public:
+  DPDKServerSocketImpl(Protocol& proto, uint16_t port, const SocketOptions &opt,
+		       int type, unsigned addr_slot);
+  int listen() {
+    return _listener.listen();
+  }
+  virtual int accept(ConnectedSocket *s, const SocketOptions &opts, entity_addr_t *out, Worker *w) override;
+  virtual void abort_accept() override;
+  virtual int fd() const override {
+    return _listener.fd();
+  }
+};
+
+// NativeConnectedSocketImpl
+template <typename Protocol>
+class NativeConnectedSocketImpl : public ConnectedSocketImpl {
+  typename Protocol::connection _conn;
+  uint32_t _cur_frag = 0;
+  uint32_t _cur_off = 0;
+  Tub<Packet> _buf;
+  Tub<bufferptr> _cache_ptr;
+
+ public:
+  explicit NativeConnectedSocketImpl(typename Protocol::connection conn)
+          : _conn(std::move(conn)) {}
+  NativeConnectedSocketImpl(NativeConnectedSocketImpl &&rhs)
+      : _conn(std::move(rhs._conn)), _buf(std::move(rhs.buf))  {}
+  virtual int is_connected() override {
+    return _conn.is_connected();
+  }
+
+  virtual ssize_t read(char *buf, size_t len) override {
+    size_t left = len;
+    ssize_t r = 0;
+    size_t off = 0;
+    while (left > 0) {
+      if (!_cache_ptr) {
+        _cache_ptr.construct();
+        r = zero_copy_read(*_cache_ptr);
+        if (r <= 0) {
+          _cache_ptr.destroy();
+          if (r == -EAGAIN)
+            break;
+          return r;
+        }
+      }
+      if (_cache_ptr->length() <= left) {
+        _cache_ptr->copy_out(0, _cache_ptr->length(), buf+off);
+        left -= _cache_ptr->length();
+        off += _cache_ptr->length();
+        _cache_ptr.destroy();
+      } else {
+        _cache_ptr->copy_out(0, left, buf+off);
+        _cache_ptr->set_offset(_cache_ptr->offset() + left);
+        _cache_ptr->set_length(_cache_ptr->length() - left);
+        left = 0;
+        break;
+      }
+    }
+    return len - left ? len - left : -EAGAIN;
+  }
+
+private:
+  ssize_t zero_copy_read(bufferptr &data) {
+    auto err = _conn.get_errno();
+    if (err <= 0)
+      return err;
+
+    if (!_buf) {
+      _buf = std::move(_conn.read());
+      if (!_buf)
+        return -EAGAIN;
+    }
+
+    fragment &f = _buf->frag(_cur_frag);
+    Packet p = _buf->share(_cur_off, f.size);
+    auto del = std::bind(
+            [](Packet &p) {}, std::move(p));
+    data = buffer::claim_buffer(
+            f.size, f.base, make_deleter(std::move(del)));
+    if (++_cur_frag == _buf->nr_frags()) {
+      _cur_frag = 0;
+      _cur_off = 0;
+      _buf.destroy();
+    } else {
+      _cur_off += f.size;
+    }
+    ceph_assert(data.length());
+    return data.length();
+  }
+  virtual ssize_t send(bufferlist &bl, bool more) override {
+    auto err = _conn.get_errno();
+    if (err < 0)
+      return (ssize_t)err;
+
+    size_t available = _conn.peek_sent_available();
+    if (available == 0) {
+      return 0;
+    }
+
+    std::vector<fragment> frags;
+    auto pb = bl.buffers().begin();
+    uint64_t len = 0;
+    uint64_t seglen = 0;
+    while (len < available && pb != bl.buffers().end()) {
+      seglen = pb->length();
+      // Buffer length is zero, no need to send, so skip it
+      if (seglen == 0) {
+        ++pb;
+        continue;
+      }
+      if (len + seglen > available) {
+        // don't continue if we enough at least 1 fragment since no available
+        // space for next ptr.
+        if (len > 0)
+          break;
+        seglen = std::min(seglen, available);
+      }
+      len += seglen;
+      frags.push_back(fragment{(char*)pb->c_str(), seglen});
+      ++pb;
+    }
+
+    if (len != bl.length()) {
+      bufferlist swapped;
+      bl.splice(0, len, &swapped);
+      auto del = std::bind(
+              [](bufferlist &bl) {}, std::move(swapped));
+      return _conn.send(Packet(std::move(frags), make_deleter(std::move(del))));
+    } else {
+      auto del = std::bind(
+              [](bufferlist &bl) {}, std::move(bl));
+
+      return _conn.send(Packet(std::move(frags), make_deleter(std::move(del))));
+    }
+  }
+
+public:
+  virtual void shutdown() override {
+    _conn.close_write();
+  }
+  // FIXME need to impl close
+  virtual void close() override {
+    _conn.close_write();
+  }
+  virtual int fd() const override {
+    return _conn.fd();
+  }
+};
+
+template <typename Protocol>
+DPDKServerSocketImpl<Protocol>::DPDKServerSocketImpl(
+  Protocol& proto, uint16_t port, const SocketOptions &opt,
+  int type, unsigned addr_slot)
+  : ServerSocketImpl(type, addr_slot), _listener(proto.listen(port)) {}
+
+template <typename Protocol>
+int DPDKServerSocketImpl<Protocol>::accept(ConnectedSocket *s, const SocketOptions &options, entity_addr_t *out, Worker *w) {
+  if (_listener.get_errno() < 0)
+    return _listener.get_errno();
+  auto c = _listener.accept();
+  if (!c)
+    return -EAGAIN;
+
+  if (out) {
+    *out = c->remote_addr();
+    out->set_type(addr_type);
+  }
+  std::unique_ptr<NativeConnectedSocketImpl<Protocol>> csi(
+          new NativeConnectedSocketImpl<Protocol>(std::move(*c)));
+  *s = ConnectedSocket(std::move(csi));
+  return 0;
+}
+
+template <typename Protocol>
+void DPDKServerSocketImpl<Protocol>::abort_accept() {
+  _listener.abort_accept();
+}
+
+class DPDKWorker : public Worker {
+  struct Impl {
+    unsigned id;
+    interface _netif;
+    std::shared_ptr<DPDKDevice> _dev;
+    ipv4 _inet;
+    Impl(CephContext *cct, unsigned i, EventCenter *c, std::shared_ptr<DPDKDevice> dev);
+    ~Impl();
+  };
+  std::unique_ptr<Impl> _impl;
+
+  virtual void initialize() override;
+  void set_ipv4_packet_filter(ip_packet_filter* filter) {
+    _impl->_inet.set_packet_filter(filter);
+  }
+  using tcp4 = tcp<ipv4_traits>;
+
+ public:
+  explicit DPDKWorker(CephContext *c, unsigned i): Worker(c, i) {}
+  virtual int listen(entity_addr_t &addr, unsigned addr_slot,
+		     const SocketOptions &opts, ServerSocket *) override;
+  virtual int connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) override;
+  void arp_learn(ethernet_address l2, ipv4_address l3) {
+    _impl->_inet.learn(l2, l3);
+  }
+  virtual void destroy() override {
+    _impl.reset();
+  }
+
+  friend class DPDKServerSocketImpl<tcp4>;
+};
+
+class DPDKStack : public NetworkStack {
+  vector<std::function<void()> > funcs;
+
+  virtual Worker* create_worker(CephContext *c, unsigned worker_id) override {
+    return new DPDKWorker(c, worker_id);
+  }
+
+ public:
+  explicit DPDKStack(CephContext *cct): NetworkStack(cct) {
+    funcs.resize(cct->_conf->ms_async_max_op_threads);
+  }
+  virtual bool support_local_listen_table() const override { return true; }
+
+  virtual void spawn_worker(unsigned i, std::function<void ()> &&func) override;
+  virtual void join_worker(unsigned i) override;
+};
+
+#endif
diff --git a/src/msg/async/dpdk/EventDPDK.cc b/src/msg/async/dpdk/EventDPDK.cc
new file mode 100644
index 000000000..5d291716c
--- /dev/null
+++ b/src/msg/async/dpdk/EventDPDK.cc
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+  *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/errno.h"
+#include "DPDKStack.h"
+#include "EventDPDK.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_ms
+
+#undef dout_prefix
+#define dout_prefix *_dout << "DPDKDriver."
+
+int DPDKDriver::init(EventCenter *c, int nevent)
+{
+	return 0;
+}
+
+int DPDKDriver::add_event(int fd, int cur_mask, int add_mask)
+{
+	ldout(cct, 20) << __func__ << " add event fd=" << fd << " cur_mask=" << cur_mask
+								 << " add_mask=" << add_mask << dendl;
+
+	int r = manager.listen(fd, add_mask);
+	if (r < 0) {
+		lderr(cct) << __func__ << " add fd=" << fd << " failed. "
+		           << cpp_strerror(-r) << dendl;
+		return -errno;
+	}
+
+	return 0;
+}
+
+int DPDKDriver::del_event(int fd, int cur_mask, int delmask)
+{
+	ldout(cct, 20) << __func__ << " del event fd=" << fd << " cur_mask=" << cur_mask
+								 << " delmask=" << delmask << dendl;
+	int r = 0;
+
+	if (delmask != EVENT_NONE) {
+		if ((r = manager.unlisten(fd, delmask)) < 0) {
+			lderr(cct) << __func__ << " delete fd=" << fd << " delmask=" << delmask
+								 << " failed." << cpp_strerror(-r) << dendl;
+			return r;
+		}
+	}
+	return 0;
+}
+
+int DPDKDriver::resize_events(int newsize)
+{
+	return 0;
+}
+
+int DPDKDriver::event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tvp)
+{
+	int num_events = 512;
+	int events[num_events];
+  int masks[num_events];
+
+	int retval = manager.poll(events, masks, num_events, tvp);
+	if (retval > 0) {
+		fired_events.resize(retval);
+		for (int i = 0; i < retval; i++) {
+			fired_events[i].fd = events[i];
+			fired_events[i].mask = masks[i];
+		}
+	}
+	return retval;
+}
diff --git a/src/msg/async/dpdk/EventDPDK.h b/src/msg/async/dpdk/EventDPDK.h
new file mode 100644
index 000000000..541c2210e
--- /dev/null
+++ b/src/msg/async/dpdk/EventDPDK.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_EVENTDPDK_H
+#define CEPH_EVENTDPDK_H
+
+#include "msg/async/Event.h"
+#include "msg/async/Stack.h"
+#include "UserspaceEvent.h"
+
+class DPDKDriver : public EventDriver {
+  CephContext *cct;
+
+ public:
+  UserspaceEventManager manager;
+
+  explicit DPDKDriver(CephContext *c): cct(c), manager(c) {}
+  virtual ~DPDKDriver() { }
+
+  int init(EventCenter *c, int nevent) override;
+  int add_event(int fd, int cur_mask, int add_mask) override;
+  int del_event(int fd, int cur_mask, int del_mask) override;
+  int resize_events(int newsize) override;
+  int event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tp) override;
+  bool need_wakeup() override { return false; }
+};
+
+#endif //CEPH_EVENTDPDK_H
diff --git a/src/msg/async/dpdk/IP.cc b/src/msg/async/dpdk/IP.cc
new file mode 100644
index 000000000..fab534bb2
--- /dev/null
+++ b/src/msg/async/dpdk/IP.cc
@@ -0,0 +1,481 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ */
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/perf_counters.h"
+
+#include "capture.h"
+#include "IP.h"
+#include "toeplitz.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "dpdk "
+
+std::ostream& operator<<(std::ostream& os, const ipv4_address& a) {
+  auto ip = a.ip;
+  return os << ((ip >> 24) & 0xff) << "." << ((ip >> 16) & 0xff)
+            << "." << ((ip >> 8) & 0xff) << "." << ((ip >> 0) & 0xff);
+}
+
+utime_t ipv4::_frag_timeout = utime_t(30, 0);
+constexpr uint32_t ipv4::_frag_low_thresh;
+constexpr uint32_t ipv4::_frag_high_thresh;
+
+class C_handle_frag_timeout : public EventCallback {
+  ipv4 *_ipv4;
+
+ public:
+  C_handle_frag_timeout(ipv4 *i): _ipv4(i) {}
+  void do_request(uint64_t fd_or_id) {
+    _ipv4->frag_timeout();
+  }
+};
+
+enum {
+  l_dpdk_qp_first = 99000,
+  l_dpdk_total_linearize_operations,
+  l_dpdk_qp_last
+};
+
+struct icmp_hdr {
+  enum class msg_type : uint8_t {
+    echo_reply = 0,
+    echo_request = 8,
+  };
+  msg_type type;
+  uint8_t code;
+  uint16_t csum;
+  uint32_t rest;
+} __attribute__((packed));
+
+ipv4::ipv4(CephContext *c, EventCenter *cen, interface* netif)
+  : cct(c), center(cen), _netif(netif), _global_arp(netif),
+    _arp(c, _global_arp, cen),
+    _host_address(0), _gw_address(0), _netmask(0),
+    _l3(netif, eth_protocol_num::ipv4, [this] { return get_packet(); }),
+    _rx_packets(
+      _l3.receive(
+        [this] (Packet p, ethernet_address ea) {
+          return handle_received_packet(std::move(p), ea);
+        },
+        [this] (forward_hash& out_hash_data, Packet& p, size_t off) {
+          return forward(out_hash_data, p, off);
+        }
+      )
+    ),
+    _tcp(*this, cen), _icmp(c, *this),
+    _l4({{ uint8_t(ip_protocol_num::tcp), &_tcp },
+         { uint8_t(ip_protocol_num::icmp), &_icmp }}),
+    _packet_filter(nullptr)
+{
+  PerfCountersBuilder plb(cct, "ipv4", l_dpdk_qp_first, l_dpdk_qp_last);
+  plb.add_u64_counter(l_dpdk_total_linearize_operations, "dpdk_ip_linearize_operations", "DPDK IP Packet linearization operations");
+  perf_logger = plb.create_perf_counters();
+  cct->get_perfcounters_collection()->add(perf_logger);
+  frag_handler = new C_handle_frag_timeout(this);
+}
+
+bool ipv4::forward(forward_hash& out_hash_data, Packet& p, size_t off)
+{
+  auto iph = p.get_header<ip_hdr>(off);
+
+  out_hash_data.push_back(iph->src_ip.ip);
+  out_hash_data.push_back(iph->dst_ip.ip);
+
+  auto h = iph->ntoh();
+  auto l4 = _l4[h.ip_proto];
+  if (l4) {
+    if (h.mf() == false && h.offset() == 0) {
+      // This IP datagram is atomic, forward according to tcp connection hash
+      l4->forward(out_hash_data, p, off + sizeof(ip_hdr));
+    }
+    // else forward according to ip fields only
+  }
+  return true;
+}
+
+int ipv4::handle_received_packet(Packet p, ethernet_address from)
+{
+  auto iph = p.get_header<ip_hdr>(0);
+  if (!iph) {
+    return 0;
+  }
+
+  // Skip checking csum of reassembled IP datagram
+  if (!get_hw_features().rx_csum_offload && !p.offload_info_ref().reassembled) {
+    checksummer csum;
+    csum.sum(reinterpret_cast<char*>(iph), sizeof(*iph));
+    if (csum.get() != 0) {
+      return 0;
+    }
+  }
+
+  auto h = iph->ntoh();
+  unsigned ip_len = h.len;
+  unsigned ip_hdr_len = h.ihl * 4;
+  unsigned pkt_len = p.len();
+  auto offset = h.offset();
+
+  ldout(cct, 10) << __func__ << " get " << std::hex << int(h.ip_proto)
+                 << std::dec << " packet from "
+                 << h.src_ip << " -> " << h.dst_ip << " id=" << h.id
+                 << " ip_len=" << ip_len << " ip_hdr_len=" << ip_hdr_len
+                 << " pkt_len=" << pkt_len << " offset=" << offset << dendl;
+
+  if (pkt_len > ip_len) {
+    // Trim extra data in the packet beyond IP total length
+    p.trim_back(pkt_len - ip_len);
+  } else if (pkt_len < ip_len) {
+    // Drop if it contains less than IP total length
+    return 0;
+  }
+  // Drop if the reassembled datagram will be larger than maximum IP size
+  if (offset + p.len() > ip_packet_len_max) {
+    return 0;
+  }
+
+  // FIXME: process options
+  if (in_my_netmask(h.src_ip) && h.src_ip != _host_address) {
+    ldout(cct, 20) << __func__ << " learn mac " << from << " with " << h.src_ip << dendl;
+    _arp.learn(from, h.src_ip);
+  }
+
+  if (_packet_filter) {
+    bool handled = false;
+    _packet_filter->handle(p, &h, from, handled);
+    if (handled) {
+      return 0;
+    }
+  }
+
+  if (h.dst_ip != _host_address) {
+    // FIXME: forward
+    return 0;
+  }
+
+  // Does this IP datagram need reassembly
+  auto mf = h.mf();
+  if (mf == true || offset != 0) {
+    frag_limit_mem();
+    auto frag_id = ipv4_frag_id{h.src_ip, h.dst_ip, h.id, h.ip_proto};
+    auto& frag = _frags[frag_id];
+    if (mf == false) {
+      frag.last_frag_received = true;
+    }
+    // This is a newly created frag_id
+    if (frag.mem_size == 0) {
+      _frags_age.push_back(frag_id);
+      frag.rx_time = ceph_clock_now();
+    }
+    auto added_size = frag.merge(h, offset, std::move(p));
+    _frag_mem += added_size;
+    if (frag.is_complete()) {
+      // All the fragments are received
+      auto dropped_size = frag.mem_size;
+      auto& ip_data = frag.data.map.begin()->second;
+      // Choose a cpu to forward this packet
+      auto cpu_id = center->get_id();
+      auto l4 = _l4[h.ip_proto];
+      if (l4) {
+        size_t l4_offset = 0;
+        forward_hash hash_data;
+        hash_data.push_back(hton(h.src_ip.ip));
+        hash_data.push_back(hton(h.dst_ip.ip));
+        l4->forward(hash_data, ip_data, l4_offset);
+        cpu_id = _netif->hash2cpu(toeplitz_hash(_netif->rss_key(), hash_data));
+      }
+
+      // No need to forward if the dst cpu is the current cpu
+      if (cpu_id == center->get_id()) {
+        l4->received(std::move(ip_data), h.src_ip, h.dst_ip);
+      } else {
+        auto to = _netif->hw_address();
+        auto pkt = frag.get_assembled_packet(from, to);
+        _netif->forward(center, cpu_id, std::move(pkt));
+      }
+
+      // Delete this frag from _frags and _frags_age
+      frag_drop(frag_id, dropped_size);
+      _frags_age.remove(frag_id);
+      perf_logger->set(l_dpdk_total_linearize_operations,
+                       ipv4_packet_merger::linearizations());
+    } else {
+      // Some of the fragments are missing
+      if (frag_timefd) {
+        frag_arm();
+      }
+    }
+    return 0;
+  }
+
+  auto l4 = _l4[h.ip_proto];
+  if (l4) {
+    // Trim IP header and pass to upper layer
+    p.trim_front(ip_hdr_len);
+    l4->received(std::move(p), h.src_ip, h.dst_ip);
+  }
+  return 0;
+}
+
+void ipv4::wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb) {
+  // Figure out where to send the packet to. If it is a directly connected
+  // host, send to it directly, otherwise send to the default gateway.
+  ipv4_address dst;
+  if (in_my_netmask(to)) {
+    dst = to;
+  } else {
+    dst = _gw_address;
+  }
+
+  _arp.wait(std::move(dst), std::move(p), std::move(cb));
+}
+
+const hw_features& ipv4::get_hw_features() const
+{
+  return _netif->get_hw_features();
+}
+
+void ipv4::send(ipv4_address to, ip_protocol_num proto_num,
+        Packet p, ethernet_address e_dst) {
+  auto needs_frag = this->needs_frag(p, proto_num, get_hw_features());
+
+  auto send_pkt = [this, to, proto_num, needs_frag, e_dst] (Packet& pkt, uint16_t remaining, uint16_t offset) mutable  {
+    static uint16_t id = 0;
+    auto iph = pkt.prepend_header<ip_hdr>();
+    iph->ihl = sizeof(*iph) / 4;
+    iph->ver = 4;
+    iph->dscp = 0;
+    iph->ecn = 0;
+    iph->len = pkt.len();
+    // FIXME: a proper id
+    iph->id = id++;
+    if (needs_frag) {
+      uint16_t mf = remaining > 0;
+      // The fragment offset is measured in units of 8 octets (64 bits)
+      auto off = offset / 8;
+      iph->frag = (mf << uint8_t(ip_hdr::frag_bits::mf)) | off;
+    } else {
+      iph->frag = 0;
+    }
+    iph->ttl = 64;
+    iph->ip_proto = (uint8_t)proto_num;
+    iph->csum = 0;
+    iph->src_ip = _host_address;
+    iph->dst_ip = to;
+    ldout(cct, 20) << " ipv4::send " << " id=" << iph->id << " " << _host_address << " -> " << to
+                   << " len " << pkt.len() << dendl;
+    *iph = iph->hton();
+
+    if (get_hw_features().tx_csum_ip_offload) {
+      iph->csum = 0;
+      pkt.offload_info_ref().needs_ip_csum = true;
+    } else {
+      checksummer csum;
+      csum.sum(reinterpret_cast<char*>(iph), sizeof(*iph));
+      iph->csum = csum.get();
+    }
+
+    _packetq.push_back(
+            l3_protocol::l3packet{eth_protocol_num::ipv4, e_dst, std::move(pkt)});
+  };
+
+  if (needs_frag) {
+    uint16_t offset = 0;
+    uint16_t remaining = p.len();
+    auto mtu = get_hw_features().mtu;
+
+    while (remaining) {
+      auto can_send = std::min(uint16_t(mtu - ipv4_hdr_len_min), remaining);
+      remaining -= can_send;
+      auto pkt = p.share(offset, can_send);
+      send_pkt(pkt, remaining, offset);
+      offset += can_send;
+    }
+  } else {
+    // The whole packet can be send in one shot
+    send_pkt(p, 0, 0);
+  }
+}
+
+Tub<l3_protocol::l3packet> ipv4::get_packet() {
+  // _packetq will be mostly empty here unless it hold remnants of previously
+  // fragmented packet
+  if (_packetq.empty()) {
+    for (size_t i = 0; i < _pkt_providers.size(); i++) {
+      auto l4p = _pkt_providers[_pkt_provider_idx++]();
+      if (_pkt_provider_idx == _pkt_providers.size()) {
+        _pkt_provider_idx = 0;
+      }
+      if (l4p) {
+        ldout(cct, 20) << " ipv4::get_packet len " << l4p->p.len() << dendl;
+        send(l4p->to, l4p->proto_num, std::move(l4p->p), l4p->e_dst);
+        break;
+      }
+    }
+  }
+
+  Tub<l3_protocol::l3packet> p;
+  if (!_packetq.empty()) {
+    p = std::move(_packetq.front());
+    _packetq.pop_front();
+  }
+  return p;
+}
+
+void ipv4::frag_limit_mem() {
+  if (_frag_mem <= _frag_high_thresh) {
+    return;
+  }
+  auto drop = _frag_mem - _frag_low_thresh;
+  while (drop) {
+    if (_frags_age.empty()) {
+      return;
+    }
+    // Drop the oldest frag (first element) from _frags_age
+    auto frag_id = _frags_age.front();
+    _frags_age.pop_front();
+
+    // Drop from _frags as well
+    auto& frag = _frags[frag_id];
+    auto dropped_size = frag.mem_size;
+    frag_drop(frag_id, dropped_size);
+
+    drop -= std::min(drop, dropped_size);
+  }
+}
+
+void ipv4::frag_timeout() {
+  if (_frags.empty()) {
+    return;
+  }
+  auto now = ceph_clock_now();
+  for (auto it = _frags_age.begin(); it != _frags_age.end();) {
+    auto frag_id = *it;
+    auto& frag = _frags[frag_id];
+    if (now > frag.rx_time + _frag_timeout) {
+      auto dropped_size = frag.mem_size;
+      // Drop from _frags
+      frag_drop(frag_id, dropped_size);
+      // Drop from _frags_age
+      it = _frags_age.erase(it);
+    } else {
+      // The further items can only be younger
+      break;
+    }
+  }
+  if (_frags.size() != 0) {
+    frag_arm(now);
+  } else {
+    _frag_mem = 0;
+  }
+}
+
+int32_t ipv4::frag::merge(ip_hdr &h, uint16_t offset, Packet p) {
+  uint32_t old = mem_size;
+  unsigned ip_hdr_len = h.ihl * 4;
+  // Store IP header
+  if (offset == 0) {
+    header = p.share(0, ip_hdr_len);
+  }
+  // Sotre IP payload
+  p.trim_front(ip_hdr_len);
+  data.merge(offset, std::move(p));
+  // Update mem size
+  mem_size = header.memory();
+  for (const auto& x : data.map) {
+    mem_size += x.second.memory();
+  }
+  auto added_size = mem_size - old;
+  return added_size;
+}
+
+bool ipv4::frag::is_complete() {
+  // If all the fragments are received, ipv4::frag::merge() should merge all
+  // the fragments into a single packet
+  auto offset = data.map.begin()->first;
+  auto nr_packet = data.map.size();
+  return last_frag_received && nr_packet == 1 && offset == 0;
+}
+
+Packet ipv4::frag::get_assembled_packet(ethernet_address from, ethernet_address to) {
+  auto& ip_header = header;
+  auto& ip_data = data.map.begin()->second;
+  // Append a ethernet header, needed for forwarding
+  auto eh = ip_header.prepend_header<eth_hdr>();
+  eh->src_mac = from;
+  eh->dst_mac = to;
+  eh->eth_proto = uint16_t(eth_protocol_num::ipv4);
+  *eh = eh->hton();
+  // Prepare a packet contains both ethernet header, ip header and ip data
+  ip_header.append(std::move(ip_data));
+  auto pkt = std::move(ip_header);
+  auto iph = pkt.get_header<ip_hdr>(sizeof(eth_hdr));
+  // len is the sum of each fragment
+  iph->len = hton(uint16_t(pkt.len() - sizeof(eth_hdr)));
+  // No fragmentation for the assembled datagram
+  iph->frag = 0;
+  // Since each fragment's csum is checked, no need to csum
+  // again for the assembled datagram
+  offload_info oi;
+  oi.reassembled = true;
+  pkt.set_offload_info(oi);
+  return pkt;
+}
+
+void icmp::received(Packet p, ipaddr from, ipaddr to) {
+  auto hdr = p.get_header<icmp_hdr>(0);
+  if (!hdr || hdr->type != icmp_hdr::msg_type::echo_request) {
+    return;
+  }
+  hdr->type = icmp_hdr::msg_type::echo_reply;
+  hdr->code = 0;
+  hdr->csum = 0;
+  checksummer csum;
+  csum.sum(reinterpret_cast<char*>(hdr), p.len());
+  hdr->csum = csum.get();
+
+  if (_queue_space.get_or_fail(p.len())) { // drop packets that do not fit the queue
+    auto cb = [this, from] (const ethernet_address e_dst, Packet p, int r) mutable {
+        if (r == 0) {
+          _packetq.emplace_back(ipv4_traits::l4packet{from, std::move(p), e_dst, ip_protocol_num::icmp});
+        }
+    };
+    _inet.wait_l2_dst_address(from, std::move(p), cb);
+  }
+}
diff --git a/src/msg/async/dpdk/IP.h b/src/msg/async/dpdk/IP.h
new file mode 100644
index 000000000..1fc606582
--- /dev/null
+++ b/src/msg/async/dpdk/IP.h
@@ -0,0 +1,403 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ */
+
+#ifndef CEPH_MSG_IP_H_
+#define CEPH_MSG_IP_H_
+
+#include <arpa/inet.h>
+#include <unordered_map>
+#include <cstdint>
+#include <array>
+#include <map>
+#include <list>
+#include <chrono>
+
+#include "msg/async/Event.h"
+#include "common/Throttle.h"
+
+#include "array_map.h"
+#include "ARP.h"
+#include "IPChecksum.h"
+#include "ip_types.h"
+#include "const.h"
+#include "net.h"
+#include "PacketUtil.h"
+#include "toeplitz.h"
+
+class ipv4;
+template <ip_protocol_num ProtoNum>
+class ipv4_l4;
+
+template <typename InetTraits>
+class tcp;
+
+struct ipv4_traits {
+  using address_type = ipv4_address;
+  using inet_type = ipv4_l4<ip_protocol_num::tcp>;
+  struct l4packet {
+    ipv4_address to;
+    Packet p;
+    ethernet_address e_dst;
+    ip_protocol_num proto_num;
+  };
+  using packet_provider_type = std::function<Tub<l4packet> ()>;
+  static void tcp_pseudo_header_checksum(checksummer& csum, ipv4_address src, ipv4_address dst, uint16_t len) {
+    csum.sum_many(src.ip, dst.ip, uint8_t(0), uint8_t(ip_protocol_num::tcp), len);
+  }
+  static constexpr uint8_t ip_hdr_len_min = ipv4_hdr_len_min;
+};
+
+template <ip_protocol_num ProtoNum>
+class ipv4_l4 {
+ public:
+  ipv4& _inet;
+ public:
+  ipv4_l4(ipv4& inet) : _inet(inet) {}
+  void register_packet_provider(ipv4_traits::packet_provider_type func);
+  void wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb);
+};
+
+class ip_protocol {
+ public:
+  virtual ~ip_protocol() {}
+  virtual void received(Packet p, ipv4_address from, ipv4_address to) = 0;
+  virtual bool forward(forward_hash& out_hash_data, Packet& p, size_t off) { return true; }
+};
+
+template <typename InetTraits>
+struct l4connid {
+  using ipaddr = typename InetTraits::address_type;
+  using inet_type = typename InetTraits::inet_type;
+  struct connid_hash;
+
+  ipaddr local_ip;
+  ipaddr foreign_ip;
+  uint16_t local_port;
+  uint16_t foreign_port;
+
+  bool operator==(const l4connid& x) const {
+    return local_ip == x.local_ip
+           && foreign_ip == x.foreign_ip
+           && local_port == x.local_port
+           && foreign_port == x.foreign_port;
+  }
+
+  uint32_t hash(const rss_key_type& rss_key) {
+    forward_hash hash_data;
+    hash_data.push_back(hton(foreign_ip.ip));
+    hash_data.push_back(hton(local_ip.ip));
+    hash_data.push_back(hton(foreign_port));
+    hash_data.push_back(hton(local_port));
+    return toeplitz_hash(rss_key, hash_data);
+  }
+};
+
+class ipv4_tcp final : public ip_protocol {
+  ipv4_l4<ip_protocol_num::tcp> _inet_l4;
+  std::unique_ptr<tcp<ipv4_traits>> _tcp;
+ public:
+  ipv4_tcp(ipv4& inet, EventCenter *c);
+  ~ipv4_tcp();
+  virtual void received(Packet p, ipv4_address from, ipv4_address to) override;
+  virtual bool forward(forward_hash& out_hash_data, Packet& p, size_t off) override;
+  friend class ipv4;
+};
+
+
+class icmp {
+ public:
+  using ipaddr = ipv4_address;
+  using inet_type = ipv4_l4<ip_protocol_num::icmp>;
+  explicit icmp(CephContext *c, inet_type& inet)
+      : cct(c), _inet(inet), _queue_space(c, "DPDK::icmp::_queue_space", 212992) {
+    _inet.register_packet_provider([this] {
+      Tub<ipv4_traits::l4packet> l4p;
+      if (!_packetq.empty()) {
+        l4p = std::move(_packetq.front());
+        _packetq.pop_front();
+        _queue_space.put(l4p->p.len());
+      }
+      return l4p;
+    });
+  }
+  void received(Packet p, ipaddr from, ipaddr to);
+
+ private:
+  CephContext *cct;
+  // ipv4_l4<ip_protocol_num::icmp>
+  inet_type& _inet;
+  circular_buffer<ipv4_traits::l4packet> _packetq;
+  Throttle _queue_space;
+};
+
+class ipv4_icmp final : public ip_protocol {
+  CephContext *cct;
+  ipv4_l4<ip_protocol_num::icmp> _inet_l4;
+  icmp _icmp;
+ public:
+  ipv4_icmp(CephContext *c, ipv4& inet) : cct(c), _inet_l4(inet), _icmp(c, _inet_l4) {}
+  virtual void received(Packet p, ipv4_address from, ipv4_address to) override {
+    _icmp.received(std::move(p), from, to);
+  }
+  friend class ipv4;
+};
+
+struct ip_hdr;
+
+struct ip_packet_filter {
+  virtual ~ip_packet_filter() {};
+  virtual void handle(Packet& p, ip_hdr* iph, ethernet_address from, bool & handled) = 0;
+};
+
+struct ipv4_frag_id {
+  struct hash;
+  ipv4_address src_ip;
+  ipv4_address dst_ip;
+  uint16_t identification;
+  uint8_t protocol;
+  bool operator==(const ipv4_frag_id& x) const {
+    return src_ip == x.src_ip &&
+           dst_ip == x.dst_ip &&
+           identification == x.identification &&
+           protocol == x.protocol;
+  }
+};
+
+struct ipv4_frag_id::hash : private std::hash<ipv4_address>,
+                            private std::hash<uint16_t>, private std::hash<uint8_t> {
+  size_t operator()(const ipv4_frag_id& id) const noexcept {
+    using h1 = std::hash<ipv4_address>;
+    using h2 = std::hash<uint16_t>;
+    using h3 = std::hash<uint8_t>;
+    return h1::operator()(id.src_ip) ^
+           h1::operator()(id.dst_ip) ^
+           h2::operator()(id.identification) ^
+           h3::operator()(id.protocol);
+  }
+};
+
+struct ipv4_tag {};
+using ipv4_packet_merger = packet_merger<uint32_t, ipv4_tag>;
+
+class interface;
+
+class ipv4 {
+ public:
+  using address_type = ipv4_address;
+  using proto_type = uint16_t;
+  static address_type broadcast_address() { return ipv4_address(0xffffffff); }
+  static proto_type arp_protocol_type() { return proto_type(eth_protocol_num::ipv4); }
+  CephContext *cct;
+  EventCenter *center;
+
+ private:
+  interface* _netif;
+  std::vector<ipv4_traits::packet_provider_type> _pkt_providers;
+  Tub<uint64_t> frag_timefd;
+  EventCallbackRef frag_handler;
+  arp _global_arp;
+  arp_for<ipv4> _arp;
+  ipv4_address _host_address;
+  ipv4_address _gw_address;
+  ipv4_address _netmask;
+  l3_protocol _l3;
+  subscription<Packet, ethernet_address> _rx_packets;
+  ipv4_tcp _tcp;
+  ipv4_icmp _icmp;
+  array_map<ip_protocol*, 256> _l4;
+  ip_packet_filter *_packet_filter;
+  struct frag {
+    Packet header;
+    ipv4_packet_merger data;
+    utime_t rx_time;
+    uint32_t mem_size = 0;
+    // fragment with MF == 0 inidates it is the last fragment
+    bool last_frag_received = false;
+
+    Packet get_assembled_packet(ethernet_address from, ethernet_address to);
+    int32_t merge(ip_hdr &h, uint16_t offset, Packet p);
+    bool is_complete();
+  };
+  std::unordered_map<ipv4_frag_id, frag, ipv4_frag_id::hash> _frags;
+  std::list<ipv4_frag_id> _frags_age;
+  static utime_t _frag_timeout;
+  static constexpr uint32_t _frag_low_thresh{3 * 1024 * 1024};
+  static constexpr uint32_t _frag_high_thresh{4 * 1024 * 1024};
+  uint32_t _frag_mem = 0;
+  circular_buffer<l3_protocol::l3packet> _packetq;
+  unsigned _pkt_provider_idx = 0;
+  PerfCounters *perf_logger;
+
+ private:
+  int handle_received_packet(Packet p, ethernet_address from);
+  bool forward(forward_hash& out_hash_data, Packet& p, size_t off);
+  Tub<l3_protocol::l3packet> get_packet();
+  bool in_my_netmask(ipv4_address a) const {
+    return !((a.ip ^ _host_address.ip) & _netmask.ip);
+  }
+  void frag_limit_mem();
+  void frag_drop(ipv4_frag_id frag_id, uint32_t dropped_size) {
+    _frags.erase(frag_id);
+    _frag_mem -= dropped_size;
+  }
+  void frag_arm(utime_t now) {
+    auto tp = now + _frag_timeout;
+    frag_timefd.construct(center->create_time_event(tp.to_nsec() / 1000, frag_handler));
+  }
+  void frag_arm() {
+    auto now = ceph_clock_now();
+    frag_timefd.construct(center->create_time_event(now.to_nsec() / 1000, frag_handler));
+  }
+
+ public:
+  void frag_timeout();
+
+ public:
+  explicit ipv4(CephContext *c, EventCenter *cen, interface* netif);
+  ~ipv4() {
+    delete frag_handler;
+  }
+  void set_host_address(ipv4_address ip) {
+    _host_address = ip;
+    _arp.set_self_addr(ip);
+  }
+  ipv4_address host_address() {
+    return _host_address;
+  }
+  void set_gw_address(ipv4_address ip) {
+    _gw_address = ip;
+  }
+  ipv4_address gw_address() const {
+    return _gw_address;
+  }
+  void set_netmask_address(ipv4_address ip) {
+    _netmask = ip;
+  }
+  ipv4_address netmask_address() const {
+    return _netmask;
+  }
+  interface *netif() const {
+    return _netif;
+  }
+  // TODO or something. Should perhaps truly be a list
+  // of filters. With ordering. And blackjack. Etc.
+  // But for now, a simple single raw pointer suffices
+  void set_packet_filter(ip_packet_filter *f) {
+    _packet_filter = f;
+  }
+  ip_packet_filter * packet_filter() const {
+    return _packet_filter;
+  }
+  void send(ipv4_address to, ip_protocol_num proto_num, Packet p, ethernet_address e_dst);
+  tcp<ipv4_traits>& get_tcp() { return *_tcp._tcp; }
+  void register_l4(proto_type id, ip_protocol* handler);
+  const hw_features& get_hw_features() const;
+  static bool needs_frag(Packet& p, ip_protocol_num proto_num, hw_features hw_features) {
+    if (p.len() + ipv4_hdr_len_min <= hw_features.mtu)
+      return false;
+
+    if ((proto_num == ip_protocol_num::tcp && hw_features.tx_tso))
+      return false;
+
+    return true;
+  }
+  void learn(ethernet_address l2, ipv4_address l3) {
+    _arp.learn(l2, l3);
+  }
+  void register_packet_provider(ipv4_traits::packet_provider_type&& func) {
+    _pkt_providers.push_back(std::move(func));
+  }
+  void wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb);
+};
+
+template <ip_protocol_num ProtoNum>
+inline void ipv4_l4<ProtoNum>::register_packet_provider(
+    ipv4_traits::packet_provider_type func) {
+  _inet.register_packet_provider([func] {
+    auto l4p = func();
+    if (l4p) {
+      (*l4p).proto_num = ProtoNum;
+    }
+    return l4p;
+  });
+}
+
+template <ip_protocol_num ProtoNum>
+inline void ipv4_l4<ProtoNum>::wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb) {
+  _inet.wait_l2_dst_address(to, std::move(p), std::move(cb));
+}
+
+struct ip_hdr {
+  uint8_t ihl : 4;
+  uint8_t ver : 4;
+  uint8_t dscp : 6;
+  uint8_t ecn : 2;
+  uint16_t len;
+  uint16_t id;
+  uint16_t frag;
+  enum class frag_bits : uint8_t { mf = 13, df = 14, reserved = 15, offset_shift = 3 };
+  uint8_t ttl;
+  uint8_t ip_proto;
+  uint16_t csum;
+  ipv4_address src_ip;
+  ipv4_address dst_ip;
+  uint8_t options[0];
+  ip_hdr hton() {
+    ip_hdr hdr = *this;
+    hdr.len = ::hton(len);
+    hdr.id = ::hton(id);
+    hdr.frag = ::hton(frag);
+    hdr.csum = ::hton(csum);
+    hdr.src_ip.ip = ::hton(src_ip.ip);
+    hdr.dst_ip.ip = ::hton(dst_ip.ip);
+    return hdr;
+  }
+  ip_hdr ntoh() {
+    ip_hdr hdr = *this;
+    hdr.len = ::ntoh(len);
+    hdr.id = ::ntoh(id);
+    hdr.frag = ::ntoh(frag);
+    hdr.csum = ::ntoh(csum);
+    hdr.src_ip = src_ip.ntoh();
+    hdr.dst_ip = dst_ip.ntoh();
+    return hdr;
+  }
+
+  bool mf() { return frag & (1 << uint8_t(frag_bits::mf)); }
+  bool df() { return frag & (1 << uint8_t(frag_bits::df)); }
+  uint16_t offset() { return frag << uint8_t(frag_bits::offset_shift); }
+} __attribute__((packed));
+
+template <typename InetTraits>
+struct l4connid<InetTraits>::connid_hash : private std::hash<ipaddr>, private std::hash<uint16_t> {
+  size_t operator()(const l4connid<InetTraits>& id) const noexcept {
+    using h1 = std::hash<ipaddr>;
+    using h2 = std::hash<uint16_t>;
+    return h1::operator()(id.local_ip)
+           ^ h1::operator()(id.foreign_ip)
+           ^ h2::operator()(id.local_port)
+           ^ h2::operator()(id.foreign_port);
+  }
+};
+
+#endif /* CEPH_MSG_IP_H */
diff --git a/src/msg/async/dpdk/IPChecksum.cc b/src/msg/async/dpdk/IPChecksum.cc
new file mode 100644
index 000000000..7a3253c1e
--- /dev/null
+++ b/src/msg/async/dpdk/IPChecksum.cc
@@ -0,0 +1,70 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#include <arpa/inet.h>
+#include "net.h"
+#include "IPChecksum.h"
+
+void checksummer::sum(const char* data, size_t len) {
+  auto orig_len = len;
+  if (odd) {
+    csum += uint8_t(*data++);
+    --len;
+  }
+  auto p64 = reinterpret_cast<const uint64_t*>(data);
+  while (len >= 8) {
+    csum += ntohq(*p64++);
+    len -= 8;
+  }
+  auto p16 = reinterpret_cast<const uint16_t*>(p64);
+  while (len >= 2) {
+    csum += ntohs(*p16++);
+    len -= 2;
+  }
+  auto p8 = reinterpret_cast<const uint8_t*>(p16);
+  if (len) {
+    csum += *p8++ << 8;
+    len -= 1;
+  }
+  odd ^= orig_len & 1;
+}
+
+uint16_t checksummer::get() const {
+  __int128 csum1 = (csum & 0xffffffffffffffff) + (csum >> 64);
+  uint64_t csum = (csum1 & 0xffffffffffffffff) + (csum1 >> 64);
+  csum = (csum & 0xffff) + ((csum >> 16) & 0xffff) + ((csum >> 32) & 0xffff) + (csum >> 48);
+  csum = (csum & 0xffff) + (csum >> 16);
+  csum = (csum & 0xffff) + (csum >> 16);
+  return htons(~csum);
+}
+
+void checksummer::sum(const Packet& p) {
+  for (auto&& f : p.fragments()) {
+    sum(f.base, f.size);
+  }
+}
+
+uint16_t ip_checksum(const void* data, size_t len) {
+  checksummer cksum;
+  cksum.sum(reinterpret_cast<const char*>(data), len);
+  return cksum.get();
+}
diff --git a/src/msg/async/dpdk/IPChecksum.h b/src/msg/async/dpdk/IPChecksum.h
new file mode 100644
index 000000000..9af4a86b9
--- /dev/null
+++ b/src/msg/async/dpdk/IPChecksum.h
@@ -0,0 +1,72 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_CHECKSUM_H_
+#define CEPH_MSG_CHECKSUM_H_
+
+#include <cstdint>
+#include <cstddef>
+#include <arpa/inet.h>
+
+#include "Packet.h"
+
+uint16_t ip_checksum(const void* data, size_t len);
+
+struct checksummer {
+  __int128 csum = 0;
+  bool odd = false;
+  void sum(const char* data, size_t len);
+  void sum(const Packet& p);
+  void sum(uint8_t data) {
+    if (!odd) {
+      csum += data << 8;
+    } else {
+      csum += data;
+    }
+    odd = !odd;
+  }
+  void sum(uint16_t data) {
+    if (odd) {
+      sum(uint8_t(data >> 8));
+      sum(uint8_t(data));
+    } else {
+      csum += data;
+    }
+  }
+  void sum(uint32_t data) {
+    if (odd) {
+      sum(uint16_t(data));
+      sum(uint16_t(data >> 16));
+    } else {
+      csum += data;
+    }
+  }
+  void sum_many() {}
+  template <typename T0, typename... T>
+  void sum_many(T0 data, T... rest) {
+    sum(data);
+    sum_many(rest...);
+  }
+  uint16_t get() const;
+};
+
+#endif /* CEPH_MSG_CHECKSUM_H_ */
diff --git a/src/msg/async/dpdk/Packet.cc b/src/msg/async/dpdk/Packet.cc
new file mode 100644
index 000000000..6c2320a01
--- /dev/null
+++ b/src/msg/async/dpdk/Packet.cc
@@ -0,0 +1,146 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <iostream>
+#include <algorithm>
+#include <cctype>
+
+#include "capture.h"
+#include "Packet.h"
+
+constexpr size_t Packet::internal_data_size;
+constexpr size_t Packet::default_nr_frags;
+
+void Packet::linearize(size_t at_frag, size_t desired_size) {
+  _impl->unuse_internal_data();
+  size_t nr_frags = 0;
+  size_t accum_size = 0;
+  while (accum_size < desired_size) {
+    accum_size += _impl->frags[at_frag + nr_frags].size;
+    ++nr_frags;
+  }
+  char *new_frag = new char[accum_size];
+  auto p = new_frag;
+  for (size_t i = 0; i < nr_frags; ++i) {
+    auto& f = _impl->frags[at_frag + i];
+    p = std::copy(f.base, f.base + f.size, p);
+  }
+  // collapse nr_frags into one fragment
+  std::copy(_impl->frags + at_frag + nr_frags, _impl->frags + _impl->_nr_frags,
+            _impl->frags + at_frag + 1);
+  _impl->_nr_frags -= nr_frags - 1;
+  _impl->frags[at_frag] = fragment{new_frag, accum_size};
+  if (at_frag == 0 && desired_size == len()) {
+    // We can drop the old buffer safely
+    auto x = std::move(_impl->_deleter);
+    _impl->_deleter = make_deleter([new_frag] { delete []new_frag; });
+  } else {
+    auto del = std::bind(
+            [new_frag](deleter &d) { delete []new_frag; }, std::move(_impl->_deleter));
+    _impl->_deleter = make_deleter(std::move(del));
+  }
+}
+
+class C_free_on_cpu : public EventCallback {
+  deleter del;
+  std::function<void()> cb;
+ public:
+  C_free_on_cpu(deleter &&d, std::function<void()> &&c):
+      del(std::move(d)), cb(std::move(c)) {}
+  void do_request(uint64_t fd) {
+    // deleter needs to be moved from lambda capture to be destroyed here
+    // otherwise deleter destructor will be called on a cpu that called
+    // create_external_event when work_item is destroyed.
+    deleter xxx(std::move(del));
+    cb();
+    delete this;
+  }
+};
+
+Packet Packet::free_on_cpu(EventCenter *center, std::function<void()> cb)
+{
+  auto del = std::bind(
+      [center, cb] (deleter &del) mutable {
+        center->dispatch_event_external(new C_free_on_cpu(std::move(del), std::move(cb)));
+      }, std::move(_impl->_deleter));
+  // make new deleter that runs old deleter on an origin cpu
+  _impl->_deleter = make_deleter(deleter(), std::move(del));
+
+  return Packet(impl::copy(_impl.get()));
+}
+
+std::ostream& operator<<(std::ostream& os, const Packet& p) {
+  os << "Packet{";
+  bool first = true;
+  for (auto&& frag : p.fragments()) {
+    if (!first) {
+      os << ", ";
+    }
+    first = false;
+    if (std::all_of(frag.base, frag.base + frag.size, [] (int c) { return c >= 9 && c <= 0x7f; })) {
+      os << '"';
+      for (auto p = frag.base; p != frag.base + frag.size; ++p) {
+        auto c = *p;
+        if (isprint(c)) {
+          os << c;
+        } else if (c == '\r') {
+          os << "\\r";
+        } else if (c == '\n') {
+          os << "\\n";
+        } else if (c == '\t') {
+          os << "\\t";
+        } else {
+          uint8_t b = c;
+          os << "\\x" << (b / 16) << (b % 16);
+        }
+      }
+      os << '"';
+    } else {
+      os << "{";
+      bool nfirst = true;
+      for (auto p = frag.base; p != frag.base + frag.size; ++p) {
+        if (!nfirst) {
+          os << " ";
+        }
+        nfirst = false;
+        uint8_t b = *p;
+        os << b;
+      }
+      os << "}";
+    }
+  }
+  os << "}";
+  return os;
+}
diff --git a/src/msg/async/dpdk/Packet.h b/src/msg/async/dpdk/Packet.h
new file mode 100644
index 000000000..f929da317
--- /dev/null
+++ b/src/msg/async/dpdk/Packet.h
@@ -0,0 +1,550 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_PACKET_H_
+#define CEPH_MSG_PACKET_H_
+
+#include <vector>
+#include <algorithm>
+#include <iosfwd>
+
+#include "include/types.h"
+#include "common/Tub.h"
+#include "common/deleter.h"
+#include "msg/async/Event.h"
+
+#include "const.h"
+
+struct fragment {
+    char* base;
+    size_t size;
+};
+
+struct offload_info {
+  ip_protocol_num protocol = ip_protocol_num::unused;
+  bool needs_csum = false;
+  uint8_t ip_hdr_len = 20;
+  uint8_t tcp_hdr_len = 20;
+  uint8_t udp_hdr_len = 8;
+  bool needs_ip_csum = false;
+  bool reassembled = false;
+  uint16_t tso_seg_size = 0;
+  // HW stripped VLAN header (CPU order)
+  Tub<uint16_t> vlan_tci;
+};
+
+// Zero-copy friendly packet class
+//
+// For implementing zero-copy, we need a flexible destructor that can
+// destroy packet data in different ways: decrementing a reference count,
+// or calling a free()-like function.
+//
+// Moreover, we need different destructors for each set of fragments within
+// a single fragment. For example, a header and trailer might need delete[]
+// to be called, while the internal data needs a reference count to be
+// released.  Matters are complicated in that fragments can be split
+// (due to virtual/physical translation).
+//
+// To implement this, we associate each packet with a single destructor,
+// but allow composing a packet from another packet plus a fragment to
+// be added, with its own destructor, causing the destructors to be chained.
+//
+// The downside is that the data needed for the destructor is duplicated,
+// if it is already available in the fragment itself.
+//
+// As an optimization, when we allocate small fragments, we allocate some
+// extra space, so prepending to the packet does not require extra
+// allocations.  This is useful when adding headers.
+//
+class Packet {
+  // enough for lots of headers, not quite two cache lines:
+  static constexpr size_t internal_data_size = 128 - 16;
+  static constexpr size_t default_nr_frags = 4;
+
+  struct pseudo_vector {
+    fragment* _start;
+    fragment* _finish;
+    pseudo_vector(fragment* start, size_t nr)
+        : _start(start), _finish(_start + nr) {}
+    fragment* begin() { return _start; }
+    fragment* end() { return _finish; }
+    fragment& operator[](size_t idx) { return _start[idx]; }
+  };
+
+  struct impl {
+    // when destroyed, virtual destructor will reclaim resources
+    deleter _deleter;
+    unsigned _len = 0;
+    uint16_t _nr_frags = 0;
+    uint16_t _allocated_frags;
+    offload_info _offload_info;
+    Tub<uint32_t> rss_hash;
+    char data[internal_data_size]; // only frags[0] may use
+    unsigned headroom = internal_data_size; // in data
+    // FIXME: share data/frags space
+
+    fragment frags[];
+
+    explicit impl(size_t nr_frags = default_nr_frags);
+    impl(const impl&) = delete;
+    impl(fragment frag, size_t nr_frags = default_nr_frags);
+
+    pseudo_vector fragments() { return { frags, _nr_frags }; }
+
+    static std::unique_ptr<impl> allocate(size_t nr_frags) {
+      nr_frags = std::max(nr_frags, default_nr_frags);
+      return std::unique_ptr<impl>(new (nr_frags) impl(nr_frags));
+    }
+
+    static std::unique_ptr<impl> copy(impl* old, size_t nr) {
+      auto n = allocate(nr);
+      n->_deleter = std::move(old->_deleter);
+      n->_len = old->_len;
+      n->_nr_frags = old->_nr_frags;
+      n->headroom = old->headroom;
+      n->_offload_info = old->_offload_info;
+      n->rss_hash.construct(old->rss_hash);
+      std::copy(old->frags, old->frags + old->_nr_frags, n->frags);
+      old->copy_internal_fragment_to(n.get());
+      return n;
+    }
+
+    static std::unique_ptr<impl> copy(impl* old) {
+      return copy(old, old->_nr_frags);
+    }
+
+    static std::unique_ptr<impl> allocate_if_needed(std::unique_ptr<impl> old, size_t extra_frags) {
+      if (old->_allocated_frags >= old->_nr_frags + extra_frags) {
+        return old;
+      }
+      return copy(old.get(), std::max<size_t>(old->_nr_frags + extra_frags, 2 * old->_nr_frags));
+    }
+    void* operator new(size_t size, size_t nr_frags = default_nr_frags) {
+      ceph_assert(nr_frags == uint16_t(nr_frags));
+      return ::operator new(size + nr_frags * sizeof(fragment));
+    }
+    // Matching the operator new above
+    void operator delete(void* ptr, size_t nr_frags) {
+      return ::operator delete(ptr);
+    }
+    // Since the above "placement delete" hides the global one, expose it
+    void operator delete(void* ptr) {
+      return ::operator delete(ptr);
+    }
+
+    bool using_internal_data() const {
+      return _nr_frags
+              && frags[0].base >= data
+              && frags[0].base < data + internal_data_size;
+    }
+
+    void unuse_internal_data() {
+      if (!using_internal_data()) {
+        return;
+      }
+      auto buf = static_cast<char*>(::malloc(frags[0].size));
+      if (!buf) {
+        throw std::bad_alloc();
+      }
+      deleter d = make_free_deleter(buf);
+      std::copy(frags[0].base, frags[0].base + frags[0].size, buf);
+      frags[0].base = buf;
+      _deleter.append(std::move(d));
+      headroom = internal_data_size;
+    }
+    void copy_internal_fragment_to(impl* to) {
+      if (!using_internal_data()) {
+        return;
+      }
+      to->frags[0].base = to->data + headroom;
+      std::copy(frags[0].base, frags[0].base + frags[0].size,
+              to->frags[0].base);
+    }
+  };
+  explicit Packet(std::unique_ptr<impl>&& impl) : _impl(std::move(impl)) {}
+  std::unique_ptr<impl> _impl;
+public:
+  static Packet from_static_data(const char* data, size_t len) {
+    return {fragment{const_cast<char*>(data), len}, deleter()};
+  }
+
+  // build empty Packet
+  Packet();
+  // build empty Packet with nr_frags allocated
+  explicit Packet(size_t nr_frags);
+  // move existing Packet
+  Packet(Packet&& x) noexcept;
+  // copy data into Packet
+  Packet(const char* data, size_t len);
+  // copy data into Packet
+  explicit Packet(fragment frag);
+  // zero-copy single fragment
+  Packet(fragment frag, deleter del);
+  // zero-copy multiple fragments
+  Packet(std::vector<fragment> frag, deleter del);
+  // build Packet with iterator
+  template <typename Iterator>
+  Packet(Iterator begin, Iterator end, deleter del);
+  // append fragment (copying new fragment)
+  Packet(Packet&& x, fragment frag);
+  // prepend fragment (copying new fragment, with header optimization)
+  Packet(fragment frag, Packet&& x);
+  // prepend fragment (zero-copy)
+  Packet(fragment frag, deleter del, Packet&& x);
+  // append fragment (zero-copy)
+  Packet(Packet&& x, fragment frag, deleter d);
+  // append deleter
+  Packet(Packet&& x, deleter d);
+
+  Packet& operator=(Packet&& x) {
+    if (this != &x) {
+      this->~Packet();
+      new (this) Packet(std::move(x));
+    }
+    return *this;
+  }
+
+  unsigned len() const { return _impl->_len; }
+  unsigned memory() const { return len() +  sizeof(Packet::impl); }
+
+  fragment frag(unsigned idx) const { return _impl->frags[idx]; }
+  fragment& frag(unsigned idx) { return _impl->frags[idx]; }
+
+  unsigned nr_frags() const { return _impl->_nr_frags; }
+  pseudo_vector fragments() const { return { _impl->frags, _impl->_nr_frags }; }
+  fragment* fragment_array() const { return _impl->frags; }
+
+  // share Packet data (reference counted, non COW)
+  Packet share();
+  Packet share(size_t offset, size_t len);
+
+  void append(Packet&& p);
+
+  void trim_front(size_t how_much);
+  void trim_back(size_t how_much);
+
+  // get a header pointer, linearizing if necessary
+  template <typename Header>
+  Header* get_header(size_t offset = 0);
+
+  // get a header pointer, linearizing if necessary
+  char* get_header(size_t offset, size_t size);
+
+  // prepend a header (default-initializing it)
+  template <typename Header>
+  Header* prepend_header(size_t extra_size = 0);
+
+  // prepend a header (uninitialized!)
+  char* prepend_uninitialized_header(size_t size);
+
+  Packet free_on_cpu(EventCenter *c, std::function<void()> cb = []{});
+
+  void linearize() { return linearize(0, len()); }
+
+  void reset() { _impl.reset(); }
+
+  void reserve(int n_frags) {
+    if (n_frags > _impl->_nr_frags) {
+      auto extra = n_frags - _impl->_nr_frags;
+      _impl = impl::allocate_if_needed(std::move(_impl), extra);
+    }
+  }
+  Tub<uint32_t> rss_hash() {
+    return _impl->rss_hash;
+  }
+  void set_rss_hash(uint32_t hash) {
+    _impl->rss_hash.construct(hash);
+  }
+private:
+  void linearize(size_t at_frag, size_t desired_size);
+  bool allocate_headroom(size_t size);
+public:
+  class offload_info offload_info() const { return _impl->_offload_info; }
+  class offload_info& offload_info_ref() { return _impl->_offload_info; }
+  void set_offload_info(class offload_info oi) { _impl->_offload_info = oi; }
+};
+
+std::ostream& operator<<(std::ostream& os, const Packet& p);
+
+inline Packet::Packet(Packet&& x) noexcept
+    : _impl(std::move(x._impl)) {
+}
+
+inline Packet::impl::impl(size_t nr_frags)
+    : _len(0), _allocated_frags(nr_frags) {
+}
+
+inline Packet::impl::impl(fragment frag, size_t nr_frags)
+    : _len(frag.size), _allocated_frags(nr_frags) {
+    ceph_assert(_allocated_frags > _nr_frags);
+  if (frag.size <= internal_data_size) {
+    headroom -= frag.size;
+    frags[0] = { data + headroom, frag.size };
+  } else {
+    auto buf = static_cast<char*>(::malloc(frag.size));
+    if (!buf) {
+      throw std::bad_alloc();
+    }
+    deleter d = make_free_deleter(buf);
+    frags[0] = { buf, frag.size };
+    _deleter.append(std::move(d));
+  }
+  std::copy(frag.base, frag.base + frag.size, frags[0].base);
+  ++_nr_frags;
+}
+
+inline Packet::Packet(): _impl(impl::allocate(1)) {
+}
+
+inline Packet::Packet(size_t nr_frags): _impl(impl::allocate(nr_frags)) {
+}
+
+inline Packet::Packet(fragment frag): _impl(new impl(frag)) {
+}
+
+inline Packet::Packet(const char* data, size_t size):
+    Packet(fragment{const_cast<char*>(data), size}) {
+}
+
+inline Packet::Packet(fragment frag, deleter d)
+    : _impl(impl::allocate(1)) {
+  _impl->_deleter = std::move(d);
+  _impl->frags[_impl->_nr_frags++] = frag;
+  _impl->_len = frag.size;
+}
+
+inline Packet::Packet(std::vector<fragment> frag, deleter d)
+    : _impl(impl::allocate(frag.size())) {
+  _impl->_deleter = std::move(d);
+  std::copy(frag.begin(), frag.end(), _impl->frags);
+  _impl->_nr_frags = frag.size();
+  _impl->_len = 0;
+  for (auto&& f : _impl->fragments()) {
+    _impl->_len += f.size;
+  }
+}
+
+template <typename Iterator>
+inline Packet::Packet(Iterator begin, Iterator end, deleter del) {
+  unsigned nr_frags = 0, len = 0;
+  nr_frags = std::distance(begin, end);
+  std::for_each(begin, end, [&] (fragment& frag) { len += frag.size; });
+  _impl = impl::allocate(nr_frags);
+  _impl->_deleter = std::move(del);
+  _impl->_len = len;
+  _impl->_nr_frags = nr_frags;
+  std::copy(begin, end, _impl->frags);
+}
+
+inline Packet::Packet(Packet&& x, fragment frag)
+    : _impl(impl::allocate_if_needed(std::move(x._impl), 1)) {
+  _impl->_len += frag.size;
+  char* buf = new char[frag.size];
+  std::copy(frag.base, frag.base + frag.size, buf);
+  _impl->frags[_impl->_nr_frags++] = {buf, frag.size};
+  _impl->_deleter = make_deleter(std::move(_impl->_deleter), [buf] {
+    delete[] buf;
+  });
+}
+
+inline bool Packet::allocate_headroom(size_t size) {
+  if (_impl->headroom >= size) {
+    _impl->_len += size;
+    if (!_impl->using_internal_data()) {
+      _impl = impl::allocate_if_needed(std::move(_impl), 1);
+      std::copy_backward(_impl->frags, _impl->frags + _impl->_nr_frags,
+              _impl->frags + _impl->_nr_frags + 1);
+      _impl->frags[0] = { _impl->data + internal_data_size, 0 };
+      ++_impl->_nr_frags;
+    }
+    _impl->headroom -= size;
+    _impl->frags[0].base -= size;
+    _impl->frags[0].size += size;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+
+inline Packet::Packet(fragment frag, Packet&& x)
+    : _impl(std::move(x._impl)) {
+  // try to prepend into existing internal fragment
+  if (allocate_headroom(frag.size)) {
+    std::copy(frag.base, frag.base + frag.size, _impl->frags[0].base);
+    return;
+  } else {
+    // didn't work out, allocate and copy
+    _impl->unuse_internal_data();
+    _impl = impl::allocate_if_needed(std::move(_impl), 1);
+    _impl->_len += frag.size;
+    char *buf = new char[frag.size];
+    std::copy(frag.base, frag.base + frag.size, buf);
+    std::copy_backward(_impl->frags, _impl->frags + _impl->_nr_frags,
+            _impl->frags + _impl->_nr_frags + 1);
+    ++_impl->_nr_frags;
+    _impl->frags[0] = {buf, frag.size};
+    _impl->_deleter = make_deleter(
+            std::move(_impl->_deleter), [buf] { delete []buf; });
+  }
+}
+
+inline Packet::Packet(Packet&& x, fragment frag, deleter d)
+    : _impl(impl::allocate_if_needed(std::move(x._impl), 1)) {
+  _impl->_len += frag.size;
+  _impl->frags[_impl->_nr_frags++] = frag;
+  d.append(std::move(_impl->_deleter));
+  _impl->_deleter = std::move(d);
+}
+
+inline Packet::Packet(Packet&& x, deleter d): _impl(std::move(x._impl)) {
+  _impl->_deleter.append(std::move(d));
+}
+
+inline void Packet::append(Packet&& p) {
+  if (!_impl->_len) {
+    *this = std::move(p);
+    return;
+  }
+  _impl = impl::allocate_if_needed(std::move(_impl), p._impl->_nr_frags);
+  _impl->_len += p._impl->_len;
+  p._impl->unuse_internal_data();
+  std::copy(p._impl->frags, p._impl->frags + p._impl->_nr_frags,
+            _impl->frags + _impl->_nr_frags);
+  _impl->_nr_frags += p._impl->_nr_frags;
+  p._impl->_deleter.append(std::move(_impl->_deleter));
+  _impl->_deleter = std::move(p._impl->_deleter);
+}
+
+inline char* Packet::get_header(size_t offset, size_t size) {
+  if (offset + size > _impl->_len) {
+    return nullptr;
+  }
+  size_t i = 0;
+  while (i != _impl->_nr_frags && offset >= _impl->frags[i].size) {
+    offset -= _impl->frags[i++].size;
+  }
+  if (i == _impl->_nr_frags) {
+    return nullptr;
+  }
+  if (offset + size > _impl->frags[i].size) {
+    linearize(i, offset + size);
+  }
+  return _impl->frags[i].base + offset;
+}
+
+template <typename Header>
+inline Header* Packet::get_header(size_t offset) {
+  return reinterpret_cast<Header*>(get_header(offset, sizeof(Header)));
+}
+
+inline void Packet::trim_front(size_t how_much) {
+  ceph_assert(how_much <= _impl->_len);
+  _impl->_len -= how_much;
+  size_t i = 0;
+  while (how_much && how_much >= _impl->frags[i].size) {
+    how_much -= _impl->frags[i++].size;
+  }
+  std::copy(_impl->frags + i, _impl->frags + _impl->_nr_frags, _impl->frags);
+  _impl->_nr_frags -= i;
+  if (!_impl->using_internal_data()) {
+    _impl->headroom = internal_data_size;
+  }
+  if (how_much) {
+    if (_impl->using_internal_data()) {
+      _impl->headroom += how_much;
+    }
+    _impl->frags[0].base += how_much;
+    _impl->frags[0].size -= how_much;
+  }
+}
+
+inline void Packet::trim_back(size_t how_much) {
+  ceph_assert(how_much <= _impl->_len);
+  _impl->_len -= how_much;
+  size_t i = _impl->_nr_frags - 1;
+  while (how_much && how_much >= _impl->frags[i].size) {
+    how_much -= _impl->frags[i--].size;
+  }
+  _impl->_nr_frags = i + 1;
+  if (how_much) {
+    _impl->frags[i].size -= how_much;
+    if (i == 0 && _impl->using_internal_data()) {
+        _impl->headroom += how_much;
+    }
+  }
+}
+
+template <typename Header>
+Header* Packet::prepend_header(size_t extra_size) {
+  auto h = prepend_uninitialized_header(sizeof(Header) + extra_size);
+  return new (h) Header{};
+}
+
+// prepend a header (uninitialized!)
+inline char* Packet::prepend_uninitialized_header(size_t size) {
+  if (!allocate_headroom(size)) {
+    // didn't work out, allocate and copy
+    _impl->unuse_internal_data();
+    // try again, after unuse_internal_data we may have space after all
+    if (!allocate_headroom(size)) {
+      // failed
+      _impl->_len += size;
+      _impl = impl::allocate_if_needed(std::move(_impl), 1);
+      char *buf = new char[size];
+      std::copy_backward(_impl->frags, _impl->frags + _impl->_nr_frags,
+              _impl->frags + _impl->_nr_frags + 1);
+      ++_impl->_nr_frags;
+      _impl->frags[0] = {buf, size};
+      _impl->_deleter = make_deleter(std::move(_impl->_deleter),
+              [buf] { delete []buf; });
+    }
+  }
+  return _impl->frags[0].base;
+}
+
+inline Packet Packet::share() {
+    return share(0, _impl->_len);
+}
+
+inline Packet Packet::share(size_t offset, size_t len) {
+  _impl->unuse_internal_data(); // FIXME: eliminate?
+  Packet n;
+  n._impl = impl::allocate_if_needed(std::move(n._impl), _impl->_nr_frags);
+  size_t idx = 0;
+  while (offset > 0 && offset >= _impl->frags[idx].size) {
+    offset -= _impl->frags[idx++].size;
+  }
+  while (n._impl->_len < len) {
+    auto& f = _impl->frags[idx++];
+    auto fsize = std::min(len - n._impl->_len, f.size - offset);
+    n._impl->frags[n._impl->_nr_frags++] = { f.base + offset, fsize };
+    n._impl->_len += fsize;
+    offset = 0;
+  }
+  n._impl->_offload_info = _impl->_offload_info;
+  ceph_assert(!n._impl->_deleter);
+  n._impl->_deleter = _impl->_deleter.share();
+  return n;
+}
+
+#endif /* CEPH_MSG_PACKET_H_ */
diff --git a/src/msg/async/dpdk/PacketUtil.h b/src/msg/async/dpdk/PacketUtil.h
new file mode 100644
index 000000000..118218e66
--- /dev/null
+++ b/src/msg/async/dpdk/PacketUtil.h
@@ -0,0 +1,154 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_PACKET_UTIL_H_
+#define CEPH_MSG_PACKET_UTIL_H_
+
+#include <map>
+#include <iostream>
+
+#include "Packet.h"
+
+template <typename Offset, typename Tag>
+class packet_merger {
+ private:
+  static uint64_t& linearizations_ref() {
+    static thread_local uint64_t linearization_count;
+    return linearization_count;
+  }
+ public:
+  std::map<Offset, Packet> map;
+
+  static uint64_t linearizations() {
+    return linearizations_ref();
+  }
+
+  void merge(Offset offset, Packet p) {
+    bool insert = true;
+    auto beg = offset;
+    auto end = beg + p.len();
+    // First, try to merge the packet with existing segment
+    for (auto it = map.begin(); it != map.end();) {
+      auto& seg_pkt = it->second;
+      auto seg_beg = it->first;
+      auto seg_end = seg_beg + seg_pkt.len();
+      // There are 6 cases:
+      if (seg_beg <= beg && end <= seg_end) {
+        // 1) seg_beg beg end seg_end
+        // We already have data in this packet
+        return;
+      } else if (beg <= seg_beg && seg_end <= end) {
+        // 2) beg seg_beg seg_end end
+        // The new segment contains more data than this old segment
+        // Delete the old one, insert the new one
+        it = map.erase(it);
+        insert = true;
+        break;
+      } else if (beg < seg_beg && seg_beg <= end && end <= seg_end) {
+        // 3) beg seg_beg end seg_end
+        // Merge two segments, trim front of old segment
+        auto trim = end - seg_beg;
+        seg_pkt.trim_front(trim);
+        p.append(std::move(seg_pkt));
+        // Delete the old one, insert the new one
+        it = map.erase(it);
+        insert = true;
+        break;
+      } else if (seg_beg <= beg && beg <= seg_end && seg_end < end) {
+        // 4) seg_beg beg seg_end end
+        // Merge two segments, trim front of new segment
+        auto trim = seg_end - beg;
+        p.trim_front(trim);
+        // Append new data to the old segment, keep the old segment
+        seg_pkt.append(std::move(p));
+        seg_pkt.linearize();
+        ++linearizations_ref();
+        insert = false;
+        break;
+      } else {
+        // 5) beg end < seg_beg seg_end
+        //   or
+        // 6) seg_beg seg_end < beg end
+        // Can not merge with this segment, keep looking
+        it++;
+        insert = true;
+      }
+    }
+
+    if (insert) {
+      p.linearize();
+      ++linearizations_ref();
+      map.emplace(beg, std::move(p));
+    }
+
+    // Second, merge adjacent segments after this packet has been merged,
+    // because this packet might fill a "whole" and make two adjacent
+    // segments mergable
+    for (auto it = map.begin(); it != map.end();) {
+      // The first segment
+      auto& seg_pkt = it->second;
+      auto seg_beg = it->first;
+      auto seg_end = seg_beg + seg_pkt.len();
+
+      // The second segment
+      auto it_next = it;
+      it_next++;
+      if (it_next == map.end()) {
+        break;
+      }
+      auto& p = it_next->second;
+      auto beg = it_next->first;
+      auto end = beg + p.len();
+
+      // Merge the the second segment into first segment if possible
+      if (seg_beg <= beg && beg <= seg_end && seg_end < end) {
+        // Merge two segments, trim front of second segment
+        auto trim = seg_end - beg;
+        p.trim_front(trim);
+        // Append new data to the first segment, keep the first segment
+        seg_pkt.append(std::move(p));
+
+        // Delete the second segment
+        map.erase(it_next);
+
+        // Keep merging this first segment with its new next packet
+        // So we do not update the iterator: it
+        continue;
+      } else if (end <= seg_end) {
+        // The first segment has all the data in the second segment
+        // Delete the second segment
+        map.erase(it_next);
+        continue;
+      } else if (seg_end < beg) {
+        // Can not merge first segment with second segment
+        it = it_next;
+        continue;
+      } else {
+        // If we reach here, we have a bug with merge.
+        std::cout << "packet_merger: merge error\n";
+        abort();
+      }
+    }
+  }
+};
+
+#endif
diff --git a/src/msg/async/dpdk/TCP-Stack.h b/src/msg/async/dpdk/TCP-Stack.h
new file mode 100644
index 000000000..edcf4d803
--- /dev/null
+++ b/src/msg/async/dpdk/TCP-Stack.h
@@ -0,0 +1,40 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+// tcp/network-stack integration
+
+#ifndef CEPH_MSG_DPDK_TCP_STACK_H
+#define CEPH_MSG_DPDK_TCP_STACK_H
+
+class ServerSocket;
+class ConnectedSocket;
+
+class ipv4_traits;
+template <typename InetTraits>
+class tcp;
+
+int tcpv4_listen(tcp<ipv4_traits>& tcpv4, uint16_t port, const SocketOptions &opts,
+                 int type, unsigned addr_slot, ServerSocket *sa);
+
+int tcpv4_connect(tcp<ipv4_traits>& tcpv4, const entity_addr_t &addr,
+                  ConnectedSocket *sa);
+
+#endif
diff --git a/src/msg/async/dpdk/TCP.cc b/src/msg/async/dpdk/TCP.cc
new file mode 100644
index 000000000..26f29e10f
--- /dev/null
+++ b/src/msg/async/dpdk/TCP.cc
@@ -0,0 +1,841 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#include "align.h"
+#include "TCP.h"
+#include "IP.h"
+#include "DPDKStack.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "tcp "
+
+void tcp_option::parse(uint8_t* beg, uint8_t* end)
+{
+  while (beg < end) {
+    auto kind = option_kind(*beg);
+    if (kind != option_kind::nop && kind != option_kind::eol) {
+      // Make sure there is enough room for this option
+      auto len = *(beg + 1);
+      if (beg + len > end) {
+        return;
+      }
+    }
+    switch (kind) {
+      case option_kind::mss:
+        _mss_received = true;
+        _remote_mss = ntoh(reinterpret_cast<mss*>(beg)->mss);
+        beg += option_len::mss;
+        break;
+      case option_kind::win_scale:
+        _win_scale_received = true;
+        _remote_win_scale = reinterpret_cast<win_scale*>(beg)->shift;
+        // We can turn on win_scale option, 7 is Linux's default win scale size
+        _local_win_scale = 7;
+        beg += option_len::win_scale;
+        break;
+      case option_kind::sack:
+        _sack_received = true;
+        beg += option_len::sack;
+        break;
+      case option_kind::nop:
+        beg += option_len::nop;
+        break;
+      case option_kind::eol:
+        return;
+      default:
+        // Ignore options we do not understand
+        auto len = *(beg + 1);
+        beg += len;
+        // Prevent infinite loop
+        if (len == 0) {
+            return;
+        }
+        break;
+    }
+  }
+}
+
+uint8_t tcp_option::fill(tcp_hdr* th, uint8_t options_size)
+{
+  auto hdr = reinterpret_cast<uint8_t*>(th);
+  auto off = hdr + sizeof(tcp_hdr);
+  uint8_t size = 0;
+  bool syn_on = th->f_syn;
+  bool ack_on = th->f_ack;
+
+  if (syn_on) {
+    if (_mss_received || !ack_on) {
+      auto mss = new (off) tcp_option::mss;
+      mss->mss = _local_mss;
+      off += mss->len;
+      size += mss->len;
+      *mss = mss->hton();
+    }
+    if (_win_scale_received || !ack_on) {
+      auto win_scale = new (off) tcp_option::win_scale;
+      win_scale->shift = _local_win_scale;
+      off += win_scale->len;
+      size += win_scale->len;
+    }
+  }
+  if (size > 0) {
+    // Insert NOP option
+    auto size_max = align_up(uint8_t(size + 1), tcp_option::align);
+    while (size < size_max - uint8_t(option_len::eol)) {
+      new (off) tcp_option::nop;
+      off += option_len::nop;
+      size += option_len::nop;
+    }
+    new (off) tcp_option::eol;
+    size += option_len::eol;
+  }
+  ceph_assert(size == options_size);
+
+  return size;
+}
+
+uint8_t tcp_option::get_size(bool syn_on, bool ack_on)
+{
+  uint8_t size = 0;
+  if (syn_on) {
+    if (_mss_received || !ack_on) {
+      size += option_len::mss;
+    }
+    if (_win_scale_received || !ack_on) {
+      size += option_len::win_scale;
+    }
+  }
+  if (size > 0) {
+    size += option_len::eol;
+    // Insert NOP option to align on 32-bit
+    size = align_up(size, tcp_option::align);
+  }
+  return size;
+}
+
+ipv4_tcp::ipv4_tcp(ipv4& inet, EventCenter *c)
+    : _inet_l4(inet), _tcp(std::unique_ptr<tcp<ipv4_traits>>(new tcp<ipv4_traits>(inet.cct, _inet_l4, c)))
+{ }
+
+ipv4_tcp::~ipv4_tcp() { }
+
+void ipv4_tcp::received(Packet p, ipv4_address from, ipv4_address to)
+{
+  _tcp->received(std::move(p), from, to);
+}
+
+bool ipv4_tcp::forward(forward_hash& out_hash_data, Packet& p, size_t off)
+{
+  return _tcp->forward(out_hash_data, p, off);
+}
+
+int tcpv4_listen(tcp<ipv4_traits>& tcpv4, uint16_t port, const SocketOptions &opts,
+                 int type, unsigned addr_slot, ServerSocket *sock)
+{
+  auto p = new DPDKServerSocketImpl<tcp<ipv4_traits>>(tcpv4, port, opts,
+						      type, addr_slot);
+  int r = p->listen();
+  if (r < 0) {
+    delete p;
+    return r;
+  }
+  *sock = ServerSocket(std::unique_ptr<ServerSocketImpl>(p));
+  return 0;
+}
+
+int tcpv4_connect(tcp<ipv4_traits>& tcpv4, const entity_addr_t &addr,
+                  ConnectedSocket *sock)
+{
+  auto conn = tcpv4.connect(addr);
+  *sock = ConnectedSocket(std::unique_ptr<ConnectedSocketImpl>(
+          new NativeConnectedSocketImpl<tcp<ipv4_traits>>(std::move(conn))));
+  return 0;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr foreign_ip)
+{
+  ldout(cct, 20) << __func__ << " tcp header rst=" << bool(rth->f_rst) << " fin=" << bool(rth->f_fin)
+                 << " syn=" << bool(rth->f_syn) << dendl;
+  if (rth->f_rst) {
+    return;
+  }
+  Packet p;
+  auto th = p.prepend_header<tcp_hdr>();
+  th->src_port = rth->dst_port;
+  th->dst_port = rth->src_port;
+  if (rth->f_ack) {
+    th->seq = rth->ack;
+  }
+  // If this RST packet is in response to a SYN packet. We ACK the ISN.
+  if (rth->f_syn) {
+    th->ack = rth->seq + 1;
+    th->f_ack = true;
+  }
+  th->f_rst = true;
+  th->data_offset = sizeof(*th) / 4;
+  th->checksum = 0;
+  *th = th->hton();
+
+  checksummer csum;
+  offload_info oi;
+  InetTraits::tcp_pseudo_header_checksum(csum, local_ip, foreign_ip, sizeof(*th));
+  if (get_hw_features().tx_csum_l4_offload) {
+    th->checksum = ~csum.get();
+    oi.needs_csum = true;
+  } else {
+    csum.sum(p);
+    th->checksum = csum.get();
+    oi.needs_csum = false;
+  }
+
+  oi.protocol = ip_protocol_num::tcp;
+  oi.tcp_hdr_len = sizeof(tcp_hdr);
+  p.set_offload_info(oi);
+
+  send_packet_without_tcb(local_ip, foreign_ip, std::move(p));
+}
+
+#undef dout_prefix
+#define dout_prefix _prefix(_dout)
+template<typename InetTraits>
+ostream& tcp<InetTraits>::tcb::_prefix(std::ostream *_dout) {
+  return *_dout << "tcp " << _local_ip << ":" << _local_port << " -> " << _foreign_ip << ":" << _foreign_port
+                << " tcb(" << this << " fd=" << fd << " s=" << _state << ").";
+}
+
+template<typename InetTraits>
+void tcp<InetTraits>::tcb::input_handle_listen_state(tcp_hdr* th, Packet p)
+{
+  auto opt_len = th->data_offset * 4 - sizeof(tcp_hdr);
+  auto opt_start = reinterpret_cast<uint8_t*>(p.get_header(0, th->data_offset * 4)) + sizeof(tcp_hdr);
+  auto opt_end = opt_start + opt_len;
+  p.trim_front(th->data_offset * 4);
+  tcp_sequence seg_seq = th->seq;
+
+  // Set RCV.NXT to SEG.SEQ+1, IRS is set to SEG.SEQ
+  _rcv.next = seg_seq + 1;
+  _rcv.initial = seg_seq;
+
+  // ISS should be selected and a SYN segment sent of the form:
+  // <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK>
+  // SND.NXT is set to ISS+1 and SND.UNA to ISS
+  // NOTE: In previous code, _snd.next is set to ISS + 1 only when SYN is
+  // ACKed. Now, we set _snd.next to ISS + 1 here, so in output_one(): we
+  // have
+  //     th->seq = syn_on ? _snd.initial : _snd.next
+  // to make sure retransmitted SYN has correct SEQ number.
+  do_setup_isn();
+
+  _rcv.urgent = _rcv.next;
+
+  ldout(_tcp.cct, 10) << __func__ << " listen: LISTEN -> SYN_RECEIVED" << dendl;
+  init_from_options(th, opt_start, opt_end);
+  do_syn_received();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::input_handle_syn_sent_state(tcp_hdr* th, Packet p)
+{
+  auto opt_len = th->data_offset * 4 - sizeof(tcp_hdr);
+  auto opt_start = reinterpret_cast<uint8_t*>(p.get_header(0, th->data_offset * 4)) + sizeof(tcp_hdr);
+  auto opt_end = opt_start + opt_len;
+  p.trim_front(th->data_offset * 4);
+  tcp_sequence seg_seq = th->seq;
+  auto seg_ack = th->ack;
+
+  ldout(_tcp.cct, 20) << __func__ << " tcp header seq " << seg_seq.raw << " ack " << seg_ack.raw
+                      << " fin=" << bool(th->f_fin) << " syn=" << bool(th->f_syn) << dendl;
+
+  bool acceptable = false;
+  // 3.1 first check the ACK bit
+  if (th->f_ack) {
+    // If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless the
+    // RST bit is set, if so drop the segment and return)
+    if (seg_ack <= _snd.initial || seg_ack > _snd.next) {
+      return respond_with_reset(th);
+    }
+
+    // If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable.
+    acceptable = _snd.unacknowledged <= seg_ack && seg_ack <= _snd.next;
+  }
+
+  // 3.2 second check the RST bit
+  if (th->f_rst) {
+    // If the ACK was acceptable then signal the user "error: connection
+    // reset", drop the segment, enter CLOSED state, delete TCB, and
+    // return.  Otherwise (no ACK) drop the segment and return.
+    if (acceptable) {
+      return do_reset();
+    } else {
+      return;
+    }
+  }
+
+  // 3.3 third check the security and precedence
+  // NOTE: Ignored for now
+
+  // 3.4 fourth check the SYN bit
+  if (th->f_syn) {
+    // RCV.NXT is set to SEG.SEQ+1, IRS is set to SEG.SEQ.  SND.UNA should
+    // be advanced to equal SEG.ACK (if there is an ACK), and any segments
+    // on the retransmission queue which are thereby acknowledged should be
+    // removed.
+    _rcv.next = seg_seq + 1;
+    _rcv.initial = seg_seq;
+    if (th->f_ack) {
+      // TODO: clean retransmission queue
+      _snd.unacknowledged = seg_ack;
+    }
+    if (_snd.unacknowledged > _snd.initial) {
+      // If SND.UNA > ISS (our SYN has been ACKed), change the connection
+      // state to ESTABLISHED, form an ACK segment
+      // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
+      ldout(_tcp.cct, 20) << __func__ << " syn: SYN_SENT -> ESTABLISHED" << dendl;
+      init_from_options(th, opt_start, opt_end);
+      do_established();
+      output();
+    } else {
+      // Otherwise enter SYN_RECEIVED, form a SYN,ACK segment
+      // <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK>
+      ldout(_tcp.cct, 20) << __func__ << " syn: SYN_SENT -> SYN_RECEIVED" << dendl;
+      do_syn_received();
+    }
+  }
+
+  // 3.5 fifth, if neither of the SYN or RST bits is set then drop the
+  // segment and return.
+  return;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::input_handle_other_state(tcp_hdr* th, Packet p)
+{
+  p.trim_front(th->data_offset * 4);
+  bool do_output = false;
+  bool do_output_data = false;
+  tcp_sequence seg_seq = th->seq;
+  auto seg_ack = th->ack;
+  auto seg_len = p.len();
+  ldout(_tcp.cct, 20) << __func__ << " tcp header seq " << seg_seq.raw << " ack " << seg_ack.raw
+                      << " snd next " << _snd.next.raw << " unack " << _snd.unacknowledged.raw
+                      << " rcv next " << _rcv.next.raw << " len " << seg_len
+                      << " fin=" << bool(th->f_fin) << " syn=" << bool(th->f_syn) << dendl;
+
+  // 4.1 first check sequence number
+  if (!segment_acceptable(seg_seq, seg_len)) {
+    //<SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
+    return output();
+  }
+
+  // In the following it is assumed that the segment is the idealized
+  // segment that begins at RCV.NXT and does not exceed the window.
+  if (seg_seq < _rcv.next) {
+    // ignore already acknowledged data
+    auto dup = std::min(uint32_t(_rcv.next - seg_seq), seg_len);
+    ldout(_tcp.cct, 10) << __func__ << " dup segment len " << dup << dendl;
+    p.trim_front(dup);
+    seg_len -= dup;
+    seg_seq += dup;
+  }
+  // FIXME: We should trim data outside the right edge of the receive window as well
+
+  if (seg_seq != _rcv.next) {
+    ldout(_tcp.cct, 10) << __func__ << " out of order, expect " << _rcv.next.raw
+                        << " actual " << seg_seq.raw
+                        << " out of order size " << _rcv.out_of_order.map.size()
+                        << dendl;
+    insert_out_of_order(seg_seq, std::move(p));
+    // A TCP receiver SHOULD send an immediate duplicate ACK
+    // when an out-of-order segment arrives.
+    return output();
+  }
+
+  // 4.2 second check the RST bit
+  if (th->f_rst) {
+    if (in_state(SYN_RECEIVED)) {
+      // If this connection was initiated with a passive OPEN (i.e.,
+      // came from the LISTEN state), then return this connection to
+      // LISTEN state and return.  The user need not be informed.  If
+      // this connection was initiated with an active OPEN (i.e., came
+      // from SYN_SENT state) then the connection was refused, signal
+      // the user "connection refused".  In either case, all segments
+      // on the retransmission queue should be removed.  And in the
+      // active OPEN case, enter the CLOSED state and delete the TCB,
+      // and return.
+      errno = -ECONNREFUSED;
+      return do_reset();
+    }
+    if (in_state(ESTABLISHED | FIN_WAIT_1 | FIN_WAIT_2 | CLOSE_WAIT)) {
+      // If the RST bit is set then, any outstanding RECEIVEs and SEND
+      // should receive "reset" responses.  All segment queues should be
+      // flushed.  Users should also receive an unsolicited general
+      // "connection reset" signal.  Enter the CLOSED state, delete the
+      // TCB, and return.
+      return do_reset();
+    }
+    if (in_state(CLOSING | LAST_ACK | TIME_WAIT)) {
+      // If the RST bit is set then, enter the CLOSED state, delete the
+      // TCB, and return.
+      return do_closed();
+    }
+  }
+
+  // 4.3 third check security and precedence
+  // NOTE: Ignored for now
+
+  // 4.4 fourth, check the SYN bit
+  if (th->f_syn) {
+    // SYN_RECEIVED, ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2
+    // CLOSE_WAIT, CLOSING, LAST_ACK, TIME_WAIT
+
+    // If the SYN is in the window it is an error, send a reset, any
+    // outstanding RECEIVEs and SEND should receive "reset" responses,
+    // all segment queues should be flushed, the user should also
+    // receive an unsolicited general "connection reset" signal, enter
+    // the CLOSED state, delete the TCB, and return.
+    respond_with_reset(th);
+    return do_reset();
+
+    // If the SYN is not in the window this step would not be reached
+    // and an ack would have been sent in the first step (sequence
+    // number check).
+  }
+
+  // 4.5 fifth check the ACK field
+  if (!th->f_ack) {
+    // if the ACK bit is off drop the segment and return
+    return;
+  } else {
+    // SYN_RECEIVED STATE
+    if (in_state(SYN_RECEIVED)) {
+      // If SND.UNA =< SEG.ACK =< SND.NXT then enter ESTABLISHED state
+      // and continue processing.
+      if (_snd.unacknowledged <= seg_ack && seg_ack <= _snd.next) {
+        ldout(_tcp.cct, 20) << __func__ << " SYN_RECEIVED -> ESTABLISHED" << dendl;
+        do_established();
+        if (_tcp.push_listen_queue(_local_port, this)) {
+          ldout(_tcp.cct, 20) << __func__ << " successfully accepting socket" << dendl;
+        } else {
+          ldout(_tcp.cct, 5) << __func__ << " not exist listener or full queue, reset" << dendl;
+          return respond_with_reset(th);
+        }
+      } else {
+        // <SEQ=SEG.ACK><CTL=RST>
+        return respond_with_reset(th);
+      }
+    }
+    auto update_window = [this, th, seg_seq, seg_ack] {
+      ldout(_tcp.cct, 20) << __func__ << " window update seg_seq=" << seg_seq
+                          << " seg_ack=" << seg_ack << " old window=" << th->window
+                          << " new window=" << int(_snd.window_scale) << dendl;
+      _snd.window = th->window << _snd.window_scale;
+      _snd.wl1 = seg_seq;
+      _snd.wl2 = seg_ack;
+      if (_snd.window == 0) {
+        _persist_time_out = _rto;
+        start_persist_timer();
+      } else {
+        stop_persist_timer();
+      }
+    };
+    // ESTABLISHED STATE or
+    // CLOSE_WAIT STATE: Do the same processing as for the ESTABLISHED state.
+    if (in_state(ESTABLISHED | CLOSE_WAIT)) {
+      // If SND.UNA < SEG.ACK =< SND.NXT then, set SND.UNA <- SEG.ACK.
+      if (_snd.unacknowledged < seg_ack && seg_ack <= _snd.next) {
+        // Remote ACKed data we sent
+        auto acked_bytes = data_segment_acked(seg_ack);
+
+        // If SND.UNA < SEG.ACK =< SND.NXT, the send window should be updated.
+        if (_snd.wl1 < seg_seq || (_snd.wl1 == seg_seq && _snd.wl2 <= seg_ack)) {
+          update_window();
+        }
+
+        // some data is acked, try send more data
+        do_output_data = true;
+
+        auto set_retransmit_timer = [this] {
+          if (_snd.data.empty()) {
+            // All outstanding segments are acked, turn off the timer.
+            stop_retransmit_timer();
+            // Signal the waiter of this event
+            signal_all_data_acked();
+          } else {
+            // Restart the timer becasue new data is acked.
+            start_retransmit_timer();
+          }
+        };
+
+        if (_snd.dupacks >= 3) {
+          // We are in fast retransmit / fast recovery phase
+          uint32_t smss = _snd.mss;
+          if (seg_ack > _snd.recover) {
+            ldout(_tcp.cct, 20) << __func__ << " ack: full_ack" << dendl;
+            // Set cwnd to min (ssthresh, max(FlightSize, SMSS) + SMSS)
+            _snd.cwnd = std::min(_snd.ssthresh, std::max(flight_size(), smss) + smss);
+            // Exit the fast recovery procedure
+            exit_fast_recovery();
+            set_retransmit_timer();
+          } else {
+            ldout(_tcp.cct, 20) << __func__ << " ack: partial_ack" << dendl;
+            // Retransmit the first unacknowledged segment
+            fast_retransmit();
+            // Deflate the congestion window by the amount of new data
+            // acknowledged by the Cumulative Acknowledgment field
+            _snd.cwnd -= acked_bytes;
+            // If the partial ACK acknowledges at least one SMSS of new
+            // data, then add back SMSS bytes to the congestion window
+            if (acked_bytes >= smss) {
+              _snd.cwnd += smss;
+            }
+            // Send a new segment if permitted by the new value of
+            // cwnd.  Do not exit the fast recovery procedure For
+            // the first partial ACK that arrives during fast
+            // recovery, also reset the retransmit timer.
+            if (++_snd.partial_ack == 1) {
+              start_retransmit_timer();
+            }
+          }
+        } else {
+          // RFC5681: The fast retransmit algorithm uses the arrival
+          // of 3 duplicate ACKs (as defined in section 2, without
+          // any intervening ACKs which move SND.UNA) as an
+          // indication that a segment has been lost.
+          //
+          // So, here we reset dupacks to zero becasue this ACK moves
+          // SND.UNA.
+          exit_fast_recovery();
+          set_retransmit_timer();
+        }
+      } else if (!_snd.data.empty() && seg_len == 0 &&
+                 th->f_fin == 0 && th->f_syn == 0 &&
+                 th->ack == _snd.unacknowledged &&
+                 uint32_t(th->window << _snd.window_scale) == _snd.window) {
+        // Note:
+        // RFC793 states:
+        // If the ACK is a duplicate (SEG.ACK < SND.UNA), it can be ignored
+        // RFC5681 states:
+        // The TCP sender SHOULD use the "fast retransmit" algorithm to detect
+        // and repair loss, based on incoming duplicate ACKs.
+        // Here, We follow RFC5681.
+        _snd.dupacks++;
+        uint32_t smss = _snd.mss;
+        // 3 duplicated ACKs trigger a fast retransmit
+        if (_snd.dupacks == 1 || _snd.dupacks == 2) {
+          // RFC5681 Step 3.1
+          // Send cwnd + 2 * smss per RFC3042
+          do_output_data = true;
+        } else if (_snd.dupacks == 3) {
+          // RFC6582 Step 3.2
+          if (seg_ack - 1 > _snd.recover) {
+            _snd.recover = _snd.next - 1;
+            // RFC5681 Step 3.2
+            _snd.ssthresh = std::max((flight_size() - _snd.limited_transfer) / 2, 2 * smss);
+            fast_retransmit();
+          } else {
+            // Do not enter fast retransmit and do not reset ssthresh
+          }
+          // RFC5681 Step 3.3
+          _snd.cwnd = _snd.ssthresh + 3 * smss;
+        } else if (_snd.dupacks > 3) {
+          // RFC5681 Step 3.4
+          _snd.cwnd += smss;
+          // RFC5681 Step 3.5
+          do_output_data = true;
+        }
+      } else if (seg_ack > _snd.next) {
+        // If the ACK acks something not yet sent (SEG.ACK > SND.NXT)
+        // then send an ACK, drop the segment, and return
+        return output();
+      } else if (_snd.window == 0 && th->window > 0) {
+        update_window();
+        do_output_data = true;
+      }
+    }
+    // FIN_WAIT_1 STATE
+    if (in_state(FIN_WAIT_1)) {
+      // In addition to the processing for the ESTABLISHED state, if
+      // our FIN is now acknowledged then enter FIN-WAIT-2 and continue
+      // processing in that state.
+      if (seg_ack == _snd.next + 1) {
+        ldout(_tcp.cct, 20) << __func__ << " ack: FIN_WAIT_1 -> FIN_WAIT_2" << dendl;
+        _state = FIN_WAIT_2;
+        do_local_fin_acked();
+      }
+    }
+    // FIN_WAIT_2 STATE
+    if (in_state(FIN_WAIT_2)) {
+      // In addition to the processing for the ESTABLISHED state, if
+      // the retransmission queue is empty, the user’s CLOSE can be
+      // acknowledged ("ok") but do not delete the TCB.
+      // TODO
+    }
+    // CLOSING STATE
+    if (in_state(CLOSING)) {
+      if (seg_ack == _snd.next + 1) {
+        ldout(_tcp.cct, 20) << __func__ << " ack: CLOSING -> TIME_WAIT" << dendl;
+        do_local_fin_acked();
+        return do_time_wait();
+      } else {
+        return;
+      }
+    }
+    // LAST_ACK STATE
+    if (in_state(LAST_ACK)) {
+      if (seg_ack == _snd.next + 1) {
+        ldout(_tcp.cct, 20) << __func__ << " ack: LAST_ACK -> CLOSED" << dendl;
+        do_local_fin_acked();
+        return do_closed();
+      }
+    }
+    // TIME_WAIT STATE
+    if (in_state(TIME_WAIT)) {
+      // The only thing that can arrive in this state is a
+      // retransmission of the remote FIN. Acknowledge it, and restart
+      // the 2 MSL timeout.
+      // TODO
+    }
+  }
+
+  // 4.6 sixth, check the URG bit
+  if (th->f_urg) {
+    // TODO
+  }
+
+  // 4.7 seventh, process the segment text
+  if (in_state(ESTABLISHED | FIN_WAIT_1 | FIN_WAIT_2)) {
+    if (p.len()) {
+      // Once the TCP takes responsibility for the data it advances
+      // RCV.NXT over the data accepted, and adjusts RCV.WND as
+      // apporopriate to the current buffer availability.  The total of
+      // RCV.NXT and RCV.WND should not be reduced.
+      _rcv.data.push_back(std::move(p));
+      _rcv.next += seg_len;
+      auto merged = merge_out_of_order();
+      signal_data_received();
+      // Send an acknowledgment of the form:
+      // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
+      // This acknowledgment should be piggybacked on a segment being
+      // transmitted if possible without incurring undue delay.
+      if (merged) {
+        // TCP receiver SHOULD send an immediate ACK when the
+        // incoming segment fills in all or part of a gap in the
+        // sequence space.
+        do_output = true;
+      } else {
+        do_output = should_send_ack(seg_len);
+      }
+      ldout(_tcp.cct, 20) << __func__ << " merged=" << merged << " do_output=" << do_output << dendl;
+    }
+  } else if (in_state(CLOSE_WAIT | CLOSING | LAST_ACK | TIME_WAIT)) {
+    // This should not occur, since a FIN has been received from the
+    // remote side. Ignore the segment text.
+    return;
+  }
+
+  // 4.8 eighth, check the FIN bit
+  if (th->f_fin) {
+    if (in_state(CLOSED | LISTEN | SYN_SENT)) {
+      // Do not process the FIN if the state is CLOSED, LISTEN or SYN-SENT
+      // since the SEG.SEQ cannot be validated; drop the segment and return.
+      return;
+    }
+    auto fin_seq = seg_seq + seg_len;
+    if (fin_seq == _rcv.next) {
+      _rcv.next = fin_seq + 1;
+
+      // If this <FIN> packet contains data as well, we can ACK both data
+      // and <FIN> in a single packet, so canncel the previous ACK.
+      clear_delayed_ack();
+      do_output = false;
+      // Send ACK for the FIN!
+      output();
+      signal_data_received();
+      _errno = 0;
+
+      if (in_state(SYN_RECEIVED | ESTABLISHED)) {
+        ldout(_tcp.cct, 20) << __func__ << " fin: SYN_RECEIVED or ESTABLISHED -> CLOSE_WAIT" << dendl;
+        _state = CLOSE_WAIT;
+        // EOF
+      }
+      if (in_state(FIN_WAIT_1)) {
+        // If our FIN has been ACKed (perhaps in this segment), then
+        // enter TIME-WAIT, start the time-wait timer, turn off the other
+        // timers; otherwise enter the CLOSING state.
+        // Note: If our FIN has been ACKed, we should be in FIN_WAIT_2
+        // not FIN_WAIT_1 if we reach here.
+        ldout(_tcp.cct, 20) << __func__ << " fin: FIN_WAIT_1 -> CLOSING" << dendl;
+        _state = CLOSING;
+      }
+      if (in_state(FIN_WAIT_2)) {
+        ldout(_tcp.cct, 20) << __func__ << " fin: FIN_WAIT_2 -> TIME_WAIT" << dendl;
+        return do_time_wait();
+      }
+    }
+  }
+  if (do_output || (do_output_data && can_send())) {
+    // Since we will do output, we can canncel scheduled delayed ACK.
+    clear_delayed_ack();
+    output();
+  }
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::connect()
+{
+  ldout(_tcp.cct, 20) << __func__ << dendl;
+  // An initial send sequence number (ISS) is selected.  A SYN segment of the
+  // form <SEQ=ISS><CTL=SYN> is sent.  Set SND.UNA to ISS, SND.NXT to ISS+1,
+  // enter SYN-SENT state, and return.
+  do_setup_isn();
+
+  // Local receive window scale factor
+  _rcv.window_scale = _option._local_win_scale = 7;
+  // Maximum segment size local can receive
+  _rcv.mss = _option._local_mss = local_mss();
+  // Linux's default window size
+  _rcv.window = 29200 << _rcv.window_scale;
+
+  do_syn_sent();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::close_final_cleanup()
+{
+  if (_snd._all_data_acked_fd >= 0) {
+    center->delete_file_event(_snd._all_data_acked_fd, EVENT_READABLE);
+    _tcp.manager.close(_snd._all_data_acked_fd);
+    _snd._all_data_acked_fd = -1;
+  }
+
+  _snd.closed = true;
+  signal_data_received();
+  ldout(_tcp.cct, 20) << __func__ << " unsent_len=" << _snd.unsent_len << dendl;
+  if (in_state(CLOSE_WAIT)) {
+    ldout(_tcp.cct, 20) << __func__ << " CLOSE_WAIT -> LAST_ACK" << dendl;
+    _state = LAST_ACK;
+  } else if (in_state(ESTABLISHED)) {
+    ldout(_tcp.cct, 20) << __func__ << " ESTABLISHED -> FIN_WAIT_1" << dendl;
+    _state = FIN_WAIT_1;
+  }
+  // Send <FIN> to remote
+  // Note: we call output_one to make sure a packet with FIN actually
+  // sent out. If we only call output() and _packetq is not empty,
+  // tcp::tcb::get_packet(), packet with FIN will not be generated.
+  output_one();
+  output();
+  center->delete_file_event(fd, EVENT_READABLE|EVENT_WRITABLE);
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::retransmit()
+{
+  auto output_update_rto = [this] {
+    output();
+    // According to RFC6298, Update RTO <- RTO * 2 to perform binary exponential back-off
+    this->_rto = std::min(this->_rto * 2, this->_rto_max);
+    start_retransmit_timer();
+  };
+
+  // Retransmit SYN
+  if (syn_needs_on()) {
+    if (_snd.syn_retransmit++ < _max_nr_retransmit) {
+      output_update_rto();
+    } else {
+      _errno = -ECONNABORTED;
+      ldout(_tcp.cct, 5) << __func__ << " syn retransmit exceed max "
+                         << _max_nr_retransmit << dendl;
+      _errno = -ETIMEDOUT;
+      cleanup();
+      return;
+    }
+  }
+
+  // Retransmit FIN
+  if (fin_needs_on()) {
+    if (_snd.fin_retransmit++ < _max_nr_retransmit) {
+      output_update_rto();
+    } else {
+      ldout(_tcp.cct, 5) << __func__ << " fin retransmit exceed max "
+                         << _max_nr_retransmit << dendl;
+      _errno = -ETIMEDOUT;
+      cleanup();
+      return;
+    }
+  }
+
+  // Retransmit Data
+  if (_snd.data.empty()) {
+    return;
+  }
+
+  // If there are unacked data, retransmit the earliest segment
+  auto& unacked_seg = _snd.data.front();
+
+  // According to RFC5681
+  // Update ssthresh only for the first retransmit
+  uint32_t smss = _snd.mss;
+  if (unacked_seg.nr_transmits == 0) {
+    _snd.ssthresh = std::max(flight_size() / 2, 2 * smss);
+  }
+  // RFC6582 Step 4
+  _snd.recover = _snd.next - 1;
+  // Start the slow start process
+  _snd.cwnd = smss;
+  // End fast recovery
+  exit_fast_recovery();
+
+  ldout(_tcp.cct, 20) << __func__ << " unack data size " << _snd.data.size()
+                      << " nr=" << unacked_seg.nr_transmits << dendl;
+  if (unacked_seg.nr_transmits < _max_nr_retransmit) {
+    unacked_seg.nr_transmits++;
+  } else {
+    // Delete connection when max num of retransmission is reached
+    ldout(_tcp.cct, 5) << __func__ << " seg retransmit exceed max "
+                       << _max_nr_retransmit << dendl;
+    _errno = -ETIMEDOUT;
+    cleanup();
+    return;
+  }
+  retransmit_one();
+
+  output_update_rto();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::persist() {
+  ldout(_tcp.cct, 20) << __func__ << " persist timer fired" << dendl;
+  // Send 1 byte packet to probe peer's window size
+  _snd.window_probe = true;
+  output_one();
+  _snd.window_probe = false;
+
+  output();
+  // Perform binary exponential back-off per RFC1122
+  _persist_time_out = std::min(_persist_time_out * 2, _rto_max);
+  start_persist_timer();
+}
diff --git a/src/msg/async/dpdk/TCP.h b/src/msg/async/dpdk/TCP.h
new file mode 100644
index 000000000..a0104fb44
--- /dev/null
+++ b/src/msg/async/dpdk/TCP.h
@@ -0,0 +1,1506 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_DPDK_TCP_H_
+#define CEPH_DPDK_TCP_H_
+
+#include <unordered_map>
+#include <map>
+#include <queue>
+#include <functional>
+#include <deque>
+#include <chrono>
+#include <stdexcept>
+#include <system_error>
+
+#include "msg/async/dpdk/EventDPDK.h"
+
+#include "include/utime.h"
+#include "common/Throttle.h"
+#include "common/ceph_time.h"
+#include "common/ceph_crypto.h"
+#include "msg/async/Event.h"
+#include "IPChecksum.h"
+#include "IP.h"
+#include "const.h"
+#include "byteorder.h"
+#include "shared_ptr.h"
+#include "PacketUtil.h"
+
+#include "include/random.h"
+
+struct tcp_hdr;
+
+enum class tcp_state : uint16_t {
+  CLOSED          = (1 << 0),
+  LISTEN          = (1 << 1),
+  SYN_SENT        = (1 << 2),
+  SYN_RECEIVED    = (1 << 3),
+  ESTABLISHED     = (1 << 4),
+  FIN_WAIT_1      = (1 << 5),
+  FIN_WAIT_2      = (1 << 6),
+  CLOSE_WAIT      = (1 << 7),
+  CLOSING         = (1 << 8),
+  LAST_ACK        = (1 << 9),
+  TIME_WAIT       = (1 << 10)
+};
+
+inline tcp_state operator|(tcp_state s1, tcp_state s2) {
+  return tcp_state(uint16_t(s1) | uint16_t(s2));
+}
+
+inline std::ostream & operator<<(std::ostream & str, const tcp_state& s) {
+  switch (s) {
+    case tcp_state::CLOSED: return str << "CLOSED";
+    case tcp_state::LISTEN: return str << "LISTEN";
+    case tcp_state::SYN_SENT: return str << "SYN_SENT";
+    case tcp_state::SYN_RECEIVED: return str << "SYN_RECEIVED";
+    case tcp_state::ESTABLISHED: return str << "ESTABLISHED";
+    case tcp_state::FIN_WAIT_1: return str << "FIN_WAIT_1";
+    case tcp_state::FIN_WAIT_2: return str << "FIN_WAIT_2";
+    case tcp_state::CLOSE_WAIT: return str << "CLOSE_WAIT";
+    case tcp_state::CLOSING: return str << "CLOSING";
+    case tcp_state::LAST_ACK: return str << "LAST_ACK";
+    case tcp_state::TIME_WAIT: return str << "TIME_WAIT";
+    default: return str << "UNKNOWN";
+  }
+}
+
+struct tcp_option {
+  // The kind and len field are fixed and defined in TCP protocol
+  enum class option_kind: uint8_t { mss = 2, win_scale = 3, sack = 4, timestamps = 8,  nop = 1, eol = 0 };
+  enum class option_len:  uint8_t { mss = 4, win_scale = 3, sack = 2, timestamps = 10, nop = 1, eol = 1 };
+  struct mss {
+    option_kind kind = option_kind::mss;
+    option_len len = option_len::mss;
+    uint16_t mss;
+    struct mss hton() {
+      struct mss m = *this;
+      m.mss = ::hton(m.mss);
+      return m;
+    }
+  } __attribute__((packed));
+  struct win_scale {
+    option_kind kind = option_kind::win_scale;
+    option_len len = option_len::win_scale;
+    uint8_t shift;
+  } __attribute__((packed));
+  struct sack {
+    option_kind kind = option_kind::sack;
+    option_len len = option_len::sack;
+  } __attribute__((packed));
+  struct timestamps {
+    option_kind kind = option_kind::timestamps;
+    option_len len = option_len::timestamps;
+    uint32_t t1;
+    uint32_t t2;
+  } __attribute__((packed));
+  struct nop {
+    option_kind kind = option_kind::nop;
+  } __attribute__((packed));
+  struct eol {
+    option_kind kind = option_kind::eol;
+  } __attribute__((packed));
+  static const uint8_t align = 4;
+
+  void parse(uint8_t* beg, uint8_t* end);
+  uint8_t fill(tcp_hdr* th, uint8_t option_size);
+  uint8_t get_size(bool syn_on, bool ack_on);
+
+  // For option negotiattion
+  bool _mss_received = false;
+  bool _win_scale_received = false;
+  bool _timestamps_received = false;
+  bool _sack_received = false;
+
+  // Option data
+  uint16_t _remote_mss = 536;
+  uint16_t _local_mss;
+  uint8_t _remote_win_scale = 0;
+  uint8_t _local_win_scale = 0;
+};
+inline uint8_t*& operator+=(uint8_t*& x, tcp_option::option_len len) { x += uint8_t(len); return x; }
+inline uint8_t& operator+=(uint8_t& x, tcp_option::option_len len) { x += uint8_t(len); return x; }
+
+struct tcp_sequence {
+  uint32_t raw;
+};
+
+tcp_sequence ntoh(tcp_sequence ts) {
+  return tcp_sequence { ::ntoh(ts.raw) };
+}
+
+tcp_sequence hton(tcp_sequence ts) {
+  return tcp_sequence { ::hton(ts.raw) };
+}
+
+inline std::ostream& operator<<(std::ostream& os, const tcp_sequence& s) {
+  return os << s.raw;
+}
+
+inline tcp_sequence make_seq(uint32_t raw) { return tcp_sequence{raw}; }
+inline tcp_sequence& operator+=(tcp_sequence& s, int32_t n) { s.raw += n; return s; }
+inline tcp_sequence& operator-=(tcp_sequence& s, int32_t n) { s.raw -= n; return s; }
+inline tcp_sequence operator+(tcp_sequence s, int32_t n) { return s += n; }
+inline tcp_sequence operator-(tcp_sequence s, int32_t n) { return s -= n; }
+inline int32_t operator-(tcp_sequence s, tcp_sequence q) { return s.raw - q.raw; }
+inline bool operator==(tcp_sequence s, tcp_sequence q)  { return s.raw == q.raw; }
+inline bool operator!=(tcp_sequence s, tcp_sequence q) { return !(s == q); }
+inline bool operator<(tcp_sequence s, tcp_sequence q) { return s - q < 0; }
+inline bool operator>(tcp_sequence s, tcp_sequence q) { return q < s; }
+inline bool operator<=(tcp_sequence s, tcp_sequence q) { return !(s > q); }
+inline bool operator>=(tcp_sequence s, tcp_sequence q) { return !(s < q); }
+
+struct tcp_hdr {
+  uint16_t src_port;
+  uint16_t dst_port;
+  tcp_sequence seq;
+  tcp_sequence ack;
+  uint8_t rsvd1 : 4;
+  uint8_t data_offset : 4;
+  uint8_t f_fin : 1;
+  uint8_t f_syn : 1;
+  uint8_t f_rst : 1;
+  uint8_t f_psh : 1;
+  uint8_t f_ack : 1;
+  uint8_t f_urg : 1;
+  uint8_t rsvd2 : 2;
+  uint16_t window;
+  uint16_t checksum;
+  uint16_t urgent;
+
+  tcp_hdr hton() {
+    tcp_hdr hdr = *this;
+    hdr.src_port = ::hton(src_port);
+    hdr.dst_port = ::hton(dst_port);
+    hdr.seq = ::hton(seq);
+    hdr.ack = ::hton(ack);
+    hdr.window = ::hton(window);
+    hdr.checksum = ::hton(checksum);
+    hdr.urgent = ::hton(urgent);
+    return hdr;
+  }
+
+  tcp_hdr ntoh() {
+    tcp_hdr hdr = *this;
+    hdr.src_port = ::ntoh(src_port);
+    hdr.dst_port = ::ntoh(dst_port);
+    hdr.seq = ::ntoh(seq);
+    hdr.ack = ::ntoh(ack);
+    hdr.window = ::ntoh(window);
+    hdr.checksum = ::ntoh(checksum);
+    hdr.urgent = ::ntoh(urgent);
+    return hdr;
+  }
+} __attribute__((packed));
+
+struct tcp_tag {};
+using tcp_packet_merger = packet_merger<tcp_sequence, tcp_tag>;
+
+template <typename InetTraits>
+class tcp {
+ public:
+  using ipaddr = typename InetTraits::address_type;
+  using inet_type = typename InetTraits::inet_type;
+  using connid = l4connid<InetTraits>;
+  using connid_hash = typename connid::connid_hash;
+  class connection;
+  class listener;
+ private:
+  class tcb;
+
+  class C_handle_delayed_ack : public EventCallback {
+    tcb *tc;
+
+   public:
+    C_handle_delayed_ack(tcb *t): tc(t) { }
+    void do_request(uint64_t r) {
+      tc->_delayed_ack_fd.destroy();
+      tc->_nr_full_seg_received = 0;
+      tc->output();
+    }
+  };
+
+  class C_handle_retransmit : public EventCallback {
+    tcb *tc;
+
+   public:
+    C_handle_retransmit(tcb *t): tc(t) { }
+    void do_request(uint64_t r) {
+      tc->retransmit_fd.destroy();
+      tc->retransmit();
+    }
+  };
+
+  class C_handle_persist : public EventCallback {
+    tcb *tc;
+
+   public:
+    C_handle_persist(tcb *t): tc(t) { }
+    void do_request(uint64_t r) {
+      tc->persist_fd.destroy();
+      tc->persist();
+    }
+  };
+
+  class C_all_data_acked : public EventCallback {
+    tcb *tc;
+
+   public:
+    C_all_data_acked(tcb *t): tc(t) {}
+    void do_request(uint64_t fd_or_id) {
+      tc->close_final_cleanup();
+    }
+  };
+
+  class C_actual_remove_tcb : public EventCallback {
+    lw_shared_ptr<tcb> tc;
+   public:
+    C_actual_remove_tcb(tcb *t): tc(t->shared_from_this()) {}
+    void do_request(uint64_t r) {
+      delete this;
+    }
+  };
+
+  class tcb : public enable_lw_shared_from_this<tcb> {
+    using clock_type = ceph::coarse_real_clock;
+    static constexpr tcp_state CLOSED         = tcp_state::CLOSED;
+    static constexpr tcp_state LISTEN         = tcp_state::LISTEN;
+    static constexpr tcp_state SYN_SENT       = tcp_state::SYN_SENT;
+    static constexpr tcp_state SYN_RECEIVED   = tcp_state::SYN_RECEIVED;
+    static constexpr tcp_state ESTABLISHED    = tcp_state::ESTABLISHED;
+    static constexpr tcp_state FIN_WAIT_1     = tcp_state::FIN_WAIT_1;
+    static constexpr tcp_state FIN_WAIT_2     = tcp_state::FIN_WAIT_2;
+    static constexpr tcp_state CLOSE_WAIT     = tcp_state::CLOSE_WAIT;
+    static constexpr tcp_state CLOSING        = tcp_state::CLOSING;
+    static constexpr tcp_state LAST_ACK       = tcp_state::LAST_ACK;
+    static constexpr tcp_state TIME_WAIT      = tcp_state::TIME_WAIT;
+    tcp_state _state = CLOSED;
+    tcp& _tcp;
+    UserspaceEventManager &manager;
+    connection* _conn = nullptr;
+    bool _connect_done = false;
+    ipaddr _local_ip;
+    ipaddr _foreign_ip;
+    uint16_t _local_port;
+    uint16_t _foreign_port;
+    struct unacked_segment {
+      Packet p;
+      uint16_t data_len;
+      unsigned nr_transmits;
+      clock_type::time_point tx_time;
+    };
+    struct send {
+      tcp_sequence unacknowledged;
+      tcp_sequence next;
+      uint32_t window;
+      uint8_t window_scale;
+      uint16_t mss;
+      tcp_sequence urgent;
+      tcp_sequence wl1;
+      tcp_sequence wl2;
+      tcp_sequence initial;
+      std::deque<unacked_segment> data;
+      std::deque<Packet> unsent;
+      uint32_t unsent_len = 0;
+      uint32_t queued_len = 0;
+      bool closed = false;
+      // Wait for all data are acked
+      int _all_data_acked_fd = -1;
+      // Limit number of data queued into send queue
+      Throttle user_queue_space;
+      // Round-trip time variation
+      std::chrono::microseconds rttvar;
+      // Smoothed round-trip time
+      std::chrono::microseconds srtt;
+      bool first_rto_sample = true;
+      clock_type::time_point syn_tx_time;
+      // Congestion window
+      uint32_t cwnd;
+      // Slow start threshold
+      uint32_t ssthresh;
+      // Duplicated ACKs
+      uint16_t dupacks = 0;
+      unsigned syn_retransmit = 0;
+      unsigned fin_retransmit = 0;
+      uint32_t limited_transfer = 0;
+      uint32_t partial_ack = 0;
+      tcp_sequence recover;
+      bool window_probe = false;
+      send(CephContext *c): user_queue_space(c, "DPDK::tcp::tcb::user_queue_space", 81920) {}
+    } _snd;
+    struct receive {
+      tcp_sequence next;
+      uint32_t window;
+      uint8_t window_scale;
+      uint16_t mss;
+      tcp_sequence urgent;
+      tcp_sequence initial;
+      std::deque<Packet> data;
+      tcp_packet_merger out_of_order;
+    } _rcv;
+    EventCenter *center;
+    int fd;
+    // positive means no errno, 0 means eof, nagetive means error
+    int16_t _errno = 1;
+    tcp_option _option;
+    EventCallbackRef delayed_ack_event;
+    Tub<uint64_t> _delayed_ack_fd;
+    // Retransmission timeout
+    std::chrono::microseconds _rto{1000*1000};
+    std::chrono::microseconds _persist_time_out{1000*1000};
+    static constexpr std::chrono::microseconds _rto_min{1000*1000};
+    static constexpr std::chrono::microseconds _rto_max{60000*1000};
+    // Clock granularity
+    static constexpr std::chrono::microseconds _rto_clk_granularity{1000};
+    static constexpr uint16_t _max_nr_retransmit{5};
+    EventCallbackRef retransmit_event;
+    Tub<uint64_t> retransmit_fd;
+    EventCallbackRef persist_event;
+    EventCallbackRef all_data_ack_event;
+    Tub<uint64_t> persist_fd;
+    uint16_t _nr_full_seg_received = 0;
+    struct isn_secret {
+      // 512 bits secretkey for ISN generating
+      uint32_t key[16];
+      isn_secret () {
+        for (auto& k : key) {
+          k = ceph::util::generate_random_number<uint32_t>(0, std::numeric_limits<uint32_t>::max());
+        }
+      }
+    };
+    static isn_secret _isn_secret;
+    tcp_sequence get_isn();
+    circular_buffer<typename InetTraits::l4packet> _packetq;
+    bool _poll_active = false;
+   public:
+    // callback
+    void close_final_cleanup();
+    ostream& _prefix(std::ostream *_dout);
+
+   public:
+    tcb(tcp& t, connid id);
+    ~tcb();
+    void input_handle_listen_state(tcp_hdr* th, Packet p);
+    void input_handle_syn_sent_state(tcp_hdr* th, Packet p);
+    void input_handle_other_state(tcp_hdr* th, Packet p);
+    void output_one(bool data_retransmit = false);
+    bool is_all_data_acked();
+    int send(Packet p);
+    void connect();
+    Tub<Packet> read();
+    void close();
+    void remove_from_tcbs() {
+      auto id = connid{_local_ip, _foreign_ip, _local_port, _foreign_port};
+      _tcp._tcbs.erase(id);
+    }
+    Tub<typename InetTraits::l4packet> get_packet();
+    void output() {
+      if (!_poll_active) {
+        _poll_active = true;
+
+        auto tcb = this->shared_from_this();
+        _tcp._inet.wait_l2_dst_address(_foreign_ip, Packet(), [tcb] (const ethernet_address &dst, Packet p, int r) {
+          if (r == 0) {
+            tcb->_tcp.poll_tcb(dst, std::move(tcb));
+          } else if (r == -ETIMEDOUT) {
+            // in other states connection should time out
+            if (tcb->in_state(SYN_SENT)) {
+              tcb->_errno = -ETIMEDOUT;
+              tcb->cleanup();
+            }
+          } else if (r == -EBUSY) {
+            // retry later
+            tcb->_poll_active = false;
+            tcb->start_retransmit_timer();
+          }
+        });
+      }
+    }
+
+    int16_t get_errno() const {
+      return _errno;
+    }
+
+    tcp_state& state() {
+      return _state;
+    }
+
+    uint64_t peek_sent_available() {
+      if (!in_state(ESTABLISHED))
+        return 0;
+      uint64_t left = _snd.user_queue_space.get_max() - _snd.user_queue_space.get_current();
+      return left;
+    }
+
+    int is_connected() const {
+      if (_errno <= 0)
+        return _errno;
+      return _connect_done;
+    }
+
+   private:
+    void respond_with_reset(tcp_hdr* th);
+    bool merge_out_of_order();
+    void insert_out_of_order(tcp_sequence seq, Packet p);
+    void trim_receive_data_after_window();
+    bool should_send_ack(uint16_t seg_len);
+    void clear_delayed_ack();
+    Packet get_transmit_packet();
+    void retransmit_one() {
+      bool data_retransmit = true;
+      output_one(data_retransmit);
+    }
+    void start_retransmit_timer() {
+      if (retransmit_fd)
+        center->delete_time_event(*retransmit_fd);
+      retransmit_fd.construct(center->create_time_event(_rto.count(), retransmit_event));
+    };
+    void stop_retransmit_timer() {
+      if (retransmit_fd) {
+        center->delete_time_event(*retransmit_fd);
+        retransmit_fd.destroy();
+      }
+    };
+    void start_persist_timer() {
+      if (persist_fd)
+        center->delete_time_event(*persist_fd);
+      persist_fd.construct(center->create_time_event(_persist_time_out.count(), persist_event));
+    };
+    void stop_persist_timer() {
+      if (persist_fd) {
+        center->delete_time_event(*persist_fd);
+        persist_fd.destroy();
+      }
+    };
+    void persist();
+    void retransmit();
+    void fast_retransmit();
+    void update_rto(clock_type::time_point tx_time);
+    void update_cwnd(uint32_t acked_bytes);
+    void cleanup();
+    uint32_t can_send() {
+      if (_snd.window_probe) {
+        return 1;
+      }
+      // Can not send more than advertised window allows
+      auto x = std::min(uint32_t(_snd.unacknowledged + _snd.window - _snd.next), _snd.unsent_len);
+      // Can not send more than congestion window allows
+      x = std::min(_snd.cwnd, x);
+      if (_snd.dupacks == 1 || _snd.dupacks == 2) {
+        // RFC5681 Step 3.1
+        // Send cwnd + 2 * smss per RFC3042
+        auto flight = flight_size();
+        auto max = _snd.cwnd + 2 * _snd.mss;
+        x = flight <= max ? std::min(x, max - flight) : 0;
+        _snd.limited_transfer += x;
+      } else if (_snd.dupacks >= 3) {
+        // RFC5681 Step 3.5
+        // Sent 1 full-sized segment at most
+        x = std::min(uint32_t(_snd.mss), x);
+      }
+      return x;
+    }
+    uint32_t flight_size() {
+      uint32_t size = 0;
+      std::for_each(_snd.data.begin(), _snd.data.end(),
+                    [&] (unacked_segment& seg) { size += seg.p.len(); });
+      return size;
+    }
+    uint16_t local_mss() {
+      return _tcp.get_hw_features().mtu - tcp_hdr_len_min - InetTraits::ip_hdr_len_min;
+    }
+    void queue_packet(Packet p) {
+      _packetq.emplace_back(
+          typename InetTraits::l4packet{_foreign_ip, std::move(p)});
+    }
+    void signal_data_received() {
+      manager.notify(fd, EVENT_READABLE);
+    }
+    void signal_all_data_acked() {
+      if (_snd._all_data_acked_fd >= 0 && _snd.unsent_len == 0 && _snd.queued_len == 0)
+        manager.notify(_snd._all_data_acked_fd, EVENT_READABLE);
+    }
+    void do_syn_sent() {
+      _state = SYN_SENT;
+      _snd.syn_tx_time = clock_type::now();
+      // Send <SYN> to remote
+      output();
+    }
+    void do_syn_received() {
+      _state = SYN_RECEIVED;
+      _snd.syn_tx_time = clock_type::now();
+      // Send <SYN,ACK> to remote
+      output();
+    }
+    void do_established() {
+      _state = ESTABLISHED;
+      update_rto(_snd.syn_tx_time);
+      _connect_done = true;
+      manager.notify(fd, EVENT_READABLE|EVENT_WRITABLE);
+    }
+    void do_reset() {
+      _state = CLOSED;
+      // Free packets to be sent which are waiting for user_queue_space
+      _snd.user_queue_space.reset();
+      cleanup();
+      _errno = -ECONNRESET;
+      manager.notify(fd, EVENT_READABLE);
+
+      if (_snd._all_data_acked_fd >= 0)
+        manager.notify(_snd._all_data_acked_fd, EVENT_READABLE);
+    }
+    void do_time_wait() {
+      // FIXME: Implement TIME_WAIT state timer
+      _state = TIME_WAIT;
+      cleanup();
+    }
+    void do_closed() {
+      _state = CLOSED;
+      cleanup();
+    }
+    void do_setup_isn() {
+      _snd.initial = get_isn();
+      _snd.unacknowledged = _snd.initial;
+      _snd.next = _snd.initial + 1;
+      _snd.recover = _snd.initial;
+    }
+    void do_local_fin_acked() {
+      _snd.unacknowledged += 1;
+      _snd.next += 1;
+    }
+    bool syn_needs_on() {
+      return in_state(SYN_SENT | SYN_RECEIVED);
+    }
+    bool fin_needs_on() {
+      return in_state(FIN_WAIT_1 | CLOSING | LAST_ACK) && _snd.closed &&
+             _snd.unsent_len == 0 && _snd.queued_len == 0;
+    }
+    bool ack_needs_on() {
+      return !in_state(CLOSED | LISTEN | SYN_SENT);
+    }
+    bool foreign_will_not_send() {
+      return in_state(CLOSING | TIME_WAIT | CLOSE_WAIT | LAST_ACK | CLOSED);
+    }
+    bool in_state(tcp_state state) {
+      return uint16_t(_state) & uint16_t(state);
+    }
+    void exit_fast_recovery() {
+      _snd.dupacks = 0;
+      _snd.limited_transfer = 0;
+      _snd.partial_ack = 0;
+    }
+    uint32_t data_segment_acked(tcp_sequence seg_ack);
+    bool segment_acceptable(tcp_sequence seg_seq, unsigned seg_len);
+    void init_from_options(tcp_hdr* th, uint8_t* opt_start, uint8_t* opt_end);
+    friend class connection;
+
+    friend class C_handle_delayed_ack;
+    friend class C_handle_retransmit;
+    friend class C_handle_persist;
+    friend class C_all_data_acked;
+  };
+
+  CephContext *cct;
+  // ipv4_l4<ip_protocol_num::tcp>
+  inet_type& _inet;
+  EventCenter *center;
+  UserspaceEventManager &manager;
+  std::unordered_map<connid, lw_shared_ptr<tcb>, connid_hash> _tcbs;
+  std::unordered_map<uint16_t, listener*> _listening;
+  std::random_device _rd;
+  std::default_random_engine _e;
+  std::uniform_int_distribution<uint16_t> _port_dist{41952, 65535};
+  circular_buffer<std::pair<lw_shared_ptr<tcb>, ethernet_address>> _poll_tcbs;
+  // queue for packets that do not belong to any tcb
+  circular_buffer<ipv4_traits::l4packet> _packetq;
+  Throttle _queue_space;
+  // Limit number of data queued into send queue
+ public:
+  class connection {
+    lw_shared_ptr<tcb> _tcb;
+   public:
+    explicit connection(lw_shared_ptr<tcb> tcbp) : _tcb(std::move(tcbp)) { _tcb->_conn = this; }
+    connection(const connection&) = delete;
+    connection(connection&& x) noexcept : _tcb(std::move(x._tcb)) {
+      _tcb->_conn = this;
+    }
+    ~connection();
+    void operator=(const connection&) = delete;
+    connection& operator=(connection&& x) {
+      if (this != &x) {
+        this->~connection();
+        new (this) connection(std::move(x));
+      }
+      return *this;
+    }
+    int fd() const {
+      return _tcb->fd;
+    }
+    int send(Packet p) {
+      return _tcb->send(std::move(p));
+    }
+    Tub<Packet> read() {
+      return _tcb->read();
+    }
+    int16_t get_errno() const {
+      return _tcb->get_errno();
+    }
+    void close_read();
+    void close_write();
+    entity_addr_t remote_addr() const {
+      entity_addr_t addr;
+      auto net_ip = _tcb->_foreign_ip.hton();
+      memcpy((void*)&addr.in4_addr().sin_addr.s_addr,
+             &net_ip, sizeof(addr.in4_addr().sin_addr.s_addr));
+      addr.set_family(AF_INET);
+      return addr;
+    }
+    uint64_t peek_sent_available() {
+      return _tcb->peek_sent_available();
+    }
+    int is_connected() const { return _tcb->is_connected(); }
+  };
+  class listener {
+    tcp& _tcp;
+    uint16_t _port;
+    int _fd = -1;
+    int16_t _errno;
+    queue<connection> _q;
+    size_t _q_max_length;
+
+   private:
+    listener(tcp& t, uint16_t port, size_t queue_length)
+        : _tcp(t), _port(port), _errno(0), _q(), _q_max_length(queue_length) {
+    }
+   public:
+    listener(const listener&) = delete;
+    void operator=(const listener&) = delete;
+    listener(listener&& x)
+        : _tcp(x._tcp), _port(x._port), _fd(std::move(x._fd)), _errno(x._errno),
+          _q(std::move(x._q)) {
+      if (_fd >= 0)
+        _tcp._listening[_port] = this;
+    }
+    ~listener() {
+      abort_accept();
+    }
+    int listen() {
+      if (_tcp._listening.find(_port) != _tcp._listening.end())
+        return -EADDRINUSE;
+      _tcp._listening.emplace(_port, this);
+      _fd = _tcp.manager.get_eventfd();
+      return 0;
+    }
+    Tub<connection> accept() {
+      Tub<connection> c;
+      if (!_q.empty()) {
+        c = std::move(_q.front());
+        _q.pop();
+      }
+      return c;
+    }
+    void abort_accept() {
+      while (!_q.empty())
+        _q.pop();
+      if (_fd >= 0) {
+        _tcp._listening.erase(_port);
+        _tcp.manager.close(_fd);
+        _fd = -1;
+      }
+    }
+    int16_t get_errno() const {
+      return _errno;
+    }
+    bool full() const {
+      return _q.size() == _q_max_length;
+    }
+    int fd() const {
+      return _fd;
+    }
+    friend class tcp;
+  };
+ public:
+  explicit tcp(CephContext *c, inet_type& inet, EventCenter *cen);
+  void received(Packet p, ipaddr from, ipaddr to);
+  bool forward(forward_hash& out_hash_data, Packet& p, size_t off);
+  listener listen(uint16_t port, size_t queue_length = 100);
+  connection connect(const entity_addr_t &addr);
+  const hw_features& get_hw_features() const { return _inet._inet.get_hw_features(); }
+  void poll_tcb(const ethernet_address &dst, lw_shared_ptr<tcb> tcb) {
+    _poll_tcbs.emplace_back(std::move(tcb), dst);
+  }
+  bool push_listen_queue(uint16_t port, tcb *t) {
+    auto listener = _listening.find(port);
+    if (listener == _listening.end() || listener->second->full()) {
+      return false;
+    }
+    listener->second->_q.push(connection(t->shared_from_this()));
+    manager.notify(listener->second->_fd, EVENT_READABLE);
+    return true;
+  }
+
+ private:
+  void send_packet_without_tcb(ipaddr from, ipaddr to, Packet p);
+  void respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr foreign_ip);
+  friend class listener;
+};
+
+template <typename InetTraits>
+tcp<InetTraits>::tcp(CephContext *c, inet_type& inet, EventCenter *cen)
+    : cct(c), _inet(inet), center(cen),
+      manager(static_cast<DPDKDriver*>(cen->get_driver())->manager),
+      _e(_rd()), _queue_space(cct, "DPDK::tcp::queue_space", 81920) {
+  int tcb_polled = 0u;
+  _inet.register_packet_provider([this, tcb_polled] () mutable {
+    Tub<typename InetTraits::l4packet> l4p;
+    auto c = _poll_tcbs.size();
+    if (!_packetq.empty() && (!(tcb_polled % 128) || c == 0)) {
+      l4p = std::move(_packetq.front());
+      _packetq.pop_front();
+      _queue_space.put(l4p->p.len());
+    } else {
+      while (c--) {
+        tcb_polled++;
+        lw_shared_ptr<tcb> tcb;
+        ethernet_address dst;
+        std::tie(tcb, dst) = std::move(_poll_tcbs.front());
+        _poll_tcbs.pop_front();
+        l4p = std::move(tcb->get_packet());
+        if (l4p) {
+          l4p->e_dst = dst;
+          break;
+        }
+      }
+    }
+    return l4p;
+  });
+}
+
+template <typename InetTraits>
+auto tcp<InetTraits>::listen(uint16_t port, size_t queue_length) -> listener {
+  return listener(*this, port, queue_length);
+}
+
+template <typename InetTraits>
+typename tcp<InetTraits>::connection tcp<InetTraits>::connect(const entity_addr_t &addr) {
+  uint16_t src_port;
+  connid id;
+  auto src_ip = _inet._inet.host_address();
+  auto dst_ip = ipv4_address(addr);
+  auto dst_port = addr.get_port();
+
+  do {
+    src_port = _port_dist(_e);
+    id = connid{src_ip, dst_ip, src_port, (uint16_t)dst_port};
+    if (_tcbs.find(id) == _tcbs.end()) {
+      if (_inet._inet.netif()->hw_queues_count() == 1 ||
+          _inet._inet.netif()->hash2cpu(
+              id.hash(_inet._inet.netif()->rss_key())) == center->get_id())
+        break;
+    }
+  } while (true);
+
+  auto tcbp = make_lw_shared<tcb>(*this, id);
+  _tcbs.insert({id, tcbp});
+  tcbp->connect();
+  return connection(tcbp);
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::forward(forward_hash& out_hash_data, Packet& p, size_t off) {
+  auto th = p.get_header<tcp_hdr>(off);
+  if (th) {
+    out_hash_data.push_back(th->src_port);
+    out_hash_data.push_back(th->dst_port);
+  }
+  return true;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::received(Packet p, ipaddr from, ipaddr to) {
+  auto th = p.get_header<tcp_hdr>(0);
+  if (!th) {
+    return;
+  }
+  // th->data_offset is correct even before ntoh()
+  if (unsigned(th->data_offset * 4) < sizeof(*th)) {
+    return;
+  }
+
+  if (!get_hw_features().rx_csum_offload) {
+    checksummer csum;
+    InetTraits::tcp_pseudo_header_checksum(csum, from, to, p.len());
+    csum.sum(p);
+    if (csum.get() != 0) {
+      return;
+    }
+  }
+  auto h = th->ntoh();
+  auto id = connid{to, from, h.dst_port, h.src_port};
+  auto tcbi = _tcbs.find(id);
+  lw_shared_ptr<tcb> tcbp;
+  if (tcbi == _tcbs.end()) {
+    auto listener = _listening.find(id.local_port);
+    if (listener == _listening.end() || listener->second->full()) {
+      // 1) In CLOSE state
+      // 1.1 all data in the incoming segment is discarded.  An incoming
+      // segment containing a RST is discarded. An incoming segment not
+      // containing a RST causes a RST to be sent in response.
+      // FIXME:
+      //      if ACK off: <SEQ=0><ACK=SEG.SEQ+SEG.LEN><CTL=RST,ACK>
+      //      if ACK on:  <SEQ=SEG.ACK><CTL=RST>
+      return respond_with_reset(&h, id.local_ip, id.foreign_ip);
+    } else {
+      // 2) In LISTEN state
+      // 2.1 first check for an RST
+      if (h.f_rst) {
+        // An incoming RST should be ignored
+        return;
+      }
+      // 2.2 second check for an ACK
+      if (h.f_ack) {
+        // Any acknowledgment is bad if it arrives on a connection
+        // still in the LISTEN state.
+        // <SEQ=SEG.ACK><CTL=RST>
+        return respond_with_reset(&h, id.local_ip, id.foreign_ip);
+      }
+      // 2.3 third check for a SYN
+      if (h.f_syn) {
+        // check the security
+        // NOTE: Ignored for now
+        tcbp = make_lw_shared<tcb>(*this, id);
+        _tcbs.insert({id, tcbp});
+        return tcbp->input_handle_listen_state(&h, std::move(p));
+      }
+      // 2.4 fourth other text or control
+      // So you are unlikely to get here, but if you do, drop the
+      // segment, and return.
+      return;
+    }
+  } else {
+    tcbp = tcbi->second;
+    if (tcbp->state() == tcp_state::SYN_SENT) {
+      // 3) In SYN_SENT State
+      return tcbp->input_handle_syn_sent_state(&h, std::move(p));
+    } else {
+      // 4) In other state, can be one of the following:
+      // SYN_RECEIVED, ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2
+      // CLOSE_WAIT, CLOSING, LAST_ACK, TIME_WAIT
+      return tcbp->input_handle_other_state(&h, std::move(p));
+    }
+  }
+}
+
+// Send packet does not belong to any tcb
+template <typename InetTraits>
+void tcp<InetTraits>::send_packet_without_tcb(ipaddr from, ipaddr to, Packet p) {
+  if (_queue_space.get_or_fail(p.len())) { // drop packets that do not fit the queue
+    _inet.wait_l2_dst_address(to, std::move(p), [this, to] (const ethernet_address &e_dst, Packet p, int r) mutable {
+      if (r == 0)
+        _packetq.emplace_back(ipv4_traits::l4packet{to, std::move(p), e_dst, ip_protocol_num::tcp});
+    });
+  }
+}
+
+template <typename InetTraits>
+tcp<InetTraits>::connection::~connection() {
+  if (_tcb) {
+    _tcb->_conn = nullptr;
+    close_read();
+    close_write();
+  }
+}
+
+template <typename InetTraits>
+tcp<InetTraits>::tcb::tcb(tcp& t, connid id)
+    : _tcp(t), manager(t.manager), _local_ip(id.local_ip) , _foreign_ip(id.foreign_ip),
+      _local_port(id.local_port), _foreign_port(id.foreign_port),
+      _snd(_tcp.cct),
+      center(t.center),
+      fd(t.manager.get_eventfd()),
+      delayed_ack_event(new tcp<InetTraits>::C_handle_delayed_ack(this)),
+      retransmit_event(new tcp<InetTraits>::C_handle_retransmit(this)),
+      persist_event(new tcp<InetTraits>::C_handle_persist(this)),
+      all_data_ack_event(new tcp<InetTraits>::C_all_data_acked(this)) {}
+
+template <typename InetTraits>
+tcp<InetTraits>::tcb::~tcb()
+{
+  if (_delayed_ack_fd)
+    center->delete_time_event(*_delayed_ack_fd);
+  if (retransmit_fd)
+    center->delete_time_event(*retransmit_fd);
+  if (persist_fd)
+    center->delete_time_event(*persist_fd);
+  delete delayed_ack_event;
+  delete retransmit_event;
+  delete persist_event;
+  delete all_data_ack_event;
+  manager.close(fd);
+  fd = -1;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::respond_with_reset(tcp_hdr* rth)
+{
+  _tcp.respond_with_reset(rth, _local_ip, _foreign_ip);
+}
+
+template <typename InetTraits>
+uint32_t tcp<InetTraits>::tcb::data_segment_acked(tcp_sequence seg_ack) {
+  uint32_t total_acked_bytes = 0;
+  // Full ACK of segment
+  while (!_snd.data.empty()
+         && (_snd.unacknowledged + _snd.data.front().p.len() <= seg_ack)) {
+    auto acked_bytes = _snd.data.front().p.len();
+    _snd.unacknowledged += acked_bytes;
+    // Ignore retransmitted segments when setting the RTO
+    if (_snd.data.front().nr_transmits == 0) {
+      update_rto(_snd.data.front().tx_time);
+    }
+    update_cwnd(acked_bytes);
+    total_acked_bytes += acked_bytes;
+    _snd.user_queue_space.put(_snd.data.front().data_len);
+    manager.notify(fd, EVENT_WRITABLE);
+    _snd.data.pop_front();
+  }
+  // Partial ACK of segment
+  if (_snd.unacknowledged < seg_ack) {
+    auto acked_bytes = seg_ack - _snd.unacknowledged;
+    if (!_snd.data.empty()) {
+      auto& unacked_seg = _snd.data.front();
+      unacked_seg.p.trim_front(acked_bytes);
+    }
+    _snd.unacknowledged = seg_ack;
+    update_cwnd(acked_bytes);
+    total_acked_bytes += acked_bytes;
+  }
+  return total_acked_bytes;
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::tcb::segment_acceptable(tcp_sequence seg_seq, unsigned seg_len) {
+  if (seg_len == 0 && _rcv.window == 0) {
+    // SEG.SEQ = RCV.NXT
+    return seg_seq == _rcv.next;
+  } else if (seg_len == 0 && _rcv.window > 0) {
+    // RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
+    return (_rcv.next <= seg_seq) && (seg_seq < _rcv.next + _rcv.window);
+  } else if (seg_len > 0 && _rcv.window > 0) {
+    // RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
+    //    or
+    // RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND
+    bool x = (_rcv.next <= seg_seq) && seg_seq < (_rcv.next + _rcv.window);
+    bool y = (_rcv.next <= seg_seq + seg_len - 1) && (seg_seq + seg_len - 1 < _rcv.next + _rcv.window);
+    return x || y;
+  } else  {
+    // SEG.LEN > 0 RCV.WND = 0, not acceptable
+    return false;
+  }
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::init_from_options(tcp_hdr* th, uint8_t* opt_start, uint8_t* opt_end) {
+  // Handle tcp options
+  _option.parse(opt_start, opt_end);
+
+  // Remote receive window scale factor
+  _snd.window_scale = _option._remote_win_scale;
+  // Local receive window scale factor
+  _rcv.window_scale = _option._local_win_scale;
+
+  // Maximum segment size remote can receive
+  _snd.mss = _option._remote_mss;
+  // Maximum segment size local can receive
+  _rcv.mss = _option._local_mss = local_mss();
+
+  // Linux's default window size
+  _rcv.window = 29200 << _rcv.window_scale;
+  _snd.window = th->window << _snd.window_scale;
+
+  // Segment sequence number used for last window update
+  _snd.wl1 = th->seq;
+  // Segment acknowledgment number used for last window update
+  _snd.wl2 = th->ack;
+
+  // Setup initial congestion window
+  if (2190 < _snd.mss) {
+    _snd.cwnd = 2 * _snd.mss;
+  } else if (1095 < _snd.mss && _snd.mss <= 2190) {
+    _snd.cwnd = 3 * _snd.mss;
+  } else {
+    _snd.cwnd = 4 * _snd.mss;
+  }
+
+  // Setup initial slow start threshold
+  _snd.ssthresh = th->window << _snd.window_scale;
+}
+
+template <typename InetTraits>
+Packet tcp<InetTraits>::tcb::get_transmit_packet() {
+  // easy case: empty queue
+  if (_snd.unsent.empty()) {
+    return Packet();
+  }
+  auto can_send = this->can_send();
+  // Max number of TCP payloads we can pass to NIC
+  uint32_t len;
+  if (_tcp.get_hw_features().tx_tso) {
+    // FIXME: Info tap device the size of the split packet
+    len = _tcp.get_hw_features().max_packet_len - tcp_hdr_len_min - InetTraits::ip_hdr_len_min;
+  } else {
+    len = std::min(uint16_t(_tcp.get_hw_features().mtu - tcp_hdr_len_min - InetTraits::ip_hdr_len_min), _snd.mss);
+  }
+  can_send = std::min(can_send, len);
+  // easy case: one small packet
+  if (_snd.unsent.front().len() <= can_send) {
+    auto p = std::move(_snd.unsent.front());
+    _snd.unsent.pop_front();
+    _snd.unsent_len -= p.len();
+    return p;
+  }
+  // moderate case: need to split one packet
+  if (_snd.unsent.front().len() > can_send) {
+    auto p = _snd.unsent.front().share(0, can_send);
+    _snd.unsent.front().trim_front(can_send);
+    _snd.unsent_len -= p.len();
+    return p;
+  }
+  // hard case: merge some packets, possibly split last
+  auto p = std::move(_snd.unsent.front());
+  _snd.unsent.pop_front();
+  can_send -= p.len();
+  while (!_snd.unsent.empty()
+         && _snd.unsent.front().len() <= can_send) {
+    can_send -= _snd.unsent.front().len();
+    p.append(std::move(_snd.unsent.front()));
+    _snd.unsent.pop_front();
+  }
+  // FIXME: this will result in calling "deleter" of packet which free managed objects
+  // will used later
+  // if (!_snd.unsent.empty() && can_send) {
+  //   auto& q = _snd.unsent.front();
+  //   p.append(q.share(0, can_send));
+  //   q.trim_front(can_send);
+  // }
+  _snd.unsent_len -= p.len();
+  return p;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::output_one(bool data_retransmit) {
+  if (in_state(CLOSED)) {
+    return;
+  }
+
+  Packet p = data_retransmit ? _snd.data.front().p.share() : get_transmit_packet();
+  Packet clone = p.share();  // early clone to prevent share() from calling packet::unuse_internal_data() on header.
+  uint16_t len = p.len();
+  bool syn_on = syn_needs_on();
+  bool ack_on = ack_needs_on();
+
+  auto options_size = _option.get_size(syn_on, ack_on);
+  auto th = p.prepend_header<tcp_hdr>(options_size);
+
+  th->src_port = _local_port;
+  th->dst_port = _foreign_port;
+
+  th->f_syn = syn_on;
+  th->f_ack = ack_on;
+  if (ack_on) {
+    clear_delayed_ack();
+  }
+  th->f_urg = false;
+  th->f_psh = false;
+
+  tcp_sequence seq;
+  if (data_retransmit) {
+    seq = _snd.unacknowledged;
+  } else {
+    seq = syn_on ? _snd.initial : _snd.next;
+    _snd.next += len;
+  }
+  th->seq = seq;
+  th->ack = _rcv.next;
+  th->data_offset = (sizeof(*th) + options_size) / 4;
+  th->window = _rcv.window >> _rcv.window_scale;
+  th->checksum = 0;
+
+  // FIXME: does the FIN have to fit in the window?
+  bool fin_on = fin_needs_on();
+  th->f_fin = fin_on;
+
+  // Add tcp options
+  _option.fill(th, options_size);
+  *th = th->hton();
+
+  offload_info oi;
+  checksummer csum;
+  uint16_t pseudo_hdr_seg_len = 0;
+
+  oi.tcp_hdr_len = sizeof(tcp_hdr) + options_size;
+
+  if (_tcp.get_hw_features().tx_csum_l4_offload) {
+    oi.needs_csum = true;
+
+    //
+    // tx checksum offloading: both virtio-net's VIRTIO_NET_F_CSUM dpdk's
+    // PKT_TX_TCP_CKSUM - requires th->checksum to be initialized to ones'
+    // complement sum of the pseudo header.
+    //
+    // For TSO the csum should be calculated for a pseudo header with
+    // segment length set to 0. All the rest is the same as for a TCP Tx
+    // CSUM offload case.
+    //
+    if (_tcp.get_hw_features().tx_tso && len > _snd.mss) {
+      oi.tso_seg_size = _snd.mss;
+    } else {
+      pseudo_hdr_seg_len = sizeof(*th) + options_size + len;
+    }
+  } else {
+    pseudo_hdr_seg_len = sizeof(*th) + options_size + len;
+    oi.needs_csum = false;
+  }
+
+  InetTraits::tcp_pseudo_header_checksum(csum, _local_ip, _foreign_ip,
+                                         pseudo_hdr_seg_len);
+
+  if (_tcp.get_hw_features().tx_csum_l4_offload) {
+    th->checksum = ~csum.get();
+  } else {
+    csum.sum(p);
+    th->checksum = csum.get();
+  }
+
+  oi.protocol = ip_protocol_num::tcp;
+
+  p.set_offload_info(oi);
+
+  if (!data_retransmit && (len || syn_on || fin_on)) {
+    auto now = clock_type::now();
+    if (len) {
+      unsigned nr_transmits = 0;
+      _snd.data.emplace_back(unacked_segment{std::move(clone),
+                                             len, nr_transmits, now});
+    }
+    if (!retransmit_fd) {
+      start_retransmit_timer();
+    }
+  }
+
+  queue_packet(std::move(p));
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::tcb::is_all_data_acked() {
+  if (_snd.data.empty() && _snd.unsent_len == 0 && _snd.queued_len == 0) {
+    return true;
+  }
+  return false;
+}
+
+template <typename InetTraits>
+Tub<Packet> tcp<InetTraits>::tcb::read() {
+  Tub<Packet> p;
+  if (_rcv.data.empty())
+    return p;
+
+  p.construct();
+  for (auto&& q : _rcv.data) {
+    p->append(std::move(q));
+  }
+  _rcv.data.clear();
+  return p;
+}
+
+template <typename InetTraits>
+int tcp<InetTraits>::tcb::send(Packet p) {
+  // We can not send after the connection is closed
+  ceph_assert(!_snd.closed);
+
+  if (in_state(CLOSED))
+    return -ECONNRESET;
+
+  auto len = p.len();
+  if (!_snd.user_queue_space.get_or_fail(len)) {
+    // note: caller must ensure enough queue space to send
+    ceph_abort();
+  }
+  // TODO: Handle p.len() > max user_queue_space case
+  _snd.queued_len += len;
+  _snd.unsent_len += len;
+  _snd.queued_len -= len;
+  _snd.unsent.push_back(std::move(p));
+  if (can_send() > 0) {
+    output();
+  }
+  return len;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::close() {
+  if (in_state(CLOSED) || _snd.closed) {
+    return ;
+  }
+  // TODO: We should make this asynchronous
+
+  _errno = -EPIPE;
+  center->delete_file_event(fd, EVENT_READABLE|EVENT_WRITABLE);
+  bool acked = is_all_data_acked();
+  if (!acked) {
+    _snd._all_data_acked_fd = manager.get_eventfd();
+    center->create_file_event(_snd._all_data_acked_fd, EVENT_READABLE, all_data_ack_event);
+  } else {
+    close_final_cleanup();
+  }
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::tcb::should_send_ack(uint16_t seg_len) {
+  // We've received a TSO packet, do ack immediately
+  if (seg_len > _rcv.mss) {
+    _nr_full_seg_received = 0;
+    if (_delayed_ack_fd) {
+      center->delete_time_event(*_delayed_ack_fd);
+      _delayed_ack_fd.destroy();
+    }
+    return true;
+  }
+
+  // We've received a full sized segment, ack for every second full sized segment
+  if (seg_len == _rcv.mss) {
+    if (_nr_full_seg_received++ >= 1) {
+      _nr_full_seg_received = 0;
+      if (_delayed_ack_fd) {
+        center->delete_time_event(*_delayed_ack_fd);
+        _delayed_ack_fd.destroy();
+      }
+      return true;
+    }
+  }
+
+  // If the timer is armed and its callback hasn't been run.
+  if (_delayed_ack_fd) {
+    return false;
+  }
+
+  // If the timer is not armed, schedule a delayed ACK.
+  // The maximum delayed ack timer allowed by RFC1122 is 500ms, most
+  // implementations use 200ms.
+  _delayed_ack_fd.construct(center->create_time_event(200*1000, delayed_ack_event));
+  return false;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::clear_delayed_ack() {
+  if (_delayed_ack_fd) {
+    center->delete_time_event(*_delayed_ack_fd);
+    _delayed_ack_fd.destroy();
+  }
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::tcb::merge_out_of_order() {
+  bool merged = false;
+  if (_rcv.out_of_order.map.empty()) {
+    return merged;
+  }
+  for (auto it = _rcv.out_of_order.map.begin(); it != _rcv.out_of_order.map.end();) {
+    auto& p = it->second;
+    auto seg_beg = it->first;
+    auto seg_len = p.len();
+    auto seg_end = seg_beg + seg_len;
+    if (seg_beg <= _rcv.next && seg_end > _rcv.next) {
+      // This segment has been received out of order and its previous
+      // segment has been received now
+      auto trim = _rcv.next - seg_beg;
+      if (trim) {
+        p.trim_front(trim);
+        seg_len -= trim;
+      }
+      _rcv.next += seg_len;
+      _rcv.data.push_back(std::move(p));
+      // Since c++11, erase() always returns the value of the following element
+      it = _rcv.out_of_order.map.erase(it);
+      merged = true;
+    } else if (_rcv.next >= seg_end) {
+      // This segment has been receive already, drop it
+      it = _rcv.out_of_order.map.erase(it);
+    } else {
+      // seg_beg > _rcv.need, can not merge. Note, seg_beg can grow only,
+      // so we can stop looking here.
+      it++;
+      break;
+    }
+  }
+  return merged;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::insert_out_of_order(tcp_sequence seg, Packet p) {
+  _rcv.out_of_order.merge(seg, std::move(p));
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::trim_receive_data_after_window() {
+  abort();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::fast_retransmit() {
+  if (!_snd.data.empty()) {
+    auto& unacked_seg = _snd.data.front();
+    unacked_seg.nr_transmits++;
+    retransmit_one();
+    output();
+  }
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::update_rto(clock_type::time_point tx_time) {
+  // Update RTO according to RFC6298
+  auto R = std::chrono::duration_cast<std::chrono::microseconds>(clock_type::now() - tx_time);
+  if (_snd.first_rto_sample) {
+    _snd.first_rto_sample = false;
+    // RTTVAR <- R/2
+    // SRTT <- R
+    _snd.rttvar = R / 2;
+    _snd.srtt = R;
+  } else {
+    // RTTVAR <- (1 - beta) * RTTVAR + beta * |SRTT - R'|
+    // SRTT <- (1 - alpha) * SRTT + alpha * R'
+    // where alpha = 1/8 and beta = 1/4
+    auto delta = _snd.srtt > R ? (_snd.srtt - R) : (R - _snd.srtt);
+    _snd.rttvar = _snd.rttvar * 3 / 4 + delta / 4;
+    _snd.srtt = _snd.srtt * 7 / 8 +  R / 8;
+  }
+  // RTO <- SRTT + max(G, K * RTTVAR)
+  _rto =  _snd.srtt + std::max(_rto_clk_granularity, 4 * _snd.rttvar);
+
+  // Make sure 1 sec << _rto << 60 sec
+  _rto = std::max(_rto, _rto_min);
+  _rto = std::min(_rto, _rto_max);
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::update_cwnd(uint32_t acked_bytes) {
+  uint32_t smss = _snd.mss;
+  if (_snd.cwnd < _snd.ssthresh) {
+    // In slow start phase
+    _snd.cwnd += std::min(acked_bytes, smss);
+  } else {
+    // In congestion avoidance phase
+    uint32_t round_up = 1;
+    _snd.cwnd += std::max(round_up, smss * smss / _snd.cwnd);
+  }
+}
+
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::cleanup() {
+  manager.notify(fd, EVENT_READABLE);
+  _snd.closed = true;
+  _snd.unsent.clear();
+  _snd.data.clear();
+  _rcv.out_of_order.map.clear();
+  _rcv.data.clear();
+  stop_retransmit_timer();
+  clear_delayed_ack();
+  center->dispatch_event_external(new tcp<InetTraits>::C_actual_remove_tcb(this));
+  remove_from_tcbs();
+}
+
+template <typename InetTraits>
+tcp_sequence tcp<InetTraits>::tcb::get_isn() {
+  // Per RFC6528, TCP SHOULD generate its Initial Sequence Numbers
+  // with the expression:
+  //   ISN = M + F(localip, localport, remoteip, remoteport, secretkey)
+  //   M is the 4 microsecond timer
+  using namespace std::chrono;
+  uint32_t hash[4];
+  hash[0] = _local_ip.ip;
+  hash[1] = _foreign_ip.ip;
+  hash[2] = (_local_port << 16) + _foreign_port;
+  hash[3] = _isn_secret.key[15];
+  ceph::crypto::MD5 md5;
+  md5.Update((const unsigned char*)_isn_secret.key, sizeof(_isn_secret.key));
+  md5.Final((unsigned char*)hash);
+  auto seq = hash[0];
+  auto m = duration_cast<microseconds>(clock_type::now().time_since_epoch());
+  seq += m.count() / 4;
+  return make_seq(seq);
+}
+
+template <typename InetTraits>
+Tub<typename InetTraits::l4packet> tcp<InetTraits>::tcb::get_packet() {
+  _poll_active = false;
+  if (_packetq.empty()) {
+    output_one();
+  }
+
+  Tub<typename InetTraits::l4packet> p;
+  if (in_state(CLOSED)) {
+    return p;
+  }
+
+  ceph_assert(!_packetq.empty());
+
+  p = std::move(_packetq.front());
+  _packetq.pop_front();
+  if (!_packetq.empty() || (_snd.dupacks < 3 && can_send() > 0)) {
+    // If there are packets to send in the queue or tcb is allowed to send
+    // more add tcp back to polling set to keep sending. In addition, dupacks >= 3
+    // is an indication that an segment is lost, stop sending more in this case.
+    output();
+  }
+  return p;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::connection::close_read() {
+  // do nothing
+  // _tcb->manager.notify(_tcb->fd, EVENT_READABLE);
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::connection::close_write() {
+  _tcb->close();
+}
+
+template <typename InetTraits>
+constexpr uint16_t tcp<InetTraits>::tcb::_max_nr_retransmit;
+
+template <typename InetTraits>
+constexpr std::chrono::microseconds tcp<InetTraits>::tcb::_rto_min;
+
+template <typename InetTraits>
+constexpr std::chrono::microseconds tcp<InetTraits>::tcb::_rto_max;
+
+template <typename InetTraits>
+constexpr std::chrono::microseconds tcp<InetTraits>::tcb::_rto_clk_granularity;
+
+template <typename InetTraits>
+typename tcp<InetTraits>::tcb::isn_secret tcp<InetTraits>::tcb::_isn_secret;
+
+
+#endif /* TCP_HH_ */
diff --git a/src/msg/async/dpdk/UserspaceEvent.cc b/src/msg/async/dpdk/UserspaceEvent.cc
new file mode 100644
index 000000000..282dcef12
--- /dev/null
+++ b/src/msg/async/dpdk/UserspaceEvent.cc
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "UserspaceEvent.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "dpdk "
+
+int UserspaceEventManager::get_eventfd()
+{
+  int fd;
+  if (!unused_fds.empty()) {
+    fd = unused_fds.front();
+    unused_fds.pop_front();
+  } else {
+    fd = ++max_fd;
+    fds.resize(fd + 1);
+  }
+
+  Tub<UserspaceFDImpl> &impl = fds[fd];
+  ceph_assert(!impl);
+  impl.construct();
+  ldout(cct, 20) << __func__ << " fd=" << fd << dendl;
+  return fd;
+}
+
+int UserspaceEventManager::notify(int fd, int mask)
+{
+  ldout(cct, 20) << __func__ << " fd=" << fd << " mask=" << mask << dendl;
+  if ((size_t)fd >= fds.size())
+    return -ENOENT;
+
+  Tub<UserspaceFDImpl> &impl = fds[fd];
+  if (!impl)
+    return -ENOENT;
+
+  ldout(cct, 20) << __func__ << " activing=" << int(impl->activating_mask)
+                 << " listening=" << int(impl->listening_mask)
+                 << " waiting_idx=" << int(impl->waiting_idx) << dendl;
+
+  impl->activating_mask |= mask;
+  if (impl->waiting_idx)
+    return 0;
+
+  if (impl->listening_mask & mask) {
+    if (waiting_fds.size() <= max_wait_idx)
+      waiting_fds.resize(waiting_fds.size()*2);
+    impl->waiting_idx = ++max_wait_idx;
+    waiting_fds[max_wait_idx] = fd;
+  }
+
+  ldout(cct, 20) << __func__ << " activing=" << int(impl->activating_mask)
+                 << " listening=" << int(impl->listening_mask)
+                 << " waiting_idx=" << int(impl->waiting_idx) << " done " << dendl;
+  return 0;
+}
+
+void UserspaceEventManager::close(int fd)
+{
+  ldout(cct, 20) << __func__ << " fd=" << fd << dendl;
+  if ((size_t)fd >= fds.size())
+    return ;
+
+  Tub<UserspaceFDImpl> &impl = fds[fd];
+  if (!impl)
+    return ;
+
+  if (fd == max_fd)
+    --max_fd;
+  else
+    unused_fds.push_back(fd);
+
+  if (impl->activating_mask) {
+    if (waiting_fds[max_wait_idx] == fd) {
+      ceph_assert(impl->waiting_idx == max_wait_idx);
+      --max_wait_idx;
+    }
+    waiting_fds[impl->waiting_idx] = -1;
+  }
+  impl.destroy();
+}
+
+int UserspaceEventManager::poll(int *events, int *masks, int num_events, struct timeval *tp)
+{
+  int fd;
+  uint32_t i = 0;
+  int count = 0;
+  ceph_assert(num_events);
+  // leave zero slot for waiting_fds
+  while (i < max_wait_idx) {
+    fd = waiting_fds[++i];
+    if (fd == -1)
+      continue;
+
+    events[count] = fd;
+    Tub<UserspaceFDImpl> &impl = fds[fd];
+    ceph_assert(impl);
+    masks[count] = impl->listening_mask & impl->activating_mask;
+    ceph_assert(masks[count]);
+    ldout(cct, 20) << __func__ << " fd=" << fd << " mask=" << masks[count] << dendl;
+    impl->activating_mask &= (~masks[count]);
+    impl->waiting_idx = 0;
+    if (++count >= num_events)
+      break;
+  }
+  if (i < max_wait_idx) {
+    memmove(&waiting_fds[1], &waiting_fds[i+1], sizeof(int)*(max_wait_idx-i));
+  }
+  max_wait_idx -= i;
+  return count;
+}
diff --git a/src/msg/async/dpdk/UserspaceEvent.h b/src/msg/async/dpdk/UserspaceEvent.h
new file mode 100644
index 000000000..7e89517df
--- /dev/null
+++ b/src/msg/async/dpdk/UserspaceEvent.h
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_USERSPACEEVENT_H
+#define CEPH_USERSPACEEVENT_H
+
+#include <cstddef>
+#include <errno.h>
+#include <string.h>
+
+#include <vector>
+#include <list>
+
+#include "include/ceph_assert.h"
+#include "include/int_types.h"
+#include "common/Tub.h"
+
+class CephContext;
+
+class UserspaceEventManager {
+  struct UserspaceFDImpl {
+    uint32_t waiting_idx = 0;
+    int16_t read_errno = 0;
+    int16_t write_errno = 0;
+    int8_t listening_mask = 0;
+    int8_t activating_mask = 0;
+    uint32_t magic = 4921;
+  };
+  CephContext *cct;
+  int max_fd = 0;
+  uint32_t max_wait_idx = 0;
+  std::vector<Tub<UserspaceFDImpl> > fds;
+  std::vector<int> waiting_fds;
+  std::list<uint32_t> unused_fds;
+
+ public:
+  explicit UserspaceEventManager(CephContext *c): cct(c) {
+    waiting_fds.resize(1024);
+  }
+
+  int get_eventfd();
+
+  int listen(int fd, int mask) {
+    if ((size_t)fd >= fds.size())
+      return -ENOENT;
+
+    Tub<UserspaceFDImpl> &impl = fds[fd];
+    if (!impl)
+      return -ENOENT;
+
+    impl->listening_mask |= mask;
+    if (impl->activating_mask & impl->listening_mask && !impl->waiting_idx) {
+      if (waiting_fds.size() <= max_wait_idx)
+        waiting_fds.resize(waiting_fds.size()*2);
+      impl->waiting_idx = ++max_wait_idx;
+      waiting_fds[max_wait_idx] = fd;
+    }
+    return 0;
+  }
+
+  int unlisten(int fd, int mask) {
+    if ((size_t)fd >= fds.size())
+      return -ENOENT;
+
+    Tub<UserspaceFDImpl> &impl = fds[fd];
+    if (!impl)
+      return -ENOENT;
+
+    impl->listening_mask &= (~mask);
+    if (!(impl->activating_mask & impl->listening_mask) && impl->waiting_idx) {
+      if (waiting_fds[max_wait_idx] == fd) {
+        ceph_assert(impl->waiting_idx == max_wait_idx);
+        --max_wait_idx;
+      }
+      waiting_fds[impl->waiting_idx] = -1;
+      impl->waiting_idx = 0;
+    }
+    return 0;
+  }
+
+  int notify(int fd, int mask);
+  void close(int fd);
+  int poll(int *events, int *masks, int num_events, struct timeval *tp);
+
+  bool check() {
+    for (auto &&m : fds) {
+      if (m && m->magic != 4921)
+        return false;
+    }
+    return true;
+  }
+};
+
+#endif //CEPH_USERSPACEEVENT_H
diff --git a/src/msg/async/dpdk/align.h b/src/msg/async/dpdk/align.h
new file mode 100644
index 000000000..3b48f7899
--- /dev/null
+++ b/src/msg/async/dpdk/align.h
@@ -0,0 +1,50 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_DPDK_ALIGN_HH_
+#define CEPH_MSG_DPDK_ALIGN_HH_
+
+#include <cstdint>
+#include <cstdlib>
+
+template <typename T>
+inline constexpr T align_up(T v, T align) {
+  return (v + align - 1) & ~(align - 1);
+}
+
+template <typename T>
+inline constexpr T* align_up(T* v, size_t align) {
+  static_assert(sizeof(T) == 1, "align byte pointers only");
+  return reinterpret_cast<T*>(align_up(reinterpret_cast<uintptr_t>(v), align));
+}
+
+template <typename T>
+inline constexpr T align_down(T v, T align) {
+  return v & ~(align - 1);
+}
+
+template <typename T>
+inline constexpr T* align_down(T* v, size_t align) {
+  static_assert(sizeof(T) == 1, "align byte pointers only");
+  return reinterpret_cast<T*>(align_down(reinterpret_cast<uintptr_t>(v), align));
+}
+
+#endif /* CEPH_MSG_DPDK_ALIGN_HH_ */
diff --git a/src/msg/async/dpdk/array_map.h b/src/msg/async/dpdk/array_map.h
new file mode 100644
index 000000000..40f7728dc
--- /dev/null
+++ b/src/msg/async/dpdk/array_map.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_ARRAY_MAP_HH_
+#define CEPH_ARRAY_MAP_HH_
+
+#include <array>
+
+// unordered_map implemented as a simple array
+
+template <typename Value, size_t Max>
+class array_map {
+  std::array<Value, Max> _a {};
+ public:
+  array_map(std::initializer_list<std::pair<size_t, Value>> i) {
+    for (auto kv : i) {
+      _a[kv.first] = kv.second;
+    }
+  }
+  Value& operator[](size_t key) { return _a[key]; }
+  const Value& operator[](size_t key) const { return _a[key]; }
+
+  Value& at(size_t key) {
+    if (key >= Max) {
+      throw std::out_of_range(std::to_string(key) + " >= " + std::to_string(Max));
+    }
+    return _a[key];
+  }
+};
+
+#endif /* ARRAY_MAP_HH_ */
diff --git a/src/msg/async/dpdk/byteorder.h b/src/msg/async/dpdk/byteorder.h
new file mode 100644
index 000000000..a996ec077
--- /dev/null
+++ b/src/msg/async/dpdk/byteorder.h
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_BYTEORDER_H_
+#define CEPH_MSG_BYTEORDER_H_
+
+#include <arpa/inet.h>  // for ntohs() and friends
+#include <iosfwd>
+#include <utility>
+
+inline uint64_t ntohq(uint64_t v) {
+  return __builtin_bswap64(v);
+}
+inline uint64_t htonq(uint64_t v) {
+  return __builtin_bswap64(v);
+}
+
+inline void ntoh() {}
+inline void hton() {}
+
+inline uint8_t ntoh(uint8_t x) { return x; }
+inline uint8_t hton(uint8_t x) { return x; }
+inline uint16_t ntoh(uint16_t x) { return ntohs(x); }
+inline uint16_t hton(uint16_t x) { return htons(x); }
+inline uint32_t ntoh(uint32_t x) { return ntohl(x); }
+inline uint32_t hton(uint32_t x) { return htonl(x); }
+inline uint64_t ntoh(uint64_t x) { return ntohq(x); }
+inline uint64_t hton(uint64_t x) { return htonq(x); }
+
+inline int8_t ntoh(int8_t x) { return x; }
+inline int8_t hton(int8_t x) { return x; }
+inline int16_t ntoh(int16_t x) { return ntohs(x); }
+inline int16_t hton(int16_t x) { return htons(x); }
+inline int32_t ntoh(int32_t x) { return ntohl(x); }
+inline int32_t hton(int32_t x) { return htonl(x); }
+inline int64_t ntoh(int64_t x) { return ntohq(x); }
+inline int64_t hton(int64_t x) { return htonq(x); }
+
+#endif /* CEPH_MSG_BYTEORDER_H_ */
diff --git a/src/msg/async/dpdk/capture.h b/src/msg/async/dpdk/capture.h
new file mode 100644
index 000000000..1ace8eeb0
--- /dev/null
+++ b/src/msg/async/dpdk/capture.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_DPDK_CAPTURE_H
+#define CEPH_MSG_DPDK_CAPTURE_H
+
+#include <utility>
+
+template <typename T, typename F>
+class capture_impl {
+  T x;
+  F f;
+ public:
+  capture_impl(capture_impl &) = delete;
+  capture_impl( T && x, F && f )
+      : x{std::forward<T>(x)}, f{std::forward<F>(f)}
+  {}
+
+  template <typename ...Ts> auto operator()( Ts&&...args )
+  -> decltype(f( x, std::forward<Ts>(args)... ))
+  {
+    return f( x, std::forward<Ts>(args)... );
+  }
+
+  template <typename ...Ts> auto operator()( Ts&&...args ) const
+  -> decltype(f( x, std::forward<Ts>(args)... ))
+  {
+    return f( x, std::forward<Ts>(args)... );
+  }
+};
+
+template <typename T, typename F>
+capture_impl<T,F> capture( T && x, F && f ) {
+  return capture_impl<T,F>(
+      std::forward<T>(x), std::forward<F>(f) );
+}
+
+#endif //CEPH_MSG_DPDK_CAPTURE_H
diff --git a/src/msg/async/dpdk/circular_buffer.h b/src/msg/async/dpdk/circular_buffer.h
new file mode 100644
index 000000000..2c92c1204
--- /dev/null
+++ b/src/msg/async/dpdk/circular_buffer.h
@@ -0,0 +1,347 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_CIRCULAR_BUFFER_HH_
+#define CEPH_CIRCULAR_BUFFER_HH_
+
+// A growable double-ended queue container that can be efficiently
+// extended (and shrunk) from both ends.  Implementation is a single
+// storage vector.
+//
+// Similar to libstdc++'s std::deque, except that it uses a single level
+// store, and so is more efficient for simple stored items.
+// Similar to boost::circular_buffer_space_optimized, except it uses
+// uninitialized storage for unoccupied elements (and thus move/copy
+// constructors instead of move/copy assignments, which are less efficient).
+
+#include <memory>
+#include <algorithm>
+
+#include "transfer.h"
+
+template <typename T, typename Alloc = std::allocator<T>>
+class circular_buffer {
+  struct impl : Alloc {
+    T* storage = nullptr;
+    // begin, end interpreted (mod capacity)
+    size_t begin = 0;
+    size_t end = 0;
+    size_t capacity = 0;
+  };
+  impl _impl;
+ public:
+  using value_type = T;
+  using size_type = size_t;
+  using reference = T&;
+  using pointer = T*;
+  using const_reference = const T&;
+  using const_pointer = const T*;
+ public:
+  circular_buffer() = default;
+  circular_buffer(circular_buffer&& X);
+  circular_buffer(const circular_buffer& X) = delete;
+  ~circular_buffer();
+  circular_buffer& operator=(const circular_buffer&) = delete;
+  circular_buffer& operator=(circular_buffer&&) = delete;
+  void push_front(const T& data);
+  void push_front(T&& data);
+  template <typename... A>
+  void emplace_front(A&&... args);
+  void push_back(const T& data);
+  void push_back(T&& data);
+  template <typename... A>
+  void emplace_back(A&&... args);
+  T& front();
+  T& back();
+  void pop_front();
+  void pop_back();
+  bool empty() const;
+  size_t size() const;
+  size_t capacity() const;
+  T& operator[](size_t idx);
+  template <typename Func>
+  void for_each(Func func);
+  // access an element, may return wrong or destroyed element
+  // only useful if you do not rely on data accuracy (e.g. prefetch)
+  T& access_element_unsafe(size_t idx);
+ private:
+  void expand();
+  void maybe_expand(size_t nr = 1);
+  size_t mask(size_t idx) const;
+
+  template<typename CB, typename ValueType>
+  struct cbiterator : std::iterator<std::random_access_iterator_tag, ValueType> {
+    typedef std::iterator<std::random_access_iterator_tag, ValueType> super_t;
+
+    ValueType& operator*() const { return cb->_impl.storage[cb->mask(idx)]; }
+    ValueType* operator->() const { return &cb->_impl.storage[cb->mask(idx)]; }
+    // prefix
+    cbiterator<CB, ValueType>& operator++() {
+      idx++;
+      return *this;
+    }
+    // postfix
+    cbiterator<CB, ValueType> operator++(int unused) {
+      auto v = *this;
+      idx++;
+      return v;
+    }
+    // prefix
+    cbiterator<CB, ValueType>& operator--() {
+      idx--;
+      return *this;
+    }
+    // postfix
+    cbiterator<CB, ValueType> operator--(int unused) {
+      auto v = *this;
+      idx--;
+      return v;
+    }
+    cbiterator<CB, ValueType> operator+(typename super_t::difference_type n) const {
+      return cbiterator<CB, ValueType>(cb, idx + n);
+    }
+    cbiterator<CB, ValueType> operator-(typename super_t::difference_type n) const {
+      return cbiterator<CB, ValueType>(cb, idx - n);
+    }
+    cbiterator<CB, ValueType>& operator+=(typename super_t::difference_type n) {
+      idx += n;
+      return *this;
+    }
+    cbiterator<CB, ValueType>& operator-=(typename super_t::difference_type n) {
+      idx -= n;
+      return *this;
+    }
+    bool operator==(const cbiterator<CB, ValueType>& rhs) const {
+      return idx == rhs.idx;
+    }
+    bool operator!=(const cbiterator<CB, ValueType>& rhs) const {
+      return idx != rhs.idx;
+    }
+    bool operator<(const cbiterator<CB, ValueType>& rhs) const {
+      return idx < rhs.idx;
+    }
+    bool operator>(const cbiterator<CB, ValueType>& rhs) const {
+      return idx > rhs.idx;
+    }
+    bool operator>=(const cbiterator<CB, ValueType>& rhs) const {
+      return idx >= rhs.idx;
+    }
+    bool operator<=(const cbiterator<CB, ValueType>& rhs) const {
+      return idx <= rhs.idx;
+    }
+    typename super_t::difference_type operator-(const cbiterator<CB, ValueType>& rhs) const {
+      return idx - rhs.idx;
+    }
+   private:
+    CB* cb;
+    size_t idx;
+    cbiterator<CB, ValueType>(CB* b, size_t i) : cb(b), idx(i) {}
+    friend class circular_buffer;
+  };
+  friend class iterator;
+
+ public:
+  typedef cbiterator<circular_buffer, T> iterator;
+  typedef cbiterator<const circular_buffer, const T> const_iterator;
+
+  iterator begin() {
+    return iterator(this, _impl.begin);
+  }
+  const_iterator begin() const {
+    return const_iterator(this, _impl.begin);
+  }
+  iterator end() {
+    return iterator(this, _impl.end);
+  }
+  const_iterator end() const {
+    return const_iterator(this, _impl.end);
+  }
+  const_iterator cbegin() const {
+    return const_iterator(this, _impl.begin);
+  }
+  const_iterator cend() const {
+    return const_iterator(this, _impl.end);
+  }
+};
+
+template <typename T, typename Alloc>
+inline size_t circular_buffer<T, Alloc>::mask(size_t idx) const {
+  return idx & (_impl.capacity - 1);
+}
+
+template <typename T, typename Alloc>
+inline bool circular_buffer<T, Alloc>::empty() const {
+  return _impl.begin == _impl.end;
+}
+
+template <typename T, typename Alloc>
+inline size_t circular_buffer<T, Alloc>::size() const {
+  return _impl.end - _impl.begin;
+}
+
+template <typename T, typename Alloc>
+inline size_t circular_buffer<T, Alloc>::capacity() const {
+  return _impl.capacity;
+}
+
+template <typename T, typename Alloc>
+inline circular_buffer<T, Alloc>::circular_buffer(circular_buffer&& x)
+    : _impl(std::move(x._impl)) {
+  x._impl = {};
+}
+
+template <typename T, typename Alloc>
+template <typename Func>
+inline void circular_buffer<T, Alloc>::for_each(Func func) {
+  auto s = _impl.storage;
+  auto m = _impl.capacity - 1;
+  for (auto i = _impl.begin; i != _impl.end; ++i) {
+    func(s[i & m]);
+  }
+}
+
+template <typename T, typename Alloc>
+inline circular_buffer<T, Alloc>::~circular_buffer() {
+  for_each([this] (T& obj) {
+    _impl.destroy(&obj);
+  });
+  _impl.deallocate(_impl.storage, _impl.capacity);
+}
+
+template <typename T, typename Alloc>
+void circular_buffer<T, Alloc>::expand() {
+  auto new_cap = std::max<size_t>(_impl.capacity * 2, 1);
+  auto new_storage = _impl.allocate(new_cap);
+  auto p = new_storage;
+  try {
+    for_each([this, &p] (T& obj) {
+      transfer_pass1(_impl, &obj, p);
+      p++;
+    });
+  } catch (...) {
+    while (p != new_storage) {
+      _impl.destroy(--p);
+    }
+    _impl.deallocate(new_storage, new_cap);
+    throw;
+  }
+  p = new_storage;
+  for_each([this, &p] (T& obj) {
+    transfer_pass2(_impl, &obj, p++);
+  });
+  std::swap(_impl.storage, new_storage);
+  std::swap(_impl.capacity, new_cap);
+  _impl.begin = 0;
+  _impl.end = p - _impl.storage;
+  _impl.deallocate(new_storage, new_cap);
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::maybe_expand(size_t nr) {
+  if (_impl.end - _impl.begin + nr > _impl.capacity) {
+    expand();
+  }
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::push_front(const T& data) {
+  maybe_expand();
+  auto p = &_impl.storage[mask(_impl.begin - 1)];
+  _impl.construct(p, data);
+  --_impl.begin;
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::push_front(T&& data) {
+  maybe_expand();
+  auto p = &_impl.storage[mask(_impl.begin - 1)];
+  _impl.construct(p, std::move(data));
+  --_impl.begin;
+}
+
+template <typename T, typename Alloc>
+template <typename... Args>
+inline void circular_buffer<T, Alloc>::emplace_front(Args&&... args) {
+  maybe_expand();
+  auto p = &_impl.storage[mask(_impl.begin - 1)];
+  _impl.construct(p, std::forward<Args>(args)...);
+  --_impl.begin;
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::push_back(const T& data) {
+  maybe_expand();
+  auto p = &_impl.storage[mask(_impl.end)];
+  _impl.construct(p, data);
+  ++_impl.end;
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::push_back(T&& data) {
+  maybe_expand();
+  auto p = &_impl.storage[mask(_impl.end)];
+  _impl.construct(p, std::move(data));
+  ++_impl.end;
+}
+
+template <typename T, typename Alloc>
+template <typename... Args>
+inline void circular_buffer<T, Alloc>::emplace_back(Args&&... args) {
+  maybe_expand();
+  auto p = &_impl.storage[mask(_impl.end)];
+  _impl.construct(p, std::forward<Args>(args)...);
+  ++_impl.end;
+}
+
+template <typename T, typename Alloc>
+inline T& circular_buffer<T, Alloc>::front() {
+  return _impl.storage[mask(_impl.begin)];
+}
+
+template <typename T, typename Alloc>
+inline T& circular_buffer<T, Alloc>::back() {
+  return _impl.storage[mask(_impl.end - 1)];
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::pop_front() {
+  _impl.destroy(&front());
+  ++_impl.begin;
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::pop_back() {
+  _impl.destroy(&back());
+  --_impl.end;
+}
+
+template <typename T, typename Alloc>
+inline T& circular_buffer<T, Alloc>::operator[](size_t idx) {
+  return _impl.storage[mask(_impl.begin + idx)];
+}
+
+template <typename T, typename Alloc>
+inline T& circular_buffer<T, Alloc>::access_element_unsafe(size_t idx) {
+  return _impl.storage[mask(_impl.begin + idx)];
+}
+
+#endif /* CEPH_CIRCULAR_BUFFER_HH_ */
diff --git a/src/msg/async/dpdk/const.h b/src/msg/async/dpdk/const.h
new file mode 100644
index 000000000..ea5dc49e5
--- /dev/null
+++ b/src/msg/async/dpdk/const.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_CONST_H_
+#define CEPH_MSG_CONST_H_
+
+#include <stdint.h>
+
+enum class ip_protocol_num : uint8_t {
+  icmp = 1, tcp = 6, unused = 255
+};
+
+enum class eth_protocol_num : uint16_t {
+  ipv4 = 0x0800, arp = 0x0806, ipv6 = 0x86dd
+};
+
+const uint8_t eth_hdr_len = 14;
+const uint8_t tcp_hdr_len_min = 20;
+const uint8_t ipv4_hdr_len_min = 20;
+const uint8_t ipv6_hdr_len_min = 40;
+const uint16_t ip_packet_len_max = 65535;
+
+#endif
diff --git a/src/msg/async/dpdk/dpdk_rte.cc b/src/msg/async/dpdk/dpdk_rte.cc
new file mode 100644
index 000000000..96cf896f8
--- /dev/null
+++ b/src/msg/async/dpdk/dpdk_rte.cc
@@ -0,0 +1,185 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <bitset>
+
+#include <rte_config.h>
+#include <rte_common.h>
+#include <rte_ethdev.h>
+#include <rte_version.h>
+
+#include "DPDK.h"
+#include "dpdk_rte.h"
+
+namespace dpdk {
+
+  static inline std::vector<char> string2vector(std::string str) {
+    auto v = std::vector<char>(str.begin(), str.end());
+    v.push_back('\0');
+    return v;
+  }
+
+  bool eal::initialized = false;
+  std::thread eal::t;
+  std::mutex eal::lock;
+  std::condition_variable eal::cond;
+  std::list<std::function<void()>> eal::funcs;
+
+  static int bitcount(unsigned long long n)
+  {
+    return std::bitset<CHAR_BIT * sizeof(n)>{n}.count();
+  }
+
+  static int hex2bitcount(unsigned char c)
+  {
+    int val;
+
+    if (isdigit(c))
+      val = c - '0';
+    else if (isupper(c))
+      val = c - 'A' + 10;
+    else
+      val = c - 'a' + 10;
+    return bitcount(val);
+  }
+
+  static int coremask_bitcount(const char *buf)
+  {
+    int count = 0;
+
+    if (buf[0] == '0' && 
+        ((buf[1] == 'x') || (buf[1] == 'X')))
+      buf += 2;
+
+    for (int i = 0; buf[i] != '\0'; i++) {
+      char c = buf[i];
+      if (isxdigit(c) == 0)
+        return -EINVAL;
+      count += hex2bitcount(c);
+    }
+    return count;
+  }
+
+  int eal::init(CephContext *c)
+  {
+    if (initialized) {
+      return 1;
+    }
+
+    bool done = false;
+    auto coremask = c->_conf.get_val<std::string>("ms_dpdk_coremask");
+    int coremaskbit = coremask_bitcount(coremask.c_str());
+
+    if (coremaskbit <= 0
+        || static_cast<uint64_t>(coremaskbit) < c->_conf->ms_async_op_threads)
+      return -EINVAL;
+
+    t = std::thread([&]() {
+      // TODO: Inherit these from the app parameters - "opts"
+      std::vector<std::vector<char>> args {
+          string2vector(string("ceph")),
+          string2vector("-c"), string2vector(c->_conf.get_val<std::string>("ms_dpdk_coremask")),
+          string2vector("-n"), string2vector(c->_conf->ms_dpdk_memory_channel),
+      };
+
+      Tub<std::string> hugepages_path;
+      if (!c->_conf->ms_dpdk_hugepages.empty()) {
+        hugepages_path.construct(c->_conf->ms_dpdk_hugepages);
+      }
+
+      // If "hugepages" is not provided and DPDK PMD drivers mode is requested -
+      // use the default DPDK huge tables configuration.
+      if (hugepages_path) {
+        args.push_back(string2vector("--huge-dir"));
+        args.push_back(string2vector(*hugepages_path));
+
+        //
+        // We don't know what is going to be our networking configuration so we
+        // assume there is going to be a queue per-CPU. Plus we'll give a DPDK
+        // 64MB for "other stuff".
+        //
+        unsigned int x;
+        std::stringstream ss;
+        ss << std::hex << "fffefffe";
+        ss >> x;
+        size_t size_MB = mem_size(bitcount(x)) >> 20;
+        std::stringstream size_MB_str;
+        size_MB_str << size_MB;
+
+        args.push_back(string2vector("-m"));
+        args.push_back(string2vector(size_MB_str.str()));
+      } else if (!c->_conf->ms_dpdk_pmd.empty()) {
+        args.push_back(string2vector("--no-huge"));
+      }
+
+      std::string rte_file_prefix;
+      rte_file_prefix = "rte_";
+      rte_file_prefix += c->_conf->name.to_str();
+      args.push_back(string2vector("--file-prefix"));
+      args.push_back(string2vector(rte_file_prefix));
+
+      std::vector<char*> cargs;
+
+      for (auto&& a: args) {
+        cargs.push_back(a.data());
+      }
+      /* initialise the EAL for all */
+      int ret = rte_eal_init(cargs.size(), cargs.data());
+      if (ret < 0)
+        return ret;
+
+      std::unique_lock<std::mutex> l(lock);
+      initialized = true;
+      done = true;
+      cond.notify_all();
+      while (true) {
+        if (!funcs.empty()) {
+          auto f = std::move(funcs.front());
+          funcs.pop_front();
+          f();
+          cond.notify_all();
+        } else {
+          cond.wait(l);
+        }
+      }
+    });
+    t.detach();
+    std::unique_lock<std::mutex> l(lock);
+    while (!done)
+      cond.wait(l);
+    return 0;
+  }
+
+  size_t eal::mem_size(int num_cpus)
+  {
+    size_t memsize = 0;
+    //
+    // PMD mempool memory:
+    //
+    // We don't know what is going to be our networking configuration so we
+    // assume there is going to be a queue per-CPU.
+    //
+    memsize += num_cpus * qp_mempool_obj_size();
+
+    // Plus we'll give a DPDK 64MB for "other stuff".
+    memsize += (64UL << 20);
+
+    return memsize;
+  }
+
+} // namespace dpdk
diff --git a/src/msg/async/dpdk/dpdk_rte.h b/src/msg/async/dpdk/dpdk_rte.h
new file mode 100644
index 000000000..4aa838994
--- /dev/null
+++ b/src/msg/async/dpdk/dpdk_rte.h
@@ -0,0 +1,74 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef CEPH_DPDK_RTE_H_
+#define CEPH_DPDK_RTE_H_
+
+
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+
+#include <bitset>
+#include <rte_config.h>
+#include <rte_version.h>
+#include <boost/program_options.hpp>
+
+/*********************** Compat section ***************************************/
+// We currently support only versions 2.0 and above.
+#if (RTE_VERSION < RTE_VERSION_NUM(2,0,0,0))
+#error "DPDK version above 2.0.0 is required"
+#endif
+
+#if defined(RTE_MBUF_REFCNT_ATOMIC)
+#warning "CONFIG_RTE_MBUF_REFCNT_ATOMIC should be disabled in DPDK's " \
+         "config/common_linuxapp"
+#endif
+/******************************************************************************/
+
+namespace dpdk {
+
+// DPDK Environment Abstraction Layer
+class eal {
+ public:
+  using cpuset = std::bitset<RTE_MAX_LCORE>;
+
+  static std::mutex lock;
+  static std::condition_variable cond;
+  static std::list<std::function<void()>> funcs;
+  static int init(CephContext *c);
+  static void execute_on_master(std::function<void()> &&f) {
+    bool done = false;
+    std::unique_lock<std::mutex> l(lock);
+    funcs.emplace_back([&]() { f(); done = true; });
+    cond.notify_all();
+    while (!done)
+      cond.wait(l);
+  }
+  /**
+   * Returns the amount of memory needed for DPDK
+   * @param num_cpus Number of CPUs the application is going to use
+   *
+   * @return
+   */
+  static size_t mem_size(int num_cpus);
+  static bool initialized;
+  static std::thread t;
+};
+
+} // namespace dpdk
+#endif // CEPH_DPDK_RTE_H_
diff --git a/src/msg/async/dpdk/ethernet.cc b/src/msg/async/dpdk/ethernet.cc
new file mode 100644
index 000000000..9aca50788
--- /dev/null
+++ b/src/msg/async/dpdk/ethernet.cc
@@ -0,0 +1,16 @@
+#include <iomanip>
+
+#include "ethernet.h"
+
+std::ostream& operator<<(std::ostream& os, const ethernet_address& ea) {
+  auto& m = ea.mac;
+  using u = uint32_t;
+  os << std::hex << std::setw(2)
+     << u(m[0]) << ":"
+     << u(m[1]) << ":"
+     << u(m[2]) << ":"
+     << u(m[3]) << ":"
+     << u(m[4]) << ":"
+     << u(m[5]);
+  return os;
+}
diff --git a/src/msg/async/dpdk/ethernet.h b/src/msg/async/dpdk/ethernet.h
new file mode 100644
index 000000000..b007425fe
--- /dev/null
+++ b/src/msg/async/dpdk/ethernet.h
@@ -0,0 +1,84 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_ETHERNET_H_
+#define CEPH_MSG_ETHERNET_H_
+
+#include <array>
+#include <sstream>
+
+#include "include/ceph_assert.h"
+#include "byteorder.h"
+
+struct ethernet_address {
+  ethernet_address() {}
+
+  ethernet_address(const uint8_t *eaddr) {
+    std::copy(eaddr, eaddr + 6, mac.begin());
+  }
+
+  ethernet_address(std::initializer_list<uint8_t> eaddr) {
+    ceph_assert(eaddr.size() == mac.size());
+    std::copy(eaddr.begin(), eaddr.end(), mac.begin());
+  }
+
+  ethernet_address ntoh() {
+    return *this;
+  }
+  ethernet_address hton() {
+    return *this;
+  }
+  std::array<uint8_t, 6> mac;
+} __attribute__((packed));
+
+inline bool operator==(const ethernet_address& a, const ethernet_address& b) {
+  return a.mac == b.mac;
+}
+std::ostream& operator<<(std::ostream& os, const ethernet_address& ea);
+
+struct ethernet {
+  using address = ethernet_address;
+  static address broadcast_address() {
+      return {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+  }
+  static constexpr uint16_t arp_hardware_type() { return 1; }
+};
+
+struct eth_hdr {
+  ethernet_address dst_mac;
+  ethernet_address src_mac;
+  uint16_t eth_proto;
+  eth_hdr hton() {
+    eth_hdr hdr = *this;
+    hdr.eth_proto = ::hton(eth_proto);
+    return hdr;
+  }
+  eth_hdr ntoh() {
+    eth_hdr hdr = *this;
+    hdr.eth_proto = ::ntoh(eth_proto);
+    return hdr;
+  }
+} __attribute__((packed));
+
+ethernet_address parse_ethernet_address(std::string addr);
+
+#endif /* CEPH_MSG_ETHERNET_H_ */
diff --git a/src/msg/async/dpdk/ip_types.h b/src/msg/async/dpdk/ip_types.h
new file mode 100644
index 000000000..356d8fd6e
--- /dev/null
+++ b/src/msg/async/dpdk/ip_types.h
@@ -0,0 +1,109 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ */
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_IP_TYPES_H_H
+#define CEPH_IP_TYPES_H_H
+
+#include <boost/asio/ip/address_v4.hpp>
+#include <string>
+
+class Packet;
+class ethernet_address;
+using resolution_cb = std::function<void (const ethernet_address&, Packet, int)>;
+
+struct ipv4_addr {
+  uint32_t ip;
+  uint16_t port;
+
+  ipv4_addr() : ip(0), port(0) {}
+  ipv4_addr(uint32_t ip, uint16_t port) : ip(ip), port(port) {}
+  ipv4_addr(uint16_t port) : ip(0), port(port) {}
+  ipv4_addr(const std::string &addr);
+  ipv4_addr(const std::string &addr, uint16_t port);
+
+  ipv4_addr(const entity_addr_t &ad) {
+    ip = ntoh(ad.in4_addr().sin_addr.s_addr);
+    port = ad.get_port();
+  }
+
+  ipv4_addr(entity_addr_t &&addr) : ipv4_addr(addr) {}
+};
+
+struct ipv4_address {
+  ipv4_address() : ip(0) {}
+  explicit ipv4_address(uint32_t ip) : ip(ip) {}
+  explicit ipv4_address(const std::string& addr) {
+    ip = static_cast<uint32_t>(boost::asio::ip::address_v4::from_string(addr).to_ulong());
+  }
+  ipv4_address(ipv4_addr addr) {
+    ip = addr.ip;
+  }
+
+  uint32_t ip;
+
+  ipv4_address hton() {
+    ipv4_address addr;
+    addr.ip = ::hton(ip);
+    return addr;
+  }
+  ipv4_address ntoh() {
+    ipv4_address addr;
+    addr.ip = ::ntoh(ip);
+    return addr;
+  }
+
+  friend bool operator==(ipv4_address x, ipv4_address y) {
+    return x.ip == y.ip;
+  }
+  friend bool operator!=(ipv4_address x, ipv4_address y) {
+    return x.ip != y.ip;
+  }
+} __attribute__((packed));
+
+static inline bool is_unspecified(ipv4_address addr) { return addr.ip == 0; }
+
+std::ostream& operator<<(std::ostream& os, const ipv4_address& a);
+
+namespace std {
+
+  template <>
+  struct hash<ipv4_address> {
+    size_t operator()(ipv4_address a) const { return a.ip; }
+  };
+
+}
+
+#endif //CEPH_IP_TYPES_H_H
diff --git a/src/msg/async/dpdk/net.cc b/src/msg/async/dpdk/net.cc
new file mode 100644
index 000000000..6e361f182
--- /dev/null
+++ b/src/msg/async/dpdk/net.cc
@@ -0,0 +1,205 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ */
+
+#include "net.h"
+#include "DPDK.h"
+#include "DPDKStack.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "net "
+
+interface::interface(CephContext *cct, std::shared_ptr<DPDKDevice> dev, EventCenter *center)
+    : cct(cct), _dev(dev),
+      _rx(_dev->receive(
+          center->get_id(),
+          [center, this] (Packet p) {
+            return dispatch_packet(center, std::move(p));
+          }
+      )),
+      _hw_address(_dev->hw_address()),
+      _hw_features(_dev->get_hw_features()) {
+  auto idx = 0u;
+  unsigned qid = center->get_id();
+  dev->queue_for_cpu(center->get_id()).register_packet_provider([this, idx, qid] () mutable {
+    Tub<Packet> p;
+    for (size_t i = 0; i < _pkt_providers.size(); i++) {
+      auto l3p = _pkt_providers[idx++]();
+      if (idx == _pkt_providers.size())
+        idx = 0;
+      if (l3p) {
+        auto l3pv = std::move(*l3p);
+        auto eh = l3pv.p.prepend_header<eth_hdr>();
+        eh->dst_mac = l3pv.to;
+        eh->src_mac = _hw_address;
+        eh->eth_proto = uint16_t(l3pv.proto_num);
+        *eh = eh->hton();
+        ldout(this->cct, 10) << "=== tx === proto " << std::hex << uint16_t(l3pv.proto_num)
+                       << " " << _hw_address << " -> " << l3pv.to
+                       << " length " << std::dec << l3pv.p.len() << dendl;
+        p = std::move(l3pv.p);
+        return p;
+      }
+    }
+    return p;
+  });
+}
+
+subscription<Packet, ethernet_address> interface::register_l3(
+    eth_protocol_num proto_num,
+    std::function<int (Packet p, ethernet_address from)> next,
+    std::function<bool (forward_hash&, Packet& p, size_t)> forward)
+{
+  auto i = _proto_map.emplace(std::piecewise_construct, std::make_tuple(uint16_t(proto_num)), std::forward_as_tuple(std::move(forward)));
+  ceph_assert(i.second);
+  l3_rx_stream& l3_rx = i.first->second;
+  return l3_rx.packet_stream.listen(std::move(next));
+}
+
+unsigned interface::hash2cpu(uint32_t hash) {
+  return _dev->hash2cpu(hash);
+}
+
+const rss_key_type& interface::rss_key() const {
+  return _dev->rss_key();
+}
+
+uint16_t interface::hw_queues_count() const {
+  return _dev->hw_queues_count();
+}
+
+class C_handle_l2forward : public EventCallback {
+  std::shared_ptr<DPDKDevice> sdev;
+  unsigned &queue_depth;
+  Packet p;
+  unsigned dst;
+
+ public:
+  C_handle_l2forward(std::shared_ptr<DPDKDevice> &p, unsigned &qd, Packet pkt, unsigned target)
+      : sdev(p), queue_depth(qd), p(std::move(pkt)), dst(target) {}
+  void do_request(uint64_t fd) {
+    sdev->l2receive(dst, std::move(p));
+    queue_depth--;
+    delete this;
+  }
+};
+
+void interface::forward(EventCenter *source, unsigned target, Packet p) {
+  static __thread unsigned queue_depth;
+
+  if (queue_depth < 1000) {
+    queue_depth++;
+    // FIXME: need ensure this event not be called after EventCenter destruct
+    _dev->workers[target]->center.dispatch_event_external(
+        new C_handle_l2forward(_dev, queue_depth, std::move(p.free_on_cpu(source)), target));
+  }
+}
+
+int interface::dispatch_packet(EventCenter *center, Packet p) {
+  auto eh = p.get_header<eth_hdr>();
+  if (eh) {
+    auto i = _proto_map.find(ntoh(eh->eth_proto));
+    auto hwrss = p.rss_hash();
+    if (hwrss) {
+      ldout(cct, 10) << __func__ << " === rx === proto " << std::hex << ::ntoh(eh->eth_proto)
+                     << " "<< eh->src_mac.ntoh() << " -> " << eh->dst_mac.ntoh()
+                     << " length " << std::dec << p.len() << " rss_hash " << *p.rss_hash() << dendl;
+    } else {
+      ldout(cct, 10) << __func__ << " === rx === proto " << std::hex << ::ntoh(eh->eth_proto)
+                     << " "<< eh->src_mac.ntoh() << " -> " << eh->dst_mac.ntoh()
+                     << " length " << std::dec << p.len() << dendl;
+    }
+    if (i != _proto_map.end()) {
+      l3_rx_stream& l3 = i->second;
+      auto fw = _dev->forward_dst(center->get_id(), [&p, &l3, this] () {
+        auto hwrss = p.rss_hash();
+        if (hwrss) {
+          return *hwrss;
+        } else {
+          forward_hash data;
+          if (l3.forward(data, p, sizeof(eth_hdr))) {
+            return toeplitz_hash(rss_key(), data);
+          }
+          return 0u;
+        }
+      });
+      if (fw != center->get_id()) {
+        ldout(cct, 1) << __func__ << " forward to " << fw << dendl;
+        forward(center, fw, std::move(p));
+      } else {
+        auto h = eh->ntoh();
+        auto from = h.src_mac;
+        p.trim_front(sizeof(*eh));
+        // avoid chaining, since queue length is unlimited
+        // drop instead.
+        if (l3.ready()) {
+          return l3.packet_stream.produce(std::move(p), from);
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+class C_arp_learn : public EventCallback {
+  DPDKWorker *worker;
+  ethernet_address l2_addr;
+  ipv4_address l3_addr;
+
+ public:
+  C_arp_learn(DPDKWorker *w, ethernet_address l2, ipv4_address l3)
+      : worker(w), l2_addr(l2), l3_addr(l3) {}
+  void do_request(uint64_t id) {
+    worker->arp_learn(l2_addr, l3_addr);
+    delete this;
+  }
+};
+
+void interface::arp_learn(ethernet_address l2, ipv4_address l3)
+{
+  for (auto &&w : _dev->workers) {
+    w->center.dispatch_event_external(
+        new C_arp_learn(w, l2, l3));
+  }
+}
+
+l3_protocol::l3_protocol(interface* netif, eth_protocol_num proto_num, packet_provider_type func)
+    : _netif(netif), _proto_num(proto_num)  {
+  _netif->register_packet_provider(std::move(func));
+}
+
+subscription<Packet, ethernet_address> l3_protocol::receive(
+    std::function<int (Packet, ethernet_address)> rx_fn,
+    std::function<bool (forward_hash &h, Packet &p, size_t s)> forward) {
+  return _netif->register_l3(_proto_num, std::move(rx_fn), std::move(forward));
+};
diff --git a/src/msg/async/dpdk/net.h b/src/msg/async/dpdk/net.h
new file mode 100644
index 000000000..63f0422b7
--- /dev/null
+++ b/src/msg/async/dpdk/net.h
@@ -0,0 +1,138 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_DPDK_NET_H
+#define CEPH_MSG_DPDK_NET_H
+
+#include "const.h"
+#include "ethernet.h"
+#include "Packet.h"
+#include "stream.h"
+#include "toeplitz.h"
+
+struct hw_features {
+  // Enable tx ip header checksum offload
+  bool tx_csum_ip_offload = false;
+  // Enable tx l4 (TCP or UDP) checksum offload
+  bool tx_csum_l4_offload = false;
+  // Enable rx checksum offload
+  bool rx_csum_offload = false;
+  // LRO is enabled
+  bool rx_lro = false;
+  // Enable tx TCP segment offload
+  bool tx_tso = false;
+  // Enable tx UDP fragmentation offload
+  bool tx_ufo = false;
+  // Maximum Transmission Unit
+  uint16_t mtu = 1500;
+  // Maximun packet len when TCP/UDP offload is enabled
+  uint16_t max_packet_len = ip_packet_len_max - eth_hdr_len;
+};
+
+class forward_hash {
+  uint8_t data[64];
+  size_t end_idx = 0;
+ public:
+  size_t size() const {
+    return end_idx;
+  }
+  void push_back(uint8_t b) {
+    ceph_assert(end_idx < sizeof(data));
+    data[end_idx++] = b;
+  }
+  void push_back(uint16_t b) {
+    push_back(uint8_t(b));
+    push_back(uint8_t(b >> 8));
+  }
+  void push_back(uint32_t b) {
+    push_back(uint16_t(b));
+    push_back(uint16_t(b >> 16));
+  }
+  const uint8_t& operator[](size_t idx) const {
+    return data[idx];
+  }
+};
+
+class interface;
+
+class l3_protocol {
+ public:
+  struct l3packet {
+    eth_protocol_num proto_num;
+    ethernet_address to;
+    Packet p;
+  };
+  using packet_provider_type = std::function<Tub<l3packet> ()>;
+
+ private:
+  interface* _netif;
+  eth_protocol_num _proto_num;
+
+ public:
+  explicit l3_protocol(interface* netif, eth_protocol_num proto_num, packet_provider_type func);
+  subscription<Packet, ethernet_address> receive(
+      std::function<int (Packet, ethernet_address)> rx_fn,
+      std::function<bool (forward_hash &h, Packet &p, size_t s)> forward);
+
+ private:
+  friend class interface;
+};
+
+class DPDKDevice;
+struct ipv4_address;
+
+class interface {
+  CephContext *cct;
+  struct l3_rx_stream {
+    stream<Packet, ethernet_address> packet_stream;
+    std::function<bool (forward_hash&, Packet&, size_t)> forward;
+    bool ready() { return packet_stream.started(); }
+    explicit l3_rx_stream(std::function<bool (forward_hash&, Packet&, size_t)>&& fw) : forward(fw) {}
+  };
+  std::unordered_map<uint16_t, l3_rx_stream> _proto_map;
+  std::shared_ptr<DPDKDevice> _dev;
+  subscription<Packet> _rx;
+  ethernet_address _hw_address;
+  struct hw_features _hw_features;
+  std::vector<l3_protocol::packet_provider_type> _pkt_providers;
+
+ private:
+  int dispatch_packet(EventCenter *c, Packet p);
+ public:
+  explicit interface(CephContext *cct, std::shared_ptr<DPDKDevice> dev, EventCenter *center);
+  ethernet_address hw_address() { return _hw_address; }
+  const struct hw_features& get_hw_features() const { return _hw_features; }
+  subscription<Packet, ethernet_address> register_l3(
+      eth_protocol_num proto_num,
+      std::function<int (Packet, ethernet_address)> next,
+      std::function<bool (forward_hash&, Packet&, size_t)> forward);
+  void forward(EventCenter *source, unsigned target, Packet p);
+  unsigned hash2cpu(uint32_t hash);
+  void register_packet_provider(l3_protocol::packet_provider_type func) {
+    _pkt_providers.push_back(std::move(func));
+  }
+  const rss_key_type& rss_key() const;
+  uint16_t hw_queues_count() const;
+  void arp_learn(ethernet_address l2, ipv4_address l3);
+  friend class l3_protocol;
+};
+
+#endif //CEPH_MSG_DPDK_NET_H
diff --git a/src/msg/async/dpdk/queue.h b/src/msg/async/dpdk/queue.h
new file mode 100644
index 000000000..984ddca13
--- /dev/null
+++ b/src/msg/async/dpdk/queue.h
@@ -0,0 +1,96 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_DPDK_QUEUE_H_
+#define CEPH_MSG_DPDK_QUEUE_H_
+
+#include <queue>
+
+#include "circular_buffer.h"
+
+template <typename T>
+class queue {
+  std::queue<T, circular_buffer<T>> _q;
+  size_t _max;
+
+ public:
+  explicit queue(size_t size): _max(size) {}
+
+  // Push an item.
+  //
+  // Returns false if the queue was full and the item was not pushed.
+  bool push(T&& a);
+
+  // pops an item.
+  T pop();
+
+  // Consumes items from the queue, passing them to @func, until @func
+  // returns false or the queue it empty
+  //
+  // Returns false if func returned false.
+  template <typename Func>
+  bool consume(Func&& func);
+
+  // Returns true when the queue is empty.
+  bool empty() const;
+
+  // Returns true when the queue is full.
+  bool full() const;
+
+  size_t size() const { return _q.size(); }
+
+  // Destroy any items in the queue
+  void clear() {
+    while (!_q.empty()) {
+      _q.pop();
+    }
+  }
+};
+
+template <typename T>
+inline bool queue<T>::push(T&& data) {
+  if (_q.size() < _max) {
+    _q.push(std::move(data));
+    notify_not_empty();
+    return true;
+  } else {
+    return false;
+  }
+}
+
+template <typename T>
+inline T queue<T>::pop() {
+  T data = std::move(_q.front());
+  _q.pop();
+  return data;
+}
+
+template <typename T>
+inline bool queue<T>::empty() const {
+  return _q.empty();
+}
+
+template <typename T>
+inline bool queue<T>::full() const {
+  return _q.size() == _max;
+}
+
+#endif /* CEPH_MSG_DPDK_QUEUE_H_ */
diff --git a/src/msg/async/dpdk/shared_ptr.h b/src/msg/async/dpdk/shared_ptr.h
new file mode 100644
index 000000000..d078063b3
--- /dev/null
+++ b/src/msg/async/dpdk/shared_ptr.h
@@ -0,0 +1,391 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:4; indent-tabs-mode:nil -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_LW_SHARED_PTR_H_
+#define CEPH_LW_SHARED_PTR_H_
+
+#include <utility>
+#include <type_traits>
+#include <functional>
+#include <iostream>
+
+// This header defines a shared pointer facility, lw_shared_ptr<>,
+// modeled after std::shared_ptr<>.
+//
+// Unlike std::shared_ptr<>, this implementation is thread
+// safe, and two pointers sharing the same object must not be used in
+// different threads.
+//
+// lw_shared_ptr<> is the more lightweight variant, with a lw_shared_ptr<>
+// occupying just one machine word, and adding just one word to the shared
+// object.  However, it does not support polymorphism.
+//
+// It supports shared_from_this() via enable_shared_from_this<>
+// and lw_enable_shared_from_this<>().
+//
+
+template <typename T>
+class lw_shared_ptr;
+
+template <typename T>
+class enable_lw_shared_from_this;
+
+template <typename T>
+class enable_shared_from_this;
+
+template <typename T, typename... A>
+lw_shared_ptr<T> make_lw_shared(A&&... a);
+
+template <typename T>
+lw_shared_ptr<T> make_lw_shared(T&& a);
+
+template <typename T>
+lw_shared_ptr<T> make_lw_shared(T& a);
+
+struct lw_shared_ptr_counter_base {
+    long _count = 0;
+};
+
+
+namespace internal {
+
+template <class T, class U>
+struct lw_shared_ptr_accessors;
+
+template <class T>
+struct lw_shared_ptr_accessors_esft;
+
+template <class T>
+struct lw_shared_ptr_accessors_no_esft;
+
+}
+
+
+// We want to support two use cases for shared_ptr<T>:
+//
+//   1. T is any type (primitive or class type)
+//
+//   2. T is a class type that inherits from enable_shared_from_this<T>.
+//
+// In the first case, we must wrap T in an object containing the counter,
+// since T may be a primitive type and cannot be a base class.
+//
+// In the second case, we want T to reach the counter through its
+// enable_shared_from_this<> base class, so that we can implement
+// shared_from_this().
+//
+// To implement those two conflicting requirements (T alongside its counter;
+// T inherits from an object containing the counter) we use std::conditional<>
+// and some accessor functions to select between two implementations.
+
+
+// CRTP from this to enable shared_from_this:
+template <typename T>
+class enable_lw_shared_from_this : private lw_shared_ptr_counter_base {
+    using ctor = T;
+protected:
+    enable_lw_shared_from_this() noexcept {}
+    enable_lw_shared_from_this(enable_lw_shared_from_this&&) noexcept {}
+    enable_lw_shared_from_this(const enable_lw_shared_from_this&) noexcept {}
+    enable_lw_shared_from_this& operator=(const enable_lw_shared_from_this&) noexcept { return *this; }
+    enable_lw_shared_from_this& operator=(enable_lw_shared_from_this&&) noexcept { return *this; }
+public:
+    lw_shared_ptr<T> shared_from_this();
+    lw_shared_ptr<const T> shared_from_this() const;
+
+    template <typename X>
+    friend class lw_shared_ptr;
+    template <typename X>
+    friend class ::internal::lw_shared_ptr_accessors_esft;
+    template <typename X, class Y>
+    friend class ::internal::lw_shared_ptr_accessors;
+};
+
+template <typename T>
+struct shared_ptr_no_esft : private lw_shared_ptr_counter_base {
+    T _value;
+
+    shared_ptr_no_esft() = default;
+    shared_ptr_no_esft(const T& x) : _value(x) {}
+    shared_ptr_no_esft(T&& x) : _value(std::move(x)) {}
+    template <typename... A>
+    shared_ptr_no_esft(A&&... a) : _value(std::forward<A>(a)...) {}
+
+    template <typename X>
+    friend class lw_shared_ptr;
+    template <typename X>
+    friend class ::internal::lw_shared_ptr_accessors_no_esft;
+    template <typename X, class Y>
+    friend class ::internal::lw_shared_ptr_accessors;
+};
+
+
+/// Extension point: the user may override this to change how \ref lw_shared_ptr objects are destroyed,
+/// primarily so that incomplete classes can be used.
+///
+/// Customizing the deleter requires that \c T be derived from \c enable_lw_shared_from_this<T>.
+/// The specialization must be visible for all uses of \c lw_shared_ptr<T>.
+///
+/// To customize, the template must have a `static void dispose(T*)` operator that disposes of
+/// the object.
+template <typename T>
+struct lw_shared_ptr_deleter;  // No generic implementation
+
+namespace internal {
+
+template <typename T>
+struct lw_shared_ptr_accessors_esft {
+    using concrete_type = std::remove_const_t<T>;
+    static T* to_value(lw_shared_ptr_counter_base* counter) {
+        return static_cast<T*>(counter);
+    }
+    static void dispose(lw_shared_ptr_counter_base* counter) {
+        delete static_cast<T*>(counter);
+    }
+    static void instantiate_to_value(lw_shared_ptr_counter_base* p) {
+        // since to_value() is defined above, we don't need to do anything special
+        // to force-instantiate it
+    }
+};
+
+template <typename T>
+struct lw_shared_ptr_accessors_no_esft {
+    using concrete_type = shared_ptr_no_esft<T>;
+    static T* to_value(lw_shared_ptr_counter_base* counter) {
+        return &static_cast<concrete_type*>(counter)->_value;
+    }
+    static void dispose(lw_shared_ptr_counter_base* counter) {
+        delete static_cast<concrete_type*>(counter);
+    }
+    static void instantiate_to_value(lw_shared_ptr_counter_base* p) {
+        // since to_value() is defined above, we don't need to do anything special
+        // to force-instantiate it
+    }
+};
+
+// Generic case: lw_shared_ptr_deleter<T> is not specialized, select
+// implementation based on whether T inherits from enable_lw_shared_from_this<T>.
+template <typename T, typename U = void>
+struct lw_shared_ptr_accessors : std::conditional_t<
+         std::is_base_of<enable_lw_shared_from_this<T>, T>::value,
+         lw_shared_ptr_accessors_esft<T>,
+         lw_shared_ptr_accessors_no_esft<T>> {
+};
+
+// Overload when lw_shared_ptr_deleter<T> specialized
+template <typename T>
+struct lw_shared_ptr_accessors<T, std::void_t<decltype(lw_shared_ptr_deleter<T>{})>> {
+    using concrete_type = T;
+    static T* to_value(lw_shared_ptr_counter_base* counter);
+    static void dispose(lw_shared_ptr_counter_base* counter) {
+        lw_shared_ptr_deleter<T>::dispose(to_value(counter));
+    }
+    static void instantiate_to_value(lw_shared_ptr_counter_base* p) {
+        // instantiate to_value(); must be defined by shared_ptr_incomplete.hh
+        to_value(p);
+    }
+};
+
+}
+
+template <typename T>
+class lw_shared_ptr {
+    using accessors = ::internal::lw_shared_ptr_accessors<std::remove_const_t<T>>;
+    using concrete_type = typename accessors::concrete_type;
+    mutable lw_shared_ptr_counter_base* _p = nullptr;
+private:
+    lw_shared_ptr(lw_shared_ptr_counter_base* p) noexcept : _p(p) {
+        if (_p) {
+            ++_p->_count;
+        }
+    }
+    template <typename... A>
+    static lw_shared_ptr make(A&&... a) {
+        auto p = new concrete_type(std::forward<A>(a)...);
+        accessors::instantiate_to_value(p);
+        return lw_shared_ptr(p);
+    }
+public:
+    using element_type = T;
+
+    lw_shared_ptr() noexcept = default;
+    lw_shared_ptr(std::nullptr_t) noexcept : lw_shared_ptr() {}
+    lw_shared_ptr(const lw_shared_ptr& x) noexcept : _p(x._p) {
+        if (_p) {
+            ++_p->_count;
+        }
+    }
+    lw_shared_ptr(lw_shared_ptr&& x) noexcept  : _p(x._p) {
+        x._p = nullptr;
+    }
+    [[gnu::always_inline]]
+    ~lw_shared_ptr() {
+        if (_p && !--_p->_count) {
+            accessors::dispose(_p);
+        }
+    }
+    lw_shared_ptr& operator=(const lw_shared_ptr& x) noexcept {
+        if (_p != x._p) {
+            this->~lw_shared_ptr();
+            new (this) lw_shared_ptr(x);
+        }
+        return *this;
+    }
+    lw_shared_ptr& operator=(lw_shared_ptr&& x) noexcept {
+        if (_p != x._p) {
+            this->~lw_shared_ptr();
+            new (this) lw_shared_ptr(std::move(x));
+        }
+        return *this;
+    }
+    lw_shared_ptr& operator=(std::nullptr_t) noexcept {
+        return *this = lw_shared_ptr();
+    }
+    lw_shared_ptr& operator=(T&& x) noexcept {
+        this->~lw_shared_ptr();
+        new (this) lw_shared_ptr(make_lw_shared<T>(std::move(x)));
+        return *this;
+    }
+
+    T& operator*() const noexcept { return *accessors::to_value(_p); }
+    T* operator->() const noexcept { return accessors::to_value(_p); }
+    T* get() const noexcept {
+        if (_p) {
+            return accessors::to_value(_p);
+        } else {
+            return nullptr;
+        }
+    }
+
+    long int use_count() const noexcept {
+        if (_p) {
+            return _p->_count;
+        } else {
+            return 0;
+        }
+    }
+
+    operator lw_shared_ptr<const T>() const noexcept {
+        return lw_shared_ptr<const T>(_p);
+    }
+
+    explicit operator bool() const noexcept {
+        return _p;
+    }
+
+    bool owned() const noexcept {
+        return _p->_count == 1;
+    }
+
+    bool operator==(const lw_shared_ptr<const T>& x) const {
+        return _p == x._p;
+    }
+
+    bool operator!=(const lw_shared_ptr<const T>& x) const {
+        return !operator==(x);
+    }
+
+    bool operator==(const lw_shared_ptr<std::remove_const_t<T>>& x) const {
+        return _p == x._p;
+    }
+
+    bool operator!=(const lw_shared_ptr<std::remove_const_t<T>>& x) const {
+        return !operator==(x);
+    }
+
+    bool operator<(const lw_shared_ptr<const T>& x) const {
+        return _p < x._p;
+    }
+
+    bool operator<(const lw_shared_ptr<std::remove_const_t<T>>& x) const {
+        return _p < x._p;
+    }
+
+    template <typename U>
+    friend class lw_shared_ptr;
+
+    template <typename X, typename... A>
+    friend lw_shared_ptr<X> make_lw_shared(A&&...);
+
+    template <typename U>
+    friend lw_shared_ptr<U> make_lw_shared(U&&);
+
+    template <typename U>
+    friend lw_shared_ptr<U> make_lw_shared(U&);
+
+    template <typename U>
+    friend class enable_lw_shared_from_this;
+};
+
+template <typename T, typename... A>
+inline
+lw_shared_ptr<T> make_lw_shared(A&&... a) {
+    return lw_shared_ptr<T>::make(std::forward<A>(a)...);
+}
+
+template <typename T>
+inline
+lw_shared_ptr<T> make_lw_shared(T&& a) {
+    return lw_shared_ptr<T>::make(std::move(a));
+}
+
+template <typename T>
+inline
+lw_shared_ptr<T> make_lw_shared(T& a) {
+    return lw_shared_ptr<T>::make(a);
+}
+
+template <typename T>
+inline
+lw_shared_ptr<T>
+enable_lw_shared_from_this<T>::shared_from_this() {
+    return lw_shared_ptr<T>(this);
+}
+
+template <typename T>
+inline
+lw_shared_ptr<const T>
+enable_lw_shared_from_this<T>::shared_from_this() const {
+    return lw_shared_ptr<const T>(const_cast<enable_lw_shared_from_this*>(this));
+}
+
+template <typename T>
+static inline
+std::ostream& operator<<(std::ostream& out, const lw_shared_ptr<T>& p) {
+    if (!p) {
+        return out << "null";
+    }
+    return out << *p;
+}
+
+namespace std {
+
+  template <typename T>
+  struct hash<lw_shared_ptr<T>> : private hash<T*> {
+    size_t operator()(const lw_shared_ptr<T>& p) const {
+        return hash<T*>::operator()(p.get());
+    }
+  };
+
+}
+
+#endif /* CEPH_LW_SHARED_PTR_H_ */
diff --git a/src/msg/async/dpdk/stream.h b/src/msg/async/dpdk/stream.h
new file mode 100644
index 000000000..1898e8f86
--- /dev/null
+++ b/src/msg/async/dpdk/stream.h
@@ -0,0 +1,155 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_STREAM_H_
+#define CEPH_MSG_STREAM_H_
+
+#include <exception>
+#include <cassert>
+
+// A stream<> is the producer side.  It may call produce() as long
+// as the returned from the previous invocation is ready.
+// To signify no more data is available, call close().
+//
+// A subscription<> is the consumer side.  It is created by a call
+// to stream::listen().  Calling subscription::start(),
+// which registers the data processing callback, starts processing
+// events.  It may register for end-of-stream notifications by
+// return the when_done() future, which also delivers error
+// events (as exceptions).
+//
+// The consumer can pause generation of new data by returning
+// positive integer; when it becomes ready, the producer
+// will resume processing.
+
+template <typename... T>
+class subscription;
+
+template <typename... T>
+class stream {
+  subscription<T...>* _sub = nullptr;
+  int done;
+  bool ready;
+ public:
+  using next_fn = std::function<int (T...)>;
+  stream() = default;
+  stream(const stream&) = delete;
+  stream(stream&&) = delete;
+  ~stream() {
+    if (_sub) {
+      _sub->_stream = nullptr;
+    }
+  }
+
+  void operator=(const stream&) = delete;
+  void operator=(stream&&) = delete;
+
+  // Returns a subscription that reads value from this
+  // stream.
+  subscription<T...> listen() {
+    return subscription<T...>(this);
+  }
+
+  // Returns a subscription that reads value from this
+  // stream, and also sets up the listen function.
+  subscription<T...> listen(next_fn next) {
+    auto sub = subscription<T...>(this);
+    sub.start(std::move(next));
+    return sub;
+  }
+
+  // Becomes ready when the listener is ready to accept
+  // values.  Call only once, when beginning to produce
+  // values.
+  bool started() {
+    return ready;
+  }
+
+  // Produce a value.  Call only after started(), and after
+  // a previous produce() is ready.
+  int produce(T... data) {
+      return _sub->_next(std::move(data)...);
+  }
+
+  // End the stream.   Call only after started(), and after
+  // a previous produce() is ready.  No functions may be called
+  // after this.
+  void close() {
+    done = 1;
+  }
+
+  // Signal an error.   Call only after started(), and after
+  // a previous produce() is ready.  No functions may be called
+  // after this.
+  void set_exception(int error) {
+    done = error;
+  }
+ private:
+  void start();
+  friend class subscription<T...>;
+};
+
+template <typename... T>
+class subscription {
+ public:
+  using next_fn = typename stream<T...>::next_fn;
+ private:
+  stream<T...>* _stream;
+  next_fn _next;
+ private:
+  explicit subscription(stream<T...>* s): _stream(s) {
+    ceph_assert(!_stream->_sub);
+    _stream->_sub = this;
+  }
+
+ public:
+  subscription(subscription&& x)
+    : _stream(x._stream), _next(std::move(x._next)) {
+    x._stream = nullptr;
+    if (_stream) {
+      _stream->_sub = this;
+    }
+  }
+  ~subscription() {
+    if (_stream) {
+      _stream->_sub = nullptr;
+    }
+  }
+
+  /// \brief Start receiving events from the stream.
+  ///
+  /// \param next Callback to call for each event
+  void start(std::function<int (T...)> next) {
+    _next = std::move(next);
+    _stream->ready = true;
+  }
+
+  // Becomes ready when the stream is empty, or when an error
+  // happens (in that case, an exception is held).
+  int done() {
+    return _stream->done;
+  }
+
+  friend class stream<T...>;
+};
+
+#endif /* CEPH_MSG_STREAM_H_ */
diff --git a/src/msg/async/dpdk/toeplitz.h b/src/msg/async/dpdk/toeplitz.h
new file mode 100644
index 000000000..3ca388082
--- /dev/null
+++ b/src/msg/async/dpdk/toeplitz.h
@@ -0,0 +1,92 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*-
+ * Copyright (c) 2010 David Malone <dwmalone@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef CEPH_MSG_TOEPLITZ_H_
+#define CEPH_MSG_TOEPLITZ_H_
+
+#include <vector>
+
+using rss_key_type = std::vector<uint8_t>;
+
+// Mellanox Linux's driver key
+static const rss_key_type default_rsskey_40bytes = {
+    0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
+    0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
+    0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
+    0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
+    0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
+};
+
+// Intel's i40e PMD default RSS key
+static const rss_key_type default_rsskey_52bytes = {
+    0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23,
+    0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30,
+    0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02,
+    0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c,
+    0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55,
+    0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e,
+    0x81, 0x15, 0x03, 0x66
+};
+
+template<typename T>
+static inline uint32_t toeplitz_hash(const rss_key_type& key, const T& data)
+{
+	uint32_t hash = 0, v;
+	u_int i, b;
+
+	/* XXXRW: Perhaps an assertion about key length vs. data length? */
+
+	v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
+	for (i = 0; i < data.size(); i++) {
+		for (b = 0; b < 8; b++) {
+			if (data[i] & (1<<(7-b)))
+				hash ^= v;
+			v <<= 1;
+			if ((i + 4) < key.size() &&
+			    (key[i+4] & (1<<(7-b))))
+				v |= 1;
+		}
+	}
+	return (hash);
+}
+#endif
diff --git a/src/msg/async/dpdk/transfer.h b/src/msg/async/dpdk/transfer.h
new file mode 100644
index 000000000..599db5bd0
--- /dev/null
+++ b/src/msg/async/dpdk/transfer.h
@@ -0,0 +1,64 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_TRANSFER_H_
+#define CEPH_TRANSFER_H_
+
+// Helper functions for copying or moving multiple objects in an exception
+// safe manner, then destroying the sources.
+//
+// To transfer, call transfer_pass1(allocator, &from, &to) on all object pairs,
+// (this copies the object from @from to @to).  If no exceptions are encountered,
+// call transfer_pass2(allocator, &from, &to).  This destroys the object at the
+// origin.  If exceptions were encountered, simply destroy all copied objects.
+//
+// As an optimization, if the objects are moveable without throwing (noexcept)
+// transfer_pass1() simply moves the objects and destroys the source, and
+// transfer_pass2() does nothing.
+
+#include <type_traits>
+#include <utility>
+
+template <typename T, typename Alloc>
+inline void transfer_pass1(Alloc& a, T* from, T* to,
+                           typename std::enable_if<std::is_nothrow_move_constructible<T>::value>::type* = nullptr) {
+    a.construct(to, std::move(*from));
+    a.destroy(from);
+}
+
+template <typename T, typename Alloc>
+inline void transfer_pass2(Alloc& a, T* from, T* to,
+                           typename std::enable_if<std::is_nothrow_move_constructible<T>::value>::type* = nullptr) {
+}
+
+template <typename T, typename Alloc>
+inline void transfer_pass1(Alloc& a, T* from, T* to,
+               typename std::enable_if<!std::is_nothrow_move_constructible<T>::value>::type* = nullptr) {
+    a.construct(to, *from);
+}
+
+template <typename T, typename Alloc>
+inline void transfer_pass2(Alloc& a, T* from, T* to,
+               typename std::enable_if<!std::is_nothrow_move_constructible<T>::value>::type* = nullptr) {
+    a.destroy(from);
+}
+
+#endif /* CEPH_TRANSFER_H_ */
diff --git a/src/msg/async/frames_v2.cc b/src/msg/async/frames_v2.cc
new file mode 100644
index 000000000..7a0b5907b
--- /dev/null
+++ b/src/msg/async/frames_v2.cc
@@ -0,0 +1,476 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "frames_v2.h"
+
+#include <ostream>
+
+#include <fmt/format.h>
+
+namespace ceph::msgr::v2 {
+
+// Unpads bufferlist to unpadded_len.
+static void unpad_zero(bufferlist& bl, uint32_t unpadded_len) {
+  ceph_assert(bl.length() >= unpadded_len);
+  if (bl.length() > unpadded_len) {
+    bl.splice(unpadded_len, bl.length() - unpadded_len);
+  }
+}
+
+// Discards trailing empty segments, unless there is just one segment.
+// A frame always has at least one (possibly empty) segment.
+static size_t calc_num_segments(const bufferlist segment_bls[],
+                                size_t segment_count) {
+  ceph_assert(segment_count > 0 && segment_count <= MAX_NUM_SEGMENTS);
+  for (size_t i = segment_count; i-- > 0; ) {
+    if (segment_bls[i].length() > 0) {
+      return i + 1;
+    }
+  }
+  return 1;
+}
+
+static void check_segment_crc(const bufferlist& segment_bl,
+                              uint32_t expected_crc) {
+  uint32_t crc = segment_bl.crc32c(-1);
+  if (crc != expected_crc) {
+    throw FrameError(fmt::format(
+        "bad segment crc calculated={} expected={}", crc, expected_crc));
+  }
+}
+
+// Returns true if the frame is ready for dispatching, or false if
+// it was aborted by the sender and must be dropped.
+static bool check_epilogue_late_status(__u8 late_status) {
+  __u8 aborted = late_status & FRAME_LATE_STATUS_ABORTED_MASK;
+  if (aborted != FRAME_LATE_STATUS_ABORTED &&
+      aborted != FRAME_LATE_STATUS_COMPLETE) {
+    throw FrameError(fmt::format("bad late_status"));
+  }
+  return aborted == FRAME_LATE_STATUS_COMPLETE;
+}
+
+void FrameAssembler::fill_preamble(Tag tag,
+                                   preamble_block_t& preamble) const {
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  ::memset(&preamble, 0, sizeof(preamble));
+
+  preamble.tag = static_cast<__u8>(tag);
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    preamble.segments[i].length = m_descs[i].logical_len;
+    preamble.segments[i].alignment = m_descs[i].align;
+  }
+  preamble.num_segments = m_descs.size();
+  preamble.crc = ceph_crc32c(
+      0, reinterpret_cast<const unsigned char*>(&preamble),
+      sizeof(preamble) - sizeof(preamble.crc));
+}
+
+uint64_t FrameAssembler::get_frame_logical_len() const {
+  ceph_assert(!m_descs.empty());
+  uint64_t logical_len = 0;
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    logical_len += m_descs[i].logical_len;
+  }
+  return logical_len;
+}
+
+uint64_t FrameAssembler::get_frame_onwire_len() const {
+  ceph_assert(!m_descs.empty());
+  uint64_t onwire_len = get_preamble_onwire_len();
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    onwire_len += get_segment_onwire_len(i);
+  }
+  onwire_len += get_epilogue_onwire_len();
+  return onwire_len;
+}
+
+bufferlist FrameAssembler::asm_crc_rev0(const preamble_block_t& preamble,
+                                        bufferlist segment_bls[]) const {
+  epilogue_crc_rev0_block_t epilogue;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  ::memset(&epilogue, 0, sizeof(epilogue));
+
+  bufferlist frame_bl(sizeof(preamble) + sizeof(epilogue));
+  frame_bl.append(reinterpret_cast<const char*>(&preamble), sizeof(preamble));
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    ceph_assert(segment_bls[i].length() == m_descs[i].logical_len);
+    epilogue.crc_values[i] = segment_bls[i].crc32c(-1);
+    if (segment_bls[i].length() > 0) {
+      frame_bl.claim_append(segment_bls[i]);
+    }
+  }
+  frame_bl.append(reinterpret_cast<const char*>(&epilogue), sizeof(epilogue));
+  return frame_bl;
+}
+
+bufferlist FrameAssembler::asm_secure_rev0(const preamble_block_t& preamble,
+                                           bufferlist segment_bls[]) const {
+  bufferlist preamble_bl(sizeof(preamble));
+  preamble_bl.append(reinterpret_cast<const char*>(&preamble),
+                     sizeof(preamble));
+
+  epilogue_secure_rev0_block_t epilogue;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  ::memset(&epilogue, 0, sizeof(epilogue));
+  bufferlist epilogue_bl(sizeof(epilogue));
+  epilogue_bl.append(reinterpret_cast<const char*>(&epilogue),
+                     sizeof(epilogue));
+
+  // preamble + MAX_NUM_SEGMENTS + epilogue
+  uint32_t onwire_lens[MAX_NUM_SEGMENTS + 2];
+  onwire_lens[0] = preamble_bl.length();
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    onwire_lens[i + 1] = segment_bls[i].length();  // already padded
+  }
+  onwire_lens[m_descs.size() + 1] = epilogue_bl.length();
+  m_crypto->tx->reset_tx_handler(onwire_lens,
+                                 onwire_lens + m_descs.size() + 2);
+  m_crypto->tx->authenticated_encrypt_update(preamble_bl);
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    if (segment_bls[i].length() > 0) {
+      m_crypto->tx->authenticated_encrypt_update(segment_bls[i]);
+    }
+  }
+  m_crypto->tx->authenticated_encrypt_update(epilogue_bl);
+  return m_crypto->tx->authenticated_encrypt_final();
+}
+
+bufferlist FrameAssembler::asm_crc_rev1(const preamble_block_t& preamble,
+                                        bufferlist segment_bls[]) const {
+  epilogue_crc_rev1_block_t epilogue;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  ::memset(&epilogue, 0, sizeof(epilogue));
+  epilogue.late_status |= FRAME_LATE_STATUS_COMPLETE;
+
+  bufferlist frame_bl(sizeof(preamble) + FRAME_CRC_SIZE + sizeof(epilogue));
+  frame_bl.append(reinterpret_cast<const char*>(&preamble), sizeof(preamble));
+
+  ceph_assert(segment_bls[0].length() == m_descs[0].logical_len);
+  if (segment_bls[0].length() > 0) {
+    uint32_t crc = segment_bls[0].crc32c(-1);
+    frame_bl.claim_append(segment_bls[0]);
+    encode(crc, frame_bl);
+  }
+  if (m_descs.size() == 1) {
+    return frame_bl;  // no epilogue if only one segment
+  }
+
+  for (size_t i = 1; i < m_descs.size(); i++) {
+    ceph_assert(segment_bls[i].length() == m_descs[i].logical_len);
+    epilogue.crc_values[i - 1] = segment_bls[i].crc32c(-1);
+    if (segment_bls[i].length() > 0) {
+      frame_bl.claim_append(segment_bls[i]);
+    }
+  }
+  frame_bl.append(reinterpret_cast<const char*>(&epilogue), sizeof(epilogue));
+  return frame_bl;
+}
+
+bufferlist FrameAssembler::asm_secure_rev1(const preamble_block_t& preamble,
+                                           bufferlist segment_bls[]) const {
+  bufferlist preamble_bl;
+  if (segment_bls[0].length() > FRAME_PREAMBLE_INLINE_SIZE) {
+    // first segment is partially inlined, inline buffer is full
+    preamble_bl.reserve(sizeof(preamble));
+    preamble_bl.append(reinterpret_cast<const char*>(&preamble),
+                       sizeof(preamble));
+    segment_bls[0].splice(0, FRAME_PREAMBLE_INLINE_SIZE, &preamble_bl);
+  } else {
+    // first segment is fully inlined, inline buffer may need padding
+    uint32_t pad_len = FRAME_PREAMBLE_INLINE_SIZE - segment_bls[0].length();
+    preamble_bl.reserve(sizeof(preamble) + pad_len);
+    preamble_bl.append(reinterpret_cast<const char*>(&preamble),
+                       sizeof(preamble));
+    preamble_bl.claim_append(segment_bls[0]);
+    if (pad_len > 0) {
+      preamble_bl.append_zero(pad_len);
+    }
+  }
+
+  m_crypto->tx->reset_tx_handler({preamble_bl.length()});
+  m_crypto->tx->authenticated_encrypt_update(preamble_bl);
+  auto frame_bl = m_crypto->tx->authenticated_encrypt_final();
+
+  if (segment_bls[0].length() > 0) {
+    m_crypto->tx->reset_tx_handler({segment_bls[0].length()});
+    m_crypto->tx->authenticated_encrypt_update(segment_bls[0]);
+    frame_bl.claim_append(m_crypto->tx->authenticated_encrypt_final());
+  }
+  if (m_descs.size() == 1) {
+    return frame_bl;  // no epilogue if only one segment
+  }
+
+  epilogue_secure_rev1_block_t epilogue;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  ::memset(&epilogue, 0, sizeof(epilogue));
+  epilogue.late_status |= FRAME_LATE_STATUS_COMPLETE;
+  bufferlist epilogue_bl(sizeof(epilogue));
+  epilogue_bl.append(reinterpret_cast<const char*>(&epilogue),
+                     sizeof(epilogue));
+
+  // MAX_NUM_SEGMENTS - 1 + epilogue
+  uint32_t onwire_lens[MAX_NUM_SEGMENTS];
+  for (size_t i = 1; i < m_descs.size(); i++) {
+    onwire_lens[i - 1] = segment_bls[i].length();  // already padded
+  }
+  onwire_lens[m_descs.size() - 1] = epilogue_bl.length();
+  m_crypto->tx->reset_tx_handler(onwire_lens, onwire_lens + m_descs.size());
+  for (size_t i = 1; i < m_descs.size(); i++) {
+    if (segment_bls[i].length() > 0) {
+      m_crypto->tx->authenticated_encrypt_update(segment_bls[i]);
+    }
+  }
+  m_crypto->tx->authenticated_encrypt_update(epilogue_bl);
+  frame_bl.claim_append(m_crypto->tx->authenticated_encrypt_final());
+  return frame_bl;
+}
+
+bufferlist FrameAssembler::assemble_frame(Tag tag, bufferlist segment_bls[],
+                                          const uint16_t segment_aligns[],
+                                          size_t segment_count) {
+  m_descs.resize(calc_num_segments(segment_bls, segment_count));
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    m_descs[i].logical_len = segment_bls[i].length();
+    m_descs[i].align = segment_aligns[i];
+  }
+
+  preamble_block_t preamble;
+  fill_preamble(tag, preamble);
+
+  if (m_crypto->rx) {
+    for (size_t i = 0; i < m_descs.size(); i++) {
+      ceph_assert(segment_bls[i].length() == m_descs[i].logical_len);
+      // We're padding segments to biggest cipher's block size. Although
+      // AES-GCM can live without that as it's a stream cipher, we don't
+      // want to be fixed to stream ciphers only.
+      uint32_t padded_len = get_segment_padded_len(i);
+      if (padded_len > segment_bls[i].length()) {
+        uint32_t pad_len = padded_len - segment_bls[i].length();
+        segment_bls[i].reserve(pad_len);
+        segment_bls[i].append_zero(pad_len);
+      }
+    }
+    if (m_is_rev1) {
+      return asm_secure_rev1(preamble, segment_bls);
+    }
+    return asm_secure_rev0(preamble, segment_bls);
+  }
+  if (m_is_rev1) {
+    return asm_crc_rev1(preamble, segment_bls);
+  }
+  return asm_crc_rev0(preamble, segment_bls);
+}
+
+Tag FrameAssembler::disassemble_preamble(bufferlist& preamble_bl) {
+  if (m_crypto->rx) {
+    m_crypto->rx->reset_rx_handler();
+    if (m_is_rev1) {
+      ceph_assert(preamble_bl.length() == FRAME_PREAMBLE_WITH_INLINE_SIZE +
+                                          get_auth_tag_len());
+      m_crypto->rx->authenticated_decrypt_update_final(preamble_bl);
+    } else {
+      ceph_assert(preamble_bl.length() == sizeof(preamble_block_t));
+      m_crypto->rx->authenticated_decrypt_update(preamble_bl);
+    }
+  } else {
+    ceph_assert(preamble_bl.length() == sizeof(preamble_block_t));
+  }
+
+  // I expect ceph_le32 will make the endian conversion for me. Passing
+  // everything through ::Decode is unnecessary.
+  auto preamble = reinterpret_cast<const preamble_block_t*>(
+      preamble_bl.c_str());
+  // check preamble crc before any further processing
+  uint32_t crc = ceph_crc32c(
+      0, reinterpret_cast<const unsigned char*>(preamble),
+      sizeof(*preamble) - sizeof(preamble->crc));
+  if (crc != preamble->crc) {
+    throw FrameError(fmt::format(
+        "bad preamble crc calculated={} expected={}", crc, preamble->crc));
+  }
+
+  // see calc_num_segments()
+  if (preamble->num_segments < 1 ||
+      preamble->num_segments > MAX_NUM_SEGMENTS) {
+    throw FrameError(fmt::format(
+        "bad number of segments num_segments={}", preamble->num_segments));
+  }
+  if (preamble->num_segments > 1 &&
+      preamble->segments[preamble->num_segments - 1].length == 0) {
+    throw FrameError("last segment empty");
+  }
+
+  m_descs.resize(preamble->num_segments);
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    m_descs[i].logical_len = preamble->segments[i].length;
+    m_descs[i].align = preamble->segments[i].alignment;
+  }
+  return static_cast<Tag>(preamble->tag);
+}
+
+bool FrameAssembler::disasm_all_crc_rev0(bufferlist segment_bls[],
+                                         bufferlist& epilogue_bl) const {
+  ceph_assert(epilogue_bl.length() == sizeof(epilogue_crc_rev0_block_t));
+  auto epilogue = reinterpret_cast<const epilogue_crc_rev0_block_t*>(
+      epilogue_bl.c_str());
+
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    ceph_assert(segment_bls[i].length() == m_descs[i].logical_len);
+    check_segment_crc(segment_bls[i], epilogue->crc_values[i]);
+  }
+  return !(epilogue->late_flags & FRAME_LATE_FLAG_ABORTED);
+}
+
+bool FrameAssembler::disasm_all_secure_rev0(bufferlist segment_bls[],
+                                            bufferlist& epilogue_bl) const {
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    ceph_assert(segment_bls[i].length() == get_segment_padded_len(i));
+    if (segment_bls[i].length() > 0) {
+      m_crypto->rx->authenticated_decrypt_update(segment_bls[i]);
+      unpad_zero(segment_bls[i], m_descs[i].logical_len);
+    }
+  }
+
+  ceph_assert(epilogue_bl.length() == sizeof(epilogue_secure_rev0_block_t) +
+                                      get_auth_tag_len());
+  m_crypto->rx->authenticated_decrypt_update_final(epilogue_bl);
+  auto epilogue = reinterpret_cast<const epilogue_secure_rev0_block_t*>(
+      epilogue_bl.c_str());
+  return !(epilogue->late_flags & FRAME_LATE_FLAG_ABORTED);
+}
+
+void FrameAssembler::disasm_first_crc_rev1(bufferlist& preamble_bl,
+                                           bufferlist& segment_bl) const {
+  ceph_assert(preamble_bl.length() == sizeof(preamble_block_t));
+  if (m_descs[0].logical_len > 0) {
+    ceph_assert(segment_bl.length() == m_descs[0].logical_len +
+                                       FRAME_CRC_SIZE);
+    bufferlist::const_iterator it(&segment_bl, m_descs[0].logical_len);
+    uint32_t expected_crc;
+    decode(expected_crc, it);
+    segment_bl.splice(m_descs[0].logical_len, FRAME_CRC_SIZE);
+    check_segment_crc(segment_bl, expected_crc);
+  } else {
+    ceph_assert(segment_bl.length() == 0);
+  }
+}
+
+bool FrameAssembler::disasm_remaining_crc_rev1(bufferlist segment_bls[],
+                                               bufferlist& epilogue_bl) const {
+  ceph_assert(epilogue_bl.length() == sizeof(epilogue_crc_rev1_block_t));
+  auto epilogue = reinterpret_cast<const epilogue_crc_rev1_block_t*>(
+      epilogue_bl.c_str());
+
+  for (size_t i = 1; i < m_descs.size(); i++) {
+    ceph_assert(segment_bls[i].length() == m_descs[i].logical_len);
+    check_segment_crc(segment_bls[i], epilogue->crc_values[i - 1]);
+  }
+  return check_epilogue_late_status(epilogue->late_status);
+}
+
+void FrameAssembler::disasm_first_secure_rev1(bufferlist& preamble_bl,
+                                              bufferlist& segment_bl) const {
+  ceph_assert(preamble_bl.length() == FRAME_PREAMBLE_WITH_INLINE_SIZE);
+  uint32_t padded_len = get_segment_padded_len(0);
+  if (padded_len > FRAME_PREAMBLE_INLINE_SIZE) {
+    ceph_assert(segment_bl.length() == padded_len + get_auth_tag_len() -
+                                       FRAME_PREAMBLE_INLINE_SIZE);
+    m_crypto->rx->reset_rx_handler();
+    m_crypto->rx->authenticated_decrypt_update_final(segment_bl);
+    // prepend the inline buffer (already decrypted) to segment_bl
+    bufferlist tmp;
+    segment_bl.swap(tmp);
+    preamble_bl.splice(sizeof(preamble_block_t), FRAME_PREAMBLE_INLINE_SIZE,
+                       &segment_bl);
+    segment_bl.claim_append(std::move(tmp));
+  } else {
+    ceph_assert(segment_bl.length() == 0);
+    preamble_bl.splice(sizeof(preamble_block_t), FRAME_PREAMBLE_INLINE_SIZE,
+                       &segment_bl);
+  }
+  unpad_zero(segment_bl, m_descs[0].logical_len);
+  ceph_assert(segment_bl.length() == m_descs[0].logical_len);
+}
+
+bool FrameAssembler::disasm_remaining_secure_rev1(
+    bufferlist segment_bls[], bufferlist& epilogue_bl) const {
+  m_crypto->rx->reset_rx_handler();
+  for (size_t i = 1; i < m_descs.size(); i++) {
+    ceph_assert(segment_bls[i].length() == get_segment_padded_len(i));
+    if (segment_bls[i].length() > 0) {
+      m_crypto->rx->authenticated_decrypt_update(segment_bls[i]);
+      unpad_zero(segment_bls[i], m_descs[i].logical_len);
+    }
+  }
+
+  ceph_assert(epilogue_bl.length() == sizeof(epilogue_secure_rev1_block_t) +
+                                      get_auth_tag_len());
+  m_crypto->rx->authenticated_decrypt_update_final(epilogue_bl);
+  auto epilogue = reinterpret_cast<const epilogue_secure_rev1_block_t*>(
+      epilogue_bl.c_str());
+  return check_epilogue_late_status(epilogue->late_status);
+}
+
+void FrameAssembler::disassemble_first_segment(bufferlist& preamble_bl,
+                                               bufferlist& segment_bl) const {
+  ceph_assert(!m_descs.empty());
+  if (m_is_rev1) {
+    if (m_crypto->rx) {
+      disasm_first_secure_rev1(preamble_bl, segment_bl);
+    } else {
+      disasm_first_crc_rev1(preamble_bl, segment_bl);
+    }
+  } else {
+    // noop, everything is handled in disassemble_remaining_segments()
+  }
+}
+
+bool FrameAssembler::disassemble_remaining_segments(
+    bufferlist segment_bls[], bufferlist& epilogue_bl) const {
+  ceph_assert(!m_descs.empty());
+  if (m_is_rev1) {
+    if (m_descs.size() == 1) {
+      // no epilogue if only one segment
+      ceph_assert(epilogue_bl.length() == 0);
+      return true;
+    }
+    if (m_crypto->rx) {
+      return disasm_remaining_secure_rev1(segment_bls, epilogue_bl);
+    }
+    return disasm_remaining_crc_rev1(segment_bls, epilogue_bl);
+  }
+  if (m_crypto->rx) {
+    return disasm_all_secure_rev0(segment_bls, epilogue_bl);
+  }
+  return disasm_all_crc_rev0(segment_bls, epilogue_bl);
+}
+
+std::ostream& operator<<(std::ostream& os, const FrameAssembler& frame_asm) {
+  if (!frame_asm.m_descs.empty()) {
+    os << frame_asm.get_preamble_onwire_len();
+    for (size_t i = 0; i < frame_asm.m_descs.size(); i++) {
+      os << " + " << frame_asm.get_segment_onwire_len(i)
+         << " (logical " << frame_asm.m_descs[i].logical_len
+         << "/" << frame_asm.m_descs[i].align << ")";
+    }
+    os << " + " << frame_asm.get_epilogue_onwire_len() << " ";
+  }
+  os << "rev1=" << frame_asm.m_is_rev1
+     << " rx=" << frame_asm.m_crypto->rx.get()
+     << " tx=" << frame_asm.m_crypto->tx.get();
+  return os;
+}
+
+}  // namespace ceph::msgr::v2
diff --git a/src/msg/async/frames_v2.h b/src/msg/async/frames_v2.h
new file mode 100644
index 000000000..94d4d1732
--- /dev/null
+++ b/src/msg/async/frames_v2.h
@@ -0,0 +1,842 @@
+#ifndef _MSG_ASYNC_FRAMES_V2_
+#define _MSG_ASYNC_FRAMES_V2_
+
+#include "include/types.h"
+#include "common/Clock.h"
+#include "crypto_onwire.h"
+#include <array>
+#include <iosfwd>
+#include <utility>
+
+#include <boost/container/static_vector.hpp>
+
+/**
+ * Protocol V2 Frame Structures
+ * 
+ * Documentation in: doc/dev/msgr2.rst
+ **/
+
+namespace ceph::msgr::v2 {
+
+// We require these features from any peer, period, in order to encode
+// a entity_addrvec_t.
+const uint64_t msgr2_required = CEPH_FEATUREMASK_MSG_ADDR2;
+
+// We additionally assume the peer has the below features *purely for
+// the purpose of encoding the frames themselves*.  The only complex
+// types in the frames are entity_addr_t and entity_addrvec_t, and we
+// specifically want the peer to understand the (new in nautilus)
+// TYPE_ANY.  We treat narrow this assumption to frames because we
+// expect there may be future clients (the kernel) that understand
+// msgr v2 and understand this encoding but don't necessarily have
+// everything else that SERVER_NAUTILUS implies.  Yes, a fresh feature
+// bit would be a cleaner approach, but those are scarce these days.
+const uint64_t msgr2_frame_assumed =
+		   msgr2_required |
+		   CEPH_FEATUREMASK_SERVER_NAUTILUS;
+
+enum class Tag : __u8 {
+  HELLO = 1,
+  AUTH_REQUEST,
+  AUTH_BAD_METHOD,
+  AUTH_REPLY_MORE,
+  AUTH_REQUEST_MORE,
+  AUTH_DONE,
+  AUTH_SIGNATURE,
+  CLIENT_IDENT,
+  SERVER_IDENT,
+  IDENT_MISSING_FEATURES,
+  SESSION_RECONNECT,
+  SESSION_RESET,
+  SESSION_RETRY,
+  SESSION_RETRY_GLOBAL,
+  SESSION_RECONNECT_OK,
+  WAIT,
+  MESSAGE,
+  KEEPALIVE2,
+  KEEPALIVE2_ACK,
+  ACK
+};
+
+struct segment_t {
+  // TODO: this will be dropped with support for `allocation policies`.
+  // We need them because of the rx_buffers zero-copy optimization.
+  static constexpr __u16 PAGE_SIZE_ALIGNMENT = 4096;
+
+  static constexpr __u16 DEFAULT_ALIGNMENT = sizeof(void *);
+
+  ceph_le32 length;
+  ceph_le16 alignment;
+} __attribute__((packed));
+
+struct SegmentIndex {
+  struct Msg {
+    static constexpr std::size_t HEADER = 0;
+    static constexpr std::size_t FRONT = 1;
+    static constexpr std::size_t MIDDLE = 2;
+    static constexpr std::size_t DATA = 3;
+  };
+
+  struct Control {
+    static constexpr std::size_t PAYLOAD = 0;
+  };
+};
+
+static constexpr uint8_t CRYPTO_BLOCK_SIZE { 16 };
+
+static constexpr std::size_t MAX_NUM_SEGMENTS = 4;
+
+// V2 preamble consists of one or more preamble blocks depending on
+// the number of segments a particular frame needs. Each block holds
+// up to MAX_NUM_SEGMENTS segments and has its own CRC.
+//
+// XXX: currently the multi-segment facility is NOT implemented.
+struct preamble_block_t {  
+  // Tag. For multi-segmented frames the value is the same
+  // between subsequent preamble blocks.
+  __u8 tag;
+
+  // Number of segments to go in entire frame. First preable block has
+  // set this to just #segments, second #segments - MAX_NUM_SEGMENTS,
+  // third to #segments - MAX_NUM_SEGMENTS and so on.
+  __u8 num_segments;
+
+  segment_t segments[MAX_NUM_SEGMENTS];
+  __u8 _reserved[2];
+
+  // CRC32 for this single preamble block.
+  ceph_le32 crc;
+} __attribute__((packed));
+static_assert(sizeof(preamble_block_t) % CRYPTO_BLOCK_SIZE == 0);
+static_assert(std::is_standard_layout<preamble_block_t>::value);
+
+struct epilogue_crc_rev0_block_t {
+  __u8 late_flags;  // FRAME_LATE_FLAG_ABORTED
+  ceph_le32 crc_values[MAX_NUM_SEGMENTS];
+} __attribute__((packed));
+static_assert(std::is_standard_layout_v<epilogue_crc_rev0_block_t>);
+
+struct epilogue_crc_rev1_block_t {
+  __u8 late_status;  // FRAME_LATE_STATUS_*
+  ceph_le32 crc_values[MAX_NUM_SEGMENTS - 1];
+} __attribute__((packed));
+static_assert(std::is_standard_layout_v<epilogue_crc_rev1_block_t>);
+
+struct epilogue_secure_rev0_block_t {
+  __u8 late_flags;  // FRAME_LATE_FLAG_ABORTED
+  __u8 padding[CRYPTO_BLOCK_SIZE - sizeof(late_flags)];
+} __attribute__((packed));
+static_assert(sizeof(epilogue_secure_rev0_block_t) % CRYPTO_BLOCK_SIZE == 0);
+static_assert(std::is_standard_layout_v<epilogue_secure_rev0_block_t>);
+
+// epilogue_secure_rev0_block_t with late_flags changed to late_status
+struct epilogue_secure_rev1_block_t {
+  __u8 late_status;  // FRAME_LATE_STATUS_*
+  __u8 padding[CRYPTO_BLOCK_SIZE - sizeof(late_status)];
+} __attribute__((packed));
+static_assert(sizeof(epilogue_secure_rev1_block_t) % CRYPTO_BLOCK_SIZE == 0);
+static_assert(std::is_standard_layout_v<epilogue_secure_rev1_block_t>);
+
+static constexpr uint32_t FRAME_CRC_SIZE = 4;
+static constexpr uint32_t FRAME_PREAMBLE_INLINE_SIZE = 48;
+static_assert(FRAME_PREAMBLE_INLINE_SIZE % CRYPTO_BLOCK_SIZE == 0);
+// just for performance, nothing should break otherwise
+static_assert(sizeof(ceph_msg_header2) <= FRAME_PREAMBLE_INLINE_SIZE);
+static constexpr uint32_t FRAME_PREAMBLE_WITH_INLINE_SIZE =
+    sizeof(preamble_block_t) + FRAME_PREAMBLE_INLINE_SIZE;
+
+// A frame can be aborted by the sender after transmitting the
+// preamble and the first segment.  The remainder of the frame
+// is filled with zeros, up until the epilogue.
+//
+// This flag is for msgr2.0.  Note that in crc mode, late_flags
+// is not covered by any crc -- a single bit flip can result in
+// a completed frame being dropped or in an aborted frame with
+// garbage segment payloads being dispatched.
+#define FRAME_LATE_FLAG_ABORTED           (1<<0)
+
+// For msgr2.1, FRAME_LATE_STATUS_ABORTED has the same meaning
+// as FRAME_LATE_FLAG_ABORTED and late_status replaces late_flags.
+// Bit error detection in crc mode is achieved by using a 4-bit
+// nibble per flag with two code words that are far apart in terms
+// of Hamming Distance (HD=4, same as provided by CRC32-C for
+// input lengths over ~5K).
+#define FRAME_LATE_STATUS_ABORTED         0x1
+#define FRAME_LATE_STATUS_COMPLETE        0xe
+#define FRAME_LATE_STATUS_ABORTED_MASK    0xf
+
+#define FRAME_LATE_STATUS_RESERVED_TRUE   0x10
+#define FRAME_LATE_STATUS_RESERVED_FALSE  0xe0
+#define FRAME_LATE_STATUS_RESERVED_MASK   0xf0
+
+struct FrameError : std::runtime_error {
+  using runtime_error::runtime_error;
+};
+
+class FrameAssembler {
+public:
+  // crypto must be non-null
+  FrameAssembler(const ceph::crypto::onwire::rxtx_t* crypto, bool is_rev1)
+      : m_crypto(crypto), m_is_rev1(is_rev1) {}
+
+  void set_is_rev1(bool is_rev1) {
+    m_descs.clear();
+    m_is_rev1 = is_rev1;
+  }
+
+  bool get_is_rev1() {
+    return m_is_rev1;
+  }
+
+  size_t get_num_segments() const {
+    ceph_assert(!m_descs.empty());
+    return m_descs.size();
+  }
+
+  uint32_t get_segment_logical_len(size_t seg_idx) const {
+    ceph_assert(seg_idx < m_descs.size());
+    return m_descs[seg_idx].logical_len;
+  }
+
+  uint16_t get_segment_align(size_t seg_idx) const {
+    ceph_assert(seg_idx < m_descs.size());
+    return m_descs[seg_idx].align;
+  }
+
+  // Preamble:
+  //
+  //   preamble_block_t
+  //   [preamble inline buffer + auth tag -- only in msgr2.1 secure mode]
+  //
+  // The preamble is generated unconditionally.
+  //
+  // In msgr2.1 secure mode, the first segment is inlined into the
+  // preamble inline buffer, either fully or partially.
+  uint32_t get_preamble_onwire_len() const {
+    if (m_is_rev1 && m_crypto->rx) {
+      return FRAME_PREAMBLE_WITH_INLINE_SIZE + get_auth_tag_len();
+    }
+    return sizeof(preamble_block_t);
+  }
+
+  // Segment:
+  //
+  //   segment payload
+  //   [zero padding -- only in secure mode]
+  //   [crc or auth tag -- only in msgr2.1, only for the first segment]
+  //
+  // For an empty segment, nothing is generated.  In msgr2.1 secure
+  // mode, if the first segment gets fully inlined into the preamble
+  // inline buffer, it is considered empty.
+  uint32_t get_segment_onwire_len(size_t seg_idx) const {
+    ceph_assert(seg_idx < m_descs.size());
+    if (m_crypto->rx) {
+      uint32_t padded_len = get_segment_padded_len(seg_idx);
+      if (m_is_rev1 && seg_idx == 0) {
+        if (padded_len > FRAME_PREAMBLE_INLINE_SIZE) {
+          return padded_len + get_auth_tag_len() - FRAME_PREAMBLE_INLINE_SIZE;
+        }
+        return 0;
+      }
+      return padded_len;
+    }
+    if (m_is_rev1 && seg_idx == 0 && m_descs[0].logical_len > 0) {
+      return m_descs[0].logical_len + FRAME_CRC_SIZE;
+    }
+    return m_descs[seg_idx].logical_len;
+  }
+
+  // Epilogue:
+  //
+  //   epilogue_*_block_t
+  //   [auth tag -- only in secure mode]
+  //
+  // For msgr2.0, the epilogue is generated unconditionally.  In
+  // crc mode, it stores crcs for all segments; the preamble is
+  // covered by its own crc.  In secure mode, the epilogue auth tag
+  // covers the whole frame.
+  //
+  // For msgr2.1, the epilogue is generated only if the frame has
+  // more than one segment (i.e. at least one of second to fourth
+  // segments is not empty).  In crc mode, it stores crcs for
+  // second to fourh segments; the preamble and the first segment
+  // are covered by their own crcs.  In secure mode, the epilogue
+  // auth tag covers second to fourth segments; the preamble and the
+  // first segment (if not fully inlined into the preamble inline
+  // buffer) are covered by their own auth tags.
+  //
+  // Note that the auth tag format is an implementation detail of a
+  // particular cipher.  FrameAssembler is concerned only with where
+  // the auth tag is placed (at the end of the ciphertext) and how
+  // long it is (RxHandler::get_extra_size_at_final()).  This is to
+  // provide room for other encryption algorithms: currently we use
+  // AES-128-GCM with 16-byte tags, but it is possible to switch to
+  // e.g. AES-128-CBC + HMAC-SHA512 without affecting the protocol
+  // (except for the cipher negotiation, of course).
+  //
+  // Additionally, each variant of the epilogue contains either
+  // late_flags or late_status field that directs handling of frames
+  // with more than one segment.
+  uint32_t get_epilogue_onwire_len() const {
+    ceph_assert(!m_descs.empty());
+    if (m_is_rev1 && m_descs.size() == 1) {
+      return 0;
+    }
+    if (m_crypto->rx) {
+      return (m_is_rev1 ? sizeof(epilogue_secure_rev1_block_t) :
+                  sizeof(epilogue_secure_rev0_block_t)) + get_auth_tag_len();
+    }
+    return m_is_rev1 ? sizeof(epilogue_crc_rev1_block_t) :
+                       sizeof(epilogue_crc_rev0_block_t);
+  }
+
+  uint64_t get_frame_logical_len() const;
+  uint64_t get_frame_onwire_len() const;
+
+  bufferlist assemble_frame(Tag tag, bufferlist segment_bls[],
+                            const uint16_t segment_aligns[],
+                            size_t segment_count);
+
+  Tag disassemble_preamble(bufferlist& preamble_bl);
+
+  // Like msgr1, and unlike msgr2.0, msgr2.1 allows interpreting the
+  // first segment before reading in the rest of the frame.
+  //
+  // For msgr2.1 (set_is_rev1(true)), you may:
+  //
+  // - read in the first segment
+  // - call disassemble_first_segment()
+  // - use the contents of the first segment, for example to
+  //   look up user-provided buffers based on ceph_msg_header2::tid
+  // - read in the remaining segments, possibly directly into
+  //   user-provided buffers
+  // - read in epilogue
+  // - call disassemble_remaining_segments()
+  //
+  // For msgr2.0 (set_is_rev1(false)), disassemble_first_segment() is
+  // a noop.  To accomodate, disassemble_remaining_segments() always
+  // takes all segments and skips over the first segment in msgr2.1
+  // case.  You must:
+  //
+  // - read in all segments
+  // - read in epilogue
+  // - call disassemble_remaining_segments()
+  //
+  // disassemble_remaining_segments() returns true if the frame is
+  // ready for dispatching, or false if it was aborted by the sender
+  // and must be dropped.
+  void disassemble_first_segment(bufferlist& preamble_bl,
+                                 bufferlist& segment_bl) const;
+  bool disassemble_remaining_segments(bufferlist segment_bls[],
+                                      bufferlist& epilogue_bl) const;
+
+private:
+  struct segment_desc_t {
+    uint32_t logical_len;
+    uint16_t align;
+  };
+
+  uint32_t get_segment_padded_len(size_t seg_idx) const {
+    return p2roundup<uint32_t>(m_descs[seg_idx].logical_len,
+                               CRYPTO_BLOCK_SIZE);
+  }
+
+  uint32_t get_auth_tag_len() const {
+    return m_crypto->rx->get_extra_size_at_final();
+  }
+
+  bufferlist asm_crc_rev0(const preamble_block_t& preamble,
+                          bufferlist segment_bls[]) const;
+  bufferlist asm_secure_rev0(const preamble_block_t& preamble,
+                             bufferlist segment_bls[]) const;
+  bufferlist asm_crc_rev1(const preamble_block_t& preamble,
+                          bufferlist segment_bls[]) const;
+  bufferlist asm_secure_rev1(const preamble_block_t& preamble,
+                             bufferlist segment_bls[]) const;
+
+  bool disasm_all_crc_rev0(bufferlist segment_bls[],
+                           bufferlist& epilogue_bl) const;
+  bool disasm_all_secure_rev0(bufferlist segment_bls[],
+                              bufferlist& epilogue_bl) const;
+  void disasm_first_crc_rev1(bufferlist& preamble_bl,
+                             bufferlist& segment_bl) const;
+  bool disasm_remaining_crc_rev1(bufferlist segment_bls[],
+                                 bufferlist& epilogue_bl) const;
+  void disasm_first_secure_rev1(bufferlist& preamble_bl,
+                                bufferlist& segment_bl) const;
+  bool disasm_remaining_secure_rev1(bufferlist segment_bls[],
+                                    bufferlist& epilogue_bl) const;
+
+  void fill_preamble(Tag tag, preamble_block_t& preamble) const;
+  friend std::ostream& operator<<(std::ostream& os,
+                                  const FrameAssembler& frame_asm);
+
+  boost::container::static_vector<segment_desc_t, MAX_NUM_SEGMENTS> m_descs;
+  const ceph::crypto::onwire::rxtx_t* m_crypto;
+  bool m_is_rev1;  // msgr2.1?
+};
+
+template <class T, uint16_t... SegmentAlignmentVs>
+struct Frame {
+  static constexpr size_t SegmentsNumV = sizeof...(SegmentAlignmentVs);
+  static_assert(SegmentsNumV > 0 && SegmentsNumV <= MAX_NUM_SEGMENTS);
+protected:
+  std::array<ceph::bufferlist, SegmentsNumV> segments;
+
+private:
+  static constexpr std::array<uint16_t, SegmentsNumV> alignments {
+    SegmentAlignmentVs...
+  };
+
+public:
+  ceph::bufferlist get_buffer(FrameAssembler& tx_frame_asm) {
+    auto bl = tx_frame_asm.assemble_frame(T::tag, segments.data(),
+                                          alignments.data(), SegmentsNumV);
+    ceph_assert(bl.length() == tx_frame_asm.get_frame_onwire_len());
+    return bl;
+  }
+};
+
+// ControlFrames are used to manage transceiver state (like connections) and
+// orchestrate transfers of MessageFrames. They use only single segment with
+// marshalling facilities -- derived classes specify frame structure through
+// Args pack while ControlFrame provides common encode/decode machinery.
+template <class C, typename... Args>
+class ControlFrame : public Frame<C, segment_t::DEFAULT_ALIGNMENT /* single segment */> {
+protected:
+  ceph::bufferlist &get_payload_segment() {
+    return this->segments[SegmentIndex::Control::PAYLOAD];
+  }
+
+  // this tuple is only used when decoding values from a payload segment
+  std::tuple<Args...> _values;
+
+  // FIXME: for now, we assume specific features for the purpoess of encoding
+  // the frames themselves (*not* messages in message frames!).
+  uint64_t features = msgr2_frame_assumed;
+
+  template <typename T>
+  inline void _encode_payload_each(T &t) {
+    if constexpr (std::is_same<T, std::vector<uint32_t> const>()) {
+      encode((uint32_t)t.size(), this->get_payload_segment(), features);
+      for (const auto &elem : t) {
+        encode(elem, this->get_payload_segment(), features);
+      }
+    } else {
+      encode(t, this->get_payload_segment(), features);
+    }
+  }
+
+  template <typename T>
+  inline void _decode_payload_each(T &t, bufferlist::const_iterator &ti) const {
+    if constexpr (std::is_same<T, std::vector<uint32_t>>()) {
+      uint32_t size;
+      decode(size, ti);
+      t.resize(size);
+      for (uint32_t i = 0; i < size; ++i) {
+        decode(t[i], ti);
+      }
+    } else {
+      decode(t, ti);
+    }
+  }
+
+  template <std::size_t... Is>
+  inline void _decode_payload(bufferlist::const_iterator &ti,
+                              std::index_sequence<Is...>) const {
+    (_decode_payload_each((Args &)std::get<Is>(_values), ti), ...);
+  }
+
+  template <std::size_t N>
+  inline decltype(auto) get_val() {
+    return std::get<N>(_values);
+  }
+
+  ControlFrame()
+    : Frame<C, segment_t::DEFAULT_ALIGNMENT /* single segment */>() {
+  }
+
+  void _encode(const Args &... args) {
+    (_encode_payload_each(args), ...);
+  }
+
+  void _decode(const ceph::bufferlist &bl) {
+    auto ti = bl.cbegin();
+    _decode_payload(ti, std::index_sequence_for<Args...>());
+  }
+
+public:
+  static C Encode(const Args &... args) {
+    C c;
+    c._encode(args...);
+    return c;
+  }
+
+  static C Decode(const ceph::bufferlist &payload) {
+    C c;
+    c._decode(payload);
+    return c;
+  }
+};
+
+struct HelloFrame : public ControlFrame<HelloFrame,
+                                        uint8_t,          // entity type
+                                        entity_addr_t> {  // peer address
+  static const Tag tag = Tag::HELLO;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint8_t &entity_type() { return get_val<0>(); }
+  inline entity_addr_t &peer_addr() { return get_val<1>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct AuthRequestFrame : public ControlFrame<AuthRequestFrame,
+                                              uint32_t, // auth method
+                                              std::vector<uint32_t>, // preferred modes
+                                              bufferlist> { // auth payload
+  static const Tag tag = Tag::AUTH_REQUEST;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint32_t &method() { return get_val<0>(); }
+  inline std::vector<uint32_t> &preferred_modes() { return get_val<1>(); }
+  inline bufferlist &auth_payload() { return get_val<2>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct AuthBadMethodFrame : public ControlFrame<AuthBadMethodFrame,
+                                                uint32_t, // method
+                                                int32_t,  // result
+                                                std::vector<uint32_t>,   // allowed methods
+                                                std::vector<uint32_t>> { // allowed modes
+  static const Tag tag = Tag::AUTH_BAD_METHOD;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint32_t &method() { return get_val<0>(); }
+  inline int32_t &result() { return get_val<1>(); }
+  inline std::vector<uint32_t> &allowed_methods() { return get_val<2>(); }
+  inline std::vector<uint32_t> &allowed_modes() { return get_val<3>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct AuthReplyMoreFrame : public ControlFrame<AuthReplyMoreFrame,
+                                                bufferlist> { // auth payload
+  static const Tag tag = Tag::AUTH_REPLY_MORE;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline bufferlist &auth_payload() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct AuthRequestMoreFrame : public ControlFrame<AuthRequestMoreFrame,
+                                                  bufferlist> { // auth payload
+  static const Tag tag = Tag::AUTH_REQUEST_MORE;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline bufferlist &auth_payload() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct AuthDoneFrame : public ControlFrame<AuthDoneFrame,
+                                           uint64_t, // global id
+                                           uint32_t, // connection mode
+                                           bufferlist> { // auth method payload
+  static const Tag tag = Tag::AUTH_DONE;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint64_t &global_id() { return get_val<0>(); }
+  inline uint32_t &con_mode() { return get_val<1>(); }
+  inline bufferlist &auth_payload() { return get_val<2>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct AuthSignatureFrame
+    : public ControlFrame<AuthSignatureFrame,
+                          sha256_digest_t> {
+  static const Tag tag = Tag::AUTH_SIGNATURE;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline sha256_digest_t &signature() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct ClientIdentFrame
+    : public ControlFrame<ClientIdentFrame,
+                          entity_addrvec_t,  // my addresses
+                          entity_addr_t,  // target address
+                          int64_t,  // global_id
+                          uint64_t,  // global seq
+                          uint64_t,  // supported features
+                          uint64_t,  // required features
+                          uint64_t,  // flags
+                          uint64_t> {  // client cookie
+  static const Tag tag = Tag::CLIENT_IDENT;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline entity_addrvec_t &addrs() { return get_val<0>(); }
+  inline entity_addr_t &target_addr() { return get_val<1>(); }
+  inline int64_t &gid() { return get_val<2>(); }
+  inline uint64_t &global_seq() { return get_val<3>(); }
+  inline uint64_t &supported_features() { return get_val<4>(); }
+  inline uint64_t &required_features() { return get_val<5>(); }
+  inline uint64_t &flags() { return get_val<6>(); }
+  inline uint64_t &cookie() { return get_val<7>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct ServerIdentFrame
+    : public ControlFrame<ServerIdentFrame,
+                          entity_addrvec_t,  // my addresses
+                          int64_t,  // global_id
+                          uint64_t,  // global seq
+                          uint64_t,  // supported features
+                          uint64_t,  // required features
+                          uint64_t,  // flags
+                          uint64_t> {  // server cookie
+  static const Tag tag = Tag::SERVER_IDENT;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline entity_addrvec_t &addrs() { return get_val<0>(); }
+  inline int64_t &gid() { return get_val<1>(); }
+  inline uint64_t &global_seq() { return get_val<2>(); }
+  inline uint64_t &supported_features() { return get_val<3>(); }
+  inline uint64_t &required_features() { return get_val<4>(); }
+  inline uint64_t &flags() { return get_val<5>(); }
+  inline uint64_t &cookie() { return get_val<6>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct ReconnectFrame
+    : public ControlFrame<ReconnectFrame,
+                          entity_addrvec_t,  // my addresses
+                          uint64_t,  // client cookie
+                          uint64_t,  // server cookie
+                          uint64_t,  // global sequence
+                          uint64_t,  // connect sequence
+                          uint64_t> { // message sequence
+  static const Tag tag = Tag::SESSION_RECONNECT;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline entity_addrvec_t &addrs() { return get_val<0>(); }
+  inline uint64_t &client_cookie() { return get_val<1>(); }
+  inline uint64_t &server_cookie() { return get_val<2>(); }
+  inline uint64_t &global_seq() { return get_val<3>(); }
+  inline uint64_t &connect_seq() { return get_val<4>(); }
+  inline uint64_t &msg_seq() { return get_val<5>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct ResetFrame : public ControlFrame<ResetFrame,
+                                        bool> {  // full reset
+  static const Tag tag = Tag::SESSION_RESET;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline bool &full() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct RetryFrame : public ControlFrame<RetryFrame,
+                                        uint64_t> {  // connection seq
+  static const Tag tag = Tag::SESSION_RETRY;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint64_t &connect_seq() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct RetryGlobalFrame : public ControlFrame<RetryGlobalFrame,
+                                              uint64_t> { // global seq
+  static const Tag tag = Tag::SESSION_RETRY_GLOBAL;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint64_t &global_seq() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct WaitFrame : public ControlFrame<WaitFrame> {
+  static const Tag tag = Tag::WAIT;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct ReconnectOkFrame : public ControlFrame<ReconnectOkFrame,
+                                              uint64_t> { // message seq
+  static const Tag tag = Tag::SESSION_RECONNECT_OK;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint64_t &msg_seq() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct IdentMissingFeaturesFrame 
+    : public ControlFrame<IdentMissingFeaturesFrame,
+                          uint64_t> { // missing features mask
+  static const Tag tag = Tag::IDENT_MISSING_FEATURES;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint64_t &features() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct KeepAliveFrame : public ControlFrame<KeepAliveFrame,
+                                            utime_t> {  // timestamp
+  static const Tag tag = Tag::KEEPALIVE2;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  static KeepAliveFrame Encode() {
+    return KeepAliveFrame::Encode(ceph_clock_now());
+  }
+
+  inline utime_t &timestamp() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct KeepAliveFrameAck : public ControlFrame<KeepAliveFrameAck,
+                                               utime_t> { // ack timestamp
+  static const Tag tag = Tag::KEEPALIVE2_ACK;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline utime_t &timestamp() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct AckFrame : public ControlFrame<AckFrame,
+                                      uint64_t> { // message sequence
+  static const Tag tag = Tag::ACK;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint64_t &seq() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+using segment_bls_t =
+    boost::container::static_vector<bufferlist, MAX_NUM_SEGMENTS>;
+
+// This class is used for encoding/decoding header of the message frame.
+// Body is processed almost independently with the sole junction point
+// being the `extra_payload_len` passed to get_buffer().
+struct MessageFrame : public Frame<MessageFrame,
+                                   /* four segments */
+                                   segment_t::DEFAULT_ALIGNMENT,
+                                   segment_t::DEFAULT_ALIGNMENT,
+                                   segment_t::DEFAULT_ALIGNMENT,
+                                   segment_t::PAGE_SIZE_ALIGNMENT> {
+  static const Tag tag = Tag::MESSAGE;
+
+  static MessageFrame Encode(const ceph_msg_header2 &msg_header,
+                             const ceph::bufferlist &front,
+                             const ceph::bufferlist &middle,
+                             const ceph::bufferlist &data) {
+    MessageFrame f;
+    f.segments[SegmentIndex::Msg::HEADER].append(
+        reinterpret_cast<const char*>(&msg_header), sizeof(msg_header));
+
+    f.segments[SegmentIndex::Msg::FRONT] = front;
+    f.segments[SegmentIndex::Msg::MIDDLE] = middle;
+    f.segments[SegmentIndex::Msg::DATA] = data;
+
+    return f;
+  }
+
+  static MessageFrame Decode(segment_bls_t& recv_segments) {
+    MessageFrame f;
+    // transfer segments' bufferlists. If a MessageFrame contains less
+    // SegmentsNumV segments, the missing ones will be seen as zeroed.
+    for (__u8 idx = 0; idx < std::size(recv_segments); idx++) {
+      f.segments[idx] = std::move(recv_segments[idx]);
+    }
+    return f;
+  }
+
+  inline const ceph_msg_header2 &header() {
+    auto& hdrbl = segments[SegmentIndex::Msg::HEADER];
+    return reinterpret_cast<const ceph_msg_header2&>(*hdrbl.c_str());
+  }
+
+  ceph::bufferlist &front() {
+    return segments[SegmentIndex::Msg::FRONT];
+  }
+
+  ceph::bufferlist &middle() {
+    return segments[SegmentIndex::Msg::MIDDLE];
+  }
+
+  ceph::bufferlist &data() {
+    return segments[SegmentIndex::Msg::DATA];
+  }
+
+  uint32_t front_len() const {
+    return segments[SegmentIndex::Msg::FRONT].length();
+  }
+
+  uint32_t middle_len() const {
+    return segments[SegmentIndex::Msg::MIDDLE].length();
+  }
+
+  uint32_t data_len() const {
+    return segments[SegmentIndex::Msg::DATA].length();
+  }
+
+protected:
+  using Frame::Frame;
+};
+
+} // namespace ceph::msgr::v2
+
+#endif // _MSG_ASYNC_FRAMES_V2_
diff --git a/src/msg/async/net_handler.cc b/src/msg/async/net_handler.cc
new file mode 100644
index 000000000..59e641511
--- /dev/null
+++ b/src/msg/async/net_handler.cc
@@ -0,0 +1,245 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+
+#include "net_handler.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "include/compat.h"
+#include "include/sock_compat.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << "NetHandler "
+
+namespace ceph{
+
+int NetHandler::create_socket(int domain, bool reuse_addr)
+{
+  int s;
+  int r = 0;
+
+  if ((s = socket_cloexec(domain, SOCK_STREAM, 0)) == -1) {
+    r = ceph_sock_errno();
+    lderr(cct) << __func__ << " couldn't create socket " << cpp_strerror(r) << dendl;
+    return -r;
+  }
+
+#if !defined(__FreeBSD__)
+  /* Make sure connection-intensive things like the benchmark
+   * will be able to close/open sockets a zillion of times */
+  if (reuse_addr) {
+    int on = 1;
+    if (::setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (SOCKOPT_VAL_TYPE)&on, sizeof(on)) == -1) {
+      r = ceph_sock_errno();
+      lderr(cct) << __func__ << " setsockopt SO_REUSEADDR failed: "
+                 << strerror(r) << dendl;
+      compat_closesocket(s);
+      return -r;
+    }
+  }
+#endif
+
+  return s;
+}
+
+int NetHandler::set_nonblock(int sd)
+{
+  int flags;
+  int r = 0;
+
+  #ifdef _WIN32
+  ULONG mode = 1;
+  r = ioctlsocket(sd, FIONBIO, &mode);
+  if (r) {
+    lderr(cct) << __func__ << " ioctlsocket(FIONBIO) failed: " << r
+                           << " " << WSAGetLastError() << dendl;
+    return -r;
+  }
+  #else
+  /* Set the socket nonblocking.
+   * Note that fcntl(2) for F_GETFL and F_SETFL can't be
+   * interrupted by a signal. */
+  if ((flags = fcntl(sd, F_GETFL)) < 0 ) {
+    r = ceph_sock_errno();
+    lderr(cct) << __func__ << " fcntl(F_GETFL) failed: " << cpp_strerror(r) << dendl;
+    return -r;
+  }
+  if (fcntl(sd, F_SETFL, flags | O_NONBLOCK) < 0) {
+    r = ceph_sock_errno();
+    lderr(cct) << __func__ << " fcntl(F_SETFL,O_NONBLOCK): " << cpp_strerror(r) << dendl;
+    return -r;
+  }
+  #endif
+
+  return 0;
+}
+
+int NetHandler::set_socket_options(int sd, bool nodelay, int size)
+{
+  int r = 0;
+  // disable Nagle algorithm?
+  if (nodelay) {
+    int flag = 1;
+    r = ::setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, (SOCKOPT_VAL_TYPE)&flag, sizeof(flag));
+    if (r < 0) {
+      r = ceph_sock_errno();
+      ldout(cct, 0) << "couldn't set TCP_NODELAY: " << cpp_strerror(r) << dendl;
+    }
+  }
+  if (size) {
+    r = ::setsockopt(sd, SOL_SOCKET, SO_RCVBUF, (SOCKOPT_VAL_TYPE)&size, sizeof(size));
+    if (r < 0)  {
+      r = ceph_sock_errno();
+      ldout(cct, 0) << "couldn't set SO_RCVBUF to " << size << ": " << cpp_strerror(r) << dendl;
+    }
+  }
+
+  // block ESIGPIPE
+#ifdef CEPH_USE_SO_NOSIGPIPE
+  int val = 1;
+  r = ::setsockopt(sd, SOL_SOCKET, SO_NOSIGPIPE, (SOCKOPT_VAL_TYPE)&val, sizeof(val));
+  if (r) {
+    r = ceph_sock_errno();
+    ldout(cct,0) << "couldn't set SO_NOSIGPIPE: " << cpp_strerror(r) << dendl;
+  }
+#endif
+  return -r;
+}
+
+void NetHandler::set_priority(int sd, int prio, int domain)
+{
+#ifdef SO_PRIORITY
+  if (prio < 0) {
+    return;
+  }
+  int r = -1;
+#ifdef IPTOS_CLASS_CS6
+  int iptos = IPTOS_CLASS_CS6;
+  switch (domain) {
+  case AF_INET:
+    r = ::setsockopt(sd, IPPROTO_IP, IP_TOS, (SOCKOPT_VAL_TYPE)&iptos, sizeof(iptos));
+    break;
+  case AF_INET6:
+    r = ::setsockopt(sd, IPPROTO_IPV6, IPV6_TCLASS, (SOCKOPT_VAL_TYPE)&iptos, sizeof(iptos));
+    break;
+  default:
+    lderr(cct) << "couldn't set ToS of unknown family (" << domain << ")"
+	       << " to " << iptos << dendl;
+    return;
+  }
+  if (r < 0) {
+    r = ceph_sock_errno();
+    ldout(cct,0) << "couldn't set TOS to " << iptos
+		 << ": " << cpp_strerror(r) << dendl;
+  }
+
+#endif	// IPTOS_CLASS_CS6
+  // setsockopt(IPTOS_CLASS_CS6) sets the priority of the socket as 0.
+  // See http://goo.gl/QWhvsD and http://goo.gl/laTbjT
+  // We need to call setsockopt(SO_PRIORITY) after it.
+  r = ::setsockopt(sd, SOL_SOCKET, SO_PRIORITY, (SOCKOPT_VAL_TYPE)&prio, sizeof(prio));
+  if (r < 0) {
+    r = ceph_sock_errno();
+    ldout(cct, 0) << __func__ << " couldn't set SO_PRIORITY to " << prio
+		  << ": " << cpp_strerror(r) << dendl;
+  }
+#else
+  return;
+#endif	// SO_PRIORITY
+}
+
+int NetHandler::generic_connect(const entity_addr_t& addr, const entity_addr_t &bind_addr, bool nonblock)
+{
+  int ret;
+  int s = create_socket(addr.get_family());
+  if (s < 0)
+    return s;
+
+  if (nonblock) {
+    ret = set_nonblock(s);
+    if (ret < 0) {
+      compat_closesocket(s);
+      return ret;
+    }
+  }
+
+  set_socket_options(s, cct->_conf->ms_tcp_nodelay, cct->_conf->ms_tcp_rcvbuf);
+
+  {
+    entity_addr_t addr = bind_addr;
+    if (cct->_conf->ms_bind_before_connect && (!addr.is_blank_ip())) {
+      addr.set_port(0);
+      ret = ::bind(s, addr.get_sockaddr(), addr.get_sockaddr_len());
+      if (ret < 0) {
+        ret = ceph_sock_errno();
+        ldout(cct, 2) << __func__ << " client bind error " << ", " << cpp_strerror(ret) << dendl;
+        compat_closesocket(s);
+        return -ret;
+      }
+    }
+  }
+
+  ret = ::connect(s, addr.get_sockaddr(), addr.get_sockaddr_len());
+  if (ret < 0) {
+    ret = ceph_sock_errno();
+    // Windows can return WSAEWOULDBLOCK (converted to EAGAIN).
+    if ((ret == EINPROGRESS || ret == EAGAIN) && nonblock)
+      return s;
+
+    ldout(cct, 10) << __func__ << " connect: " << cpp_strerror(ret) << dendl;
+    compat_closesocket(s);
+    return -ret;
+  }
+
+  return s;
+}
+
+int NetHandler::reconnect(const entity_addr_t &addr, int sd)
+{
+  int r = 0;
+  int ret = ::connect(sd, addr.get_sockaddr(), addr.get_sockaddr_len());
+
+  if (ret < 0 && ceph_sock_errno() != EISCONN) {
+    r = ceph_sock_errno();
+    ldout(cct, 10) << __func__ << " reconnect: " << r
+                   << " " << strerror(r) << dendl;
+    if (r == EINPROGRESS || r == EALREADY || r == EAGAIN)
+      return 1;
+    return -r;
+  }
+
+  return 0;
+}
+
+int NetHandler::connect(const entity_addr_t &addr, const entity_addr_t& bind_addr)
+{
+  return generic_connect(addr, bind_addr, false);
+}
+
+int NetHandler::nonblock_connect(const entity_addr_t &addr, const entity_addr_t& bind_addr)
+{
+  return generic_connect(addr, bind_addr, true);
+}
+
+
+}
diff --git a/src/msg/async/net_handler.h b/src/msg/async/net_handler.h
new file mode 100644
index 000000000..190423772
--- /dev/null
+++ b/src/msg/async/net_handler.h
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_NET_UTILS_H
+#define CEPH_COMMON_NET_UTILS_H
+#include "common/config.h"
+
+namespace ceph {
+  class NetHandler {
+    int generic_connect(const entity_addr_t& addr, const entity_addr_t& bind_addr, bool nonblock);
+
+    CephContext *cct;
+   public:
+    int create_socket(int domain, bool reuse_addr=false);
+    explicit NetHandler(CephContext *c): cct(c) {}
+    int set_nonblock(int sd);
+    int set_socket_options(int sd, bool nodelay, int size);
+    int connect(const entity_addr_t &addr, const entity_addr_t& bind_addr);
+    
+    /**
+     * Try to reconnect the socket.
+     *
+     * @return    0         success
+     *            > 0       just break, and wait for event
+     *            < 0       need to goto fail
+     */
+    int reconnect(const entity_addr_t &addr, int sd);
+    int nonblock_connect(const entity_addr_t &addr, const entity_addr_t& bind_addr);
+    void set_priority(int sd, int priority, int domain);
+  };
+}
+
+#endif
diff --git a/src/msg/async/rdma/Infiniband.cc b/src/msg/async/rdma/Infiniband.cc
new file mode 100644
index 000000000..52323f948
--- /dev/null
+++ b/src/msg/async/rdma/Infiniband.cc
@@ -0,0 +1,1321 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "Infiniband.h"
+#include "common/errno.h"
+#include "common/debug.h"
+#include "RDMAStack.h"
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << "Infiniband "
+
+static const uint32_t MAX_SHARED_RX_SGE_COUNT = 1;
+static const uint32_t MAX_INLINE_DATA = 0;
+static const uint32_t TCP_MSG_LEN = sizeof("0000:00000000:00000000:00000000:00000000000000000000000000000000");
+static const uint32_t CQ_DEPTH = 30000;
+
+Port::Port(CephContext *cct, struct ibv_context* ictxt, uint8_t ipn): ctxt(ictxt), port_num(ipn),
+  gid_idx(cct->_conf.get_val<int64_t>("ms_async_rdma_gid_idx"))
+{
+  int r = ibv_query_port(ctxt, port_num, &port_attr);
+  if (r == -1) {
+    lderr(cct) << __func__  << " query port failed  " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+
+  lid = port_attr.lid;
+  ceph_assert(gid_idx < port_attr.gid_tbl_len);
+#ifdef HAVE_IBV_EXP
+  union ibv_gid cgid;
+  struct ibv_exp_gid_attr gid_attr;
+  bool malformed = false;
+
+  ldout(cct,1) << __func__ << " using experimental verbs for gid" << dendl;
+
+
+  // search for requested GID in GIDs table
+  ldout(cct, 1) << __func__ << " looking for local GID " << (cct->_conf->ms_async_rdma_local_gid)
+    << " of type " << (cct->_conf->ms_async_rdma_roce_ver) << dendl;
+  r = sscanf(cct->_conf->ms_async_rdma_local_gid.c_str(),
+	     "%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx"
+	     ":%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx",
+	     &cgid.raw[ 0], &cgid.raw[ 1],
+	     &cgid.raw[ 2], &cgid.raw[ 3],
+	     &cgid.raw[ 4], &cgid.raw[ 5],
+	     &cgid.raw[ 6], &cgid.raw[ 7],
+	     &cgid.raw[ 8], &cgid.raw[ 9],
+	     &cgid.raw[10], &cgid.raw[11],
+	     &cgid.raw[12], &cgid.raw[13],
+	     &cgid.raw[14], &cgid.raw[15]);
+
+  if (r != 16) {
+    ldout(cct, 1) << __func__ << " malformed or no GID supplied, using GID index 0" << dendl;
+    malformed = true;
+  }
+
+  gid_attr.comp_mask = IBV_EXP_QUERY_GID_ATTR_TYPE;
+
+  for (gid_idx = 0; gid_idx < port_attr.gid_tbl_len; gid_idx++) {
+    r = ibv_query_gid(ctxt, port_num, gid_idx, &gid);
+    if (r) {
+      lderr(cct) << __func__  << " query gid of port " << port_num << " index " << gid_idx << " failed  " << cpp_strerror(errno) << dendl;
+      ceph_abort();
+    }
+    r = ibv_exp_query_gid_attr(ctxt, port_num, gid_idx, &gid_attr);
+    if (r) {
+      lderr(cct) << __func__  << " query gid attributes of port " << port_num << " index " << gid_idx << " failed  " << cpp_strerror(errno) << dendl;
+      ceph_abort();
+    }
+
+    if (malformed) break; // stay with gid_idx=0
+    if ( (gid_attr.type == cct->_conf->ms_async_rdma_roce_ver) &&
+	 (memcmp(&gid, &cgid, 16) == 0) ) {
+      ldout(cct, 1) << __func__ << " found at index " << gid_idx << dendl;
+      break;
+    }
+  }
+
+  if (gid_idx == port_attr.gid_tbl_len) {
+    lderr(cct) << __func__ << " Requested local GID was not found in GID table" << dendl;
+    ceph_abort();
+  }
+#else
+  r = ibv_query_gid(ctxt, port_num, gid_idx, &gid);
+  if (r) {
+    lderr(cct) << __func__  << " query gid failed  " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+#endif
+}
+
+Device::Device(CephContext *cct, ibv_device* ib_dev): device(ib_dev), active_port(nullptr)
+{
+  ceph_assert(device);
+  ctxt = ibv_open_device(device);
+  ceph_assert(ctxt);
+
+  name = ibv_get_device_name(device);
+
+  int r = ibv_query_device(ctxt, &device_attr);
+  if (r) {
+    lderr(cct) << __func__ << " failed to query rdma device. " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+}
+
+Device::Device(CephContext *cct, struct ibv_context *ib_ctx): device(ib_ctx->device),
+                                                              active_port(nullptr)
+{
+  ceph_assert(device);
+  ctxt = ib_ctx;
+  ceph_assert(ctxt);
+
+  name = ibv_get_device_name(device);
+
+  int r = ibv_query_device(ctxt, &device_attr);
+  if (r) {
+    lderr(cct) << __func__ << " failed to query rdma device. " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+}
+
+void Device::binding_port(CephContext *cct, int port_num) {
+  port_cnt = device_attr.phys_port_cnt;
+  for (uint8_t port_id = 1; port_id <= port_cnt; ++port_id) {
+    Port *port = new Port(cct, ctxt, port_id);
+    if (port_id == port_num && port->get_port_attr()->state == IBV_PORT_ACTIVE) {
+      active_port = port;
+      ldout(cct, 1) << __func__ << " found active port " << static_cast<int>(port_id) << dendl;
+      break;
+    } else {
+      ldout(cct, 10) << __func__ << " port " << port_id << " is not what we want. state: "
+                     << ibv_port_state_str(port->get_port_attr()->state) << dendl;
+      delete port;
+    }
+  }
+  if (nullptr == active_port) {
+    lderr(cct) << __func__ << "  port not found" << dendl;
+    ceph_assert(active_port);
+  }
+}
+
+
+Infiniband::QueuePair::QueuePair(
+    CephContext *c, Infiniband& infiniband, ibv_qp_type type,
+    int port, ibv_srq *srq,
+    Infiniband::CompletionQueue* txcq, Infiniband::CompletionQueue* rxcq,
+    uint32_t tx_queue_len, uint32_t rx_queue_len, struct rdma_cm_id *cid, uint32_t q_key)
+: cct(c), infiniband(infiniband),
+  type(type),
+  ctxt(infiniband.device->ctxt),
+  ib_physical_port(port),
+  pd(infiniband.pd->pd),
+  srq(srq),
+  qp(NULL),
+  cm_id(cid), peer_cm_meta{0}, local_cm_meta{0},
+  txcq(txcq),
+  rxcq(rxcq),
+  initial_psn(lrand48() & PSN_MSK),
+  // One extra WR for beacon
+  max_send_wr(tx_queue_len + 1),
+  max_recv_wr(rx_queue_len),
+  q_key(q_key),
+  dead(false)
+{
+  if (type != IBV_QPT_RC && type != IBV_QPT_UD && type != IBV_QPT_RAW_PACKET) {
+    lderr(cct) << __func__ << " invalid queue pair type" << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+}
+
+int Infiniband::QueuePair::modify_qp_to_error(void)
+{
+    ibv_qp_attr qpa;
+    // FIPS zeroization audit 20191115: this memset is not security related.
+    memset(&qpa, 0, sizeof(qpa));
+    qpa.qp_state = IBV_QPS_ERR;
+    if (ibv_modify_qp(qp, &qpa, IBV_QP_STATE)) {
+      lderr(cct) << __func__ << " failed to transition to ERROR state: " << cpp_strerror(errno) << dendl;
+      return -1;
+    }
+    ldout(cct, 20) << __func__ << " transition to ERROR state successfully." << dendl;
+    return 0;
+}
+
+int Infiniband::QueuePair::modify_qp_to_rts(void)
+{
+  // move from RTR state RTS
+  ibv_qp_attr qpa;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&qpa, 0, sizeof(qpa));
+  qpa.qp_state = IBV_QPS_RTS;
+  /*
+   * How long to wait before retrying if packet lost or server dead.
+   * Supposedly the timeout is 4.096us*2^timeout.  However, the actual
+   * timeout appears to be 4.096us*2^(timeout+1), so the setting
+   * below creates a 135ms timeout.
+   */
+  qpa.timeout = 0x12;
+  // How many times to retry after timeouts before giving up.
+  qpa.retry_cnt = 7;
+  /*
+   * How many times to retry after RNR (receiver not ready) condition
+   * before giving up. Occurs when the remote side has not yet posted
+   * a receive request.
+   */
+  qpa.rnr_retry = 7; // 7 is infinite retry.
+  qpa.sq_psn = local_cm_meta.psn;
+  qpa.max_rd_atomic = 1;
+
+  int attr_mask = IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC;
+  int r = ibv_modify_qp(qp, &qpa, attr_mask);
+  if (r) {
+    lderr(cct) << __func__ << " failed to transition to RTS state: " << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+  ldout(cct, 20) << __func__ << " transition to RTS state successfully." << dendl;
+  return 0;
+}
+
+int Infiniband::QueuePair::modify_qp_to_rtr(void)
+{
+  // move from INIT to RTR state
+  ibv_qp_attr qpa;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&qpa, 0, sizeof(qpa));
+  qpa.qp_state = IBV_QPS_RTR;
+  qpa.path_mtu = IBV_MTU_1024;
+  qpa.dest_qp_num = peer_cm_meta.local_qpn;
+  qpa.rq_psn = peer_cm_meta.psn;
+  qpa.max_dest_rd_atomic = 1;
+  qpa.min_rnr_timer = 0x12;
+  qpa.ah_attr.is_global = 1;
+  qpa.ah_attr.grh.hop_limit = 6;
+  qpa.ah_attr.grh.dgid = peer_cm_meta.gid;
+  qpa.ah_attr.grh.sgid_index = infiniband.get_device()->get_gid_idx();
+  qpa.ah_attr.grh.traffic_class = cct->_conf->ms_async_rdma_dscp;
+  //qpa.ah_attr.grh.flow_label = 0;
+
+  qpa.ah_attr.dlid = peer_cm_meta.lid;
+  qpa.ah_attr.sl = cct->_conf->ms_async_rdma_sl;
+  qpa.ah_attr.src_path_bits = 0;
+  qpa.ah_attr.port_num = (uint8_t)(ib_physical_port);
+
+  ldout(cct, 20) << __func__ << " Choosing gid_index " << (int)qpa.ah_attr.grh.sgid_index << ", sl " << (int)qpa.ah_attr.sl << dendl;
+
+  int attr_mask = IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MIN_RNR_TIMER | IBV_QP_MAX_DEST_RD_ATOMIC;
+
+  int r = ibv_modify_qp(qp, &qpa, attr_mask);
+  if (r) {
+    lderr(cct) << __func__ << " failed to transition to RTR state: " << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+  ldout(cct, 20) << __func__ << " transition to RTR state successfully." << dendl;
+  return 0;
+}
+
+int Infiniband::QueuePair::modify_qp_to_init(void)
+{
+  // move from RESET to INIT state
+  ibv_qp_attr qpa;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&qpa, 0, sizeof(qpa));
+  qpa.qp_state   = IBV_QPS_INIT;
+  qpa.pkey_index = 0;
+  qpa.port_num   = (uint8_t)(ib_physical_port);
+  qpa.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE;
+  qpa.qkey       = q_key;
+
+  int mask = IBV_QP_STATE | IBV_QP_PORT;
+  switch (type) {
+    case IBV_QPT_RC:
+      mask |= IBV_QP_ACCESS_FLAGS;
+      mask |= IBV_QP_PKEY_INDEX;
+      break;
+    case IBV_QPT_UD:
+      mask |= IBV_QP_QKEY;
+      mask |= IBV_QP_PKEY_INDEX;
+      break;
+    case IBV_QPT_RAW_PACKET:
+      break;
+    default:
+      ceph_abort();
+  }
+
+  if (ibv_modify_qp(qp, &qpa, mask)) {
+    lderr(cct) << __func__ << " failed to switch to INIT state Queue Pair, qp number: " << qp->qp_num
+               << " Error: " << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+  ldout(cct, 20) << __func__ << " successfully switch to INIT state Queue Pair, qp number: " << qp->qp_num << dendl;
+  return 0;
+}
+
+int Infiniband::QueuePair::init()
+{
+  ldout(cct, 20) << __func__ << " started." << dendl;
+  ibv_qp_init_attr qpia;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&qpia, 0, sizeof(qpia));
+  qpia.send_cq = txcq->get_cq();
+  qpia.recv_cq = rxcq->get_cq();
+  if (srq) {
+    qpia.srq = srq;                      // use the same shared receive queue
+  } else {
+    qpia.cap.max_recv_wr = max_recv_wr;
+    qpia.cap.max_recv_sge = 1;
+  }
+  qpia.cap.max_send_wr  = max_send_wr; // max outstanding send requests
+  qpia.cap.max_send_sge = 1;           // max send scatter-gather elements
+  qpia.cap.max_inline_data = MAX_INLINE_DATA;          // max bytes of immediate data on send q
+  qpia.qp_type = type;                 // RC, UC, UD, or XRC
+  qpia.sq_sig_all = 0;                 // only generate CQEs on requested WQEs
+
+  if (!cct->_conf->ms_async_rdma_cm) {
+    qp = ibv_create_qp(pd, &qpia);
+    if (qp == NULL) {
+      lderr(cct) << __func__ << " failed to create queue pair" << cpp_strerror(errno) << dendl;
+      if (errno == ENOMEM) {
+        lderr(cct) << __func__ << " try reducing ms_async_rdma_receive_queue_length, "
+                                  " ms_async_rdma_send_buffers or"
+                                  " ms_async_rdma_buffer_size" << dendl;
+      }
+      return -1;
+    }
+    if (modify_qp_to_init() != 0) {
+      ibv_destroy_qp(qp);
+      return -1;
+    }
+  } else {
+    ceph_assert(cm_id->verbs == pd->context);
+    if (rdma_create_qp(cm_id, pd, &qpia)) {
+      lderr(cct) << __func__ << " failed to create queue pair with rdmacm library"
+                 << cpp_strerror(errno) << dendl;
+      return -1;
+    }
+    qp = cm_id->qp;
+  }
+  ldout(cct, 20) << __func__ << " successfully create queue pair: "
+                 << "qp=" << qp << dendl;
+  local_cm_meta.local_qpn = get_local_qp_number();
+  local_cm_meta.psn = get_initial_psn();
+  local_cm_meta.lid = infiniband.get_lid();
+  local_cm_meta.peer_qpn = 0;
+  local_cm_meta.gid = infiniband.get_gid();
+  if (!srq) {
+    int rq_wrs = infiniband.post_chunks_to_rq(max_recv_wr, this);
+    if (rq_wrs  == 0) {
+      lderr(cct) << __func__ << " intialize no SRQ Queue Pair, qp number: " << qp->qp_num
+                 << " fatal error: can't post SQ WR " << dendl;
+      return -1;
+    }
+    ldout(cct, 20) << __func__ << " initialize no SRQ Queue Pair, qp number: "
+                   << qp->qp_num << " post SQ WR " << rq_wrs << dendl;
+  }
+  return 0;
+}
+
+void Infiniband::QueuePair::wire_gid_to_gid(const char *wgid, ib_cm_meta_t* cm_meta_data)
+{
+  char tmp[9];
+  uint32_t v32;
+  int i;
+
+  for (tmp[8] = 0, i = 0; i < 4; ++i) {
+    memcpy(tmp, wgid + i * 8, 8);
+    sscanf(tmp, "%x", &v32);
+    *(uint32_t *)(&cm_meta_data->gid.raw[i * 4]) = ntohl(v32);
+  }
+}
+
+void Infiniband::QueuePair::gid_to_wire_gid(const ib_cm_meta_t& cm_meta_data, char wgid[])
+{
+  for (int i = 0; i < 4; ++i)
+    sprintf(&wgid[i * 8], "%08x", htonl(*(uint32_t *)(cm_meta_data.gid.raw + i * 4)));
+}
+
+/*
+ * return value
+ *   1: means no valid buffer read
+ *   0: means got enough buffer
+ * < 0: means error
+ */
+int Infiniband::QueuePair::recv_cm_meta(CephContext *cct, int socket_fd)
+{
+  char msg[TCP_MSG_LEN];
+  char gid[33];
+  ssize_t r = ::read(socket_fd, &msg, sizeof(msg));
+  // Drop incoming qpt
+  if (cct->_conf->ms_inject_socket_failures && socket_fd >= 0) {
+    if (rand() % cct->_conf->ms_inject_socket_failures == 0) {
+      ldout(cct, 0) << __func__ << " injecting socket failure" << dendl;
+      return -EINVAL;
+    }
+  }
+  if (r < 0) {
+    r = -errno;
+    lderr(cct) << __func__ << " got error " << r << ": "
+               << cpp_strerror(r) << dendl;
+  } else if (r == 0) { // valid disconnect message of length 0
+    ldout(cct, 10) << __func__ << " got disconnect message " << dendl;
+  } else if ((size_t)r != sizeof(msg)) { // invalid message
+    ldout(cct, 1) << __func__ << " got bad length (" << r << ") " << dendl;
+    r = -EINVAL;
+  } else { // valid message
+    sscanf(msg, "%hx:%x:%x:%x:%s", &(peer_cm_meta.lid), &(peer_cm_meta.local_qpn), &(peer_cm_meta.psn), &(peer_cm_meta.peer_qpn), gid);
+    wire_gid_to_gid(gid, &peer_cm_meta);
+    ldout(cct, 5) << __func__ << " recevd: " << peer_cm_meta.lid << ", " << peer_cm_meta.local_qpn
+                  << ", " << peer_cm_meta.psn << ", " << peer_cm_meta.peer_qpn << ", " << gid << dendl;
+  }
+  return r;
+}
+
+int Infiniband::QueuePair::send_cm_meta(CephContext *cct, int socket_fd)
+{
+  int retry = 0;
+  ssize_t r;
+
+  char msg[TCP_MSG_LEN];
+  char gid[33];
+retry:
+  gid_to_wire_gid(local_cm_meta, gid);
+  sprintf(msg, "%04x:%08x:%08x:%08x:%s", local_cm_meta.lid, local_cm_meta.local_qpn, local_cm_meta.psn, local_cm_meta.peer_qpn, gid);
+  ldout(cct, 10) << __func__ << " sending: " << local_cm_meta.lid << ", " << local_cm_meta.local_qpn
+                 << ", " << local_cm_meta.psn << ", " << local_cm_meta.peer_qpn << ", "  << gid  << dendl;
+  r = ::write(socket_fd, msg, sizeof(msg));
+  // Drop incoming qpt
+  if (cct->_conf->ms_inject_socket_failures && socket_fd >= 0) {
+    if (rand() % cct->_conf->ms_inject_socket_failures == 0) {
+      ldout(cct, 0) << __func__ << " injecting socket failure" << dendl;
+      return -EINVAL;
+    }
+  }
+
+  if ((size_t)r != sizeof(msg)) {
+    // FIXME need to handle EAGAIN instead of retry
+    if (r < 0 && (errno == EINTR || errno == EAGAIN) && retry < 3) {
+      retry++;
+      goto retry;
+    }
+    if (r < 0)
+      lderr(cct) << __func__ << " send returned error " << errno << ": "
+                 << cpp_strerror(errno) << dendl;
+    else
+      lderr(cct) << __func__ << " send got bad length (" << r << ") " << cpp_strerror(errno) << dendl;
+    return -errno;
+  }
+  return 0;
+}
+
+/**
+ * Switch QP to ERROR state and then post a beacon to be able to drain all
+ * WCEs and then safely destroy QP. See RDMADispatcher::handle_tx_event()
+ * for details.
+ *
+ * \return
+ *      -errno if the QueuePair can't switch to ERROR
+ *      0 for success.
+ */
+int Infiniband::QueuePair::to_dead()
+{
+  if (dead)
+    return 0;
+
+  if (modify_qp_to_error()) {
+    return -1;
+  }
+  ldout(cct, 20) << __func__ << " force trigger error state Queue Pair, qp number: " << local_cm_meta.local_qpn
+                 << " bound remote QueuePair, qp number: " << local_cm_meta.peer_qpn << dendl;
+
+  struct ibv_send_wr *bad_wr = nullptr, beacon;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&beacon, 0, sizeof(beacon));
+  beacon.wr_id = BEACON_WRID;
+  beacon.opcode = IBV_WR_SEND;
+  beacon.send_flags = IBV_SEND_SIGNALED;
+  if (ibv_post_send(qp, &beacon, &bad_wr)) {
+    lderr(cct) << __func__ << " failed to send a beacon: " << cpp_strerror(errno) << dendl;
+    return -errno;
+  }
+  ldout(cct, 20) << __func__ << " trigger error state Queue Pair, qp number: " << local_cm_meta.local_qpn << " Beacon sent " << dendl;
+  dead = true;
+
+  return 0;
+}
+
+int Infiniband::QueuePair::get_remote_qp_number(uint32_t *rqp) const
+{
+  ibv_qp_attr qpa;
+  ibv_qp_init_attr qpia;
+
+  int r = ibv_query_qp(qp, &qpa, IBV_QP_DEST_QPN, &qpia);
+  if (r) {
+    lderr(cct) << __func__ << " failed to query qp: "
+      << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+
+  if (rqp)
+    *rqp = qpa.dest_qp_num;
+  return 0;
+}
+
+/**
+ * Get the remote infiniband address for this QueuePair, as set in #plumb().
+ * LIDs are "local IDs" in infiniband terminology. They are short, locally
+ * routable addresses.
+ */
+int Infiniband::QueuePair::get_remote_lid(uint16_t *lid) const
+{
+  ibv_qp_attr qpa;
+  ibv_qp_init_attr qpia;
+
+  int r = ibv_query_qp(qp, &qpa, IBV_QP_AV, &qpia);
+  if (r) {
+    lderr(cct) << __func__ << " failed to query qp: "
+      << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+
+  if (lid)
+    *lid = qpa.ah_attr.dlid;
+  return 0;
+}
+
+/**
+ * Get the state of a QueuePair.
+ */
+int Infiniband::QueuePair::get_state() const
+{
+  ibv_qp_attr qpa;
+  ibv_qp_init_attr qpia;
+
+  int r = ibv_query_qp(qp, &qpa, IBV_QP_STATE, &qpia);
+  if (r) {
+    lderr(cct) << __func__ << " failed to get state: "
+      << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+  return qpa.qp_state;
+}
+
+Infiniband::CompletionChannel::CompletionChannel(CephContext *c, Infiniband &ib)
+  : cct(c), infiniband(ib), channel(NULL), cq(NULL), cq_events_that_need_ack(0)
+{
+}
+
+Infiniband::CompletionChannel::~CompletionChannel()
+{
+  if (channel) {
+    int r = ibv_destroy_comp_channel(channel);
+    if (r < 0)
+      lderr(cct) << __func__ << " failed to destroy cc: " << cpp_strerror(errno) << dendl;
+    ceph_assert(r == 0);
+  }
+}
+
+int Infiniband::CompletionChannel::init()
+{
+  ldout(cct, 20) << __func__ << " started." << dendl;
+  channel = ibv_create_comp_channel(infiniband.device->ctxt);
+  if (!channel) {
+    lderr(cct) << __func__ << " failed to create receive completion channel: "
+                          << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+  int rc = ceph::NetHandler(cct).set_nonblock(channel->fd);
+  if (rc < 0) {
+    ibv_destroy_comp_channel(channel);
+    return -1;
+  }
+  return 0;
+}
+
+void Infiniband::CompletionChannel::ack_events()
+{
+  ibv_ack_cq_events(cq, cq_events_that_need_ack);
+  cq_events_that_need_ack = 0;
+}
+
+bool Infiniband::CompletionChannel::get_cq_event()
+{
+  ibv_cq *cq = NULL;
+  void *ev_ctx;
+  if (ibv_get_cq_event(channel, &cq, &ev_ctx)) {
+    if (errno != EAGAIN && errno != EINTR)
+      lderr(cct) << __func__ << " failed to retrieve CQ event: "
+                 << cpp_strerror(errno) << dendl;
+    return false;
+  }
+
+  /* accumulate number of cq events that need to
+   *    * be acked, and periodically ack them
+   *       */
+  if (++cq_events_that_need_ack == MAX_ACK_EVENT) {
+    ldout(cct, 20) << __func__ << " ack aq events." << dendl;
+    ibv_ack_cq_events(cq, MAX_ACK_EVENT);
+    cq_events_that_need_ack = 0;
+  }
+
+  return true;
+}
+
+
+Infiniband::CompletionQueue::~CompletionQueue()
+{
+  if (cq) {
+    int r = ibv_destroy_cq(cq);
+    if (r < 0)
+      lderr(cct) << __func__ << " failed to destroy cq: " << cpp_strerror(errno) << dendl;
+    ceph_assert(r == 0);
+  }
+}
+
+int Infiniband::CompletionQueue::init()
+{
+  cq = ibv_create_cq(infiniband.device->ctxt, queue_depth, this, channel->get_channel(), 0);
+  if (!cq) {
+    lderr(cct) << __func__ << " failed to create receive completion queue: "
+      << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+
+  if (ibv_req_notify_cq(cq, 0)) {
+    lderr(cct) << __func__ << " ibv_req_notify_cq failed: " << cpp_strerror(errno) << dendl;
+    ibv_destroy_cq(cq);
+    cq = nullptr;
+    return -1;
+  }
+
+  channel->bind_cq(cq);
+  ldout(cct, 20) << __func__ << " successfully create cq=" << cq << dendl;
+  return 0;
+}
+
+int Infiniband::CompletionQueue::rearm_notify(bool solicite_only)
+{
+  ldout(cct, 20) << __func__ << " started." << dendl;
+  int r = ibv_req_notify_cq(cq, 0);
+  if (r < 0)
+    lderr(cct) << __func__ << " failed to notify cq: " << cpp_strerror(errno) << dendl;
+  return r;
+}
+
+int Infiniband::CompletionQueue::poll_cq(int num_entries, ibv_wc *ret_wc_array) {
+  int r = ibv_poll_cq(cq, num_entries, ret_wc_array);
+  if (r < 0) {
+    lderr(cct) << __func__ << " poll_completion_queue occur met error: "
+      << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+  return r;
+}
+
+
+Infiniband::ProtectionDomain::ProtectionDomain(CephContext *cct, Device *device)
+  : pd(ibv_alloc_pd(device->ctxt))
+{
+  if (pd == NULL) {
+    lderr(cct) << __func__ << " failed to allocate infiniband protection domain: " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+}
+
+Infiniband::ProtectionDomain::~ProtectionDomain()
+{
+  ibv_dealloc_pd(pd);
+}
+
+
+Infiniband::MemoryManager::Chunk::Chunk(ibv_mr* m, uint32_t bytes, char* buffer,
+    uint32_t offset, uint32_t bound, uint32_t lkey, QueuePair* qp)
+  : mr(m), qp(qp), lkey(lkey), bytes(bytes), offset(offset), bound(bound), buffer(buffer)
+{
+}
+
+Infiniband::MemoryManager::Chunk::~Chunk()
+{
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::get_offset()
+{
+  return offset;
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::get_size() const
+{
+  return bound - offset;
+}
+
+void Infiniband::MemoryManager::Chunk::prepare_read(uint32_t b)
+{
+  offset = 0;
+  bound = b;
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::get_bound()
+{
+  return bound;
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::read(char* buf, uint32_t len)
+{
+  uint32_t left = get_size();
+  uint32_t read_len = left <= len ? left : len;
+  memcpy(buf, buffer + offset, read_len);
+  offset += read_len;
+  return read_len;
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::write(char* buf, uint32_t len)
+{
+  uint32_t write_len = (bytes - offset) <= len ? (bytes - offset) : len;
+  memcpy(buffer + offset, buf, write_len);
+  offset += write_len;
+  return write_len;
+}
+
+bool Infiniband::MemoryManager::Chunk::full()
+{
+  return offset == bytes;
+}
+
+void Infiniband::MemoryManager::Chunk::reset_read_chunk()
+{
+  offset = 0;
+  bound = 0;
+}
+
+void Infiniband::MemoryManager::Chunk::reset_write_chunk()
+{
+  offset = 0;
+  bound = bytes;
+}
+
+Infiniband::MemoryManager::Cluster::Cluster(MemoryManager& m, uint32_t s)
+  : manager(m), buffer_size(s)
+{
+}
+
+Infiniband::MemoryManager::Cluster::~Cluster()
+{
+  int r = ibv_dereg_mr(chunk_base->mr);
+  ceph_assert(r == 0);
+  const auto chunk_end = chunk_base + num_chunk;
+  for (auto chunk = chunk_base; chunk != chunk_end; chunk++) {
+    chunk->~Chunk();
+  }
+
+  ::free(chunk_base);
+  manager.free(base);
+}
+
+int Infiniband::MemoryManager::Cluster::fill(uint32_t num)
+{
+  ceph_assert(!base);
+  num_chunk = num;
+  uint32_t bytes = buffer_size * num;
+
+  base = (char*)manager.malloc(bytes);
+  end = base + bytes;
+  ceph_assert(base);
+  chunk_base = static_cast<Chunk*>(::malloc(sizeof(Chunk) * num));
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(static_cast<void*>(chunk_base), 0, sizeof(Chunk) * num);
+  free_chunks.reserve(num);
+  ibv_mr* m = ibv_reg_mr(manager.pd->pd, base, bytes, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
+  ceph_assert(m);
+  Chunk* chunk = chunk_base;
+  for (uint32_t offset = 0; offset < bytes; offset += buffer_size){
+    new(chunk) Chunk(m, buffer_size, base + offset, 0, buffer_size, m->lkey);
+    free_chunks.push_back(chunk);
+    chunk++;
+  }
+  return 0;
+}
+
+void Infiniband::MemoryManager::Cluster::take_back(std::vector<Chunk*> &ck)
+{
+  std::lock_guard l{lock};
+  for (auto c : ck) {
+    c->reset_write_chunk();
+    free_chunks.push_back(c);
+  }
+}
+
+int Infiniband::MemoryManager::Cluster::get_buffers(std::vector<Chunk*> &chunks, size_t block_size)
+{
+  std::lock_guard l{lock};
+  uint32_t chunk_buffer_number = (block_size + buffer_size - 1) / buffer_size;
+  chunk_buffer_number = free_chunks.size() < chunk_buffer_number ? free_chunks.size(): chunk_buffer_number;
+  uint32_t r = 0;
+
+  for (r = 0; r < chunk_buffer_number; ++r) {
+    chunks.push_back(free_chunks.back());
+    free_chunks.pop_back();
+  }
+  return r;
+}
+
+bool Infiniband::MemoryManager::MemPoolContext::can_alloc(unsigned nbufs)
+{
+  /* unlimited */
+  if (manager->cct->_conf->ms_async_rdma_receive_buffers <= 0)
+    return true;
+
+  if (n_bufs_allocated + nbufs > (unsigned)manager->cct->_conf->ms_async_rdma_receive_buffers) {
+    lderr(manager->cct) << __func__ << " WARNING: OUT OF RX BUFFERS: allocated: " <<
+        n_bufs_allocated << " requested: " << nbufs <<
+        " limit: " << manager->cct->_conf->ms_async_rdma_receive_buffers << dendl;
+    return false;
+  }
+
+  return true;
+}
+
+void Infiniband::MemoryManager::MemPoolContext::set_stat_logger(PerfCounters *logger) {
+  perf_logger = logger;
+  if (perf_logger != nullptr)
+    perf_logger->set(l_msgr_rdma_rx_bufs_total, n_bufs_allocated);
+}
+
+void Infiniband::MemoryManager::MemPoolContext::update_stats(int nbufs)
+{
+  n_bufs_allocated += nbufs;
+
+  if (!perf_logger)
+    return;
+
+  if (nbufs > 0) {
+    perf_logger->inc(l_msgr_rdma_rx_bufs_total, nbufs);
+  } else {
+    perf_logger->dec(l_msgr_rdma_rx_bufs_total, -nbufs);
+  }
+}
+
+void *Infiniband::MemoryManager::mem_pool::slow_malloc()
+{
+  // this will trigger pool expansion via PoolAllocator::malloc()
+  return PoolAllocator::with_context(ctx, [this] {
+    return boost::pool<PoolAllocator>::malloc();
+  });
+}
+
+Infiniband::MemoryManager::MemPoolContext*
+Infiniband::MemoryManager::PoolAllocator::g_ctx = nullptr;
+
+// lock is taken by mem_pool::slow_malloc()
+ceph::mutex& Infiniband::MemoryManager::PoolAllocator::get_lock()
+{
+  static ceph::mutex lock = ceph::make_mutex("pool-alloc-lock");
+  return lock;
+}
+
+char *Infiniband::MemoryManager::PoolAllocator::malloc(const size_type block_size)
+{
+  ceph_assert(g_ctx);
+  MemoryManager *manager = g_ctx->manager;
+  CephContext *cct = manager->cct;
+  size_t chunk_buffer_size = sizeof(Chunk) + cct->_conf->ms_async_rdma_buffer_size;
+  size_t chunk_buffer_number = block_size / chunk_buffer_size;
+
+  if (!g_ctx->can_alloc(chunk_buffer_number))
+    return NULL;
+
+  mem_info *minfo= static_cast<mem_info *>(manager->malloc(block_size + sizeof(mem_info)));
+  if (!minfo) {
+    lderr(cct) << __func__ << " failed to allocate " << chunk_buffer_number << " buffers "
+      " Its block size is : " << block_size + sizeof(mem_info) << dendl;
+    return NULL;
+  }
+
+  minfo->mr = ibv_reg_mr(manager->pd->pd, minfo->chunks, block_size, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
+  if (minfo->mr == NULL) {
+    lderr(cct) << __func__ << " failed to do rdma memory registration " << block_size << " bytes. "
+      " relase allocated memory now." << dendl;
+    manager->free(minfo);
+    return NULL;
+  }
+
+  minfo->nbufs = chunk_buffer_number;
+  // save this chunk context
+  minfo->ctx   = g_ctx;
+
+  // note that the memory can be allocated before perf logger is set
+  g_ctx->update_stats(chunk_buffer_number);
+
+  /* initialize chunks */
+  Chunk *chunk = minfo->chunks;
+  for (unsigned i = 0; i < chunk_buffer_number; i++) {
+    new(chunk) Chunk(minfo->mr, cct->_conf->ms_async_rdma_buffer_size, chunk->data, 0, 0, minfo->mr->lkey);
+    chunk = reinterpret_cast<Chunk *>(reinterpret_cast<char *>(chunk) + chunk_buffer_size);
+  }
+
+  return reinterpret_cast<char *>(minfo->chunks);
+}
+
+
+void Infiniband::MemoryManager::PoolAllocator::free(char * const block)
+{
+  mem_info *m;
+  std::lock_guard l{get_lock()};
+    
+  Chunk *mem_info_chunk = reinterpret_cast<Chunk *>(block);
+  m = reinterpret_cast<mem_info *>(reinterpret_cast<char *>(mem_info_chunk) - offsetof(mem_info, chunks));
+  m->ctx->update_stats(-m->nbufs);
+  ibv_dereg_mr(m->mr);
+  m->ctx->manager->free(m);
+}
+
+Infiniband::MemoryManager::MemoryManager(CephContext *c, Device *d, ProtectionDomain *p)
+  : cct(c), device(d), pd(p),
+    rxbuf_pool_ctx(this),
+    rxbuf_pool(&rxbuf_pool_ctx, sizeof(Chunk) + c->_conf->ms_async_rdma_buffer_size,
+               c->_conf->ms_async_rdma_receive_buffers > 0 ?
+                  // if possible make initial pool size 2 * receive_queue_len
+                  // that way there will be no pool expansion upon receive of the
+                  // first packet.
+                  (c->_conf->ms_async_rdma_receive_buffers < 2 * c->_conf->ms_async_rdma_receive_queue_len ?
+                   c->_conf->ms_async_rdma_receive_buffers :  2 * c->_conf->ms_async_rdma_receive_queue_len) :
+                  // rx pool is infinite, we can set any initial size that we want
+                   2 * c->_conf->ms_async_rdma_receive_queue_len,
+                   device->device_attr.max_mr_size / (sizeof(Chunk) + cct->_conf->ms_async_rdma_buffer_size))
+{
+}
+
+Infiniband::MemoryManager::~MemoryManager()
+{
+  if (send)
+    delete send;
+}
+
+void* Infiniband::MemoryManager::huge_pages_malloc(size_t size)
+{
+  size_t real_size = ALIGN_TO_PAGE_2MB(size) + HUGE_PAGE_SIZE_2MB;
+  char *ptr = (char *)mmap(NULL, real_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE | MAP_HUGETLB, -1, 0);
+  if (ptr == MAP_FAILED) {
+    ptr = (char *)std::malloc(real_size);
+    if (ptr == NULL) return NULL;
+    real_size = 0;
+  }
+  *((size_t *)ptr) = real_size;
+  return ptr + HUGE_PAGE_SIZE_2MB;
+}
+
+void Infiniband::MemoryManager::huge_pages_free(void *ptr)
+{
+  if (ptr == NULL) return;
+  void *real_ptr = (char *)ptr - HUGE_PAGE_SIZE_2MB;
+  size_t real_size = *((size_t *)real_ptr);
+  ceph_assert(real_size % HUGE_PAGE_SIZE_2MB == 0);
+  if (real_size != 0)
+    munmap(real_ptr, real_size);
+  else
+    std::free(real_ptr);
+}
+
+
+void* Infiniband::MemoryManager::malloc(size_t size)
+{
+  if (cct->_conf->ms_async_rdma_enable_hugepage)
+    return huge_pages_malloc(size);
+  else
+    return std::malloc(size);
+}
+
+void Infiniband::MemoryManager::free(void *ptr)
+{
+  if (cct->_conf->ms_async_rdma_enable_hugepage)
+    huge_pages_free(ptr);
+  else
+    std::free(ptr);
+}
+
+void Infiniband::MemoryManager::create_tx_pool(uint32_t size, uint32_t tx_num)
+{
+  ceph_assert(device);
+  ceph_assert(pd);
+
+  send = new Cluster(*this, size);
+  send->fill(tx_num);
+}
+
+void Infiniband::MemoryManager::return_tx(std::vector<Chunk*> &chunks)
+{
+  send->take_back(chunks);
+}
+
+int Infiniband::MemoryManager::get_send_buffers(std::vector<Chunk*> &c, size_t bytes)
+{
+  return send->get_buffers(c, bytes);
+}
+
+static std::atomic<bool> init_prereq = {false};
+
+void Infiniband::verify_prereq(CephContext *cct) {
+   int rc = 0;
+   ldout(cct, 20) << __func__ << " ms_async_rdma_enable_hugepage value is: " << cct->_conf->ms_async_rdma_enable_hugepage <<  dendl;
+   if (cct->_conf->ms_async_rdma_enable_hugepage){
+     rc =  setenv("RDMAV_HUGEPAGES_SAFE","1",1);
+     ldout(cct, 0) << __func__ << " RDMAV_HUGEPAGES_SAFE is set as: " << getenv("RDMAV_HUGEPAGES_SAFE") <<  dendl;
+     if (rc) {
+       lderr(cct) << __func__ << " failed to export RDMA_HUGEPAGES_SAFE. On RDMA must be exported before using huge pages. Application aborts." << dendl;
+       ceph_abort();
+     }
+   }
+
+  //On RDMA MUST be called before fork
+   rc = ibv_fork_init();
+   if (rc) {
+      lderr(cct) << __func__ << " failed to call ibv_for_init(). On RDMA must be called before fork. Application aborts." << dendl;
+      ceph_abort();
+   }
+
+   //Check ulimit
+   struct rlimit limit;
+   getrlimit(RLIMIT_MEMLOCK, &limit);
+   if (limit.rlim_cur != RLIM_INFINITY || limit.rlim_max != RLIM_INFINITY) {
+      lderr(cct) << __func__ << "!!! WARNING !!! For RDMA to work properly user memlock (ulimit -l) must be big enough to allow large amount of registered memory."
+				  " We recommend setting this parameter to infinity" << dendl;
+   }
+   init_prereq = true;
+}
+
+Infiniband::Infiniband(CephContext *cct)
+  : cct(cct),
+    device_name(cct->_conf->ms_async_rdma_device_name),
+    port_num( cct->_conf->ms_async_rdma_port_num)
+{
+  if (!init_prereq)
+    verify_prereq(cct);
+  ldout(cct, 20) << __func__ << " constructing Infiniband..." << dendl;
+}
+
+void Infiniband::init()
+{
+  std::lock_guard l{lock};
+
+  if (initialized)
+    return;
+
+  device_list = new DeviceList(cct);
+  initialized = true;
+
+  device = device_list->get_device(device_name.c_str());
+  ceph_assert(device);
+  device->binding_port(cct, port_num);
+  ib_physical_port = device->active_port->get_port_num();
+  pd = new ProtectionDomain(cct, device);
+  ceph_assert(ceph::NetHandler(cct).set_nonblock(device->ctxt->async_fd) == 0);
+
+  support_srq = cct->_conf->ms_async_rdma_support_srq;
+  if (support_srq) {
+    ceph_assert(device->device_attr.max_srq);
+    rx_queue_len = device->device_attr.max_srq_wr;
+  }
+  else
+    rx_queue_len = device->device_attr.max_qp_wr;
+  if (rx_queue_len > cct->_conf->ms_async_rdma_receive_queue_len) {
+    rx_queue_len = cct->_conf->ms_async_rdma_receive_queue_len;
+    ldout(cct, 1) << __func__ << " assigning: " << rx_queue_len << " receive buffers" << dendl;
+  } else {
+    ldout(cct, 0) << __func__ << " using the max allowed receive buffers: " << rx_queue_len << dendl;
+  }
+
+  // check for the misconfiguration
+  if (cct->_conf->ms_async_rdma_receive_buffers > 0 &&
+      rx_queue_len > (unsigned)cct->_conf->ms_async_rdma_receive_buffers) {
+    lderr(cct) << __func__ << " rdma_receive_queue_len (" <<
+                  rx_queue_len << ") > ms_async_rdma_receive_buffers(" <<
+                  cct->_conf->ms_async_rdma_receive_buffers << ")." << dendl;
+    ceph_abort();
+  }
+
+  // Keep extra one WR for a beacon to indicate all WCEs were consumed
+  tx_queue_len = device->device_attr.max_qp_wr - 1;
+  if (tx_queue_len > cct->_conf->ms_async_rdma_send_buffers) {
+    tx_queue_len = cct->_conf->ms_async_rdma_send_buffers;
+    ldout(cct, 1) << __func__ << " assigning: " << tx_queue_len << " send buffers"  << dendl;
+  } else {
+    ldout(cct, 0) << __func__ << " using the max allowed send buffers: " << tx_queue_len << dendl;
+  }
+
+  //check for the memory region size misconfiguration
+  if ((uint64_t)cct->_conf->ms_async_rdma_buffer_size * tx_queue_len > device->device_attr.max_mr_size) {
+    lderr(cct) << __func__ << " Out of max memory region size " << dendl;
+    ceph_abort();
+  }
+
+  ldout(cct, 1) << __func__ << " device allow " << device->device_attr.max_cqe
+                << " completion entries" << dendl;
+
+  memory_manager = new MemoryManager(cct, device, pd);
+  memory_manager->create_tx_pool(cct->_conf->ms_async_rdma_buffer_size, tx_queue_len);
+
+  if (support_srq) {
+    srq = create_shared_receive_queue(rx_queue_len, MAX_SHARED_RX_SGE_COUNT);
+    post_chunks_to_rq(rx_queue_len, NULL); //add to srq
+  }
+}
+
+Infiniband::~Infiniband()
+{
+  if (!initialized)
+    return;
+  if (support_srq)
+    ibv_destroy_srq(srq);
+  delete memory_manager;
+  delete pd;
+  delete device_list;
+}
+
+/**
+ * Create a shared receive queue. This basically wraps the verbs call. 
+ *
+ * \param[in] max_wr
+ *      The max number of outstanding work requests in the SRQ.
+ * \param[in] max_sge
+ *      The max number of scatter elements per WR.
+ * \return
+ *      A valid ibv_srq pointer, or NULL on error.
+ */
+ibv_srq* Infiniband::create_shared_receive_queue(uint32_t max_wr, uint32_t max_sge)
+{
+  ibv_srq_init_attr sia;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&sia, 0, sizeof(sia));
+  sia.srq_context = device->ctxt;
+  sia.attr.max_wr = max_wr;
+  sia.attr.max_sge = max_sge;
+  return ibv_create_srq(pd->pd, &sia);
+}
+
+int Infiniband::get_tx_buffers(std::vector<Chunk*> &c, size_t bytes)
+{
+  return memory_manager->get_send_buffers(c, bytes);
+}
+
+/**
+ * Create a new QueuePair. This factory should be used in preference to
+ * the QueuePair constructor directly, since this lets derivatives of
+ * Infiniband, e.g. MockInfiniband (if it existed),
+ * return mocked out QueuePair derivatives.
+ *
+ * \return
+ *      QueuePair on success or NULL if init fails
+ * See QueuePair::QueuePair for parameter documentation.
+ */
+Infiniband::QueuePair* Infiniband::create_queue_pair(CephContext *cct, CompletionQueue *tx,
+    CompletionQueue* rx, ibv_qp_type type, struct rdma_cm_id *cm_id)
+{
+  Infiniband::QueuePair *qp = new QueuePair(
+      cct, *this, type, ib_physical_port, srq, tx, rx, tx_queue_len, rx_queue_len, cm_id);
+  if (qp->init()) {
+    delete qp;
+    return NULL;
+  }
+  return qp;
+}
+
+int Infiniband::post_chunks_to_rq(int rq_wr_num, QueuePair *qp)
+{
+  int ret = 0;
+  Chunk *chunk = nullptr;
+
+  ibv_recv_wr *rx_work_request = static_cast<ibv_recv_wr*>(::calloc(rq_wr_num, sizeof(ibv_recv_wr)));
+  ibv_sge *isge = static_cast<ibv_sge*>(::calloc(rq_wr_num, sizeof(ibv_sge)));
+  ceph_assert(rx_work_request);
+  ceph_assert(isge);
+
+  int i = 0;
+  while (i < rq_wr_num) {
+    chunk = get_memory_manager()->get_rx_buffer();
+    if (chunk == nullptr) {
+      lderr(cct) << __func__ << " WARNING: out of memory. Request " << rq_wr_num <<
+                 " rx buffers. Only get " << i << " rx buffers." << dendl;
+      if (i == 0) {
+        ::free(rx_work_request);
+        ::free(isge);
+        return 0;
+      }
+      break; //get some buffers, so we need post them to recevie queue
+    }
+
+    isge[i].addr = reinterpret_cast<uint64_t>(chunk->data);
+    isge[i].length = chunk->bytes;
+    isge[i].lkey = chunk->lkey;
+
+    rx_work_request[i].wr_id = reinterpret_cast<uint64_t>(chunk);// assign chunk address as work request id
+
+    if (i != 0) {
+      rx_work_request[i - 1].next = &rx_work_request[i];
+    }
+    rx_work_request[i].sg_list = &isge[i];
+    rx_work_request[i].num_sge = 1;
+
+    if (qp && !qp->get_srq()) {
+       chunk->set_qp(qp);
+       qp->add_rq_wr(chunk);
+    }
+    i++;
+  }
+
+  ibv_recv_wr *badworkrequest = nullptr;
+  if (support_srq) {
+    ret = ibv_post_srq_recv(srq, rx_work_request, &badworkrequest);
+  } else {
+    ceph_assert(qp);
+    ret = ibv_post_recv(qp->get_qp(), rx_work_request, &badworkrequest);
+  }
+
+  ::free(rx_work_request);
+  ::free(isge);
+  ceph_assert(badworkrequest == nullptr && ret == 0);
+  return i;
+}
+
+Infiniband::CompletionChannel* Infiniband::create_comp_channel(CephContext *c)
+{
+  Infiniband::CompletionChannel *cc = new Infiniband::CompletionChannel(c, *this);
+  if (cc->init()) {
+    delete cc;
+    return NULL;
+  }
+  return cc;
+}
+
+Infiniband::CompletionQueue* Infiniband::create_comp_queue(
+    CephContext *cct, CompletionChannel *cc)
+{
+  Infiniband::CompletionQueue *cq = new Infiniband::CompletionQueue(
+      cct, *this, CQ_DEPTH, cc);
+  if (cq->init()) {
+    delete cq;
+    return NULL;
+  }
+  return cq;
+}
+
+Infiniband::QueuePair::~QueuePair()
+{
+  ldout(cct, 20) << __func__ << " destroy Queue Pair, qp number: " << qp->qp_num << " left SQ WR " << recv_queue.size() << dendl;
+  if (qp) {
+    ldout(cct, 20) << __func__ << " destroy qp=" << qp << dendl;
+    ceph_assert(!ibv_destroy_qp(qp));
+  }
+
+  for (auto& chunk: recv_queue) {
+    infiniband.get_memory_manager()->release_rx_buffer(chunk);
+  }
+  recv_queue.clear();
+}
+
+/**
+ * Given a string representation of the `status' field from Verbs
+ * struct `ibv_wc'.
+ *
+ * \param[in] status
+ *      The integer status obtained in ibv_wc.status.
+ * \return
+ *      A string corresponding to the given status.
+ */
+const char* Infiniband::wc_status_to_string(int status)
+{
+  static const char *lookup[] = {
+      "SUCCESS",
+      "LOC_LEN_ERR",
+      "LOC_QP_OP_ERR",
+      "LOC_EEC_OP_ERR",
+      "LOC_PROT_ERR",
+      "WR_FLUSH_ERR",
+      "MW_BIND_ERR",
+      "BAD_RESP_ERR",
+      "LOC_ACCESS_ERR",
+      "REM_INV_REQ_ERR",
+      "REM_ACCESS_ERR",
+      "REM_OP_ERR",
+      "RETRY_EXC_ERR",
+      "RNR_RETRY_EXC_ERR",
+      "LOC_RDD_VIOL_ERR",
+      "REM_INV_RD_REQ_ERR",
+      "REM_ABORT_ERR",
+      "INV_EECN_ERR",
+      "INV_EEC_STATE_ERR",
+      "FATAL_ERR",
+      "RESP_TIMEOUT_ERR",
+      "GENERAL_ERR"
+  };
+
+  if (status < IBV_WC_SUCCESS || status > IBV_WC_GENERAL_ERR)
+    return "<status out of range!>";
+  return lookup[status];
+}
+
+const char* Infiniband::qp_state_string(int status) {
+  switch(status) {
+    case IBV_QPS_RESET : return "IBV_QPS_RESET";
+    case IBV_QPS_INIT  : return "IBV_QPS_INIT";
+    case IBV_QPS_RTR   : return "IBV_QPS_RTR";
+    case IBV_QPS_RTS   : return "IBV_QPS_RTS";
+    case IBV_QPS_SQD   : return "IBV_QPS_SQD";
+    case IBV_QPS_SQE   : return "IBV_QPS_SQE";
+    case IBV_QPS_ERR   : return "IBV_QPS_ERR";
+    default: return " out of range.";
+  }
+}
diff --git a/src/msg/async/rdma/Infiniband.h b/src/msg/async/rdma/Infiniband.h
new file mode 100644
index 000000000..f18442e4e
--- /dev/null
+++ b/src/msg/async/rdma/Infiniband.h
@@ -0,0 +1,591 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_INFINIBAND_H
+#define CEPH_INFINIBAND_H
+
+#include <boost/pool/pool.hpp>
+// need this because boost messes with ceph log/assert definitions
+#include "include/ceph_assert.h"
+
+#include <infiniband/verbs.h>
+#include <rdma/rdma_cma.h>
+
+#include <atomic>
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "include/common_fwd.h"
+#include "include/int_types.h"
+#include "include/page.h"
+#include "include/scope_guard.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/ceph_mutex.h"
+#include "common/perf_counters.h"
+#include "msg/msg_types.h"
+#include "msg/async/net_handler.h"
+
+#define HUGE_PAGE_SIZE_2MB (2 * 1024 * 1024)
+#define ALIGN_TO_PAGE_2MB(x) \
+    (((x) + (HUGE_PAGE_SIZE_2MB - 1)) & ~(HUGE_PAGE_SIZE_2MB - 1))
+
+#define PSN_LEN 24
+#define PSN_MSK ((1 << PSN_LEN) - 1)
+
+#define BEACON_WRID 0xDEADBEEF
+
+struct ib_cm_meta_t {
+  uint16_t lid;
+  uint32_t local_qpn;
+  uint32_t psn;
+  uint32_t peer_qpn;
+  union ibv_gid gid;
+} __attribute__((packed));
+
+class RDMAStack;
+
+class Port {
+  struct ibv_context* ctxt;
+  int port_num;
+  struct ibv_port_attr port_attr;
+  uint16_t lid;
+  int gid_idx;
+  union ibv_gid gid;
+
+ public:
+  explicit Port(CephContext *cct, struct ibv_context* ictxt, uint8_t ipn);
+  uint16_t get_lid() { return lid; }
+  ibv_gid  get_gid() { return gid; }
+  int get_port_num() { return port_num; }
+  ibv_port_attr* get_port_attr() { return &port_attr; }
+  int get_gid_idx() { return gid_idx; }
+};
+
+
+class Device {
+  ibv_device *device;
+  const char* name;
+  uint8_t  port_cnt = 0;
+ public:
+  explicit Device(CephContext *c, ibv_device* ib_dev);
+  explicit Device(CephContext *c, ibv_context *ib_ctx);
+  ~Device() {
+    if (active_port) {
+      delete active_port;
+      ceph_assert(ibv_close_device(ctxt) == 0);
+    }
+  }
+  const char* get_name() { return name;}
+  uint16_t get_lid() { return active_port->get_lid(); }
+  ibv_gid get_gid() { return active_port->get_gid(); }
+  int get_gid_idx() { return active_port->get_gid_idx(); }
+  void binding_port(CephContext *c, int port_num);
+  struct ibv_context *ctxt;
+  ibv_device_attr device_attr;
+  Port* active_port;
+};
+
+
+class DeviceList {
+  struct ibv_device ** device_list;
+  struct ibv_context ** device_context_list;
+  int num;
+  Device** devices;
+ public:
+  explicit DeviceList(CephContext *cct): device_list(nullptr), device_context_list(nullptr),
+                                         num(0), devices(nullptr) {
+    device_list = ibv_get_device_list(&num);
+    ceph_assert(device_list);
+    ceph_assert(num);
+    if (cct->_conf->ms_async_rdma_cm) {
+        device_context_list = rdma_get_devices(NULL);
+        ceph_assert(device_context_list);
+    }
+    devices = new Device*[num];
+
+    for (int i = 0;i < num; ++i) {
+      if (cct->_conf->ms_async_rdma_cm) {
+         devices[i] = new Device(cct, device_context_list[i]);
+      } else {
+         devices[i] = new Device(cct, device_list[i]);
+      }
+    }
+  }
+  ~DeviceList() {
+    for (int i=0; i < num; ++i) {
+      delete devices[i];
+    }
+    delete []devices;
+    ibv_free_device_list(device_list);
+    rdma_free_devices(device_context_list);
+  }
+
+  Device* get_device(const char* device_name) {
+    for (int i = 0; i < num; ++i) {
+      if (!strlen(device_name) || !strcmp(device_name, devices[i]->get_name())) {
+        return devices[i];
+      }
+    }
+    return NULL;
+  }
+};
+
+// stat counters
+enum {
+  l_msgr_rdma_dispatcher_first = 94000,
+
+  l_msgr_rdma_polling,
+  l_msgr_rdma_inflight_tx_chunks,
+  l_msgr_rdma_rx_bufs_in_use,
+  l_msgr_rdma_rx_bufs_total,
+
+  l_msgr_rdma_tx_total_wc,
+  l_msgr_rdma_tx_total_wc_errors,
+  l_msgr_rdma_tx_wc_retry_errors,
+  l_msgr_rdma_tx_wc_wr_flush_errors,
+
+  l_msgr_rdma_rx_total_wc,
+  l_msgr_rdma_rx_total_wc_errors,
+  l_msgr_rdma_rx_fin,
+
+  l_msgr_rdma_handshake_errors,
+
+  l_msgr_rdma_total_async_events,
+  l_msgr_rdma_async_last_wqe_events,
+
+  l_msgr_rdma_created_queue_pair,
+  l_msgr_rdma_active_queue_pair,
+
+  l_msgr_rdma_dispatcher_last,
+};
+
+enum {
+  l_msgr_rdma_first = 95000,
+
+  l_msgr_rdma_tx_no_mem,
+  l_msgr_rdma_tx_parital_mem,
+  l_msgr_rdma_tx_failed,
+
+  l_msgr_rdma_tx_chunks,
+  l_msgr_rdma_tx_bytes,
+  l_msgr_rdma_rx_chunks,
+  l_msgr_rdma_rx_bytes,
+  l_msgr_rdma_pending_sent_conns,
+
+  l_msgr_rdma_last,
+};
+
+class RDMADispatcher;
+
+class Infiniband {
+ public:
+  class ProtectionDomain {
+   public:
+    explicit ProtectionDomain(CephContext *cct, Device *device);
+    ~ProtectionDomain();
+
+    ibv_pd* const pd;
+  };
+
+  class QueuePair;
+  class MemoryManager {
+   public:
+    class Chunk {
+     public:
+      Chunk(ibv_mr* m, uint32_t bytes, char* buffer, uint32_t offset = 0, uint32_t bound = 0, uint32_t lkey = 0, QueuePair* qp = nullptr);
+      ~Chunk();
+
+      uint32_t get_offset();
+      uint32_t get_size() const;
+      void prepare_read(uint32_t b);
+      uint32_t get_bound();
+      uint32_t read(char* buf, uint32_t len);
+      uint32_t write(char* buf, uint32_t len);
+      bool full();
+      void reset_read_chunk();
+      void reset_write_chunk();
+      void set_qp(QueuePair *qp) { this->qp = qp; }
+      void clear_qp() { set_qp(nullptr); }
+      QueuePair* get_qp() { return qp; }
+
+     public:
+      ibv_mr* mr;
+      QueuePair *qp;
+      uint32_t lkey;
+      uint32_t bytes;
+      uint32_t offset;
+      uint32_t bound;
+      char* buffer; // TODO: remove buffer/refactor TX
+      char  data[0];
+    };
+
+    class Cluster {
+     public:
+      Cluster(MemoryManager& m, uint32_t s);
+      ~Cluster();
+
+      int fill(uint32_t num);
+      void take_back(std::vector<Chunk*> &ck);
+      int get_buffers(std::vector<Chunk*> &chunks, size_t bytes);
+      Chunk *get_chunk_by_buffer(const char *c) {
+        uint32_t idx = (c - base) / buffer_size;
+        Chunk *chunk = chunk_base + idx;
+        return chunk;
+      }
+      bool is_my_buffer(const char *c) const {
+        return c >= base && c < end;
+      }
+
+      bool is_valid_chunk(const Chunk* c) const {
+        return c >= chunk_base && c < chunk_base + num_chunk;
+      }
+      MemoryManager& manager;
+      uint32_t buffer_size;
+      uint32_t num_chunk = 0;
+      ceph::mutex lock = ceph::make_mutex("cluster_lock");
+      std::vector<Chunk*> free_chunks;
+      char *base = nullptr;
+      char *end = nullptr;
+      Chunk* chunk_base = nullptr;
+    };
+
+    class MemPoolContext {
+      PerfCounters *perf_logger;
+
+     public:
+      MemoryManager *manager;
+      unsigned n_bufs_allocated;
+      // true if it is possible to alloc
+      // more memory for the pool
+      explicit MemPoolContext(MemoryManager *m) :
+        perf_logger(nullptr),
+        manager(m),
+        n_bufs_allocated(0) {}
+      bool can_alloc(unsigned nbufs);
+      void update_stats(int val);
+      void set_stat_logger(PerfCounters *logger);
+    };
+
+    class PoolAllocator {
+      struct mem_info {
+        ibv_mr   *mr;
+        MemPoolContext *ctx;
+        unsigned nbufs;
+        Chunk    chunks[0];
+      };
+     public:
+      typedef std::size_t size_type;
+      typedef std::ptrdiff_t difference_type;
+
+      static char * malloc(const size_type bytes);
+      static void free(char * const block);
+
+      template<typename Func>
+      static std::invoke_result_t<Func> with_context(MemPoolContext* ctx,
+						     Func&& func) {
+	std::lock_guard l{get_lock()};
+	g_ctx = ctx;
+	scope_guard reset_ctx{[] { g_ctx = nullptr; }};
+	return std::move(func)();
+      }
+    private:
+      static ceph::mutex& get_lock();
+      static MemPoolContext* g_ctx;
+    };
+
+    /**
+     * modify boost pool so that it is possible to
+     * have a thread safe 'context' when allocating/freeing
+     * the memory. It is needed to allow a different pool
+     * configurations and bookkeeping per CephContext and
+     * also to be able to use same allocator to deal with
+     * RX and TX pool.
+     * TODO: use boost pool to allocate TX chunks too
+     */
+    class mem_pool : public boost::pool<PoolAllocator> {
+     private:
+      MemPoolContext *ctx;
+      void *slow_malloc();
+
+     public:
+      ceph::mutex lock = ceph::make_mutex("mem_pool_lock");
+      explicit mem_pool(MemPoolContext *ctx, const size_type nrequested_size,
+          const size_type nnext_size = 32,
+          const size_type nmax_size = 0) :
+        pool(nrequested_size, nnext_size, nmax_size),
+        ctx(ctx) { }
+
+      void *malloc() {
+        if (!store().empty())
+          return (store().malloc)();
+        // need to alloc more memory...
+        // slow path code
+        return slow_malloc();
+      }
+    };
+
+    MemoryManager(CephContext *c, Device *d, ProtectionDomain *p);
+    ~MemoryManager();
+
+    void* malloc(size_t size);
+    void  free(void *ptr);
+
+    void create_tx_pool(uint32_t size, uint32_t tx_num);
+    void return_tx(std::vector<Chunk*> &chunks);
+    int get_send_buffers(std::vector<Chunk*> &c, size_t bytes);
+    bool is_tx_buffer(const char* c) { return send->is_my_buffer(c); }
+    bool is_valid_chunk(const Chunk* c) { return send->is_valid_chunk(c); }
+    Chunk *get_tx_chunk_by_buffer(const char *c) {
+      return send->get_chunk_by_buffer(c);
+    }
+    uint32_t get_tx_buffer_size() const {
+      return send->buffer_size;
+    }
+
+    Chunk *get_rx_buffer() {
+       std::lock_guard l{rxbuf_pool.lock};
+       return reinterpret_cast<Chunk *>(rxbuf_pool.malloc());
+    }
+
+    void release_rx_buffer(Chunk *chunk) {
+      std::lock_guard l{rxbuf_pool.lock};
+      chunk->clear_qp();
+      rxbuf_pool.free(chunk);
+    }
+
+    void set_rx_stat_logger(PerfCounters *logger) {
+      rxbuf_pool_ctx.set_stat_logger(logger);
+    }
+
+    CephContext  *cct;
+   private:
+    // TODO: Cluster -> TxPool txbuf_pool
+    // chunk layout fix
+    //  
+    Cluster* send = nullptr;// SEND
+    Device *device;
+    ProtectionDomain *pd;
+    MemPoolContext rxbuf_pool_ctx;
+    mem_pool     rxbuf_pool;
+
+
+    void* huge_pages_malloc(size_t size);
+    void  huge_pages_free(void *ptr);
+  };
+
+ private:
+  uint32_t tx_queue_len = 0;
+  uint32_t rx_queue_len = 0;
+  uint32_t max_sge = 0;
+  uint8_t  ib_physical_port = 0;
+  MemoryManager* memory_manager = nullptr;
+  ibv_srq* srq = nullptr;             // shared receive work queue
+  Device *device = NULL;
+  ProtectionDomain *pd = NULL;
+  DeviceList *device_list = nullptr;
+  CephContext *cct;
+  ceph::mutex lock = ceph::make_mutex("IB lock");
+  bool initialized = false;
+  const std::string &device_name;
+  uint8_t port_num;
+  bool support_srq = false;
+
+ public:
+  explicit Infiniband(CephContext *c);
+  ~Infiniband();
+  void init();
+  static void verify_prereq(CephContext *cct);
+
+  class CompletionChannel {
+    static const uint32_t MAX_ACK_EVENT = 5000;
+    CephContext *cct;
+    Infiniband& infiniband;
+    ibv_comp_channel *channel;
+    ibv_cq *cq;
+    uint32_t cq_events_that_need_ack;
+
+   public:
+    CompletionChannel(CephContext *c, Infiniband &ib);
+    ~CompletionChannel();
+    int init();
+    bool get_cq_event();
+    int get_fd() { return channel->fd; }
+    ibv_comp_channel* get_channel() { return channel; }
+    void bind_cq(ibv_cq *c) { cq = c; }
+    void ack_events();
+  };
+
+  // this class encapsulates the creation, use, and destruction of an RC
+  // completion queue.
+  //
+  // You need to call init and it will create a cq and associate to comp channel
+  class CompletionQueue {
+   public:
+    CompletionQueue(CephContext *c, Infiniband &ib,
+                    const uint32_t qd, CompletionChannel *cc)
+      : cct(c), infiniband(ib), channel(cc), cq(NULL), queue_depth(qd) {}
+    ~CompletionQueue();
+    int init();
+    int poll_cq(int num_entries, ibv_wc *ret_wc_array);
+
+    ibv_cq* get_cq() const { return cq; }
+    int rearm_notify(bool solicited_only=true);
+    CompletionChannel* get_cc() const { return channel; }
+   private:
+    CephContext *cct;
+    Infiniband&  infiniband;     // Infiniband to which this QP belongs
+    CompletionChannel *channel;
+    ibv_cq *cq;
+    uint32_t queue_depth;
+  };
+
+  // this class encapsulates the creation, use, and destruction of an RC
+  // queue pair.
+  //
+  // you need call init and it will create a qp and bring it to the INIT state.
+  // after obtaining the lid, qpn, and psn of a remote queue pair, one
+  // must call plumb() to bring the queue pair to the RTS state.
+  class QueuePair {
+   public:
+    typedef MemoryManager::Chunk Chunk;
+    QueuePair(CephContext *c, Infiniband& infiniband, ibv_qp_type type,
+              int ib_physical_port,  ibv_srq *srq,
+              Infiniband::CompletionQueue* txcq,
+              Infiniband::CompletionQueue* rxcq,
+              uint32_t tx_queue_len, uint32_t max_recv_wr, struct rdma_cm_id *cid, uint32_t q_key = 0);
+    ~QueuePair();
+
+    int modify_qp_to_error();
+    int modify_qp_to_rts();
+    int modify_qp_to_rtr();
+    int modify_qp_to_init();
+    int init();
+
+    /**
+     * Get the initial packet sequence number for this QueuePair.
+     * This is randomly generated on creation. It should not be confused
+     * with the remote side's PSN, which is set in #plumb(). 
+     */
+    uint32_t get_initial_psn() const { return initial_psn; };
+    /**
+     * Get the local queue pair number for this QueuePair.
+     * QPNs are analogous to UDP/TCP port numbers.
+     */
+    uint32_t get_local_qp_number() const { return qp->qp_num; };
+    /**
+     * Get the remote queue pair number for this QueuePair, as set in #plumb().
+     * QPNs are analogous to UDP/TCP port numbers.
+     */
+    int get_remote_qp_number(uint32_t *rqp) const;
+    /**
+     * Get the remote infiniband address for this QueuePair, as set in #plumb().
+     * LIDs are "local IDs" in infiniband terminology. They are short, locally
+     * routable addresses.
+     */
+    int get_remote_lid(uint16_t *lid) const;
+    /**
+     * Get the state of a QueuePair.
+     */
+    int get_state() const;
+    /*
+     * send/receive connection management meta data
+     */
+    int send_cm_meta(CephContext *cct, int socket_fd);
+    int recv_cm_meta(CephContext *cct, int socket_fd);
+    void wire_gid_to_gid(const char *wgid, ib_cm_meta_t* cm_meta_data);
+    void gid_to_wire_gid(const ib_cm_meta_t& cm_meta_data, char wgid[]);
+    ibv_qp* get_qp() const { return qp; }
+    Infiniband::CompletionQueue* get_tx_cq() const { return txcq; }
+    Infiniband::CompletionQueue* get_rx_cq() const { return rxcq; }
+    int to_dead();
+    bool is_dead() const { return dead; }
+    ib_cm_meta_t& get_peer_cm_meta() { return peer_cm_meta; }
+    ib_cm_meta_t& get_local_cm_meta() { return local_cm_meta; }
+    void add_rq_wr(Chunk* chunk)
+    {
+      if (srq) return;
+
+      std::lock_guard l{lock};
+      recv_queue.push_back(chunk);
+    }
+
+    void remove_rq_wr(Chunk* chunk) {
+      if (srq) return;
+
+      std::lock_guard l{lock};
+      auto it = std::find(recv_queue.begin(), recv_queue.end(), chunk);
+      ceph_assert(it != recv_queue.end());
+      recv_queue.erase(it);
+    }
+    ibv_srq* get_srq() const { return srq; }
+
+   private:
+    CephContext  *cct;
+    Infiniband&  infiniband;     // Infiniband to which this QP belongs
+    ibv_qp_type  type;           // QP type (IBV_QPT_RC, etc.)
+    ibv_context* ctxt;           // device context of the HCA to use
+    int ib_physical_port;
+    ibv_pd*      pd;             // protection domain
+    ibv_srq*     srq;            // shared receive queue
+    ibv_qp*      qp;             // infiniband verbs QP handle
+    struct rdma_cm_id *cm_id;
+    ib_cm_meta_t peer_cm_meta;
+    ib_cm_meta_t local_cm_meta;
+    Infiniband::CompletionQueue* txcq;
+    Infiniband::CompletionQueue* rxcq;
+    uint32_t     initial_psn;    // initial packet sequence number
+    uint32_t     max_send_wr;
+    uint32_t     max_recv_wr;
+    uint32_t     q_key;
+    bool dead;
+    std::vector<Chunk*> recv_queue;
+    ceph::mutex lock = ceph::make_mutex("queue_pair_lock");
+  };
+
+ public:
+  typedef MemoryManager::Cluster Cluster;
+  typedef MemoryManager::Chunk Chunk;
+  QueuePair* create_queue_pair(CephContext *c, CompletionQueue*, CompletionQueue*,
+      ibv_qp_type type, struct rdma_cm_id *cm_id);
+  ibv_srq* create_shared_receive_queue(uint32_t max_wr, uint32_t max_sge);
+  // post rx buffers to srq, return number of buffers actually posted
+  int post_chunks_to_rq(int num, QueuePair *qp = nullptr);
+  void post_chunk_to_pool(Chunk* chunk) {
+    QueuePair *qp = chunk->get_qp();
+    if (qp != nullptr) {
+      qp->remove_rq_wr(chunk);
+    }
+    get_memory_manager()->release_rx_buffer(chunk);
+  }
+  int get_tx_buffers(std::vector<Chunk*> &c, size_t bytes);
+  CompletionChannel *create_comp_channel(CephContext *c);
+  CompletionQueue *create_comp_queue(CephContext *c, CompletionChannel *cc=NULL);
+  uint8_t get_ib_physical_port() { return ib_physical_port; }
+  uint16_t get_lid() { return device->get_lid(); }
+  ibv_gid get_gid() { return device->get_gid(); }
+  MemoryManager* get_memory_manager() { return memory_manager; }
+  Device* get_device() { return device; }
+  int get_async_fd() { return device->ctxt->async_fd; }
+  bool is_tx_buffer(const char* c) { return memory_manager->is_tx_buffer(c);}
+  Chunk *get_tx_chunk_by_buffer(const char *c) { return memory_manager->get_tx_chunk_by_buffer(c); }
+  static const char* wc_status_to_string(int status);
+  static const char* qp_state_string(int status);
+  uint32_t get_rx_queue_len() const { return rx_queue_len; }
+};
+
+#endif
diff --git a/src/msg/async/rdma/RDMAConnectedSocketImpl.cc b/src/msg/async/rdma/RDMAConnectedSocketImpl.cc
new file mode 100644
index 000000000..5ab6c9b2e
--- /dev/null
+++ b/src/msg/async/rdma/RDMAConnectedSocketImpl.cc
@@ -0,0 +1,602 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "RDMAStack.h"
+
+class C_handle_connection_established : public EventCallback {
+  RDMAConnectedSocketImpl *csi;
+  bool active = true;
+ public:
+  C_handle_connection_established(RDMAConnectedSocketImpl *w) : csi(w) {}
+  void do_request(uint64_t fd) final {
+    if (active)
+      csi->handle_connection_established();
+  }
+  void close() {
+    active = false;
+  }
+};
+
+class C_handle_connection_read : public EventCallback {
+  RDMAConnectedSocketImpl *csi;
+  bool active = true;
+ public:
+  explicit C_handle_connection_read(RDMAConnectedSocketImpl *w): csi(w) {}
+  void do_request(uint64_t fd) final {
+    if (active)
+      csi->handle_connection();
+  }
+  void close() {
+    active = false;
+  }
+};
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << " RDMAConnectedSocketImpl "
+
+RDMAConnectedSocketImpl::RDMAConnectedSocketImpl(CephContext *cct, std::shared_ptr<Infiniband> &ib,
+                                                 std::shared_ptr<RDMADispatcher>& rdma_dispatcher,
+                                                 RDMAWorker *w)
+  : cct(cct), connected(0), error(0), ib(ib),
+    dispatcher(rdma_dispatcher), worker(w),
+    is_server(false), read_handler(new C_handle_connection_read(this)),
+    established_handler(new C_handle_connection_established(this)),
+    active(false), pending(false)
+{
+  if (!cct->_conf->ms_async_rdma_cm) {
+    qp = ib->create_queue_pair(cct, dispatcher->get_tx_cq(), dispatcher->get_rx_cq(), IBV_QPT_RC, NULL);
+    if (!qp) {
+      lderr(cct) << __func__ << " queue pair create failed" << dendl;
+      return;
+    }
+    local_qpn = qp->get_local_qp_number();
+    notify_fd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
+    dispatcher->register_qp(qp, this);
+    dispatcher->perf_logger->inc(l_msgr_rdma_created_queue_pair);
+    dispatcher->perf_logger->inc(l_msgr_rdma_active_queue_pair);
+  }
+}
+
+RDMAConnectedSocketImpl::~RDMAConnectedSocketImpl()
+{
+  ldout(cct, 20) << __func__ << " destruct." << dendl;
+  cleanup();
+  worker->remove_pending_conn(this);
+  dispatcher->schedule_qp_destroy(local_qpn);
+
+  for (unsigned i=0; i < wc.size(); ++i) {
+    dispatcher->post_chunk_to_pool(reinterpret_cast<Chunk*>(wc[i].wr_id));
+  }
+  for (unsigned i=0; i < buffers.size(); ++i) {
+    dispatcher->post_chunk_to_pool(buffers[i]);
+  }
+
+  std::lock_guard l{lock};
+  if (notify_fd >= 0)
+    ::close(notify_fd);
+  if (tcp_fd >= 0)
+    ::close(tcp_fd);
+  error = ECONNRESET;
+}
+
+void RDMAConnectedSocketImpl::pass_wc(std::vector<ibv_wc> &&v)
+{
+  std::lock_guard l{lock};
+  if (wc.empty())
+    wc = std::move(v);
+  else
+    wc.insert(wc.end(), v.begin(), v.end());
+  notify();
+}
+
+void RDMAConnectedSocketImpl::get_wc(std::vector<ibv_wc> &w)
+{
+  std::lock_guard l{lock};
+  if (wc.empty())
+    return ;
+  w.swap(wc);
+}
+
+int RDMAConnectedSocketImpl::activate()
+{
+  qp->get_local_cm_meta().peer_qpn = qp->get_peer_cm_meta().local_qpn;
+  if (qp->modify_qp_to_rtr() != 0)
+    return -1;
+
+  if (qp->modify_qp_to_rts() != 0)
+    return -1;
+
+  if (!is_server) {
+    connected = 1; //indicate successfully
+    ldout(cct, 20) << __func__ << " handle fake send, wake it up. QP: " << local_qpn << dendl;
+    submit(false);
+  }
+  active = true;
+  peer_qpn = qp->get_local_cm_meta().peer_qpn;
+
+  return 0;
+}
+
+int RDMAConnectedSocketImpl::try_connect(const entity_addr_t& peer_addr, const SocketOptions &opts) {
+  ldout(cct, 20) << __func__ << " nonblock:" << opts.nonblock << ", nodelay:"
+                 << opts.nodelay << ", rbuf_size: " << opts.rcbuf_size << dendl;
+  ceph::NetHandler net(cct);
+
+  // we construct a socket to transport ib sync message
+  // but we shouldn't block in tcp connecting
+  if (opts.nonblock) {
+    tcp_fd = net.nonblock_connect(peer_addr, opts.connect_bind_addr);
+  } else {
+    tcp_fd = net.connect(peer_addr, opts.connect_bind_addr);
+  }
+
+  if (tcp_fd < 0) {
+    return -errno;
+  }
+
+  int r = net.set_socket_options(tcp_fd, opts.nodelay, opts.rcbuf_size);
+  if (r < 0) {
+    ::close(tcp_fd);
+    tcp_fd = -1;
+    return -errno;
+  }
+
+  ldout(cct, 20) << __func__ << " tcp_fd: " << tcp_fd << dendl;
+  net.set_priority(tcp_fd, opts.priority, peer_addr.get_family());
+  r = 0;
+  if (opts.nonblock) {
+    worker->center.create_file_event(tcp_fd, EVENT_READABLE | EVENT_WRITABLE , established_handler);
+  } else {
+    r = handle_connection_established(false);
+  }
+  return r;
+}
+
+int RDMAConnectedSocketImpl::handle_connection_established(bool need_set_fault) {
+  ldout(cct, 20) << __func__ << " start " << dendl;
+  // delete read event
+  worker->center.delete_file_event(tcp_fd, EVENT_READABLE | EVENT_WRITABLE);
+  if (1 == connected) {
+    ldout(cct, 1) << __func__ << " warnning: logic failed " << dendl;
+    if (need_set_fault) {
+      fault();
+    }
+    return -1;
+  }
+  // send handshake msg to server
+  qp->get_local_cm_meta().peer_qpn = 0;
+  int r = qp->send_cm_meta(cct, tcp_fd);
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " send handshake msg failed." << r << dendl;
+    if (need_set_fault) {
+      fault();
+    }
+    return r;
+  }
+  worker->center.create_file_event(tcp_fd, EVENT_READABLE, read_handler);
+  ldout(cct, 20) << __func__ << " finish " << dendl;
+  return 0;
+}
+
+void RDMAConnectedSocketImpl::handle_connection() {
+  ldout(cct, 20) << __func__ << " QP: " << local_qpn << " tcp_fd: " << tcp_fd << " notify_fd: " << notify_fd << dendl;
+  int r = qp->recv_cm_meta(cct, tcp_fd);
+  if (r <= 0) {
+    if (r != -EAGAIN) {
+      dispatcher->perf_logger->inc(l_msgr_rdma_handshake_errors);
+      ldout(cct, 1) << __func__ << " recv handshake msg failed." << dendl;
+      fault();
+    }
+    return;
+  }
+
+  if (1 == connected) {
+    ldout(cct, 1) << __func__ << " warnning: logic failed: read len: " << r << dendl;
+    fault();
+    return;
+  }
+
+  if (!is_server) {// first time: cm meta sync + ack from server
+    if (!connected) {
+      r = activate();
+      ceph_assert(!r);
+    }
+    notify();
+    r = qp->send_cm_meta(cct, tcp_fd);
+    if (r < 0) {
+      ldout(cct, 1) << __func__ << " send client ack failed." << dendl;
+      dispatcher->perf_logger->inc(l_msgr_rdma_handshake_errors);
+      fault();
+    }
+  } else {
+    if (qp->get_peer_cm_meta().peer_qpn == 0) {// first time: cm meta sync from client
+      if (active) {
+        ldout(cct, 10) << __func__ << " server is already active." << dendl;
+        return ;
+      }
+      r = activate();
+      ceph_assert(!r);
+      r = qp->send_cm_meta(cct, tcp_fd);
+      if (r < 0) {
+        ldout(cct, 1) << __func__ << " server ack failed." << dendl;
+        dispatcher->perf_logger->inc(l_msgr_rdma_handshake_errors);
+        fault();
+        return ;
+      }
+    } else { // second time: cm meta ack from client
+      connected = 1;
+      ldout(cct, 10) << __func__ << " handshake of rdma is done. server connected: " << connected << dendl;
+      //cleanup();
+      submit(false);
+      notify();
+    }
+  }
+}
+
+ssize_t RDMAConnectedSocketImpl::read(char* buf, size_t len)
+{
+  eventfd_t event_val = 0;
+  int r = eventfd_read(notify_fd, &event_val);
+  ldout(cct, 20) << __func__ << " notify_fd : " << event_val << " in " << local_qpn
+                 << " r = " << r << dendl;
+
+  if (!active) {
+    ldout(cct, 1) << __func__ << " when ib not active. len: " << len << dendl;
+    return -EAGAIN;
+  }
+
+  if (0 == connected) {
+    ldout(cct, 1) << __func__ << " when ib not connected. len: " << len <<dendl;
+    return -EAGAIN;
+  }
+  ssize_t read = 0;
+  read = read_buffers(buf,len);
+
+  if (is_server && connected == 0) {
+    ldout(cct, 20) << __func__ << " we do not need last handshake, QP: " << local_qpn << " peer QP: " << peer_qpn << dendl;
+    connected = 1; //if so, we don't need the last handshake
+    cleanup();
+    submit(false);
+  }
+
+  if (!buffers.empty()) {
+    notify();
+  }
+
+  if (read == 0 && error)
+    return -error;
+  return read == 0 ? -EAGAIN : read;
+}
+
+void RDMAConnectedSocketImpl::buffer_prefetch(void)
+{
+  std::vector<ibv_wc> cqe;
+  get_wc(cqe);
+  if(cqe.empty())
+    return;
+
+  for(size_t i = 0; i < cqe.size(); ++i) {
+    ibv_wc* response = &cqe[i];
+    ceph_assert(response->status == IBV_WC_SUCCESS);
+    Chunk* chunk = reinterpret_cast<Chunk *>(response->wr_id);
+    chunk->prepare_read(response->byte_len);
+
+    if (chunk->get_size() == 0) {
+      chunk->reset_read_chunk();
+      dispatcher->perf_logger->inc(l_msgr_rdma_rx_fin);
+      if (connected) {
+        error = ECONNRESET;
+        ldout(cct, 20) << __func__ << " got remote close msg..." << dendl;
+      }
+      dispatcher->post_chunk_to_pool(chunk);
+      continue;
+    } else {
+      buffers.push_back(chunk);
+      ldout(cct, 25) << __func__ << " buffers add a chunk: " << chunk->get_offset() << ":" << chunk->get_bound() << dendl;
+    }
+  }
+  worker->perf_logger->inc(l_msgr_rdma_rx_chunks, cqe.size());
+}
+
+ssize_t RDMAConnectedSocketImpl::read_buffers(char* buf, size_t len)
+{
+  size_t read_size = 0, tmp = 0;
+  buffer_prefetch();
+  auto pchunk = buffers.begin();
+  while (pchunk != buffers.end()) {
+    tmp = (*pchunk)->read(buf + read_size, len - read_size);
+    read_size += tmp;
+    ldout(cct, 25) << __func__ << " read chunk " << *pchunk << " bytes length" << tmp << " offset: "
+                   << (*pchunk)->get_offset() << " ,bound: " << (*pchunk)->get_bound() << dendl;
+
+    if ((*pchunk)->get_size() == 0) {
+      (*pchunk)->reset_read_chunk();
+      dispatcher->post_chunk_to_pool(*pchunk);
+      update_post_backlog();
+      ldout(cct, 25) << __func__ << " read over one chunk " << dendl;
+      pchunk++;
+    }
+
+    if (read_size == len) {
+      break;
+    }
+  }
+
+  buffers.erase(buffers.begin(), pchunk);
+  ldout(cct, 25) << __func__ << " got " << read_size  << " bytes, buffers size: " << buffers.size() << dendl;
+  worker->perf_logger->inc(l_msgr_rdma_rx_bytes, read_size);
+  return read_size;
+}
+
+ssize_t RDMAConnectedSocketImpl::send(ceph::buffer::list &bl, bool more)
+{
+  if (error) {
+    if (!active)
+      return -EPIPE;
+    return -error;
+  }
+  size_t bytes = bl.length();
+  if (!bytes)
+    return 0;
+  {
+    std::lock_guard l{lock};
+    pending_bl.claim_append(bl);
+    if (!connected) {
+      ldout(cct, 20) << __func__ << " fake send to upper, QP: " << local_qpn << dendl;
+      return bytes;
+    }
+  }
+  ldout(cct, 20) << __func__ << " QP: " << local_qpn << dendl;
+  ssize_t r = submit(more);
+  if (r < 0 && r != -EAGAIN)
+    return r;
+  return bytes;
+}
+
+size_t RDMAConnectedSocketImpl::tx_copy_chunk(std::vector<Chunk*> &tx_buffers,
+    size_t req_copy_len, decltype(std::cbegin(pending_bl.buffers()))& start,
+    const decltype(std::cbegin(pending_bl.buffers()))& end)
+{
+  ceph_assert(start != end);
+  auto chunk_idx = tx_buffers.size();
+  if (0 == worker->get_reged_mem(this, tx_buffers, req_copy_len)) {
+    ldout(cct, 1) << __func__ << " no enough buffers in worker " << worker << dendl;
+    worker->perf_logger->inc(l_msgr_rdma_tx_no_mem);
+    return 0;
+  }
+
+  Chunk *current_chunk = tx_buffers[chunk_idx];
+  size_t write_len = 0;
+  while (start != end) {
+    const uintptr_t addr = reinterpret_cast<uintptr_t>(start->c_str());
+
+    size_t slice_write_len = 0;
+    while (slice_write_len < start->length()) {
+      size_t real_len = current_chunk->write((char*)addr + slice_write_len, start->length() - slice_write_len);
+
+      slice_write_len += real_len;
+      write_len += real_len;
+      req_copy_len -= real_len;
+
+      if (current_chunk->full()) {
+        if (++chunk_idx == tx_buffers.size())
+          return write_len;
+        current_chunk = tx_buffers[chunk_idx];
+      }
+    }
+
+    ++start;
+  }
+  ceph_assert(req_copy_len == 0);
+  return write_len;
+}
+
+ssize_t RDMAConnectedSocketImpl::submit(bool more)
+{
+  if (error)
+    return -error;
+  std::lock_guard l{lock};
+  size_t bytes = pending_bl.length();
+  ldout(cct, 20) << __func__ << " we need " << bytes << " bytes. iov size: "
+                 << pending_bl.get_num_buffers() << dendl;
+  if (!bytes)
+    return 0;
+
+  std::vector<Chunk*> tx_buffers;
+  auto it = std::cbegin(pending_bl.buffers());
+  auto copy_start = it;
+  size_t total_copied = 0, wait_copy_len = 0;
+  while (it != pending_bl.buffers().end()) {
+    if (ib->is_tx_buffer(it->raw_c_str())) {
+      if (wait_copy_len) {
+        size_t copied = tx_copy_chunk(tx_buffers, wait_copy_len, copy_start, it);
+        total_copied += copied;
+        if (copied < wait_copy_len)
+          goto sending;
+        wait_copy_len = 0;
+      }
+      ceph_assert(copy_start == it);
+      tx_buffers.push_back(ib->get_tx_chunk_by_buffer(it->raw_c_str()));
+      total_copied += it->length();
+      ++copy_start;
+    } else {
+      wait_copy_len += it->length();
+    }
+    ++it;
+  }
+  if (wait_copy_len)
+    total_copied += tx_copy_chunk(tx_buffers, wait_copy_len, copy_start, it);
+
+ sending:
+  if (total_copied == 0)
+    return -EAGAIN;
+  ceph_assert(total_copied <= pending_bl.length());
+  ceph::buffer::list swapped;
+  if (total_copied < pending_bl.length()) {
+    worker->perf_logger->inc(l_msgr_rdma_tx_parital_mem);
+    pending_bl.splice(total_copied, pending_bl.length() - total_copied, &swapped);
+    pending_bl.swap(swapped);
+  } else {
+    pending_bl.clear();
+  }
+
+  ldout(cct, 20) << __func__ << " left bytes: " << pending_bl.length() << " in buffers "
+                 << pending_bl.get_num_buffers() << " tx chunks " << tx_buffers.size() << dendl;
+
+  int r = post_work_request(tx_buffers);
+  if (r < 0)
+    return r;
+
+  ldout(cct, 20) << __func__ << " finished sending " << total_copied << " bytes." << dendl;
+  return pending_bl.length() ? -EAGAIN : 0;
+}
+
+int RDMAConnectedSocketImpl::post_work_request(std::vector<Chunk*> &tx_buffers)
+{
+  ldout(cct, 20) << __func__ << " QP: " << local_qpn << " " << tx_buffers[0] << dendl;
+  auto current_buffer = tx_buffers.begin();
+  ibv_sge isge[tx_buffers.size()];
+  uint32_t current_sge = 0;
+  ibv_send_wr iswr[tx_buffers.size()];
+  uint32_t current_swr = 0;
+  ibv_send_wr* pre_wr = NULL;
+  uint32_t num = 0; 
+
+  // FIPS zeroization audit 20191115: these memsets are not security related.
+  memset(iswr, 0, sizeof(iswr));
+  memset(isge, 0, sizeof(isge));
+ 
+  while (current_buffer != tx_buffers.end()) {
+    isge[current_sge].addr = reinterpret_cast<uint64_t>((*current_buffer)->buffer);
+    isge[current_sge].length = (*current_buffer)->get_offset();
+    isge[current_sge].lkey = (*current_buffer)->mr->lkey;
+    ldout(cct, 25) << __func__ << " sending buffer: " << *current_buffer << " length: " << isge[current_sge].length  << dendl;
+
+    iswr[current_swr].wr_id = reinterpret_cast<uint64_t>(*current_buffer);
+    iswr[current_swr].next = NULL;
+    iswr[current_swr].sg_list = &isge[current_sge];
+    iswr[current_swr].num_sge = 1;
+    iswr[current_swr].opcode = IBV_WR_SEND;
+    iswr[current_swr].send_flags = IBV_SEND_SIGNALED;
+
+    num++;
+    worker->perf_logger->inc(l_msgr_rdma_tx_bytes, isge[current_sge].length);
+    if (pre_wr)
+      pre_wr->next = &iswr[current_swr];
+    pre_wr = &iswr[current_swr];
+    ++current_sge;
+    ++current_swr;
+    ++current_buffer;
+  }
+
+  ibv_send_wr *bad_tx_work_request = nullptr;
+  if (ibv_post_send(qp->get_qp(), iswr, &bad_tx_work_request)) {
+    ldout(cct, 1) << __func__ << " failed to send data"
+                  << " (most probably should be peer not ready): "
+                  << cpp_strerror(errno) << dendl;
+    worker->perf_logger->inc(l_msgr_rdma_tx_failed);
+    return -errno;
+  }
+  worker->perf_logger->inc(l_msgr_rdma_tx_chunks, tx_buffers.size());
+  ldout(cct, 20) << __func__ << " qp state is " << get_qp_state() << dendl;
+  return 0;
+}
+
+void RDMAConnectedSocketImpl::fin() {
+  ibv_send_wr wr;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&wr, 0, sizeof(wr));
+
+  wr.wr_id = reinterpret_cast<uint64_t>(qp);
+  wr.num_sge = 0;
+  wr.opcode = IBV_WR_SEND;
+  wr.send_flags = IBV_SEND_SIGNALED;
+  ibv_send_wr* bad_tx_work_request = nullptr;
+  if (ibv_post_send(qp->get_qp(), &wr, &bad_tx_work_request)) {
+    ldout(cct, 1) << __func__ << " failed to send message="
+                  << " ibv_post_send failed(most probably should be peer not ready): "
+                  << cpp_strerror(errno) << dendl;
+    worker->perf_logger->inc(l_msgr_rdma_tx_failed);
+    return ;
+  }
+}
+
+void RDMAConnectedSocketImpl::cleanup() {
+  if (read_handler && tcp_fd >= 0) {
+    (static_cast<C_handle_connection_read*>(read_handler))->close();
+    worker->center.submit_to(worker->center.get_id(), [this]() {
+      worker->center.delete_file_event(tcp_fd, EVENT_READABLE | EVENT_WRITABLE);
+    }, false);
+    delete read_handler;
+    read_handler = nullptr;
+  }
+  if (established_handler) {
+    (static_cast<C_handle_connection_established*>(established_handler))->close();
+    delete established_handler;
+    established_handler = nullptr;
+  }
+}
+
+void RDMAConnectedSocketImpl::notify()
+{
+  eventfd_t event_val = 1;
+  int r = eventfd_write(notify_fd, event_val);
+  ceph_assert(r == 0);
+}
+
+void RDMAConnectedSocketImpl::shutdown()
+{
+  if (!error)
+    fin();
+  error = ECONNRESET;
+  active = false;
+}
+
+void RDMAConnectedSocketImpl::close()
+{
+  if (!error)
+    fin();
+  error = ECONNRESET;
+  active = false;
+}
+
+void RDMAConnectedSocketImpl::fault()
+{
+  ldout(cct, 1) << __func__ << " tcp fd " << tcp_fd << dendl;
+  error = ECONNRESET;
+  connected = 1;
+  notify();
+}
+
+void RDMAConnectedSocketImpl::set_accept_fd(int sd)
+{
+  tcp_fd = sd;
+  is_server = true;
+  worker->center.submit_to(worker->center.get_id(), [this]() {
+			   worker->center.create_file_event(tcp_fd, EVENT_READABLE, read_handler);
+			   }, true);
+}
+
+void RDMAConnectedSocketImpl::post_chunks_to_rq(int num)
+{
+  post_backlog += num - ib->post_chunks_to_rq(num, qp);
+}
+
+void RDMAConnectedSocketImpl::update_post_backlog()
+{
+  if (post_backlog)
+    post_backlog -= post_backlog - dispatcher->post_chunks_to_rq(post_backlog, qp);
+}
diff --git a/src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc b/src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc
new file mode 100644
index 000000000..606dbd281
--- /dev/null
+++ b/src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc
@@ -0,0 +1,183 @@
+#include "RDMAStack.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << " RDMAIWARPConnectedSocketImpl "
+
+#define TIMEOUT_MS 3000
+#define RETRY_COUNT 7
+
+RDMAIWARPConnectedSocketImpl::RDMAIWARPConnectedSocketImpl(CephContext *cct, std::shared_ptr<Infiniband>& ib,
+                                                           std::shared_ptr<RDMADispatcher>& rdma_dispatcher,
+                                                           RDMAWorker *w, RDMACMInfo *info)
+  : RDMAConnectedSocketImpl(cct, ib, rdma_dispatcher, w), cm_con_handler(new C_handle_cm_connection(this))
+{
+  status = IDLE;
+  notify_fd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
+  if (info) {
+    is_server = true;
+    cm_id = info->cm_id;
+    cm_channel = info->cm_channel;
+    status = RDMA_ID_CREATED;
+    peer_qpn = info->qp_num;
+    if (alloc_resource()) {
+      close_notify();
+      return;
+    }
+    worker->center.submit_to(worker->center.get_id(), [this]() {
+      worker->center.create_file_event(cm_channel->fd, EVENT_READABLE, cm_con_handler);
+      status = CHANNEL_FD_CREATED;
+    }, false);
+    status = RESOURCE_ALLOCATED;
+    qp->get_local_cm_meta().peer_qpn = peer_qpn;
+    qp->get_peer_cm_meta().local_qpn = peer_qpn;
+  } else {
+    is_server = false;
+    cm_channel = rdma_create_event_channel();
+    rdma_create_id(cm_channel, &cm_id, NULL, RDMA_PS_TCP);
+    status = RDMA_ID_CREATED;
+    ldout(cct, 20) << __func__ << " successfully created cm id: " << cm_id << dendl;
+  }
+}
+
+RDMAIWARPConnectedSocketImpl::~RDMAIWARPConnectedSocketImpl() {
+  ldout(cct, 20) << __func__ << " destruct." << dendl;
+  std::unique_lock l(close_mtx);
+  close_condition.wait(l, [&] { return closed; });
+  if (status >= RDMA_ID_CREATED) {
+    rdma_destroy_id(cm_id);
+    rdma_destroy_event_channel(cm_channel);
+  }
+}
+
+int RDMAIWARPConnectedSocketImpl::try_connect(const entity_addr_t& peer_addr, const SocketOptions &opts) {
+  worker->center.create_file_event(cm_channel->fd, EVENT_READABLE, cm_con_handler);
+  status = CHANNEL_FD_CREATED;
+  if (rdma_resolve_addr(cm_id, NULL, const_cast<struct sockaddr*>(peer_addr.get_sockaddr()), TIMEOUT_MS)) {
+    lderr(cct) << __func__ << " failed to resolve addr" << dendl;
+    return -1;
+  }
+  return 0;
+}
+
+void RDMAIWARPConnectedSocketImpl::close() {
+  error = ECONNRESET;
+  active = false;
+  if (status >= CONNECTED) {
+    rdma_disconnect(cm_id);
+  }
+  close_notify();
+}
+
+void RDMAIWARPConnectedSocketImpl::shutdown() {
+  error = ECONNRESET;
+  active = false;
+}
+
+void RDMAIWARPConnectedSocketImpl::handle_cm_connection() {
+  struct rdma_cm_event *event;
+  rdma_get_cm_event(cm_channel, &event);
+  ldout(cct, 20) << __func__ << " event name: " << rdma_event_str(event->event)
+                             << " (cm id: " << cm_id << ")" << dendl;
+  struct rdma_conn_param cm_params;
+  switch (event->event) {
+    case RDMA_CM_EVENT_ADDR_RESOLVED:
+      status = ADDR_RESOLVED;
+      if (rdma_resolve_route(cm_id, TIMEOUT_MS)) {
+        lderr(cct) << __func__ << " failed to resolve rdma addr" << dendl;
+        notify();
+      }
+      break;
+
+    case RDMA_CM_EVENT_ROUTE_RESOLVED:
+      status = ROUTE_RESOLVED;
+      if (alloc_resource()) {
+        lderr(cct) << __func__ << " failed to alloc resource while resolving the route" << dendl;
+        connected = -ECONNREFUSED;
+        notify();
+        break;
+      }
+
+      // FIPS zeroization audit 20191115: this memset is not security related.
+      memset(&cm_params, 0, sizeof(cm_params));
+      cm_params.retry_count = RETRY_COUNT;
+      cm_params.qp_num = local_qpn;
+      if (rdma_connect(cm_id, &cm_params)) {
+        lderr(cct) << __func__ << " failed to connect remote rdma port" << dendl;
+        connected = -ECONNREFUSED;
+        notify();
+      }
+      break;
+
+    case RDMA_CM_EVENT_ESTABLISHED:
+      ldout(cct, 20) << __func__ << " qp_num=" << cm_id->qp->qp_num << dendl;
+      status = CONNECTED;
+      if (!is_server) {
+        peer_qpn = event->param.conn.qp_num;
+        activate();
+        qp->get_local_cm_meta().peer_qpn = peer_qpn;
+        qp->get_peer_cm_meta().local_qpn = peer_qpn;
+        notify();
+      }
+      break;
+
+    case RDMA_CM_EVENT_ADDR_ERROR:
+    case RDMA_CM_EVENT_ROUTE_ERROR:
+    case RDMA_CM_EVENT_CONNECT_ERROR:
+    case RDMA_CM_EVENT_UNREACHABLE:
+    case RDMA_CM_EVENT_REJECTED:
+      lderr(cct) << __func__ << " rdma connection rejected" << dendl;
+      connected = -ECONNREFUSED;
+      notify();
+      break;
+
+    case RDMA_CM_EVENT_DISCONNECTED:
+      status = DISCONNECTED;
+      close_notify();
+      if (!error) {
+        error = ECONNRESET;
+        notify();
+      }
+      break;
+
+    case RDMA_CM_EVENT_DEVICE_REMOVAL:
+      break;
+
+    default:
+      ceph_abort_msg("unhandled event");
+      break;
+  }
+  rdma_ack_cm_event(event);
+}
+
+void RDMAIWARPConnectedSocketImpl::activate() {
+  ldout(cct, 30) << __func__ << dendl;
+  active = true;
+  connected = 1;
+}
+
+int RDMAIWARPConnectedSocketImpl::alloc_resource() {
+  ldout(cct, 30) << __func__ << dendl;
+  qp = ib->create_queue_pair(cct, dispatcher->get_tx_cq(),
+      dispatcher->get_rx_cq(), IBV_QPT_RC, cm_id);
+  if (!qp) {
+    return -1;
+  }
+  local_qpn = qp->get_local_qp_number();
+  dispatcher->register_qp(qp, this);
+  dispatcher->perf_logger->inc(l_msgr_rdma_created_queue_pair);
+  dispatcher->perf_logger->inc(l_msgr_rdma_active_queue_pair);
+  return 0;
+}
+
+void RDMAIWARPConnectedSocketImpl::close_notify() {
+  ldout(cct, 30) << __func__ << dendl;
+  if (status >= CHANNEL_FD_CREATED) {
+    worker->center.delete_file_event(cm_channel->fd, EVENT_READABLE);
+  }
+  std::unique_lock l(close_mtx);
+  if (!closed) {
+    closed = true;
+    close_condition.notify_all();
+  }
+}
diff --git a/src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc b/src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc
new file mode 100644
index 000000000..0500b4420
--- /dev/null
+++ b/src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc
@@ -0,0 +1,119 @@
+#include <poll.h>
+
+#include "msg/async/net_handler.h"
+#include "RDMAStack.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << " RDMAIWARPServerSocketImpl "
+
+RDMAIWARPServerSocketImpl::RDMAIWARPServerSocketImpl(
+  CephContext *cct, std::shared_ptr<Infiniband>& ib,
+  std::shared_ptr<RDMADispatcher>& rdma_dispatcher, RDMAWorker *w,
+  entity_addr_t& a, unsigned addr_slot)
+  : RDMAServerSocketImpl(cct, ib, rdma_dispatcher, w, a, addr_slot)
+{
+}
+
+int RDMAIWARPServerSocketImpl::listen(entity_addr_t &sa,
+				      const SocketOptions &opt)
+{
+  ldout(cct, 20) << __func__ << " bind to rdma point" << dendl;
+  cm_channel = rdma_create_event_channel();
+  rdma_create_id(cm_channel, &cm_id, NULL, RDMA_PS_TCP);
+  ldout(cct, 20) << __func__ << " successfully created cm id: " << cm_id << dendl;
+  int rc = rdma_bind_addr(cm_id, const_cast<struct sockaddr*>(sa.get_sockaddr()));
+  if (rc < 0) {
+    rc = -errno;
+    ldout(cct, 10) << __func__ << " unable to bind to " << sa.get_sockaddr()
+                   << " on port " << sa.get_port() << ": " << cpp_strerror(errno) << dendl;
+    goto err;
+  }
+  rc = rdma_listen(cm_id, 128);
+  if (rc < 0) {
+    rc = -errno;
+    ldout(cct, 10) << __func__ << " unable to listen to " << sa.get_sockaddr()
+                   << " on port " << sa.get_port() << ": " << cpp_strerror(errno) << dendl;
+    goto err;
+  }
+  server_setup_socket = cm_channel->fd;
+  rc = net.set_nonblock(server_setup_socket);
+  if (rc < 0) {
+    goto err;
+  }
+  ldout(cct, 20) << __func__ << " fd of cm_channel is " << server_setup_socket << dendl;
+  return 0;
+
+err:
+  server_setup_socket = -1;
+  rdma_destroy_id(cm_id);
+  rdma_destroy_event_channel(cm_channel);
+  return rc;
+}
+
+int RDMAIWARPServerSocketImpl::accept(ConnectedSocket *sock, const SocketOptions &opt,
+    entity_addr_t *out, Worker *w)
+{
+  ldout(cct, 15) << __func__ << dendl;
+
+  ceph_assert(sock);
+  struct pollfd pfd = {
+    .fd = cm_channel->fd,
+    .events = POLLIN,
+    .revents = 0,
+  };
+  int ret = poll(&pfd, 1, 0);
+  ceph_assert(ret >= 0);
+  if (!ret)
+    return -EAGAIN;
+
+  struct rdma_cm_event *cm_event;
+  rdma_get_cm_event(cm_channel, &cm_event);
+  ldout(cct, 20) << __func__ << " event name: " << rdma_event_str(cm_event->event) << dendl;
+
+  struct rdma_cm_id *event_cm_id = cm_event->id;
+  struct rdma_event_channel *event_channel = rdma_create_event_channel();
+
+  if (net.set_nonblock(event_channel->fd) < 0) {
+      lderr(cct) << __func__ << " failed to switch event channel to non-block, close event channel " << dendl;
+      rdma_destroy_event_channel(event_channel);
+      rdma_ack_cm_event(cm_event);
+      return -errno;
+  }
+
+  rdma_migrate_id(event_cm_id, event_channel);
+
+  struct rdma_conn_param *remote_conn_param = &cm_event->param.conn;
+  struct rdma_conn_param local_conn_param;
+
+  RDMACMInfo info(event_cm_id, event_channel, remote_conn_param->qp_num);
+  RDMAIWARPConnectedSocketImpl* server =
+    new RDMAIWARPConnectedSocketImpl(cct, ib, dispatcher, dynamic_cast<RDMAWorker*>(w), &info);
+
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&local_conn_param, 0, sizeof(local_conn_param));
+  local_conn_param.qp_num = server->get_local_qpn();
+
+  if (rdma_accept(event_cm_id, &local_conn_param)) {
+    return -EAGAIN;
+  }
+  server->activate();
+  ldout(cct, 20) << __func__ << " accepted a new QP" << dendl;
+
+  rdma_ack_cm_event(cm_event);
+
+  std::unique_ptr<RDMAConnectedSocketImpl> csi(server);
+  *sock = ConnectedSocket(std::move(csi));
+  struct sockaddr *addr = &event_cm_id->route.addr.dst_addr;
+  out->set_sockaddr(addr);
+
+  return 0;
+}
+
+void RDMAIWARPServerSocketImpl::abort_accept()
+{
+  if (server_setup_socket >= 0) {
+    rdma_destroy_id(cm_id);
+    rdma_destroy_event_channel(cm_channel);
+  }
+}
diff --git a/src/msg/async/rdma/RDMAServerSocketImpl.cc b/src/msg/async/rdma/RDMAServerSocketImpl.cc
new file mode 100644
index 000000000..665faa931
--- /dev/null
+++ b/src/msg/async/rdma/RDMAServerSocketImpl.cc
@@ -0,0 +1,135 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "msg/async/net_handler.h"
+#include "RDMAStack.h"
+
+#include "include/compat.h"
+#include "include/sock_compat.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << " RDMAServerSocketImpl "
+
+RDMAServerSocketImpl::RDMAServerSocketImpl(
+  CephContext *cct, std::shared_ptr<Infiniband>& ib,
+  std::shared_ptr<RDMADispatcher>& rdma_dispatcher,
+  RDMAWorker *w, entity_addr_t& a, unsigned slot)
+  : ServerSocketImpl(a.get_type(), slot),
+    cct(cct), net(cct), server_setup_socket(-1), ib(ib),
+    dispatcher(rdma_dispatcher), worker(w), sa(a)
+{
+}
+
+int RDMAServerSocketImpl::listen(entity_addr_t &sa, const SocketOptions &opt)
+{
+  int rc = 0;
+  server_setup_socket = net.create_socket(sa.get_family(), true);
+  if (server_setup_socket < 0) {
+    rc = -errno;
+    lderr(cct) << __func__ << " failed to create server socket: "
+               << cpp_strerror(errno) << dendl;
+    return rc;
+  }
+
+  rc = net.set_nonblock(server_setup_socket);
+  if (rc < 0) {
+    goto err;
+  }
+
+  rc = net.set_socket_options(server_setup_socket, opt.nodelay, opt.rcbuf_size);
+  if (rc < 0) {
+    goto err;
+  }
+
+  rc = ::bind(server_setup_socket, sa.get_sockaddr(), sa.get_sockaddr_len());
+  if (rc < 0) {
+    rc = -errno;
+    ldout(cct, 10) << __func__ << " unable to bind to " << sa.get_sockaddr()
+                   << " on port " << sa.get_port() << ": " << cpp_strerror(errno) << dendl;
+    goto err;
+  }
+
+  rc = ::listen(server_setup_socket, cct->_conf->ms_tcp_listen_backlog);
+  if (rc < 0) {
+    rc = -errno;
+    lderr(cct) << __func__ << " unable to listen on " << sa << ": " << cpp_strerror(errno) << dendl;
+    goto err;
+  }
+
+  ldout(cct, 20) << __func__ << " bind to " << sa.get_sockaddr() << " on port " << sa.get_port()  << dendl;
+  return 0;
+
+err:
+  ::close(server_setup_socket);
+  server_setup_socket = -1;
+  return rc;
+}
+
+int RDMAServerSocketImpl::accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w)
+{
+  ldout(cct, 15) << __func__ << dendl;
+
+  ceph_assert(sock);
+
+  sockaddr_storage ss;
+  socklen_t slen = sizeof(ss);
+  int sd = accept_cloexec(server_setup_socket, (sockaddr*)&ss, &slen);
+  if (sd < 0) {
+    return -errno;
+  }
+
+  int r = net.set_nonblock(sd);
+  if (r < 0) {
+    ::close(sd);
+    return -errno;
+  }
+
+  r = net.set_socket_options(sd, opt.nodelay, opt.rcbuf_size);
+  if (r < 0) {
+    ::close(sd);
+    return -errno;
+  }
+
+  ceph_assert(NULL != out); //out should not be NULL in accept connection
+
+  out->set_type(addr_type);
+  out->set_sockaddr((sockaddr*)&ss);
+  net.set_priority(sd, opt.priority, out->get_family());
+
+  RDMAConnectedSocketImpl* server;
+  //Worker* w = dispatcher->get_stack()->get_worker();
+  server = new RDMAConnectedSocketImpl(cct, ib, dispatcher, dynamic_cast<RDMAWorker*>(w));
+  if (!server->get_qp()) {
+    lderr(cct) << __func__ << " server->qp is null" << dendl;
+    // cann't use delete server here, destructor will fail.
+    server->cleanup();
+    ::close(sd);
+    return -1;
+  }
+  server->set_accept_fd(sd);
+  ldout(cct, 20) << __func__ << " accepted a new QP, tcp_fd: " << sd << dendl;
+  std::unique_ptr<RDMAConnectedSocketImpl> csi(server);
+  *sock = ConnectedSocket(std::move(csi));
+
+  return 0;
+}
+
+void RDMAServerSocketImpl::abort_accept()
+{
+  if (server_setup_socket >= 0)
+    ::close(server_setup_socket);
+}
diff --git a/src/msg/async/rdma/RDMAStack.cc b/src/msg/async/rdma/RDMAStack.cc
new file mode 100644
index 000000000..49bafd0b4
--- /dev/null
+++ b/src/msg/async/rdma/RDMAStack.cc
@@ -0,0 +1,813 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <poll.h>
+#include <errno.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "include/str_list.h"
+#include "include/compat.h"
+#include "common/Cycles.h"
+#include "common/deleter.h"
+#include "common/Tub.h"
+#include "RDMAStack.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << "RDMAStack "
+
+RDMADispatcher::~RDMADispatcher()
+{
+  ldout(cct, 20) << __func__ << " destructing rdma dispatcher" << dendl;
+  polling_stop();
+
+  ceph_assert(qp_conns.empty());
+  ceph_assert(num_qp_conn == 0);
+  ceph_assert(dead_queue_pairs.empty());
+}
+
+RDMADispatcher::RDMADispatcher(CephContext* c, std::shared_ptr<Infiniband>& ib)
+  : cct(c), ib(ib)
+{
+  PerfCountersBuilder plb(cct, "AsyncMessenger::RDMADispatcher", l_msgr_rdma_dispatcher_first, l_msgr_rdma_dispatcher_last);
+
+  plb.add_u64_counter(l_msgr_rdma_polling, "polling", "Whether dispatcher thread is polling");
+  plb.add_u64_counter(l_msgr_rdma_inflight_tx_chunks, "inflight_tx_chunks", "The number of inflight tx chunks");
+  plb.add_u64_counter(l_msgr_rdma_rx_bufs_in_use, "rx_bufs_in_use", "The number of rx buffers that are holding data and being processed");
+  plb.add_u64_counter(l_msgr_rdma_rx_bufs_total, "rx_bufs_total", "The total number of rx buffers");
+
+  plb.add_u64_counter(l_msgr_rdma_tx_total_wc, "tx_total_wc", "The number of tx work comletions");
+  plb.add_u64_counter(l_msgr_rdma_tx_total_wc_errors, "tx_total_wc_errors", "The number of tx errors");
+  plb.add_u64_counter(l_msgr_rdma_tx_wc_retry_errors, "tx_retry_errors", "The number of tx retry errors");
+  plb.add_u64_counter(l_msgr_rdma_tx_wc_wr_flush_errors, "tx_wr_flush_errors", "The number of tx work request flush errors");
+
+  plb.add_u64_counter(l_msgr_rdma_rx_total_wc, "rx_total_wc", "The number of total rx work completion");
+  plb.add_u64_counter(l_msgr_rdma_rx_total_wc_errors, "rx_total_wc_errors", "The number of total rx error work completion");
+  plb.add_u64_counter(l_msgr_rdma_rx_fin, "rx_fin", "The number of rx finish work request");
+
+  plb.add_u64_counter(l_msgr_rdma_total_async_events, "total_async_events", "The number of async events");
+  plb.add_u64_counter(l_msgr_rdma_async_last_wqe_events, "async_last_wqe_events", "The number of last wqe events");
+
+  plb.add_u64_counter(l_msgr_rdma_handshake_errors, "handshake_errors", "The number of handshake errors");
+
+
+  plb.add_u64_counter(l_msgr_rdma_created_queue_pair, "created_queue_pair", "Active queue pair number");
+  plb.add_u64_counter(l_msgr_rdma_active_queue_pair, "active_queue_pair", "Created queue pair number");
+
+  perf_logger = plb.create_perf_counters();
+  cct->get_perfcounters_collection()->add(perf_logger);
+  Cycles::init();
+}
+
+void RDMADispatcher::polling_start()
+{
+  // take lock because listen/connect can happen from different worker threads
+  std::lock_guard l{lock};
+
+  if (t.joinable()) 
+    return; // dispatcher thread already running 
+
+  ib->get_memory_manager()->set_rx_stat_logger(perf_logger);
+
+  tx_cc = ib->create_comp_channel(cct);
+  ceph_assert(tx_cc);
+  rx_cc = ib->create_comp_channel(cct);
+  ceph_assert(rx_cc);
+  tx_cq = ib->create_comp_queue(cct, tx_cc);
+  ceph_assert(tx_cq);
+  rx_cq = ib->create_comp_queue(cct, rx_cc);
+  ceph_assert(rx_cq);
+
+  t = std::thread(&RDMADispatcher::polling, this);
+  ceph_pthread_setname(t.native_handle(), "rdma-polling");
+}
+
+void RDMADispatcher::polling_stop()
+{
+  {
+    std::lock_guard l{lock};
+    done = true;
+  }
+
+  if (!t.joinable())
+    return;
+
+  t.join();
+
+  tx_cc->ack_events();
+  rx_cc->ack_events();
+  delete tx_cq;
+  delete rx_cq;
+  delete tx_cc;
+  delete rx_cc;
+}
+
+void RDMADispatcher::handle_async_event()
+{
+  ldout(cct, 30) << __func__ << dendl;
+  while (1) {
+    ibv_async_event async_event;
+    if (ibv_get_async_event(ib->get_device()->ctxt, &async_event)) {
+      if (errno != EAGAIN)
+       lderr(cct) << __func__ << " ibv_get_async_event failed. (errno=" << errno
+                  << " " << cpp_strerror(errno) << ")" << dendl;
+      return;
+    }
+    perf_logger->inc(l_msgr_rdma_total_async_events);
+    ldout(cct, 1) << __func__ << "Event : " << ibv_event_type_str(async_event.event_type) << dendl;
+
+    switch (async_event.event_type) {
+      /***********************CQ events********************/
+      case IBV_EVENT_CQ_ERR:
+        lderr(cct) << __func__ << " Fatal Error, effect all QP bound with same CQ, "
+                   << " CQ Overflow, dev = " << ib->get_device()->ctxt
+                   << " Need destroy and recreate resource " << dendl;
+        break;
+      /***********************QP events********************/
+      case IBV_EVENT_QP_FATAL:
+        {
+          /* Error occurred on a QP and it transitioned to error state */
+          ibv_qp* ib_qp = async_event.element.qp;
+          uint32_t qpn = ib_qp->qp_num;
+          QueuePair* qp = get_qp(qpn);
+          lderr(cct) << __func__ << " Fatal Error, event associate qp number: " << qpn
+                     << " Queue Pair status: " << Infiniband::qp_state_string(qp->get_state())
+                     << " Event : " << ibv_event_type_str(async_event.event_type) << dendl;
+        }
+        break;
+      case IBV_EVENT_QP_LAST_WQE_REACHED:
+        {
+          /*
+           * 1. The QP bound with SRQ is in IBV_QPS_ERR state & no more WQE on the RQ of the QP
+           *    Reason: QP is force switched into Error before posting Beacon WR.
+           *            The QP's WRs will be flushed into CQ with IBV_WC_WR_FLUSH_ERR status
+           *            For SRQ, only WRs on the QP which is switched into Error status will be flushed.
+           *    Handle: Only confirm that qp enter into dead queue pairs
+           * 2. The CQE with error was generated for the last WQE
+           *    Handle: output error log
+           */
+          perf_logger->inc(l_msgr_rdma_async_last_wqe_events);
+          ibv_qp* ib_qp = async_event.element.qp;
+          uint32_t qpn = ib_qp->qp_num;
+          std::lock_guard l{lock};
+          RDMAConnectedSocketImpl *conn = get_conn_lockless(qpn);
+          QueuePair* qp = get_qp_lockless(qpn);
+
+          if (qp && !qp->is_dead()) {
+            lderr(cct) << __func__ << " QP not dead, event associate qp number: " << qpn
+                       << " Queue Pair status: " << Infiniband::qp_state_string(qp->get_state())
+                       << " Event : " << ibv_event_type_str(async_event.event_type) << dendl;
+          }
+          if (!conn) {
+            ldout(cct, 20) << __func__ << " Connection's QP maybe entered into dead status. "
+                           << " qp number: " << qpn << dendl;
+          } else {
+             conn->fault();
+             if (qp) {
+                if (!cct->_conf->ms_async_rdma_cm) {
+                  enqueue_dead_qp_lockless(qpn);
+                }
+             }
+          }
+        }
+        break;
+      case IBV_EVENT_QP_REQ_ERR:
+        /* Invalid Request Local Work Queue Error */
+        [[fallthrough]];
+      case IBV_EVENT_QP_ACCESS_ERR:
+        /* Local access violation error */
+        [[fallthrough]];
+      case IBV_EVENT_COMM_EST:
+        /* Communication was established on a QP */
+        [[fallthrough]];
+      case IBV_EVENT_SQ_DRAINED:
+        /* Send Queue was drained of outstanding messages in progress */
+        [[fallthrough]];
+      case IBV_EVENT_PATH_MIG:
+        /* A connection has migrated to the alternate path */
+        [[fallthrough]];
+      case IBV_EVENT_PATH_MIG_ERR:
+        /* A connection failed to migrate to the alternate path */
+        break;
+      /***********************SRQ events*******************/
+      case IBV_EVENT_SRQ_ERR:
+        /* Error occurred on an SRQ */
+        [[fallthrough]];
+      case IBV_EVENT_SRQ_LIMIT_REACHED:
+        /* SRQ limit was reached */
+        break;
+      /***********************Port events******************/
+      case IBV_EVENT_PORT_ACTIVE:
+        /* Link became active on a port */
+        [[fallthrough]];
+      case IBV_EVENT_PORT_ERR:
+        /* Link became unavailable on a port */
+        [[fallthrough]];
+      case IBV_EVENT_LID_CHANGE:
+        /* LID was changed on a port */
+        [[fallthrough]];
+      case IBV_EVENT_PKEY_CHANGE:
+        /* P_Key table was changed on a port */
+        [[fallthrough]];
+      case IBV_EVENT_SM_CHANGE:
+        /* SM was changed on a port */
+        [[fallthrough]];
+      case IBV_EVENT_CLIENT_REREGISTER:
+        /* SM sent a CLIENT_REREGISTER request to a port */
+        [[fallthrough]];
+      case IBV_EVENT_GID_CHANGE:
+        /* GID table was changed on a port */
+        break;
+
+      /***********************CA events******************/
+      //CA events:
+      case IBV_EVENT_DEVICE_FATAL:
+        /* CA is in FATAL state */
+        lderr(cct) << __func__ << " ibv_get_async_event: dev = " << ib->get_device()->ctxt
+                   << " evt: " << ibv_event_type_str(async_event.event_type) << dendl;
+        break;
+      default:
+        lderr(cct) << __func__ << " ibv_get_async_event: dev = " << ib->get_device()->ctxt
+                   << " unknown event: " << async_event.event_type << dendl;
+        break;
+    }
+    ibv_ack_async_event(&async_event);
+  }
+}
+
+void RDMADispatcher::post_chunk_to_pool(Chunk* chunk)
+{
+  std::lock_guard l{lock};
+  ib->post_chunk_to_pool(chunk);
+  perf_logger->dec(l_msgr_rdma_rx_bufs_in_use);
+}
+
+int RDMADispatcher::post_chunks_to_rq(int num, QueuePair *qp)
+{
+  std::lock_guard l{lock};
+  return ib->post_chunks_to_rq(num, qp);
+}
+
+void RDMADispatcher::polling()
+{
+  static int MAX_COMPLETIONS = 32;
+  ibv_wc wc[MAX_COMPLETIONS];
+
+  std::map<RDMAConnectedSocketImpl*, std::vector<ibv_wc> > polled;
+  std::vector<ibv_wc> tx_cqe;
+  ldout(cct, 20) << __func__ << " going to poll tx cq: " << tx_cq << " rx cq: " << rx_cq << dendl;
+  uint64_t last_inactive = Cycles::rdtsc();
+  bool rearmed = false;
+  int r = 0;
+
+  while (true) {
+    int tx_ret = tx_cq->poll_cq(MAX_COMPLETIONS, wc);
+    if (tx_ret > 0) {
+      ldout(cct, 20) << __func__ << " tx completion queue got " << tx_ret
+                     << " responses."<< dendl;
+      handle_tx_event(wc, tx_ret);
+    }
+
+    int rx_ret = rx_cq->poll_cq(MAX_COMPLETIONS, wc);
+    if (rx_ret > 0) {
+      ldout(cct, 20) << __func__ << " rx completion queue got " << rx_ret
+                     << " responses."<< dendl;
+      handle_rx_event(wc, rx_ret);
+    }
+
+    if (!tx_ret && !rx_ret) {
+      perf_logger->set(l_msgr_rdma_inflight_tx_chunks, inflight);
+      //
+      // Clean up dead QPs when rx/tx CQs are in idle. The thing is that
+      // we can destroy QPs even earlier, just when beacon has been received,
+      // but we have two CQs (rx & tx), thus beacon WC can be poped from tx
+      // CQ before other WCs are fully consumed from rx CQ. For safety, we
+      // wait for beacon and then "no-events" from CQs.
+      //
+      // Calling size() on vector without locks is totally fine, since we
+      // use it as a hint (accuracy is not important here)
+      //
+      if (!dead_queue_pairs.empty()) {
+        decltype(dead_queue_pairs) dead_qps;
+        {
+          std::lock_guard l{lock};
+          dead_queue_pairs.swap(dead_qps);
+        }
+
+        for (auto& qp: dead_qps) {
+          perf_logger->dec(l_msgr_rdma_active_queue_pair);
+          ldout(cct, 10) << __func__ << " finally delete qp = " << qp << dendl;
+          delete qp;
+        }
+      }
+
+      if (!num_qp_conn && done && dead_queue_pairs.empty())
+        break;
+
+      uint64_t now = Cycles::rdtsc();
+      if (Cycles::to_microseconds(now - last_inactive) > cct->_conf->ms_async_rdma_polling_us) {
+        handle_async_event();
+        if (!rearmed) {
+          // Clean up cq events after rearm notify ensure no new incoming event
+          // arrived between polling and rearm
+          tx_cq->rearm_notify();
+          rx_cq->rearm_notify();
+          rearmed = true;
+          continue;
+        }
+
+        struct pollfd channel_poll[2];
+        channel_poll[0].fd = tx_cc->get_fd();
+        channel_poll[0].events = POLLIN;
+        channel_poll[0].revents = 0;
+        channel_poll[1].fd = rx_cc->get_fd();
+        channel_poll[1].events = POLLIN;
+        channel_poll[1].revents = 0;
+        r = 0;
+        perf_logger->set(l_msgr_rdma_polling, 0);
+        while (!done && r == 0) {
+          r = TEMP_FAILURE_RETRY(poll(channel_poll, 2, 100));
+          if (r < 0) {
+            r = -errno;
+            lderr(cct) << __func__ << " poll failed " << r << dendl;
+            ceph_abort();
+          }
+        }
+        if (r > 0 && tx_cc->get_cq_event())
+          ldout(cct, 20) << __func__ << " got tx cq event." << dendl;
+        if (r > 0 && rx_cc->get_cq_event())
+          ldout(cct, 20) << __func__ << " got rx cq event." << dendl;
+        last_inactive = Cycles::rdtsc();
+        perf_logger->set(l_msgr_rdma_polling, 1);
+        rearmed = false;
+      }
+    }
+  }
+}
+
+void RDMADispatcher::notify_pending_workers() {
+  if (num_pending_workers) {
+    RDMAWorker *w = nullptr;
+    {
+      std::lock_guard l{w_lock};
+      if (!pending_workers.empty()) {
+        w = pending_workers.front();
+        pending_workers.pop_front();
+        --num_pending_workers;
+      }
+    }
+    if (w)
+      w->notify_worker();
+  }
+}
+
+void RDMADispatcher::register_qp(QueuePair *qp, RDMAConnectedSocketImpl* csi)
+{
+  std::lock_guard l{lock};
+  ceph_assert(!qp_conns.count(qp->get_local_qp_number()));
+  qp_conns[qp->get_local_qp_number()] = std::make_pair(qp, csi);
+  ++num_qp_conn;
+}
+
+RDMAConnectedSocketImpl* RDMADispatcher::get_conn_lockless(uint32_t qp)
+{
+  auto it = qp_conns.find(qp);
+  if (it == qp_conns.end())
+    return nullptr;
+  if (it->second.first->is_dead())
+    return nullptr;
+  return it->second.second;
+}
+
+Infiniband::QueuePair* RDMADispatcher::get_qp_lockless(uint32_t qp)
+{
+  // Try to find the QP in qp_conns firstly.
+  auto it = qp_conns.find(qp);
+  if (it != qp_conns.end())
+    return it->second.first;
+
+  // Try again in dead_queue_pairs.
+  for (auto &i: dead_queue_pairs)
+    if (i->get_local_qp_number() == qp)
+      return i;
+
+  return nullptr;
+}
+
+Infiniband::QueuePair* RDMADispatcher::get_qp(uint32_t qp)
+{
+  std::lock_guard l{lock};
+  return get_qp_lockless(qp);
+}
+
+void RDMADispatcher::enqueue_dead_qp_lockless(uint32_t qpn)
+{
+  auto it = qp_conns.find(qpn);
+  if (it == qp_conns.end()) {
+    lderr(cct) << __func__ << " QP [" << qpn << "] is not registered." << dendl;
+    return ;
+  }
+  QueuePair *qp = it->second.first;
+  dead_queue_pairs.push_back(qp);
+  qp_conns.erase(it);
+  --num_qp_conn;
+}
+
+void RDMADispatcher::enqueue_dead_qp(uint32_t qpn)
+{
+  std::lock_guard l{lock};
+  enqueue_dead_qp_lockless(qpn);
+}
+
+void RDMADispatcher::schedule_qp_destroy(uint32_t qpn)
+{
+  std::lock_guard l{lock};
+  auto it = qp_conns.find(qpn);
+  if (it == qp_conns.end()) {
+    lderr(cct) << __func__ << " QP [" << qpn << "] is not registered." << dendl;
+    return;
+  }
+  QueuePair *qp = it->second.first;
+  if (qp->to_dead()) {
+    //
+    // Failed to switch to dead. This is abnormal, but we can't
+    // do anything, so just destroy QP.
+    //
+    dead_queue_pairs.push_back(qp);
+    qp_conns.erase(it);
+    --num_qp_conn;
+  } else {
+    //
+    // Successfully switched to dead, thus keep entry in the map.
+    // But only zero out socked pointer in order to return null from
+    // get_conn_lockless();
+    it->second.second = nullptr;
+  }
+}
+
+void RDMADispatcher::handle_tx_event(ibv_wc *cqe, int n)
+{
+  std::vector<Chunk*> tx_chunks;
+
+  for (int i = 0; i < n; ++i) {
+    ibv_wc* response = &cqe[i];
+
+    // If it's beacon WR, enqueue the QP to be destroyed later
+    if (response->wr_id == BEACON_WRID) {
+      enqueue_dead_qp(response->qp_num);
+      continue;
+    }
+
+    ldout(cct, 20) << __func__ << " QP number: " << response->qp_num << " len: " << response->byte_len
+                   << " status: " << ib->wc_status_to_string(response->status) << dendl;
+
+    if (response->status != IBV_WC_SUCCESS) {
+      switch(response->status) {
+        case IBV_WC_RETRY_EXC_ERR:
+          {
+            perf_logger->inc(l_msgr_rdma_tx_wc_retry_errors);
+
+            ldout(cct, 1) << __func__ << " Responder ACK timeout, possible disconnect, or Remote QP in bad state "
+                          << " WCE status(" << response->status << "): " << ib->wc_status_to_string(response->status)
+                          << " WCE QP number " << response->qp_num << " Opcode " << response->opcode
+                          << " wr_id: 0x" << std::hex << response->wr_id << std::dec << dendl;
+
+            std::lock_guard l{lock};
+            RDMAConnectedSocketImpl *conn = get_conn_lockless(response->qp_num);
+            if (conn) {
+              ldout(cct, 1) << __func__ << " SQ WR return error, remote Queue Pair, qp number: "
+                            << conn->get_peer_qpn() << dendl;
+            }
+          }
+          break;
+        case IBV_WC_WR_FLUSH_ERR:
+          {
+            perf_logger->inc(l_msgr_rdma_tx_wc_wr_flush_errors);
+
+            std::lock_guard l{lock};
+            QueuePair *qp = get_qp_lockless(response->qp_num);
+            if (qp) {
+              ldout(cct, 20) << __func__ << " qp state is " << Infiniband::qp_state_string(qp->get_state()) << dendl;
+            }
+            if (qp && qp->is_dead()) {
+              ldout(cct, 20) << __func__ << " outstanding SQ WR is flushed into CQ since QueuePair is dead " << dendl;
+            } else {
+              lderr(cct) << __func__ << " Invalid/Unsupported request to consume outstanding SQ WR,"
+                         << " WCE status(" << response->status << "): " << ib->wc_status_to_string(response->status)
+                         << " WCE QP number " << response->qp_num << " Opcode " << response->opcode
+                         << " wr_id: 0x" << std::hex << response->wr_id << std::dec << dendl;
+
+              RDMAConnectedSocketImpl *conn = get_conn_lockless(response->qp_num);
+              if (conn) {
+                ldout(cct, 1) << __func__ << " SQ WR return error, remote Queue Pair, qp number: "
+                              << conn->get_peer_qpn() << dendl;
+              }
+            }
+          }
+          break;
+
+        default:
+          {
+            lderr(cct) << __func__ << " SQ WR return error,"
+                       << " WCE status(" << response->status << "): " << ib->wc_status_to_string(response->status)
+                       << " WCE QP number " << response->qp_num << " Opcode " << response->opcode
+                       << " wr_id: 0x" << std::hex << response->wr_id << std::dec << dendl;
+
+            std::lock_guard l{lock};
+            RDMAConnectedSocketImpl *conn = get_conn_lockless(response->qp_num);
+            if (conn && conn->is_connected()) {
+              ldout(cct, 20) << __func__ << " SQ WR return error Queue Pair error state is : " << conn->get_qp_state()
+                             << " remote Queue Pair, qp number: " << conn->get_peer_qpn() << dendl;
+              conn->fault();
+            } else {
+              ldout(cct, 1) << __func__ << " Disconnected, qp_num = " << response->qp_num << " discard event" << dendl;
+            }
+          }
+          break;
+      }
+    }
+
+    auto chunk = reinterpret_cast<Chunk *>(response->wr_id);
+    //TX completion may come either from
+    // 1) regular send message, WCE wr_id points to chunk
+    // 2) 'fin' message, wr_id points to the QP
+    if (ib->get_memory_manager()->is_valid_chunk(chunk)) {
+      tx_chunks.push_back(chunk);
+    } else if (reinterpret_cast<QueuePair*>(response->wr_id)->get_local_qp_number() == response->qp_num ) {
+      ldout(cct, 1) << __func__ << " sending of the disconnect msg completed" << dendl;
+    } else {
+      ldout(cct, 1) << __func__ << " not tx buffer, chunk " << chunk << dendl;
+      ceph_abort();
+    }
+  }
+
+  perf_logger->inc(l_msgr_rdma_tx_total_wc, n);
+  post_tx_buffer(tx_chunks);
+}
+
+/**
+ * Add the given Chunks to the given free queue.
+ *
+ * \param[in] chunks
+ *      The Chunks to enqueue.
+ * \return
+ *      0 if success or -1 for failure
+ */
+void RDMADispatcher::post_tx_buffer(std::vector<Chunk*> &chunks)
+{
+  if (chunks.empty())
+    return ;
+
+  inflight -= chunks.size();
+  ib->get_memory_manager()->return_tx(chunks);
+  ldout(cct, 30) << __func__ << " release " << chunks.size()
+                 << " chunks, inflight " << inflight << dendl;
+  notify_pending_workers();
+}
+
+void RDMADispatcher::handle_rx_event(ibv_wc *cqe, int rx_number)
+{
+  perf_logger->inc(l_msgr_rdma_rx_total_wc, rx_number);
+  perf_logger->inc(l_msgr_rdma_rx_bufs_in_use, rx_number);
+
+  std::map<RDMAConnectedSocketImpl*, std::vector<ibv_wc> > polled;
+  std::lock_guard l{lock};//make sure connected socket alive when pass wc
+
+  for (int i = 0; i < rx_number; ++i) {
+    ibv_wc* response = &cqe[i];
+    Chunk* chunk = reinterpret_cast<Chunk *>(response->wr_id);
+    RDMAConnectedSocketImpl *conn = get_conn_lockless(response->qp_num);
+    QueuePair *qp = get_qp_lockless(response->qp_num);
+
+    switch (response->status) {
+      case IBV_WC_SUCCESS:
+        ceph_assert(response->opcode == IBV_WC_RECV);
+        if (!conn) {
+          ldout(cct, 1) << __func__ << " csi with qpn " << response->qp_num << " may be dead. chunk 0x"
+                        << std::hex << chunk << " will be back." << std::dec << dendl;
+          ib->post_chunk_to_pool(chunk);
+          perf_logger->dec(l_msgr_rdma_rx_bufs_in_use);
+        } else {
+          conn->post_chunks_to_rq(1);
+          polled[conn].push_back(*response);
+
+          if (qp != nullptr && !qp->get_srq()) {
+            qp->remove_rq_wr(chunk);
+            chunk->clear_qp();
+          }
+        }
+        break;
+
+      case IBV_WC_WR_FLUSH_ERR:
+        perf_logger->inc(l_msgr_rdma_rx_total_wc_errors);
+
+        if (qp) {
+          ldout(cct, 20) << __func__ << " qp state is " << Infiniband::qp_state_string(qp->get_state()) << dendl;
+        }
+        if (qp && qp->is_dead()) {
+          ldout(cct, 20) << __func__ << " outstanding RQ WR is flushed into CQ since QueuePair is dead " << dendl;
+        } else {
+          ldout(cct, 1) << __func__ << " RQ WR return error,"
+                     << " WCE status(" << response->status << "): " << ib->wc_status_to_string(response->status)
+                     << " WCE QP number " << response->qp_num << " Opcode " << response->opcode
+                     << " wr_id: 0x" << std::hex << response->wr_id << std::dec << dendl;
+          if (conn) {
+            ldout(cct, 1) << __func__ << " RQ WR return error, remote Queue Pair, qp number: "
+                       << conn->get_peer_qpn() << dendl;
+          }
+        }
+
+        ib->post_chunk_to_pool(chunk);
+        perf_logger->dec(l_msgr_rdma_rx_bufs_in_use);
+        break;
+
+      default:
+        perf_logger->inc(l_msgr_rdma_rx_total_wc_errors);
+
+        ldout(cct, 1) << __func__ << " RQ WR return error,"
+                      << " WCE status(" << response->status << "): " << ib->wc_status_to_string(response->status)
+                      << " WCE QP number " << response->qp_num << " Opcode " << response->opcode
+                      << " wr_id: 0x" << std::hex << response->wr_id << std::dec << dendl;
+        if (conn && conn->is_connected())
+          conn->fault();
+
+        ib->post_chunk_to_pool(chunk);
+        perf_logger->dec(l_msgr_rdma_rx_bufs_in_use);
+        break;
+    }
+  }
+
+  for (auto &i : polled)
+    i.first->pass_wc(std::move(i.second));
+  polled.clear();
+}
+
+RDMAWorker::RDMAWorker(CephContext *c, unsigned worker_id)
+  : Worker(c, worker_id),
+    tx_handler(new C_handle_cq_tx(this))
+{
+  // initialize perf_logger
+  char name[128];
+  sprintf(name, "AsyncMessenger::RDMAWorker-%u", id);
+  PerfCountersBuilder plb(cct, name, l_msgr_rdma_first, l_msgr_rdma_last);
+
+  plb.add_u64_counter(l_msgr_rdma_tx_no_mem, "tx_no_mem", "The count of no tx buffer");
+  plb.add_u64_counter(l_msgr_rdma_tx_parital_mem, "tx_parital_mem", "The count of parital tx buffer");
+  plb.add_u64_counter(l_msgr_rdma_tx_failed, "tx_failed_post", "The number of tx failed posted");
+
+  plb.add_u64_counter(l_msgr_rdma_tx_chunks, "tx_chunks", "The number of tx chunks transmitted");
+  plb.add_u64_counter(l_msgr_rdma_tx_bytes, "tx_bytes", "The bytes of tx chunks transmitted", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_u64_counter(l_msgr_rdma_rx_chunks, "rx_chunks", "The number of rx chunks transmitted");
+  plb.add_u64_counter(l_msgr_rdma_rx_bytes, "rx_bytes", "The bytes of rx chunks transmitted", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_u64_counter(l_msgr_rdma_pending_sent_conns, "pending_sent_conns", "The count of pending sent conns");
+
+  perf_logger = plb.create_perf_counters();
+  cct->get_perfcounters_collection()->add(perf_logger);
+}
+
+RDMAWorker::~RDMAWorker()
+{
+  delete tx_handler;
+}
+
+void RDMAWorker::initialize()
+{
+  ceph_assert(dispatcher);
+}
+
+int RDMAWorker::listen(entity_addr_t &sa, unsigned addr_slot,
+		       const SocketOptions &opt,ServerSocket *sock)
+{
+  ib->init();
+  dispatcher->polling_start();
+
+  RDMAServerSocketImpl *p;
+  if (cct->_conf->ms_async_rdma_type == "iwarp") {
+    p = new RDMAIWARPServerSocketImpl(cct, ib, dispatcher, this, sa, addr_slot);
+  } else {
+    p = new RDMAServerSocketImpl(cct, ib, dispatcher, this, sa, addr_slot);
+  }
+  int r = p->listen(sa, opt);
+  if (r < 0) {
+    delete p;
+    return r;
+  }
+
+  *sock = ServerSocket(std::unique_ptr<ServerSocketImpl>(p));
+  return 0;
+}
+
+int RDMAWorker::connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket)
+{
+  ib->init();
+  dispatcher->polling_start();
+
+  RDMAConnectedSocketImpl* p;
+  if (cct->_conf->ms_async_rdma_type == "iwarp") {
+    p = new RDMAIWARPConnectedSocketImpl(cct, ib, dispatcher, this);
+  } else {
+    p = new RDMAConnectedSocketImpl(cct, ib, dispatcher, this);
+  }
+  int r = p->try_connect(addr, opts);
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " try connecting failed." << dendl;
+    delete p;
+    return r;
+  }
+  std::unique_ptr<RDMAConnectedSocketImpl> csi(p);
+  *socket = ConnectedSocket(std::move(csi));
+  return 0;
+}
+
+int RDMAWorker::get_reged_mem(RDMAConnectedSocketImpl *o, std::vector<Chunk*> &c, size_t bytes)
+{
+  ceph_assert(center.in_thread());
+  int r = ib->get_tx_buffers(c, bytes);
+  size_t got = ib->get_memory_manager()->get_tx_buffer_size() * r;
+  ldout(cct, 30) << __func__ << " need " << bytes << " bytes, reserve " << got << " registered  bytes, inflight " << dispatcher->inflight << dendl;
+  dispatcher->inflight += r;
+  if (got >= bytes)
+    return r;
+
+  if (o) {
+    if (!o->is_pending()) {
+      pending_sent_conns.push_back(o);
+      perf_logger->inc(l_msgr_rdma_pending_sent_conns, 1);
+      o->set_pending(1);
+    }
+    dispatcher->make_pending_worker(this);
+  }
+  return r;
+}
+
+
+void RDMAWorker::handle_pending_message()
+{
+  ldout(cct, 20) << __func__ << " pending conns " << pending_sent_conns.size() << dendl;
+  while (!pending_sent_conns.empty()) {
+    RDMAConnectedSocketImpl *o = pending_sent_conns.front();
+    pending_sent_conns.pop_front();
+    ssize_t r = o->submit(false);
+    ldout(cct, 20) << __func__ << " sent pending bl socket=" << o << " r=" << r << dendl;
+    if (r < 0) {
+      if (r == -EAGAIN) {
+        pending_sent_conns.push_back(o);
+        dispatcher->make_pending_worker(this);
+        return ;
+      }
+      o->fault();
+    }
+    o->set_pending(0);
+    perf_logger->dec(l_msgr_rdma_pending_sent_conns, 1);
+  }
+  dispatcher->notify_pending_workers();
+}
+
+RDMAStack::RDMAStack(CephContext *cct)
+  : NetworkStack(cct), ib(std::make_shared<Infiniband>(cct)),
+    rdma_dispatcher(std::make_shared<RDMADispatcher>(cct, ib))
+{
+  ldout(cct, 20) << __func__ << " constructing RDMAStack..." << dendl;
+
+  unsigned num = get_num_worker();
+  for (unsigned i = 0; i < num; ++i) {
+    RDMAWorker* w = dynamic_cast<RDMAWorker*>(get_worker(i));
+    w->set_dispatcher(rdma_dispatcher);
+    w->set_ib(ib);
+  }
+  ldout(cct, 20) << " creating RDMAStack:" << this << " with dispatcher:" << rdma_dispatcher.get() << dendl;
+}
+
+RDMAStack::~RDMAStack()
+{
+  if (cct->_conf->ms_async_rdma_enable_hugepage) {
+    unsetenv("RDMAV_HUGEPAGES_SAFE");	//remove env variable on destruction
+  }
+}
+
+void RDMAStack::spawn_worker(unsigned i, std::function<void ()> &&func)
+{
+  threads.resize(i+1);
+  threads[i] = std::thread(func);
+}
+
+void RDMAStack::join_worker(unsigned i)
+{
+  ceph_assert(threads.size() > i && threads[i].joinable());
+  threads[i].join();
+}
diff --git a/src/msg/async/rdma/RDMAStack.h b/src/msg/async/rdma/RDMAStack.h
new file mode 100644
index 000000000..8389fee8c
--- /dev/null
+++ b/src/msg/async/rdma/RDMAStack.h
@@ -0,0 +1,345 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_RDMASTACK_H
+#define CEPH_MSG_RDMASTACK_H
+
+#include <sys/eventfd.h>
+
+#include <list>
+#include <vector>
+#include <thread>
+
+#include "common/ceph_context.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "msg/async/Stack.h"
+#include "Infiniband.h"
+
+class RDMAConnectedSocketImpl;
+class RDMAServerSocketImpl;
+class RDMAStack;
+class RDMAWorker;
+
+class RDMADispatcher {
+  typedef Infiniband::MemoryManager::Chunk Chunk;
+  typedef Infiniband::QueuePair QueuePair;
+
+  std::thread t;
+  CephContext *cct;
+  std::shared_ptr<Infiniband> ib;
+  Infiniband::CompletionQueue* tx_cq = nullptr;
+  Infiniband::CompletionQueue* rx_cq = nullptr;
+  Infiniband::CompletionChannel *tx_cc = nullptr, *rx_cc = nullptr;
+  bool done = false;
+  std::atomic<uint64_t> num_qp_conn = {0};
+  // protect `qp_conns`, `dead_queue_pairs`
+  ceph::mutex lock = ceph::make_mutex("RDMADispatcher::lock");
+  // qp_num -> InfRcConnection
+  // The main usage of `qp_conns` is looking up connection by qp_num,
+  // so the lifecycle of element in `qp_conns` is the lifecycle of qp.
+  //// make qp queue into dead state
+  /**
+   * 1. Connection call mark_down
+   * 2. Move the Queue Pair into the Error state(QueuePair::to_dead)
+   * 3. Post a beacon
+   * 4. Wait for beacon which indicates queues are drained
+   * 5. Destroy the QP by calling ibv_destroy_qp()
+   *
+   * @param qp The qp needed to dead
+   */
+  ceph::unordered_map<uint32_t, std::pair<QueuePair*, RDMAConnectedSocketImpl*> > qp_conns;
+
+  /// if a queue pair is closed when transmit buffers are active
+  /// on it, the transmit buffers never get returned via tx_cq.  To
+  /// work around this problem, don't delete queue pairs immediately. Instead,
+  /// save them in this vector and delete them at a safe time, when there are
+  /// no outstanding transmit buffers to be lost.
+  std::vector<QueuePair*> dead_queue_pairs;
+
+  std::atomic<uint64_t> num_pending_workers = {0};
+  // protect pending workers
+  ceph::mutex w_lock =
+    ceph::make_mutex("RDMADispatcher::for worker pending list");
+  // fixme: lockfree
+  std::list<RDMAWorker*> pending_workers;
+  void enqueue_dead_qp_lockless(uint32_t qp);
+  void enqueue_dead_qp(uint32_t qpn);
+
+ public:
+  PerfCounters *perf_logger;
+
+  explicit RDMADispatcher(CephContext* c, std::shared_ptr<Infiniband>& ib);
+  virtual ~RDMADispatcher();
+  void handle_async_event();
+
+  void polling_start();
+  void polling_stop();
+  void polling();
+  void register_qp(QueuePair *qp, RDMAConnectedSocketImpl* csi);
+  void make_pending_worker(RDMAWorker* w) {
+    std::lock_guard l{w_lock};
+    auto it = std::find(pending_workers.begin(), pending_workers.end(), w);
+    if (it != pending_workers.end())
+      return;
+    pending_workers.push_back(w);
+    ++num_pending_workers;
+  }
+  RDMAConnectedSocketImpl* get_conn_lockless(uint32_t qp);
+  QueuePair* get_qp_lockless(uint32_t qp);
+  QueuePair* get_qp(uint32_t qp);
+  void schedule_qp_destroy(uint32_t qp);
+  Infiniband::CompletionQueue* get_tx_cq() const { return tx_cq; }
+  Infiniband::CompletionQueue* get_rx_cq() const { return rx_cq; }
+  void notify_pending_workers();
+  void handle_tx_event(ibv_wc *cqe, int n);
+  void post_tx_buffer(std::vector<Chunk*> &chunks);
+  void handle_rx_event(ibv_wc *cqe, int rx_number);
+
+  std::atomic<uint64_t> inflight = {0};
+
+  void post_chunk_to_pool(Chunk* chunk);
+  int post_chunks_to_rq(int num, QueuePair *qp = nullptr);
+};
+
+class RDMAWorker : public Worker {
+  typedef Infiniband::CompletionQueue CompletionQueue;
+  typedef Infiniband::CompletionChannel CompletionChannel;
+  typedef Infiniband::MemoryManager::Chunk Chunk;
+  typedef Infiniband::MemoryManager MemoryManager;
+  typedef std::vector<Chunk*>::iterator ChunkIter;
+  std::shared_ptr<Infiniband> ib;
+  EventCallbackRef tx_handler;
+  std::list<RDMAConnectedSocketImpl*> pending_sent_conns;
+  std::shared_ptr<RDMADispatcher> dispatcher;
+  ceph::mutex lock = ceph::make_mutex("RDMAWorker::lock");
+
+  class C_handle_cq_tx : public EventCallback {
+    RDMAWorker *worker;
+    public:
+    explicit C_handle_cq_tx(RDMAWorker *w): worker(w) {}
+    void do_request(uint64_t fd) {
+      worker->handle_pending_message();
+    }
+  };
+
+ public:
+  PerfCounters *perf_logger;
+  explicit RDMAWorker(CephContext *c, unsigned i);
+  virtual ~RDMAWorker();
+  virtual int listen(entity_addr_t &addr,
+		     unsigned addr_slot,
+		     const SocketOptions &opts, ServerSocket *) override;
+  virtual int connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) override;
+  virtual void initialize() override;
+  int get_reged_mem(RDMAConnectedSocketImpl *o, std::vector<Chunk*> &c, size_t bytes);
+  void remove_pending_conn(RDMAConnectedSocketImpl *o) {
+    ceph_assert(center.in_thread());
+    pending_sent_conns.remove(o);
+  }
+  void handle_pending_message();
+  void set_dispatcher(std::shared_ptr<RDMADispatcher>& dispatcher) { this->dispatcher = dispatcher; }
+  void set_ib(std::shared_ptr<Infiniband> &ib) {this->ib = ib;}
+  void notify_worker() {
+    center.dispatch_event_external(tx_handler);
+  }
+};
+
+struct RDMACMInfo {
+  RDMACMInfo(rdma_cm_id *cid, rdma_event_channel *cm_channel_, uint32_t qp_num_)
+    : cm_id(cid), cm_channel(cm_channel_), qp_num(qp_num_) {}
+  rdma_cm_id *cm_id;
+  rdma_event_channel *cm_channel;
+  uint32_t qp_num;
+};
+
+class RDMAConnectedSocketImpl : public ConnectedSocketImpl {
+ public:
+  typedef Infiniband::MemoryManager::Chunk Chunk;
+  typedef Infiniband::CompletionChannel CompletionChannel;
+  typedef Infiniband::CompletionQueue CompletionQueue;
+
+ protected:
+  CephContext *cct;
+  Infiniband::QueuePair *qp;
+  uint32_t peer_qpn = 0;
+  uint32_t local_qpn = 0;
+  int connected;
+  int error;
+  std::shared_ptr<Infiniband> ib;
+  std::shared_ptr<RDMADispatcher> dispatcher;
+  RDMAWorker* worker;
+  std::vector<Chunk*> buffers;
+  int notify_fd = -1;
+  ceph::buffer::list pending_bl;
+
+  ceph::mutex lock = ceph::make_mutex("RDMAConnectedSocketImpl::lock");
+  std::vector<ibv_wc> wc;
+  bool is_server;
+  EventCallbackRef read_handler;
+  EventCallbackRef established_handler;
+  int tcp_fd = -1;
+  bool active;// qp is active ?
+  bool pending;
+  int post_backlog = 0;
+
+  void notify();
+  void buffer_prefetch(void);
+  ssize_t read_buffers(char* buf, size_t len);
+  int post_work_request(std::vector<Chunk*>&);
+  size_t tx_copy_chunk(std::vector<Chunk*> &tx_buffers, size_t req_copy_len,
+      decltype(std::cbegin(pending_bl.buffers()))& start,
+      const decltype(std::cbegin(pending_bl.buffers()))& end);
+
+ public:
+  RDMAConnectedSocketImpl(CephContext *cct, std::shared_ptr<Infiniband>& ib,
+			  std::shared_ptr<RDMADispatcher>& rdma_dispatcher, RDMAWorker *w);
+  virtual ~RDMAConnectedSocketImpl();
+
+  void pass_wc(std::vector<ibv_wc> &&v);
+  void get_wc(std::vector<ibv_wc> &w);
+  virtual int is_connected() override { return connected; }
+
+  virtual ssize_t read(char* buf, size_t len) override;
+  virtual ssize_t send(ceph::buffer::list &bl, bool more) override;
+  virtual void shutdown() override;
+  virtual void close() override;
+  virtual int fd() const override { return notify_fd; }
+  void fault();
+  const char* get_qp_state() { return Infiniband::qp_state_string(qp->get_state()); }
+  uint32_t get_peer_qpn () const { return peer_qpn; }
+  uint32_t get_local_qpn () const { return local_qpn; }
+  Infiniband::QueuePair* get_qp () const { return qp; }
+  ssize_t submit(bool more);
+  int activate();
+  void fin();
+  void handle_connection();
+  int handle_connection_established(bool need_set_fault = true);
+  void cleanup();
+  void set_accept_fd(int sd);
+  virtual int try_connect(const entity_addr_t&, const SocketOptions &opt);
+  bool is_pending() {return pending;}
+  void set_pending(bool val) {pending = val;}
+  void post_chunks_to_rq(int num);
+  void update_post_backlog();
+};
+
+enum RDMA_CM_STATUS {
+  IDLE = 1,
+  RDMA_ID_CREATED,
+  CHANNEL_FD_CREATED,
+  RESOURCE_ALLOCATED,
+  ADDR_RESOLVED,
+  ROUTE_RESOLVED,
+  CONNECTED,
+  DISCONNECTED,
+  ERROR
+};
+
+class RDMAIWARPConnectedSocketImpl : public RDMAConnectedSocketImpl {
+  public:
+  RDMAIWARPConnectedSocketImpl(CephContext *cct, std::shared_ptr<Infiniband>& ib,
+			       std::shared_ptr<RDMADispatcher>& rdma_dispatcher,
+			       RDMAWorker *w, RDMACMInfo *info = nullptr);
+    ~RDMAIWARPConnectedSocketImpl();
+    virtual int try_connect(const entity_addr_t&, const SocketOptions &opt) override;
+    virtual void close() override;
+    virtual void shutdown() override;
+    virtual void handle_cm_connection();
+    void activate();
+    int alloc_resource();
+    void close_notify();
+
+  private:
+    rdma_cm_id *cm_id = nullptr;
+    rdma_event_channel *cm_channel = nullptr;
+    EventCallbackRef cm_con_handler;
+    std::mutex close_mtx;
+    std::condition_variable close_condition;
+    bool closed = false;
+    RDMA_CM_STATUS status = IDLE;
+
+
+  class C_handle_cm_connection : public EventCallback {
+    RDMAIWARPConnectedSocketImpl *csi;
+    public:
+      C_handle_cm_connection(RDMAIWARPConnectedSocketImpl *w): csi(w) {}
+      void do_request(uint64_t fd) {
+        csi->handle_cm_connection();
+      }
+  };
+};
+
+class RDMAServerSocketImpl : public ServerSocketImpl {
+  protected:
+    CephContext *cct;
+    ceph::NetHandler net;
+    int server_setup_socket;
+    std::shared_ptr<Infiniband> ib;
+    std::shared_ptr<RDMADispatcher> dispatcher;
+    RDMAWorker *worker;
+    entity_addr_t sa;
+
+ public:
+  RDMAServerSocketImpl(CephContext *cct, std::shared_ptr<Infiniband>& ib,
+                       std::shared_ptr<RDMADispatcher>& rdma_dispatcher,
+		       RDMAWorker *w, entity_addr_t& a, unsigned slot);
+
+  virtual int listen(entity_addr_t &sa, const SocketOptions &opt);
+  virtual int accept(ConnectedSocket *s, const SocketOptions &opts, entity_addr_t *out, Worker *w) override;
+  virtual void abort_accept() override;
+  virtual int fd() const override { return server_setup_socket; }
+};
+
+class RDMAIWARPServerSocketImpl : public RDMAServerSocketImpl {
+  public:
+    RDMAIWARPServerSocketImpl(
+      CephContext *cct, std::shared_ptr<Infiniband>& ib,
+      std::shared_ptr<RDMADispatcher>& rdma_dispatcher,
+      RDMAWorker* w, entity_addr_t& addr, unsigned addr_slot);
+    virtual int listen(entity_addr_t &sa, const SocketOptions &opt) override;
+    virtual int accept(ConnectedSocket *s, const SocketOptions &opts, entity_addr_t *out, Worker *w) override;
+    virtual void abort_accept() override;
+  private:
+    rdma_cm_id *cm_id = nullptr;
+    rdma_event_channel *cm_channel = nullptr;
+};
+
+class RDMAStack : public NetworkStack {
+  std::vector<std::thread> threads;
+  PerfCounters *perf_counter;
+  std::shared_ptr<Infiniband> ib;
+  std::shared_ptr<RDMADispatcher> rdma_dispatcher;
+
+  std::atomic<bool> fork_finished = {false};
+
+  virtual Worker* create_worker(CephContext *c, unsigned worker_id) override {
+    return new RDMAWorker(c, worker_id);
+  }
+
+ public:
+  explicit RDMAStack(CephContext *cct);
+  virtual ~RDMAStack();
+  virtual bool nonblock_connect_need_writable_event() const override { return false; }
+
+  virtual void spawn_worker(unsigned i, std::function<void ()> &&func) override;
+  virtual void join_worker(unsigned i) override;
+  virtual bool is_ready() override { return fork_finished.load(); };
+  virtual void ready() override { fork_finished = true; };
+};
+
+
+#endif
diff --git a/src/msg/msg_types.cc b/src/msg/msg_types.cc
new file mode 100644
index 000000000..ba088e84f
--- /dev/null
+++ b/src/msg/msg_types.cc
@@ -0,0 +1,395 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#include "msg_types.h"
+
+#include <arpa/inet.h>
+#include <stdlib.h>
+#include <string.h>
+#include <netdb.h>
+
+#include "common/Formatter.h"
+
+void entity_name_t::dump(ceph::Formatter *f) const
+{
+  f->dump_string("type", type_str());
+  f->dump_unsigned("num", num());
+}
+
+void entity_addr_t::dump(ceph::Formatter *f) const
+{
+  f->dump_string("type", get_type_name(type));
+  f->dump_stream("addr") << get_sockaddr();
+  f->dump_unsigned("nonce", nonce);
+}
+
+void entity_inst_t::dump(ceph::Formatter *f) const
+{
+  f->dump_object("name", name);
+  f->dump_object("addr", addr);
+}
+
+void entity_name_t::generate_test_instances(std::list<entity_name_t*>& o)
+{
+  o.push_back(new entity_name_t(entity_name_t::MON()));
+  o.push_back(new entity_name_t(entity_name_t::MON(1)));
+  o.push_back(new entity_name_t(entity_name_t::OSD(1)));
+  o.push_back(new entity_name_t(entity_name_t::CLIENT(1)));
+}
+
+void entity_addr_t::generate_test_instances(std::list<entity_addr_t*>& o)
+{
+  o.push_back(new entity_addr_t());
+  entity_addr_t *a = new entity_addr_t();
+  a->set_nonce(1);
+  o.push_back(a);
+  entity_addr_t *b = new entity_addr_t();
+  b->set_type(entity_addr_t::TYPE_LEGACY);
+  b->set_nonce(5);
+  b->set_family(AF_INET);
+  b->set_in4_quad(0, 127);
+  b->set_in4_quad(1, 0);
+  b->set_in4_quad(2, 1);
+  b->set_in4_quad(3, 2);
+  b->set_port(2);
+  o.push_back(b);
+}
+
+void entity_inst_t::generate_test_instances(std::list<entity_inst_t*>& o)
+{
+  o.push_back(new entity_inst_t());
+  entity_name_t name;
+  entity_addr_t addr;
+  entity_inst_t *a = new entity_inst_t(name, addr);
+  o.push_back(a);
+}
+
+bool entity_addr_t::parse(const std::string_view s)
+{
+  const char* start = s.data();
+  const char* end = nullptr;
+  bool got = parse(start, &end);
+  return got && end == start + s.size();
+}
+
+bool entity_addr_t::parse(const char *s, const char **end, int default_type)
+{
+  *this = entity_addr_t();
+
+  const char *start = s;
+  if (end) {
+    *end = s;
+  }
+
+  int newtype;
+  if (strncmp("v1:", s, 3) == 0) {
+    start += 3;
+    newtype = TYPE_LEGACY;
+  } else if (strncmp("v2:", s, 3) == 0) {
+    start += 3;
+    newtype = TYPE_MSGR2;
+  } else if (strncmp("any:", s, 4) == 0) {
+    start += 4;
+    newtype = TYPE_ANY;
+  } else if (*s == '-') {
+    newtype = TYPE_NONE;
+    if (end) {
+      *end = s + 1;
+    }
+    return true;
+  } else {
+    newtype = default_type ? default_type : TYPE_DEFAULT;
+  }
+
+  bool brackets = false;
+  if (*start == '[') {
+    start++;
+    brackets = true;
+  }
+  
+  // inet_pton() requires a null terminated input, so let's fill two
+  // buffers, one with ipv4 allowed characters, and one with ipv6, and
+  // then see which parses.
+  char buf4[39];
+  char *o = buf4;
+  const char *p = start;
+  while (o < buf4 + sizeof(buf4) &&
+	 *p && ((*p == '.') ||
+		(*p >= '0' && *p <= '9'))) {
+    *o++ = *p++;
+  }
+  *o = 0;
+
+  char buf6[64];  // actually 39 + null is sufficient.
+  o = buf6;
+  p = start;
+  while (o < buf6 + sizeof(buf6) &&
+	 *p && ((*p == ':') ||
+		(*p >= '0' && *p <= '9') ||
+		(*p >= 'a' && *p <= 'f') ||
+		(*p >= 'A' && *p <= 'F'))) {
+    *o++ = *p++;
+  }
+  *o = 0;
+  //cout << "buf4 is '" << buf4 << "', buf6 is '" << buf6 << "'" << std::endl;
+
+  // ipv4?
+  struct in_addr a4;
+  struct in6_addr a6;
+  if (inet_pton(AF_INET, buf4, &a4)) {
+    u.sin.sin_addr.s_addr = a4.s_addr;
+    u.sa.sa_family = AF_INET;
+    p = start + strlen(buf4);
+  } else if (inet_pton(AF_INET6, buf6, &a6)) {
+    u.sa.sa_family = AF_INET6;
+    memcpy(&u.sin6.sin6_addr, &a6, sizeof(a6));
+    p = start + strlen(buf6);
+  } else {
+    return false;
+  }
+
+  if (brackets) {
+    if (*p != ']')
+      return false;
+    p++;
+  }
+  
+  //cout << "p is " << *p << std::endl;
+  if (*p == ':') {
+    // parse a port, too!
+    p++;
+    int port = atoi(p);
+    if (port > MAX_PORT_NUMBER) {
+      return false;
+    }
+    set_port(port);
+    while (*p && *p >= '0' && *p <= '9')
+      p++;
+  }
+
+  if (*p == '/') {
+    // parse nonce, too
+    p++;
+    int non = atoi(p);
+    set_nonce(non);
+    while (*p && *p >= '0' && *p <= '9')
+      p++;
+  }
+
+  if (end)
+    *end = p;
+
+  type = newtype;
+
+  //cout << *this << std::endl;
+  return true;
+}
+
+std::ostream& operator<<(std::ostream& out, const entity_addr_t &addr)
+{
+  if (addr.type == entity_addr_t::TYPE_NONE) {
+    return out << "-";
+  }
+  if (addr.type != entity_addr_t::TYPE_ANY) {
+    out << entity_addr_t::get_type_name(addr.type) << ":";
+  }
+  out << addr.get_sockaddr() << '/' << addr.nonce;
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const sockaddr *psa)
+{
+  char buf[NI_MAXHOST] = { 0 };
+
+  switch (psa->sa_family) {
+  case AF_INET:
+    {
+      const sockaddr_in *sa = (const sockaddr_in*)psa;
+      inet_ntop(AF_INET, &sa->sin_addr, buf, NI_MAXHOST);
+      return out << buf << ':'
+		 << ntohs(sa->sin_port);
+    }
+  case AF_INET6:
+    {
+      const sockaddr_in6 *sa = (const sockaddr_in6*)psa;
+      inet_ntop(AF_INET6, &sa->sin6_addr, buf, NI_MAXHOST);
+      return out << '[' << buf << "]:"
+		 << ntohs(sa->sin6_port);
+    }
+  default:
+    return out << "(unrecognized address family " << psa->sa_family << ")";
+  }
+}
+
+std::ostream& operator<<(std::ostream& out, const sockaddr_storage &ss)
+{
+  return out << (const sockaddr*)&ss;
+}
+
+
+// entity_addrvec_t
+
+bool entity_addrvec_t::parse(const char *s, const char **end)
+{
+  const char *orig_s = s;
+  const char *static_end;
+  if (!end) {
+    end = &static_end;
+  } else {
+    *end = s;
+  }
+  v.clear();
+  bool brackets = false;
+  if (*s == '[') {
+    // weirdness: make sure this isn't an IPV6 addr!
+    entity_addr_t a;
+    const char *p;
+    if (!a.parse(s, &p) || !a.is_ipv6()) {
+      // it's not
+      brackets = true;
+      ++s;
+    }
+  }
+  while (*s) {
+    entity_addr_t a;
+    bool r = a.parse(s, end);
+    if (!r) {
+      if (brackets) {
+	v.clear();
+	*end = orig_s;
+	return false;
+      }
+      break;
+    }
+    v.push_back(a);
+    s = *end;
+    if (!brackets) {
+      break;
+    }
+    if (*s != ',') {
+      break;
+    }
+    ++s;
+  }
+  if (brackets) {
+    if (*s == ']') {
+      ++s;
+      *end = s;
+    } else {
+      *end = orig_s;
+      v.clear();
+      return false;
+    }
+  }
+  return !v.empty();
+}
+
+void entity_addrvec_t::encode(ceph::buffer::list& bl, uint64_t features) const
+{
+  using ceph::encode;
+  if ((features & CEPH_FEATURE_MSG_ADDR2) == 0) {
+    // encode a single legacy entity_addr_t for unfeatured peers
+    encode(legacy_addr(), bl, 0);
+    return;
+  }
+  encode((__u8)2, bl);
+  encode(v, bl, features);
+}
+
+void entity_addrvec_t::decode(ceph::buffer::list::const_iterator& bl)
+{
+  using ceph::decode;
+  __u8 marker;
+  decode(marker, bl);
+  if (marker == 0) {
+    // legacy!
+    entity_addr_t addr;
+    addr.decode_legacy_addr_after_marker(bl);
+    v.clear();
+    v.push_back(addr);
+    return;
+  }
+  if (marker == 1) {
+    entity_addr_t addr;
+    DECODE_START(1, bl);
+    decode(addr.type, bl);
+    decode(addr.nonce, bl);
+    __u32 elen;
+    decode(elen, bl);
+    if (elen) {
+      struct sockaddr *sa = (struct sockaddr *)addr.get_sockaddr();
+#if defined(__FreeBSD__) || defined(__APPLE__)
+      sa->sa_len = 0;
+#endif
+      uint16_t ss_family;
+      if (elen < sizeof(ss_family)) {
+        throw ceph::buffer::malformed_input("elen smaller than family len");
+      }
+      decode(ss_family, bl);
+      sa->sa_family = ss_family;
+      elen -= sizeof(ss_family);
+      if (elen > addr.get_sockaddr_len() - sizeof(sa->sa_family)) {
+        throw ceph::buffer::malformed_input("elen exceeds sockaddr len");
+      }
+      bl.copy(elen, sa->sa_data);
+    }
+    DECODE_FINISH(bl);
+    v.clear();
+    v.push_back(addr);
+    return;
+  }
+  if (marker > 2)
+    throw ceph::buffer::malformed_input("entity_addrvec_marker > 2");
+  decode(v, bl);
+}
+
+void entity_addrvec_t::dump(ceph::Formatter *f) const
+{
+  f->open_array_section("addrvec");
+  for (auto p = v.begin(); p != v.end(); ++p) {
+    f->dump_object("addr", *p);
+  }
+  f->close_section();
+}
+
+void entity_addrvec_t::generate_test_instances(std::list<entity_addrvec_t*>& ls)
+{
+  ls.push_back(new entity_addrvec_t());
+  ls.push_back(new entity_addrvec_t());
+  ls.back()->v.push_back(entity_addr_t());
+  ls.push_back(new entity_addrvec_t());
+  ls.back()->v.push_back(entity_addr_t());
+  ls.back()->v.push_back(entity_addr_t());
+}
+
+std::string entity_addr_t::ip_only_to_str() const 
+{
+  const char *host_ip = NULL;
+  char addr_buf[INET6_ADDRSTRLEN];
+  switch (get_family()) {
+  case AF_INET:
+    host_ip = inet_ntop(AF_INET, &in4_addr().sin_addr, 
+                        addr_buf, INET_ADDRSTRLEN);
+    break;
+  case AF_INET6:
+    host_ip = inet_ntop(AF_INET6, &in6_addr().sin6_addr, 
+                        addr_buf, INET6_ADDRSTRLEN);
+    break;
+  default:
+    break;
+  }
+  return host_ip ? host_ip : "";
+}
+
+std::string entity_addr_t::ip_n_port_to_str() const
+{
+  std::string addr;
+  addr += ip_only_to_str();
+  if (is_ipv6()) {
+    addr = '[' + addr + ']';
+  }
+  addr += ':';
+  addr += std::to_string(get_port());
+  return addr;
+}
+
diff --git a/src/msg/msg_types.h b/src/msg/msg_types.h
new file mode 100644
index 000000000..f1f0c5e5c
--- /dev/null
+++ b/src/msg/msg_types.h
@@ -0,0 +1,851 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_TYPES_H
+#define CEPH_MSG_TYPES_H
+
+#include <sstream>
+
+#include <netinet/in.h>
+
+#include "include/ceph_features.h"
+#include "include/types.h"
+#include "include/blobhash.h"
+#include "include/encoding.h"
+
+#define MAX_PORT_NUMBER 65535
+
+#ifdef _WIN32
+// ceph_sockaddr_storage matches the Linux format.
+#define AF_INET6_LINUX 10
+#endif
+
+namespace ceph {
+  class Formatter;
+}
+
+std::ostream& operator<<(std::ostream& out, const sockaddr_storage &ss);
+std::ostream& operator<<(std::ostream& out, const sockaddr *sa);
+
+typedef uint8_t entity_type_t;
+
+class entity_name_t {
+public:
+  entity_type_t _type;
+  int64_t _num;
+
+public:
+  static const int TYPE_MON = CEPH_ENTITY_TYPE_MON;
+  static const int TYPE_MDS = CEPH_ENTITY_TYPE_MDS;
+  static const int TYPE_OSD = CEPH_ENTITY_TYPE_OSD;
+  static const int TYPE_CLIENT = CEPH_ENTITY_TYPE_CLIENT;
+  static const int TYPE_MGR = CEPH_ENTITY_TYPE_MGR;
+
+  static const int64_t NEW = -1;
+
+  // cons
+  entity_name_t() : _type(0), _num(0) { }
+  entity_name_t(int t, int64_t n) : _type(t), _num(n) { }
+  explicit entity_name_t(const ceph_entity_name &n) :
+    _type(n.type), _num(n.num) { }
+
+  // static cons
+  static entity_name_t MON(int64_t i=NEW) { return entity_name_t(TYPE_MON, i); }
+  static entity_name_t MDS(int64_t i=NEW) { return entity_name_t(TYPE_MDS, i); }
+  static entity_name_t OSD(int64_t i=NEW) { return entity_name_t(TYPE_OSD, i); }
+  static entity_name_t CLIENT(int64_t i=NEW) { return entity_name_t(TYPE_CLIENT, i); }
+  static entity_name_t MGR(int64_t i=NEW) { return entity_name_t(TYPE_MGR, i); }
+
+  int64_t num() const { return _num; }
+  int type() const { return _type; }
+  const char *type_str() const {
+    return ceph_entity_type_name(type());
+  }
+
+  bool is_new() const { return num() < 0; }
+
+  bool is_client() const { return type() == TYPE_CLIENT; }
+  bool is_mds() const { return type() == TYPE_MDS; }
+  bool is_osd() const { return type() == TYPE_OSD; }
+  bool is_mon() const { return type() == TYPE_MON; }
+  bool is_mgr() const { return type() == TYPE_MGR; }
+
+  operator ceph_entity_name() const {
+    ceph_entity_name n = { _type, init_le64(_num) };
+    return n;
+  }
+
+  bool parse(const std::string& s) {
+    const char *start = s.c_str();
+    char *end;
+    bool got = parse(start, &end);
+    return got && end == start + s.length();
+  }
+  bool parse(const char *start, char **end) {
+    if (strstr(start, "mon.") == start) {
+      _type = TYPE_MON;
+      start += 4;
+    } else if (strstr(start, "osd.") == start) {
+      _type = TYPE_OSD;
+      start += 4;
+    } else if (strstr(start, "mds.") == start) {
+      _type = TYPE_MDS;
+      start += 4;
+    } else if (strstr(start, "client.") == start) {
+      _type = TYPE_CLIENT;
+      start += 7;
+    } else if (strstr(start, "mgr.") == start) {
+      _type = TYPE_MGR;
+      start += 4;
+    } else {
+      return false;
+    }
+    if (isspace(*start))
+      return false;
+    _num = strtoll(start, end, 10);
+    if (*end == NULL || *end == start)
+      return false;
+    return true;
+  }
+
+  DENC(entity_name_t, v, p) {
+    denc(v._type, p);
+    denc(v._num, p);
+  }
+  void dump(ceph::Formatter *f) const;
+
+  static void generate_test_instances(std::list<entity_name_t*>& o);
+};
+WRITE_CLASS_DENC(entity_name_t)
+
+inline bool operator== (const entity_name_t& l, const entity_name_t& r) {
+  return (l.type() == r.type()) && (l.num() == r.num()); }
+inline bool operator!= (const entity_name_t& l, const entity_name_t& r) {
+  return (l.type() != r.type()) || (l.num() != r.num()); }
+inline bool operator< (const entity_name_t& l, const entity_name_t& r) {
+  return (l.type() < r.type()) || (l.type() == r.type() && l.num() < r.num()); }
+
+inline std::ostream& operator<<(std::ostream& out, const entity_name_t& addr) {
+  //if (addr.is_namer()) return out << "namer";
+  if (addr.is_new() || addr.num() < 0)
+    return out << addr.type_str() << ".?";
+  else
+    return out << addr.type_str() << '.' << addr.num();
+}
+inline std::ostream& operator<<(std::ostream& out, const ceph_entity_name& addr) {
+  return out << entity_name_t{addr.type, static_cast<int64_t>(addr.num)};
+}
+
+namespace std {
+  template<> struct hash< entity_name_t >
+  {
+    size_t operator()( const entity_name_t &m ) const
+    {
+      return rjhash32(m.type() ^ m.num());
+    }
+  };
+} // namespace std
+
+// define a wire format for sockaddr that matches Linux's.
+struct ceph_sockaddr_storage {
+  ceph_le16 ss_family;
+  __u8 __ss_padding[128 - sizeof(ceph_le16)];
+
+  void encode(ceph::buffer::list& bl) const {
+    struct ceph_sockaddr_storage ss = *this;
+    ss.ss_family = htons(ss.ss_family);
+    ceph::encode_raw(ss, bl);
+  }
+
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    struct ceph_sockaddr_storage ss;
+    ceph::decode_raw(ss, bl);
+    ss.ss_family = ntohs(ss.ss_family);
+    *this = ss;
+  }
+} __attribute__ ((__packed__));
+WRITE_CLASS_ENCODER(ceph_sockaddr_storage)
+
+/*
+ * encode sockaddr.ss_family as network byte order
+ */
+static inline void encode(const sockaddr_storage& a, ceph::buffer::list& bl) {
+#if defined(__linux__)
+  struct sockaddr_storage ss = a;
+  ss.ss_family = htons(ss.ss_family);
+  ceph::encode_raw(ss, bl);
+#elif defined(__FreeBSD__) || defined(__APPLE__)
+  ceph_sockaddr_storage ss{};
+  auto src = (unsigned char const *)&a;
+  auto dst = (unsigned char *)&ss;
+  src += sizeof(a.ss_len);
+  ss.ss_family = a.ss_family;
+  src += sizeof(a.ss_family);
+  dst += sizeof(ss.ss_family);
+  const auto copy_size = std::min((unsigned char*)(&a + 1) - src,
+				  (unsigned char*)(&ss + 1) - dst);
+  ::memcpy(dst, src, copy_size);
+  encode(ss, bl);
+#elif defined(_WIN32)
+  ceph_sockaddr_storage ss{};
+  ::memcpy(&ss, &a, std::min(sizeof(ss), sizeof(a)));
+  // The Windows AF_INET6 definition doesn't match the Linux one.
+  if (a.ss_family == AF_INET6) {
+    ss.ss_family = AF_INET6_LINUX;
+  }
+  encode(ss, bl);
+#else
+  ceph_sockaddr_storage ss;
+  ::memset(&ss, '\0', sizeof(ss));
+  ::memcpy(&ss, &a, std::min(sizeof(ss), sizeof(a)));
+  encode(ss, bl);
+#endif
+}
+static inline void decode(sockaddr_storage& a,
+			  ceph::buffer::list::const_iterator& bl) {
+#if defined(__linux__)
+  ceph::decode_raw(a, bl);
+  a.ss_family = ntohs(a.ss_family);
+#elif defined(__FreeBSD__) || defined(__APPLE__)
+  ceph_sockaddr_storage ss{};
+  decode(ss, bl);
+  auto src = (unsigned char const *)&ss;
+  auto dst = (unsigned char *)&a;
+  a.ss_len = 0;
+  dst += sizeof(a.ss_len);
+  a.ss_family = ss.ss_family;
+  src += sizeof(ss.ss_family);
+  dst += sizeof(a.ss_family);
+  auto const copy_size = std::min((unsigned char*)(&ss + 1) - src,
+				  (unsigned char*)(&a + 1) - dst);
+  ::memcpy(dst, src, copy_size);
+#elif defined(_WIN32)
+  ceph_sockaddr_storage ss{};
+  decode(ss, bl);
+  ::memcpy(&a, &ss, std::min(sizeof(ss), sizeof(a)));
+  if (a.ss_family == AF_INET6_LINUX) {
+    a.ss_family = AF_INET6;
+  }
+#else
+  ceph_sockaddr_storage ss{};
+  decode(ss, bl);
+  ::memcpy(&a, &ss, std::min(sizeof(ss), sizeof(a)));
+#endif
+}
+
+/*
+ * an entity's network address.
+ * includes a random value that prevents it from being reused.
+ * thus identifies a particular process instance.
+ *
+ * This also happens to work to support cidr ranges, in which
+ * case the nonce contains the netmask. It's great!
+ */
+struct entity_addr_t {
+  typedef enum {
+    TYPE_NONE = 0,
+    TYPE_LEGACY = 1,  ///< legacy msgr1 protocol (ceph jewel and older)
+    TYPE_MSGR2 = 2,   ///< msgr2 protocol (new in ceph kraken)
+    TYPE_ANY = 3,  ///< ambiguous
+    TYPE_CIDR = 4,
+  } type_t;
+  static const type_t TYPE_DEFAULT = TYPE_MSGR2;
+  static std::string_view get_type_name(int t) {
+    switch (t) {
+    case TYPE_NONE: return "none";
+    case TYPE_LEGACY: return "v1";
+    case TYPE_MSGR2: return "v2";
+    case TYPE_ANY: return "any";
+    case TYPE_CIDR: return "cidr";
+    default: return "???";
+    }
+  };
+
+  __u32 type;
+  __u32 nonce;
+  union {
+    sockaddr sa;
+    sockaddr_in sin;
+    sockaddr_in6 sin6;
+  } u;
+
+  entity_addr_t() : type(0), nonce(0) {
+    memset(&u, 0, sizeof(u));
+  }
+  entity_addr_t(__u32 _type, __u32 _nonce) : type(_type), nonce(_nonce) {
+    memset(&u, 0, sizeof(u));
+  }
+  explicit entity_addr_t(const ceph_entity_addr &o) {
+    type = o.type;
+    nonce = o.nonce;
+    memcpy(&u, &o.in_addr, sizeof(u));
+#if !defined(__FreeBSD__)
+    u.sa.sa_family = ntohs(u.sa.sa_family);
+#endif
+  }
+
+  uint32_t get_type() const { return type; }
+  void set_type(uint32_t t) { type = t; }
+  bool is_legacy() const { return type == TYPE_LEGACY; }
+  bool is_msgr2() const { return type == TYPE_MSGR2; }
+  bool is_any() const { return type == TYPE_ANY; }
+  // this isn't a guarantee; some client addrs will match it
+  bool maybe_cidr() const { return get_port() == 0 && nonce != 0; }
+
+  __u32 get_nonce() const { return nonce; }
+  void set_nonce(__u32 n) { nonce = n; }
+
+  int get_family() const {
+    return u.sa.sa_family;
+  }
+  void set_family(int f) {
+    u.sa.sa_family = f;
+  }
+
+  bool is_ipv4() const {
+    return u.sa.sa_family == AF_INET;
+  }
+  bool is_ipv6() const {
+    return u.sa.sa_family == AF_INET6;
+  }
+
+  sockaddr_in &in4_addr() {
+    return u.sin;
+  }
+  const sockaddr_in &in4_addr() const{
+    return u.sin;
+  }
+  sockaddr_in6 &in6_addr(){
+    return u.sin6;
+  }
+  const sockaddr_in6 &in6_addr() const{
+    return u.sin6;
+  }
+  const sockaddr *get_sockaddr() const {
+    return &u.sa;
+  }
+  size_t get_sockaddr_len() const {
+    switch (u.sa.sa_family) {
+    case AF_INET:
+      return sizeof(u.sin);
+    case AF_INET6:
+      return sizeof(u.sin6);
+    }
+    return sizeof(u);
+  }
+  bool set_sockaddr(const struct sockaddr *sa)
+  {
+    switch (sa->sa_family) {
+    case AF_INET:
+      // pre-zero, since we're only copying a portion of the source
+      memset(&u, 0, sizeof(u));
+      memcpy(&u.sin, sa, sizeof(u.sin));
+      break;
+    case AF_INET6:
+      // pre-zero, since we're only copying a portion of the source
+      memset(&u, 0, sizeof(u));
+      memcpy(&u.sin6, sa, sizeof(u.sin6));
+      break;
+    case AF_UNSPEC:
+      memset(&u, 0, sizeof(u));
+      break;
+    default:
+      return false;
+    }
+    return true;
+  }
+
+  sockaddr_storage get_sockaddr_storage() const {
+    sockaddr_storage ss;
+    memcpy(&ss, &u, sizeof(u));
+    memset((char*)&ss + sizeof(u), 0, sizeof(ss) - sizeof(u));
+    return ss;
+  }
+
+  void set_in4_quad(int pos, int val) {
+    u.sin.sin_family = AF_INET;
+    unsigned char *ipq = (unsigned char*)&u.sin.sin_addr.s_addr;
+    ipq[pos] = val;
+  }
+  void set_port(int port) {
+    switch (u.sa.sa_family) {
+    case AF_INET:
+      u.sin.sin_port = htons(port);
+      break;
+    case AF_INET6:
+      u.sin6.sin6_port = htons(port);
+      break;
+    default:
+      ceph_abort();
+    }
+  }
+  int get_port() const {
+    switch (u.sa.sa_family) {
+    case AF_INET:
+      return ntohs(u.sin.sin_port);
+    case AF_INET6:
+      return ntohs(u.sin6.sin6_port);
+    }
+    return 0;
+  }
+
+  operator ceph_entity_addr() const {
+    ceph_entity_addr a;
+    a.type = 0;
+    a.nonce = nonce;
+    a.in_addr = get_sockaddr_storage();
+#if !defined(__FreeBSD__)
+    a.in_addr.ss_family = htons(a.in_addr.ss_family);
+#endif
+    return a;
+  }
+
+  bool probably_equals(const entity_addr_t &o) const {
+    if (get_port() != o.get_port())
+      return false;
+    if (get_nonce() != o.get_nonce())
+      return false;
+    if (is_blank_ip() || o.is_blank_ip())
+      return true;
+    if (memcmp(&u, &o.u, sizeof(u)) == 0)
+      return true;
+    return false;
+  }
+
+  bool is_same_host(const entity_addr_t &o) const {
+    if (u.sa.sa_family != o.u.sa.sa_family)
+      return false;
+    if (u.sa.sa_family == AF_INET)
+      return u.sin.sin_addr.s_addr == o.u.sin.sin_addr.s_addr;
+    if (u.sa.sa_family == AF_INET6)
+      return memcmp(u.sin6.sin6_addr.s6_addr,
+		    o.u.sin6.sin6_addr.s6_addr,
+		    sizeof(u.sin6.sin6_addr.s6_addr)) == 0;
+    return false;
+  }
+
+  bool is_blank_ip() const {
+    switch (u.sa.sa_family) {
+    case AF_INET:
+      return u.sin.sin_addr.s_addr == INADDR_ANY;
+    case AF_INET6:
+      return memcmp(&u.sin6.sin6_addr, &in6addr_any, sizeof(in6addr_any)) == 0;
+    default:
+      return true;
+    }
+  }
+
+  bool is_ip() const {
+    switch (u.sa.sa_family) {
+    case AF_INET:
+    case AF_INET6:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  std::string ip_only_to_str() const;
+  std::string ip_n_port_to_str() const;
+
+  std::string get_legacy_str() const {
+    std::ostringstream ss;
+    ss << get_sockaddr() << "/" << get_nonce();
+    return ss.str();
+  }
+
+  bool parse(const std::string_view s);
+  bool parse(const char *s, const char **end = 0, int type=0);
+
+  void decode_legacy_addr_after_marker(ceph::buffer::list::const_iterator& bl)
+  {
+    using ceph::decode;
+    __u8 marker;
+    __u16 rest;
+    decode(marker, bl);
+    decode(rest, bl);
+    decode(nonce, bl);
+    sockaddr_storage ss;
+    decode(ss, bl);
+    set_sockaddr((sockaddr*)&ss);
+    if (get_family() == AF_UNSPEC) {
+      type = TYPE_NONE;
+    } else {
+      type = TYPE_LEGACY;
+    }
+  }
+
+  // Right now, these only deal with sockaddr_storage that have only family and content.
+  // Apparently on BSD there is also an ss_len that we need to handle; this requires
+  // broader study
+
+  void encode(ceph::buffer::list& bl, uint64_t features) const {
+    using ceph::encode;
+    if ((features & CEPH_FEATURE_MSG_ADDR2) == 0) {
+      encode((__u32)0, bl);
+      encode(nonce, bl);
+      sockaddr_storage ss = get_sockaddr_storage();
+      encode(ss, bl);
+      return;
+    }
+    encode((__u8)1, bl);
+    ENCODE_START(1, 1, bl);
+    if (HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+      encode(type, bl);
+    } else {
+      // map any -> legacy for old clients.  this is primary for the benefit
+      // of OSDMap's blocklist, but is reasonable in general since any: is
+      // meaningless for pre-nautilus clients or daemons.
+      auto t = type;
+      if (t == TYPE_ANY) {
+	t = TYPE_LEGACY;
+      }
+      encode(t, bl);
+    }
+    encode(nonce, bl);
+    __u32 elen = get_sockaddr_len();
+#if (__FreeBSD__) || defined(__APPLE__)
+      elen -= sizeof(u.sa.sa_len);
+#endif
+    encode(elen, bl);
+    if (elen) {
+      uint16_t ss_family = u.sa.sa_family;
+#if defined(_WIN32)
+      if (ss_family == AF_INET6) {
+        ss_family = AF_INET6_LINUX;
+      }
+#endif
+      encode(ss_family, bl);
+      elen -= sizeof(u.sa.sa_family);
+      bl.append(u.sa.sa_data, elen);
+    }
+    ENCODE_FINISH(bl);
+  }
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    using ceph::decode;
+    __u8 marker;
+    decode(marker, bl);
+    if (marker == 0) {
+      decode_legacy_addr_after_marker(bl);
+      return;
+    }
+    if (marker != 1)
+      throw ceph::buffer::malformed_input("entity_addr_t marker != 1");
+    DECODE_START(1, bl);
+    decode(type, bl);
+    decode(nonce, bl);
+    __u32 elen;
+    decode(elen, bl);
+    if (elen) {
+#if defined(__FreeBSD__) || defined(__APPLE__)
+      u.sa.sa_len = 0;
+#endif
+      uint16_t ss_family;
+      if (elen < sizeof(ss_family)) {
+	throw ceph::buffer::malformed_input("elen smaller than family len");
+      }
+      decode(ss_family, bl);
+#if defined(_WIN32)
+      if (ss_family == AF_INET6_LINUX) {
+        ss_family = AF_INET6;
+      }
+#endif
+      u.sa.sa_family = ss_family;
+      elen -= sizeof(ss_family);
+      if (elen > get_sockaddr_len() - sizeof(u.sa.sa_family)) {
+	throw ceph::buffer::malformed_input("elen exceeds sockaddr len");
+      }
+      bl.copy(elen, u.sa.sa_data);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void dump(ceph::Formatter *f) const;
+
+  static void generate_test_instances(std::list<entity_addr_t*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(entity_addr_t)
+
+std::ostream& operator<<(std::ostream& out, const entity_addr_t &addr);
+
+inline bool operator==(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) == 0; }
+inline bool operator!=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) != 0; }
+inline bool operator<(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) < 0; }
+inline bool operator<=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) <= 0; }
+inline bool operator>(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) > 0; }
+inline bool operator>=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) >= 0; }
+
+namespace std {
+template<> struct hash<entity_addr_t> {
+  size_t operator()( const entity_addr_t& x ) const {
+    static blobhash H;
+    return H(&x, sizeof(x));
+  }
+};
+} // namespace std
+
+struct entity_addrvec_t {
+  std::vector<entity_addr_t> v;
+
+  entity_addrvec_t() {}
+  explicit entity_addrvec_t(const entity_addr_t& a) : v({ a }) {}
+
+  unsigned size() const { return v.size(); }
+  bool empty() const { return v.empty(); }
+
+  entity_addr_t legacy_addr() const {
+    return addr_of_type(entity_addr_t::TYPE_LEGACY);
+  }
+  entity_addr_t as_legacy_addr() const {
+    for (auto& a : v) {
+      if (a.is_legacy()) {
+	return a;
+      }
+      if (a.is_any()) {
+	auto b = a;
+	b.set_type(entity_addr_t::TYPE_LEGACY);
+	return b;
+      }
+    }
+    // hrm... lie!
+    auto a = front();
+    a.set_type(entity_addr_t::TYPE_LEGACY);
+    return a;
+  }
+  entity_addr_t front() const {
+    if (!v.empty()) {
+      return v.front();
+    }
+    return entity_addr_t();
+  }
+  entity_addr_t legacy_or_front_addr() const {
+    for (auto& a : v) {
+      if (a.type == entity_addr_t::TYPE_LEGACY) {
+	return a;
+      }
+    }
+    return front();
+  }
+  std::string get_legacy_str() const {
+    return legacy_or_front_addr().get_legacy_str();
+  }
+
+  entity_addr_t msgr2_addr() const {
+    return addr_of_type(entity_addr_t::TYPE_MSGR2);
+  }
+  bool has_msgr2() const {
+    for (auto& a : v) {
+      if (a.is_msgr2()) {
+	return true;
+      }
+    }
+    return false;
+  }
+
+  entity_addr_t pick_addr(uint32_t type) const {
+    entity_addr_t picked_addr;
+    switch (type) {
+    case entity_addr_t::TYPE_LEGACY:
+      [[fallthrough]];
+    case entity_addr_t::TYPE_MSGR2:
+      picked_addr = addr_of_type(type);
+      break;
+    case entity_addr_t::TYPE_ANY:
+      return front();
+    default:
+      return {};
+    }
+    if (!picked_addr.is_blank_ip()) {
+      return picked_addr;
+    } else {
+      return addr_of_type(entity_addr_t::TYPE_ANY);
+    }
+  }
+
+  entity_addr_t addr_of_type(uint32_t type) const {
+    for (auto &a : v) {
+      if (a.type == type) {
+        return a;
+      }
+    }
+    return entity_addr_t();
+  }
+
+  bool parse(const char *s, const char **end = 0);
+
+  void get_ports(std::set<int> *ports) const {
+    for (auto& a : v) {
+      ports->insert(a.get_port());
+    }
+  }
+  std::set<int> get_ports() const {
+    std::set<int> r;
+    get_ports(&r);
+    return r;
+  }
+
+  void encode(ceph::buffer::list& bl, uint64_t features) const;
+  void decode(ceph::buffer::list::const_iterator& bl);
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<entity_addrvec_t*>& ls);
+
+  bool legacy_equals(const entity_addrvec_t& o) const {
+    if (v == o.v) {
+      return true;
+    }
+    if (v.size() == 1 &&
+	front().is_legacy() &&
+	front() == o.legacy_addr()) {
+      return true;
+    }
+    if (o.v.size() == 1 &&
+	o.front().is_legacy() &&
+	o.front() == legacy_addr()) {
+      return true;
+    }
+    return false;
+  }
+
+  bool probably_equals(const entity_addrvec_t& o) const {
+    for (unsigned i = 0; i < v.size(); ++i) {
+      if (!v[i].probably_equals(o.v[i])) {
+	return false;
+      }
+    }
+    return true;
+  }
+  bool contains(const entity_addr_t& a) const {
+    for (auto& i : v) {
+      if (a == i) {
+	return true;
+      }
+    }
+    return false;
+  }
+  bool is_same_host(const entity_addr_t& a) const {
+    for (auto& i : v) {
+      if (i.is_same_host(a)) {
+	return true;
+      }
+    }
+    return false;
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, const entity_addrvec_t& av) {
+    if (av.v.empty()) {
+      return out;
+    } else if (av.v.size() == 1) {
+      return out << av.v[0];
+    } else {
+      return out << av.v;
+    }
+  }
+
+  friend bool operator==(const entity_addrvec_t& l, const entity_addrvec_t& r) {
+    return l.v == r.v;
+  }
+  friend bool operator!=(const entity_addrvec_t& l, const entity_addrvec_t& r) {
+    return l.v != r.v;
+  }
+  friend bool operator<(const entity_addrvec_t& l, const entity_addrvec_t& r) {
+    return l.v < r.v;  // see lexicographical_compare()
+  }
+};
+WRITE_CLASS_ENCODER_FEATURES(entity_addrvec_t);
+
+namespace std {
+template<> struct hash<entity_addrvec_t> {
+  size_t operator()( const entity_addrvec_t& x) const {
+    static blobhash H;
+    size_t r = 0;
+    for (auto& i : x.v) {
+      r += H((const char*)&i, sizeof(i));
+    }
+    return r;
+  }
+};
+} // namespace std
+
+/*
+ * a particular entity instance
+ */
+struct entity_inst_t {
+  entity_name_t name;
+  entity_addr_t addr;
+  entity_inst_t() {}
+  entity_inst_t(entity_name_t n, const entity_addr_t& a) : name(n), addr(a) {}
+  // cppcheck-suppress noExplicitConstructor
+  entity_inst_t(const ceph_entity_inst& i) : name(i.name), addr(i.addr) { }
+  entity_inst_t(const ceph_entity_name& n, const ceph_entity_addr &a) : name(n), addr(a) {}
+  operator ceph_entity_inst() {
+    ceph_entity_inst i = {name, addr};
+    return i;
+  }
+
+  void encode(ceph::buffer::list& bl, uint64_t features) const {
+    using ceph::encode;
+    encode(name, bl);
+    encode(addr, bl, features);
+  }
+  void decode(ceph::buffer::list::const_iterator& bl) {
+    using ceph::decode;
+    decode(name, bl);
+    decode(addr, bl);
+  }
+
+  void dump(ceph::Formatter *f) const;
+  static void generate_test_instances(std::list<entity_inst_t*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(entity_inst_t)
+
+
+inline bool operator==(const entity_inst_t& a, const entity_inst_t& b) {
+  return a.name == b.name && a.addr == b.addr;
+}
+inline bool operator!=(const entity_inst_t& a, const entity_inst_t& b) {
+  return a.name != b.name || a.addr != b.addr;
+}
+inline bool operator<(const entity_inst_t& a, const entity_inst_t& b) {
+  return a.name < b.name || (a.name == b.name && a.addr < b.addr);
+}
+inline bool operator<=(const entity_inst_t& a, const entity_inst_t& b) {
+  return a.name < b.name || (a.name == b.name && a.addr <= b.addr);
+}
+inline bool operator>(const entity_inst_t& a, const entity_inst_t& b) { return b < a; }
+inline bool operator>=(const entity_inst_t& a, const entity_inst_t& b) { return b <= a; }
+
+namespace std {
+  template<> struct hash< entity_inst_t >
+  {
+    size_t operator()( const entity_inst_t& x ) const
+    {
+      static hash< entity_name_t > H;
+      static hash< entity_addr_t > I;
+      return H(x.name) ^ I(x.addr);
+    }
+  };
+} // namespace std
+
+
+inline std::ostream& operator<<(std::ostream& out, const entity_inst_t &i)
+{
+  return out << i.name << " " << i.addr;
+}
+inline std::ostream& operator<<(std::ostream& out, const ceph_entity_inst &i)
+{
+  entity_inst_t n = i;
+  return out << n;
+}
+
+#endif
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:45:59 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-07 18:45:59 +0000
commit	19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree	42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/msg
parent	Initial commit. (diff)
download	ceph-upstream/16.2.11+ds.tar.xz ceph-upstream/16.2.11+ds.zip